diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r-- | arch/x86/kvm/x86.c | 330 |
1 files changed, 211 insertions, 119 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cfd880ab841d..df6f6df902c6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -90,6 +90,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif +static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; + #define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ #define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ @@ -295,13 +297,14 @@ int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); int err; - if (((value ^ smsr->values[slot].curr) & mask) == 0) + value = (value & mask) | (smsr->values[slot].host & ~mask); + if (value == smsr->values[slot].curr) return 0; - smsr->values[slot].curr = value; err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); if (err) return 1; + smsr->values[slot].curr = value; if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); @@ -437,6 +440,14 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) * for #DB exceptions under VMX. */ vcpu->arch.dr6 ^= payload & DR6_RTM; + + /* + * The #DB payload is defined as compatible with the 'pending + * debug exceptions' field under VMX, not DR6. While bit 12 is + * defined in the 'pending debug exceptions' field (enabled + * breakpoint), it is reserved and must be zero in DR6. + */ + vcpu->arch.dr6 &= ~BIT(12); break; case PF_VECTOR: vcpu->arch.cr2 = payload; @@ -816,11 +827,25 @@ void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); vcpu->guest_xcr0_loaded = 1; } + + if (static_cpu_has(X86_FEATURE_PKU) && + (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || + (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) && + vcpu->arch.pkru != vcpu->arch.host_pkru) + __write_pkru(vcpu->arch.pkru); } EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0); void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) { + if (static_cpu_has(X86_FEATURE_PKU) && + (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || + (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) { + vcpu->arch.pkru = rdpkru(); + if (vcpu->arch.pkru != vcpu->arch.host_pkru) + __write_pkru(vcpu->arch.host_pkru); + } + if (vcpu->guest_xcr0_loaded) { if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); @@ -880,9 +905,38 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); +static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) +{ + u64 reserved_bits = CR4_RESERVED_BITS; + + if (!cpu_has(c, X86_FEATURE_XSAVE)) + reserved_bits |= X86_CR4_OSXSAVE; + + if (!cpu_has(c, X86_FEATURE_SMEP)) + reserved_bits |= X86_CR4_SMEP; + + if (!cpu_has(c, X86_FEATURE_SMAP)) + reserved_bits |= X86_CR4_SMAP; + + if (!cpu_has(c, X86_FEATURE_FSGSBASE)) + reserved_bits |= X86_CR4_FSGSBASE; + + if (!cpu_has(c, X86_FEATURE_PKU)) + reserved_bits |= X86_CR4_PKE; + + if (!cpu_has(c, X86_FEATURE_LA57) && + !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57))) + reserved_bits |= X86_CR4_LA57; + + if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated()) + reserved_bits |= X86_CR4_UMIP; + + return reserved_bits; +} + static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - if (cr4 & CR4_RESERVED_BITS) + if (cr4 & cr4_reserved_bits) return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) @@ -921,6 +975,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; + if ((cr4 ^ old_cr4) & X86_CR4_LA57) + return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, @@ -1048,9 +1104,11 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { + size_t size = ARRAY_SIZE(vcpu->arch.db); + switch (dr) { case 0 ... 3: - vcpu->arch.db[dr] = val; + vcpu->arch.db[array_index_nospec(dr, size)] = val; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) vcpu->arch.eff_db[dr] = val; break; @@ -1087,9 +1145,11 @@ EXPORT_SYMBOL_GPL(kvm_set_dr); int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) { + size_t size = ARRAY_SIZE(vcpu->arch.db); + switch (dr) { case 0 ... 3: - *val = vcpu->arch.db[dr]; + *val = vcpu->arch.db[array_index_nospec(dr, size)]; break; case 4: /* fall through */ @@ -1250,27 +1310,18 @@ u64 kvm_get_arch_capabilities(void) data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; /* - * On TAA affected systems, export MDS_NO=0 when: - * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1. - * - Updated microcode is present. This is detected by - * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures - * that VERW clears CPU buffers. - * - * When MDS_NO=0 is exported, guests deploy clear CPU buffer - * mitigation and don't complain: - * - * "Vulnerable: Clear CPU buffers attempted, no microcode" - * - * If TSX is disabled on the system, guests are also mitigated against - * TAA and clear CPU buffer mitigation is not required for guests. + * On TAA affected systems: + * - nothing to do if TSX is disabled on the host. + * - we emulate TSX_CTRL if present on the host. + * This lets the guest use VERW to clear CPU buffers. */ - if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) && - (data & ARCH_CAP_TSX_CTRL_MSR)) - data &= ~ARCH_CAP_MDS_NO; + if (!boot_cpu_has(X86_FEATURE_RTM)) + data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR); + else if (!boot_cpu_has_bug(X86_BUG_TAA)) + data |= ARCH_CAP_TAA_NO; return data; } -EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities); static int kvm_get_msr_feature(struct kvm_msr_entry *msr) { @@ -2372,7 +2423,10 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = msr - MSR_IA32_MC0_CTL; + u32 offset = array_index_nospec( + msr - MSR_IA32_MC0_CTL, + MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + /* only 0 or all 1s can be written to IA32_MCi_CTL * some Linux kernels though clear bit 10 in bank 4 to * workaround a BIOS/GART TBL issue on AMD K8s, ignore @@ -2467,43 +2521,47 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) static void record_steal_time(struct kvm_vcpu *vcpu) { + struct kvm_host_map map; + struct kvm_steal_time *st; + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) + /* -EAGAIN is returned in atomic context so we can just return. */ + if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, + &map, &vcpu->arch.st.cache, false)) return; + st = map.hva + + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + /* * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ - if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB) + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, + st->preempted & KVM_VCPU_FLUSH_TLB); + if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb(vcpu, false); - if (vcpu->arch.st.steal.version & 1) - vcpu->arch.st.steal.version += 1; /* first time write, random junk */ + vcpu->arch.st.preempted = 0; - vcpu->arch.st.steal.version += 1; + if (st->version & 1) + st->version += 1; /* first time write, random junk */ - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); + st->version += 1; smp_wmb(); - vcpu->arch.st.steal.steal += current->sched_info.run_delay - + st->steal += current->sched_info.run_delay - vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); - smp_wmb(); - vcpu->arch.st.steal.version += 1; + st->version += 1; - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); + kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false); } int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -2570,7 +2628,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_write(vcpu, msr, data); case MSR_IA32_TSCDEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); @@ -2648,11 +2706,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & KVM_STEAL_RESERVED_MASK) return 1; - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, - data & KVM_STEAL_VALID_BITS, - sizeof(struct kvm_steal_time))) - return 1; - vcpu->arch.st.msr_val = data; if (!(data & KVM_MSR_ENABLED)) @@ -2792,7 +2845,10 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) default: if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = msr - MSR_IA32_MC0_CTL; + u32 offset = array_index_nospec( + msr - MSR_IA32_MC0_CTL, + MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + data = vcpu->arch.mce_banks[offset]; break; } @@ -2869,7 +2925,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_APICBASE: msr_info->data = kvm_get_apic_base(vcpu); break; - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); break; case MSR_IA32_TSCDEADLINE: @@ -3305,9 +3361,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->vcpu_load(vcpu, cpu); - fpregs_assert_state_consistent(); - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - switch_fpu_return(); + /* Save host pkru register if supported */ + vcpu->arch.host_pkru = read_pkru(); /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { @@ -3348,15 +3403,25 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { + struct kvm_host_map map; + struct kvm_steal_time *st; + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED; + if (vcpu->arch.st.preempted) + return; - kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal.preempted, - offsetof(struct kvm_steal_time, preempted), - sizeof(vcpu->arch.st.steal.preempted)); + if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, + &vcpu->arch.st.cache, true)) + return; + + st = map.hva + + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + + st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; + + kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -3492,7 +3557,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, unsigned bank_num = mcg_cap & 0xff, bank; r = -EINVAL; - if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) + if (!bank_num || bank_num > KVM_MAX_MCE_BANKS) goto out; if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) goto out; @@ -4285,6 +4350,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_SET_NESTED_STATE: { struct kvm_nested_state __user *user_kvm_nested_state = argp; struct kvm_nested_state kvm_state; + int idx; r = -EINVAL; if (!kvm_x86_ops->set_nested_state) @@ -4308,7 +4374,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE)) break; + idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } case KVM_GET_SUPPORTED_HV_CPUID: { @@ -6214,11 +6282,11 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) return r; } -static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, +static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool write_fault_to_shadow_pgtable, int emulation_type) { - gpa_t gpa = cr2; + gpa_t gpa = cr2_or_gpa; kvm_pfn_t pfn; if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) @@ -6232,7 +6300,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, * Write permission should be allowed since only * write access need to be emulated. */ - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); /* * If the mapping is invalid in guest, let cpu retry @@ -6289,10 +6357,10 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, } static bool retry_instruction(struct x86_emulate_ctxt *ctxt, - unsigned long cr2, int emulation_type) + gpa_t cr2_or_gpa, int emulation_type) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long last_retry_eip, last_retry_addr, gpa = cr2; + unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa; last_retry_eip = vcpu->arch.last_retry_eip; last_retry_addr = vcpu->arch.last_retry_addr; @@ -6321,14 +6389,14 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (x86_page_table_writing_insn(ctxt)) return false; - if (ctxt->eip == last_retry_eip && last_retry_addr == cr2) + if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) return false; vcpu->arch.last_retry_eip = ctxt->eip; - vcpu->arch.last_retry_addr = cr2; + vcpu->arch.last_retry_addr = cr2_or_gpa; if (!vcpu->arch.mmu->direct_map) - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); @@ -6472,11 +6540,8 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt) return false; } -int x86_emulate_instruction(struct kvm_vcpu *vcpu, - unsigned long cr2, - int emulation_type, - void *insn, - int insn_len) +int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + int emulation_type, void *insn, int insn_len) { int r; struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; @@ -6519,8 +6584,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (r != EMULATION_OK) { if (emulation_type & EMULTYPE_TRAP_UD) return EMULATE_FAIL; - if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, - emulation_type)) + if (reexecute_instruction(vcpu, cr2_or_gpa, + write_fault_to_spt, + emulation_type)) return EMULATE_DONE; if (ctxt->have_exception) { /* @@ -6549,7 +6615,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DONE; } - if (retry_instruction(ctxt, cr2, emulation_type)) + if (retry_instruction(ctxt, cr2_or_gpa, emulation_type)) return EMULATE_DONE; /* this is needed for vmware backdoor interface to work since it @@ -6561,7 +6627,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, restart: /* Save the faulting GPA (cr2) in the address field */ - ctxt->exception.address = cr2; + ctxt->exception.address = cr2_or_gpa; r = x86_emulate_insn(ctxt); @@ -6569,7 +6635,7 @@ restart: return EMULATE_DONE; if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, + if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt, emulation_type)) return EMULATE_DONE; @@ -6606,7 +6672,7 @@ restart: if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { kvm_rip_write(vcpu, ctxt->eip); - if (r == EMULATE_DONE && ctxt->tf) + if (r == EMULATE_DONE && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) kvm_vcpu_do_singlestep(vcpu, &r); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -6968,33 +7034,25 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .handle_intel_pt_intr = kvm_handle_intel_pt_intr, }; +extern u8 __read_mostly shadow_phys_bits; + static void kvm_set_mmio_spte_mask(void) { u64 mask; - int maxphyaddr = boot_cpu_data.x86_phys_bits; - - /* - * Set the reserved bits and the present bit of an paging-structure - * entry to generate page fault with PFER.RSV = 1. - */ /* - * Mask the uppermost physical address bit, which would be reserved as - * long as the supported physical address width is less than 52. + * Set a reserved PA bit in MMIO SPTEs to generate page faults with + * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT + * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports + * 52-bit physical addresses then there are no reserved PA bits in the + * PTEs and so the reserved PA approach must be disabled. */ - mask = 1ull << 51; - - /* Set the present bit. */ - mask |= 1ull; - - /* - * If reserved bit is not supported, clear the present bit to disable - * mmio page fault. - */ - if (IS_ENABLED(CONFIG_X86_64) && maxphyaddr == 52) - mask &= ~1ull; + if (shadow_phys_bits < 52) + mask = BIT_ULL(51) | PT_PRESENT_MASK; + else + mask = 0; - kvm_mmu_set_mmio_spte_mask(mask, mask); + kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); } #ifdef CONFIG_X86_64 @@ -7350,7 +7408,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); } -static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) +static int inject_pending_event(struct kvm_vcpu *vcpu) { int r; @@ -7386,7 +7444,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) * from L2 to L1. */ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { - r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + r = kvm_x86_ops->check_nested_events(vcpu); if (r != 0) return r; } @@ -7448,7 +7506,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) * KVM_REQ_EVENT only on certain events and not unconditionally? */ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { - r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + r = kvm_x86_ops->check_nested_events(vcpu); if (r != 0) return r; } @@ -7759,9 +7817,8 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); } -int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end, - bool blockable) +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end) { unsigned long apic_address; @@ -7772,8 +7829,6 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (start <= apic_address && apic_address < end) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); - - return 0; } void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) @@ -7921,7 +7976,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } - if (inject_pending_event(vcpu, req_int_win) != 0) + if (inject_pending_event(vcpu) != 0) req_immediate_exit = true; else { /* Enable SMI/NMI/IRQ window open exits if needed. @@ -8016,8 +8071,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) wait_lapic_expire(vcpu); guest_enter_irqoff(); - /* The preempt notifier should have taken care of the FPU already. */ - WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD)); + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_return(); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); @@ -8137,7 +8193,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) - kvm_x86_ops->check_nested_events(vcpu, false); + kvm_x86_ops->check_nested_events(vcpu); return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); @@ -8274,12 +8330,26 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return 0; } +static void kvm_save_current_fpu(struct fpu *fpu) +{ + /* + * If the target FPU state is not resident in the CPU registers, just + * memcpy() from current, else save CPU state directly to the target. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + memcpy(&fpu->state, ¤t->thread.fpu.state, + fpu_kernel_xstate_size); + else + copy_fpregs_to_fpstate(fpu); +} + /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); - copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); + kvm_save_current_fpu(vcpu->arch.user_fpu); + /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, ~XFEATURE_MASK_PKRU); @@ -8295,8 +8365,9 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); - copy_fpregs_to_fpstate(vcpu->arch.guest_fpu); - copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); + kvm_save_current_fpu(vcpu->arch.guest_fpu); + + copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state); fpregs_mark_activate(); fpregs_unlock(); @@ -8517,6 +8588,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { vcpu_load(vcpu); + if (kvm_mpx_supported()) + kvm_load_guest_fpu(vcpu); kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && @@ -8525,6 +8598,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, else mp_state->mp_state = vcpu->arch.mp_state; + if (kvm_mpx_supported()) + kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); return 0; } @@ -8881,6 +8956,9 @@ static void fx_init(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; + struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache; + + kvm_release_pfn(cache->pfn, cache->dirty, cache); kvmclock_reset(vcpu); @@ -8941,13 +9019,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - vcpu->arch.apf.msr_val = 0; - - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); - - kvm_x86_ops->vcpu_free(vcpu); + kvm_arch_vcpu_free(vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -9139,6 +9211,8 @@ int kvm_arch_hardware_setup(void) if (r != 0) return r; + cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data); + if (kvm_has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that @@ -9480,6 +9554,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, { int i; + /* + * Clear out the previous array pointers for the KVM_MR_MOVE case. The + * old arrays will be freed by __kvm_set_memory_region() if installing + * the new memslot is successful. + */ + memset(&slot->arch, 0, sizeof(slot->arch)); + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { struct kvm_lpage_info *linfo; unsigned long ugfn; @@ -9542,11 +9623,18 @@ out_free: void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { + struct kvm_vcpu *vcpu; + int i; + /* * memslots->generation has been incremented. * mmio generation may have reached its maximum value. */ kvm_mmu_invalidate_mmio_sptes(kvm, gen); + + /* Force re-initialization of steal_time cache */ + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vcpu_kick(vcpu); } int kvm_arch_prepare_memory_region(struct kvm *kvm, @@ -9554,6 +9642,10 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { + if (change == KVM_MR_MOVE) + return kvm_arch_create_memslot(kvm, memslot, + mem->memory_size >> PAGE_SHIFT); + return 0; } @@ -9793,7 +9885,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu)) return; - vcpu->arch.mmu->page_fault(vcpu, work->gva, 0, true); + vcpu->arch.mmu->page_fault(vcpu, work->cr2_or_gpa, 0, true); } static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) @@ -9876,7 +9968,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, { struct x86_exception fault; - trace_kvm_async_pf_not_present(work->arch.token, work->gva); + trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa); kvm_add_async_pf_gfn(vcpu, work->arch.gfn); if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || @@ -9904,7 +9996,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, work->arch.token = ~0; /* broadcast wakeup */ else kvm_del_async_pf_gfn(vcpu, work->arch.gfn); - trace_kvm_async_pf_ready(work->arch.token, work->gva); + trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa); if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED && !apf_get_user(vcpu, &val)) { |