diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/kvm_cache_regs.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 94 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmutrace.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 7 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 6 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 17 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 50 |
9 files changed, 109 insertions, 73 deletions
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index f500293dad8d..ce4a9f1f845e 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -5,7 +5,7 @@ #define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE) + | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE | X86_CR4_TSD) static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, enum kvm_reg reg) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 537c36b55b5d..d4fdf0e52144 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1918,7 +1918,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) || + if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) return; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e5af08b58132..1cceee0ed580 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -263,6 +263,11 @@ static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; */ static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; +/* + * The number of non-reserved physical address bits irrespective of features + * that repurpose legal bits, e.g. MKTME. + */ +static u8 __read_mostly shadow_phys_bits; static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); @@ -275,11 +280,18 @@ static bool is_executable_pte(u64 spte); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) { BUG_ON((mmio_mask & mmio_value) != mmio_value); + WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len)); + WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +static bool is_mmio_spte(u64 spte) +{ + return (spte & shadow_mmio_mask) == shadow_mmio_value; +} + static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) { return sp->role.ad_disabled; @@ -287,7 +299,7 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) static inline bool spte_ad_enabled(u64 spte) { - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + MMU_WARN_ON(is_mmio_spte(spte)); return !(spte & shadow_acc_track_value); } @@ -298,13 +310,13 @@ static bool is_nx_huge_page_enabled(void) static inline u64 spte_shadow_accessed_mask(u64 spte) { - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + MMU_WARN_ON(is_mmio_spte(spte)); return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; } static inline u64 spte_shadow_dirty_mask(u64 spte) { - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + MMU_WARN_ON(is_mmio_spte(spte)); return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; } @@ -374,11 +386,6 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, mmu_spte_set(sptep, mask); } -static bool is_mmio_spte(u64 spte) -{ - return (spte & shadow_mmio_mask) == shadow_mmio_value; -} - static gfn_t get_mmio_spte_gfn(u64 spte) { u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; @@ -443,6 +450,21 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); +static u8 kvm_get_shadow_phys_bits(void) +{ + /* + * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected + * in CPU detection code, but MKTME treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore for MKTME + * we should still return physical address bits reported by CPUID. + */ + if (!boot_cpu_has(X86_FEATURE_TME) || + WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008)) + return boot_cpu_data.x86_phys_bits; + + return cpuid_eax(0x80000008) & 0xff; +} + static void kvm_mmu_reset_all_pte_masks(void) { u8 low_phys_bits; @@ -456,20 +478,29 @@ static void kvm_mmu_reset_all_pte_masks(void) shadow_present_mask = 0; shadow_acc_track_mask = 0; + shadow_phys_bits = kvm_get_shadow_phys_bits(); + /* * If the CPU has 46 or less physical address bits, then set an * appropriate mask to guard against L1TF attacks. Otherwise, it is * assumed that the CPU is not vulnerable to L1TF. + * + * Some Intel CPUs address the L1 cache using more PA bits than are + * reported by CPUID. Use the PA width of the L1 cache when possible + * to achieve more effective mitigation, e.g. if system RAM overlaps + * the most significant bits of legal physical address space. */ + shadow_nonpresent_or_rsvd_mask = 0; low_phys_bits = boot_cpu_data.x86_phys_bits; - if (boot_cpu_data.x86_phys_bits < - 52 - shadow_nonpresent_or_rsvd_mask_len) { + if (boot_cpu_has_bug(X86_BUG_L1TF) && + !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= + 52 - shadow_nonpresent_or_rsvd_mask_len)) { + low_phys_bits = boot_cpu_data.x86_cache_bits + - shadow_nonpresent_or_rsvd_mask_len; shadow_nonpresent_or_rsvd_mask = - rsvd_bits(boot_cpu_data.x86_phys_bits - - shadow_nonpresent_or_rsvd_mask_len, - boot_cpu_data.x86_phys_bits - 1); - low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len; + rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); } + shadow_nonpresent_or_rsvd_lower_gfn_mask = GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); } @@ -1682,10 +1713,10 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, * Emulate arch specific page modification logging for the * nested hypervisor */ -int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu) +int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa) { if (kvm_x86_ops->write_log_dirty) - return kvm_x86_ops->write_log_dirty(vcpu); + return kvm_x86_ops->write_log_dirty(vcpu, l2_gpa); return 0; } @@ -4213,7 +4244,7 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51); rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | - nonleaf_bit8_rsvd | gbpages_bit_rsvd | + gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); @@ -4305,7 +4336,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) */ shadow_zero_check = &context->shadow_zero_check; __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - boot_cpu_data.x86_phys_bits, + shadow_phys_bits, context->shadow_root_level, uses_nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_pse(vcpu), true); @@ -4342,13 +4373,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - boot_cpu_data.x86_phys_bits, + shadow_phys_bits, context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), true, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, - boot_cpu_data.x86_phys_bits, + shadow_phys_bits, false); if (!shadow_me_mask) @@ -4369,7 +4400,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, - boot_cpu_data.x86_phys_bits, execonly); + shadow_phys_bits, execonly); } #define BYTE_MASK(access) \ @@ -5666,6 +5697,25 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) return 0; } +static void kvm_set_mmio_spte_mask(void) +{ + u64 mask; + + /* + * Set a reserved PA bit in MMIO SPTEs to generate page faults with + * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT + * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports + * 52-bit physical addresses then there are no reserved PA bits in the + * PTEs and so the reserved PA approach must be disabled. + */ + if (shadow_phys_bits < 52) + mask = BIT_ULL(51) | PT_PRESENT_MASK; + else + mask = 0; + + kvm_mmu_set_mmio_spte_mask(mask, mask); +} + int kvm_mmu_module_init(void) { if (nx_huge_pages == -1) @@ -5673,6 +5723,8 @@ int kvm_mmu_module_init(void) kvm_mmu_reset_all_pte_masks(); + kvm_set_mmio_spte_mask(); + pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, SLAB_ACCOUNT, NULL); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 068feab64acf..816a626b6250 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -194,7 +194,7 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); -int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); +int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 918b0d5bf272..1c1c2649829b 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -339,7 +339,7 @@ TRACE_EVENT( /* These depend on page entry type, so compute them now. */ __field(bool, r) __field(bool, x) - __field(u8, u) + __field(signed char, u) ), TP_fast_assign( diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 8cf7a09bdd73..7260a165488d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -202,7 +202,7 @@ static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct guest_walker *walker, - int write_fault) + gpa_t addr, int write_fault) { unsigned level, index; pt_element_t pte, orig_pte; @@ -227,7 +227,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); #if PTTYPE == PTTYPE_EPT - if (kvm_arch_write_log_dirty(vcpu)) + if (kvm_arch_write_log_dirty(vcpu, addr)) return -EINVAL; #endif pte |= PT_GUEST_DIRTY_MASK; @@ -424,7 +424,8 @@ retry_walk: (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); if (unlikely(!accessed_dirty)) { - ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); + ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, + addr, write_fault); if (unlikely(ret < 0)) goto error; else if (ret) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d63621386418..78826d123fb8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2757,8 +2757,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) return NESTED_EXIT_HOST; break; case SVM_EXIT_EXCP_BASE + PF_VECTOR: - /* When we're shadowing, trap PFs, but not async PF */ - if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0) + /* Trap async PF even if not shadowing */ + if (!npt_enabled || svm->vcpu.arch.apf.host_apf_reason) return NESTED_EXIT_HOST; break; default: @@ -2847,7 +2847,7 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr dst->iopm_base_pa = from->iopm_base_pa; dst->msrpm_base_pa = from->msrpm_base_pa; dst->tsc_offset = from->tsc_offset; - dst->asid = from->asid; + /* asid not copied, it is handled manually for svm->vmcb. */ dst->tlb_ctl = from->tlb_ctl; dst->int_ctl = from->int_ctl; dst->int_vector = from->int_vector; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c139dedec12b..396d41f192ca 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5592,6 +5592,8 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) { + BUILD_BUG_ON(KVM_CR4_GUEST_OWNED_BITS & ~KVM_POSSIBLE_CR4_GUEST_BITS); + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; if (enable_ept) vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; @@ -8711,7 +8713,7 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) vmcs_read32(VM_EXIT_INTR_ERROR_CODE), KVM_ISA_VMX); - switch (exit_reason) { + switch ((u16)exit_reason) { case EXIT_REASON_EXCEPTION_NMI: if (is_nmi(intr_info)) return false; @@ -9142,6 +9144,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) (exit_reason != EXIT_REASON_EXCEPTION_NMI && exit_reason != EXIT_REASON_EPT_VIOLATION && exit_reason != EXIT_REASON_PML_FULL && + exit_reason != EXIT_REASON_APIC_ACCESS && exit_reason != EXIT_REASON_TASK_SWITCH)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; @@ -9280,15 +9283,16 @@ static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; + if (!flexpriority_enabled && + !cpu_has_vmx_virtualize_x2apic_mode()) + return; + /* Postpone execution until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; return; } - if (!cpu_need_tpr_shadow(vcpu)) - return; - sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); @@ -12461,11 +12465,10 @@ static void vmx_flush_log_dirty(struct kvm *kvm) kvm_flush_pml_buffers(kvm); } -static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) +static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) { struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); - gpa_t gpa; struct page *page = NULL; u64 *pml_address; @@ -12486,7 +12489,7 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) return 1; } - gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; + gpa &= ~0xFFFull; page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address); if (is_error_page(page)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1477e23827c2..79fa55de635c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -806,6 +806,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; + if ((cr4 ^ old_cr4) & X86_CR4_LA57) + return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, @@ -2344,7 +2346,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_write(vcpu, msr, data); case MSR_IA32_TSCDEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); @@ -2629,7 +2631,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_APICBASE: msr_info->data = kvm_get_apic_base(vcpu); break; - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); break; case MSR_IA32_TSCDEADLINE: @@ -3073,6 +3075,9 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; + if (vcpu->arch.st.steal.preempted) + return; + vcpu->arch.st.steal.preempted = 1; kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, @@ -4365,10 +4370,13 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&u.ps, argp, sizeof u.ps)) goto out; + mutex_lock(&kvm->lock); r = -ENXIO; if (!kvm->arch.vpit) - goto out; + goto set_pit_out; r = kvm_vm_ioctl_set_pit(kvm, &u.ps); +set_pit_out: + mutex_unlock(&kvm->lock); break; } case KVM_GET_PIT2: { @@ -4388,10 +4396,13 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) goto out; + mutex_lock(&kvm->lock); r = -ENXIO; if (!kvm->arch.vpit) - goto out; + goto set_pit2_out; r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); +set_pit2_out: + mutex_unlock(&kvm->lock); break; } case KVM_REINJECT_CONTROL: { @@ -6291,35 +6302,6 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); -static void kvm_set_mmio_spte_mask(void) -{ - u64 mask; - int maxphyaddr = boot_cpu_data.x86_phys_bits; - - /* - * Set the reserved bits and the present bit of an paging-structure - * entry to generate page fault with PFER.RSV = 1. - */ - - /* - * Mask the uppermost physical address bit, which would be reserved as - * long as the supported physical address width is less than 52. - */ - mask = 1ull << 51; - - /* Set the present bit. */ - mask |= 1ull; - - /* - * If reserved bit is not supported, clear the present bit to disable - * mmio page fault. - */ - if (maxphyaddr == 52) - mask &= ~1ull; - - kvm_mmu_set_mmio_spte_mask(mask, mask); -} - #ifdef CONFIG_X86_64 static void pvclock_gtod_update_fn(struct work_struct *work) { @@ -6397,8 +6379,6 @@ int kvm_arch_init(void *opaque) if (r) goto out_free_percpu; - kvm_set_mmio_spte_mask(); - kvm_x86_ops = ops; kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, |