aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/cpuid.c6
-rw-r--r--arch/x86/kvm/emulate.c133
-rw-r--r--arch/x86/kvm/hyperv.c20
-rw-r--r--arch/x86/kvm/lapic.c17
-rw-r--r--arch/x86/kvm/pmu.c2
-rw-r--r--arch/x86/kvm/pmu_amd.c8
-rw-r--r--arch/x86/kvm/svm.c13
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c (renamed from arch/x86/kvm/pmu_intel.c)0
-rw-r--r--arch/x86/kvm/vmx/vmx.c (renamed from arch/x86/kvm/vmx.c)209
-rw-r--r--arch/x86/kvm/vmx/vmx_evmcs.h (renamed from arch/x86/kvm/vmx_evmcs.h)0
-rw-r--r--arch/x86/kvm/vmx/vmx_shadow_fields.h (renamed from arch/x86/kvm/vmx_shadow_fields.h)0
-rw-r--r--arch/x86/kvm/x86.c18
13 files changed, 327 insertions, 101 deletions
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index dc4f2fdf5e57..13fd54de5449 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -16,7 +16,7 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
hyperv.o page_track.o debugfs.o
-kvm-intel-y += vmx.o pmu_intel.o
+kvm-intel-y += vmx/vmx.o vmx/pmu_intel.o
kvm-amd-y += svm.o pmu_amd.o
obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 097eef712cdc..768a765c49b0 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -532,6 +532,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
union cpuid10_eax eax;
union cpuid10_edx edx;
+ if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
perf_get_x86_pmu_capability(&cap);
/*
@@ -675,6 +680,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
g_phys_as = phys_as;
entry->eax = g_phys_as | (virt_as << 8);
+ entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8));
entry->edx = 0;
/*
* IBRS, IBPB and VIRT_SSBD aren't necessarily present in
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 3e182c7ae771..0cb75d29e67b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -759,8 +759,7 @@ static int linearize(struct x86_emulate_ctxt *ctxt,
ctxt->mode, linear);
}
-static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst,
- enum x86emul_mode mode)
+static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst)
{
ulong linear;
int rc;
@@ -770,41 +769,71 @@ static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst,
if (ctxt->op_bytes != sizeof(unsigned long))
addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1);
- rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear);
+ rc = __linearize(ctxt, addr, &max_size, 1, false, true, ctxt->mode, &linear);
if (rc == X86EMUL_CONTINUE)
ctxt->_eip = addr.ea;
return rc;
}
+static inline int emulator_recalc_and_set_mode(struct x86_emulate_ctxt *ctxt)
+{
+ u64 efer;
+ struct desc_struct cs;
+ u16 selector;
+ u32 base3;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if (!(ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PE)) {
+ /* Real mode. cpu must not have long mode active */
+ if (efer & EFER_LMA)
+ return X86EMUL_UNHANDLEABLE;
+ ctxt->mode = X86EMUL_MODE_REAL;
+ return X86EMUL_CONTINUE;
+ }
+
+ if (ctxt->eflags & X86_EFLAGS_VM) {
+ /* Protected/VM86 mode. cpu must not have long mode active */
+ if (efer & EFER_LMA)
+ return X86EMUL_UNHANDLEABLE;
+ ctxt->mode = X86EMUL_MODE_VM86;
+ return X86EMUL_CONTINUE;
+ }
+
+ if (!ctxt->ops->get_segment(ctxt, &selector, &cs, &base3, VCPU_SREG_CS))
+ return X86EMUL_UNHANDLEABLE;
+
+ if (efer & EFER_LMA) {
+ if (cs.l) {
+ /* Proper long mode */
+ ctxt->mode = X86EMUL_MODE_PROT64;
+ } else if (cs.d) {
+ /* 32 bit compatibility mode*/
+ ctxt->mode = X86EMUL_MODE_PROT32;
+ } else {
+ ctxt->mode = X86EMUL_MODE_PROT16;
+ }
+ } else {
+ /* Legacy 32 bit / 16 bit mode */
+ ctxt->mode = cs.d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
{
- return assign_eip(ctxt, dst, ctxt->mode);
+ return assign_eip(ctxt, dst);
}
-static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
- const struct desc_struct *cs_desc)
+static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst)
{
- enum x86emul_mode mode = ctxt->mode;
- int rc;
+ int rc = emulator_recalc_and_set_mode(ctxt);
-#ifdef CONFIG_X86_64
- if (ctxt->mode >= X86EMUL_MODE_PROT16) {
- if (cs_desc->l) {
- u64 efer = 0;
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
- if (efer & EFER_LMA)
- mode = X86EMUL_MODE_PROT64;
- } else
- mode = X86EMUL_MODE_PROT32; /* temporary value */
- }
-#endif
- if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32)
- mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
- rc = assign_eip(ctxt, dst, mode);
- if (rc == X86EMUL_CONTINUE)
- ctxt->mode = mode;
- return rc;
+ return assign_eip(ctxt, dst);
}
static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -1669,11 +1698,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
goto exception;
}
- if (!seg_desc.p) {
- err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
- goto exception;
- }
-
dpl = seg_desc.dpl;
switch (seg) {
@@ -1713,12 +1737,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
case VCPU_SREG_TR:
if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
goto exception;
- old_desc = seg_desc;
- seg_desc.type |= 2; /* busy */
- ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
- sizeof(seg_desc), &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- return ret;
break;
case VCPU_SREG_LDTR:
if (seg_desc.s || seg_desc.type != 2)
@@ -1737,6 +1755,11 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
break;
}
+ if (!seg_desc.p) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
if (seg_desc.s) {
/* mark segment as accessed */
if (!(seg_desc.type & 1)) {
@@ -1751,8 +1774,17 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (ret != X86EMUL_CONTINUE)
return ret;
if (emul_is_noncanonical_address(get_desc_base(&seg_desc) |
- ((u64)base3 << 32), ctxt))
- return emulate_gp(ctxt, 0);
+ ((u64)base3 << 32), ctxt))
+ return emulate_gp(ctxt, err_code);
+ }
+
+ if (seg == VCPU_SREG_TR) {
+ old_desc = seg_desc;
+ seg_desc.type |= 2; /* busy */
+ ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
+ sizeof(seg_desc), &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
}
load:
ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg);
@@ -1972,7 +2004,7 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
- if (ctxt->modrm_reg == VCPU_SREG_SS)
+ if (seg == VCPU_SREG_SS)
ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
if (ctxt->op_bytes > 2)
rsp_increment(ctxt, ctxt->op_bytes - 2);
@@ -2189,7 +2221,7 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
+ rc = assign_eip_far(ctxt, ctxt->src.val);
/* Error handling is not implemented. */
if (rc != X86EMUL_CONTINUE)
return X86EMUL_UNHANDLEABLE;
@@ -2270,7 +2302,7 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
&new_desc);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = assign_eip_far(ctxt, eip, &new_desc);
+ rc = assign_eip_far(ctxt, eip);
/* Error handling is not implemented. */
if (rc != X86EMUL_CONTINUE)
return X86EMUL_UNHANDLEABLE;
@@ -2892,6 +2924,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
ctxt->_eip = rdx;
+ ctxt->mode = usermode;
*reg_write(ctxt, VCPU_REGS_RSP) = rcx;
return X86EMUL_CONTINUE;
@@ -3488,7 +3521,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
+ rc = assign_eip_far(ctxt, ctxt->src.val);
if (rc != X86EMUL_CONTINUE)
goto fail;
@@ -3635,11 +3668,25 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt)
static int em_cr_write(struct x86_emulate_ctxt *ctxt)
{
- if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
+ int cr_num = ctxt->modrm_reg;
+ int r;
+
+ if (ctxt->ops->set_cr(ctxt, cr_num, ctxt->src.val))
return emulate_gp(ctxt, 0);
/* Disable writeback. */
ctxt->dst.type = OP_NONE;
+
+ if (cr_num == 0) {
+ /*
+ * CR0 write might have updated CR0.PE and/or CR0.PG
+ * which can affect the cpu's execution mode.
+ */
+ r = emulator_recalc_and_set_mode(ctxt);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+ }
+
return X86EMUL_CONTINUE;
}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 3fd6c4b2c2b7..2047edb5ff74 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -237,7 +237,7 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
int ret;
- if (!synic->active && !host)
+ if (!synic->active && (!host || data))
return 1;
trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
@@ -283,6 +283,9 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
case HV_X64_MSR_EOM: {
int i;
+ if (!synic->active)
+ break;
+
for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
kvm_hv_notify_acked_sint(vcpu, i);
break;
@@ -338,6 +341,9 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
struct kvm_lapic_irq irq;
int ret, vector;
+ if (KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm))
+ return -EINVAL;
+
if (sint >= ARRAY_SIZE(synic->sint))
return -EINVAL;
@@ -544,6 +550,12 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
bool host)
{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
+ if (!synic->active && (!host || config))
+ return 1;
+
trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, config, host);
@@ -558,6 +570,12 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
bool host)
{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
+ if (!synic->active && (!host || count))
+ return 1;
+
trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, count, host);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 56a4b9762b0c..256b00f456e6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -961,6 +961,10 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
*r = -1;
if (irq->shorthand == APIC_DEST_SELF) {
+ if (KVM_BUG_ON(!src, kvm)) {
+ *r = 0;
+ return true;
+ }
*r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
return true;
}
@@ -2045,10 +2049,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
- | (kvm_lapic_get_reg(apic, APIC_TASKPRI) & 4));
+ apic_set_tpr(vcpu->arch.apic, (cr8 & 0x0f) << 4);
}
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
@@ -2200,13 +2201,17 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
{
u32 reg = kvm_lapic_get_reg(apic, lvt_type);
int vector, mode, trig_mode;
+ int r;
if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
vector = reg & APIC_VECTOR_MASK;
mode = reg & APIC_MODE_MASK;
trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
- return __apic_accept_irq(apic, mode, vector, 1, trig_mode,
- NULL);
+
+ r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL);
+ if (r && lvt_type == APIC_LVTPC)
+ kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED);
+ return r;
}
return 0;
}
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index acc8d217f656..ad3d39c00d7f 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -171,7 +171,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
}
if (type == PERF_TYPE_RAW)
- config = eventsel & X86_RAW_EVENT_MASK;
+ config = eventsel & AMD64_RAW_EVENT_MASK;
pmc_reprogram_counter(pmc, type, config,
!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
index 41dff881e0f0..93a135f216b2 100644
--- a/arch/x86/kvm/pmu_amd.c
+++ b/arch/x86/kvm/pmu_amd.c
@@ -247,12 +247,10 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
/* MSR_EVNTSELn */
pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
if (pmc) {
- if (data == pmc->eventsel)
- return 0;
- if (!(data & pmu->reserved_bits)) {
+ data &= ~pmu->reserved_bits;
+ if (data != pmc->eventsel)
reprogram_gp_counter(pmc, data);
- return 0;
- }
+ return 0;
}
return 1;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 85181457413e..cd3432df0d24 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -50,6 +50,7 @@
#include <asm/kvm_para.h>
#include <asm/irq_remapping.h>
#include <asm/spec-ctrl.h>
+#include <asm/cpu_device_id.h>
#include <asm/virtext.h>
#include "trace.h"
@@ -4154,9 +4155,9 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr)
msr->data = 0;
switch (msr->index) {
- case MSR_F10H_DECFG:
- if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
- msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
+ case MSR_AMD64_DE_CFG:
+ if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
+ msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
break;
default:
return 1;
@@ -4258,7 +4259,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = 0x1E;
}
break;
- case MSR_F10H_DECFG:
+ case MSR_AMD64_DE_CFG:
msr_info->data = svm->msr_decfg;
break;
default:
@@ -4445,7 +4446,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_VM_IGNNE:
vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
- case MSR_F10H_DECFG: {
+ case MSR_AMD64_DE_CFG: {
struct kvm_msr_entry msr_entry;
msr_entry.index = msr->index;
@@ -5142,8 +5143,6 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- BUG_ON(!(gif_set(svm)));
-
trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
++vcpu->stat.irq_injections;
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 611f9e60f815..611f9e60f815 100644
--- a/arch/x86/kvm/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 77b9ed5223f3..55e52064c4ec 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -41,6 +41,7 @@
#include <asm/asm.h>
#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
#include <asm/io.h>
#include <asm/desc.h>
#include <asm/vmx.h>
@@ -212,6 +213,9 @@ static const struct {
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;
+/* Control for disabling CPU Fill buffer clear */
+static bool __read_mostly vmx_fb_clear_ctrl_available;
+
static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
struct page *page;
@@ -1045,6 +1049,8 @@ struct vcpu_vmx {
u64 msr_ia32_feature_control;
u64 msr_ia32_feature_control_valid_bits;
u64 ept_pointer;
+ u64 msr_ia32_mcu_opt_ctrl;
+ bool disable_fb_clear;
};
enum segment_cache_field {
@@ -2056,6 +2062,12 @@ static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS);
}
+static inline bool nested_cpu_has_save_preemption_timer(struct vmcs12 *vmcs12)
+{
+ return vmcs12->vm_exit_controls &
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+}
+
static inline bool is_nmi(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2107,6 +2119,60 @@ static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
BUG_ON(error);
}
+static void vmx_setup_fb_clear_ctrl(void)
+{
+ u64 msr;
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
+ !boot_cpu_has_bug(X86_BUG_MDS) &&
+ !boot_cpu_has_bug(X86_BUG_TAA)) {
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
+ if (msr & ARCH_CAP_FB_CLEAR_CTRL)
+ vmx_fb_clear_ctrl_available = true;
+ }
+}
+
+static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
+{
+ u64 msr;
+
+ if (!vmx->disable_fb_clear)
+ return;
+
+ msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
+ msr |= FB_CLEAR_DIS;
+ native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+ /* Cache the MSR value to avoid reading it later */
+ vmx->msr_ia32_mcu_opt_ctrl = msr;
+}
+
+static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
+{
+ if (!vmx->disable_fb_clear)
+ return;
+
+ vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
+ native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
+}
+
+static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
+{
+ vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
+
+ /*
+ * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
+ * at VMEntry. Skip the MSR read/write when a guest has no use case to
+ * execute VERW.
+ */
+ if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
+ ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
+ vmx->disable_fb_clear = false;
+}
+
static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
{
int i;
@@ -4314,9 +4380,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
break;
}
- ret = kvm_set_msr_common(vcpu, msr_info);
+ ret = kvm_set_msr_common(vcpu, msr_info);
}
+ /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
+ if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
+ vmx_update_fb_clear_dis(vcpu, vmx);
+
return ret;
}
@@ -4670,9 +4740,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
}
}
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- rdmsrl(MSR_IA32_XSS, host_xss);
-
return 0;
}
@@ -5454,18 +5521,15 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
u32 ar;
- if (var->unusable || !var->present)
- ar = 1 << 16;
- else {
- ar = var->type & 15;
- ar |= (var->s & 1) << 4;
- ar |= (var->dpl & 3) << 5;
- ar |= (var->present & 1) << 7;
- ar |= (var->avl & 1) << 12;
- ar |= (var->l & 1) << 13;
- ar |= (var->db & 1) << 14;
- ar |= (var->g & 1) << 15;
- }
+ ar = var->type & 15;
+ ar |= (var->s & 1) << 4;
+ ar |= (var->dpl & 3) << 5;
+ ar |= (var->present & 1) << 7;
+ ar |= (var->avl & 1) << 12;
+ ar |= (var->l & 1) << 13;
+ ar |= (var->db & 1) << 14;
+ ar |= (var->g & 1) << 15;
+ ar |= (var->unusable || !var->present) << 16;
return ar;
}
@@ -6761,6 +6825,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vpid_sync_context(vmx->vpid);
if (init_event)
vmx_clear_hlt(vcpu);
+
+ vmx_update_fb_clear_dis(vcpu, vmx);
}
/*
@@ -7885,6 +7951,9 @@ static __init int hardware_setup(void)
WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
}
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ rdmsrl(MSR_IA32_XSS, host_xss);
+
if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
enable_vpid = 0;
@@ -10694,10 +10763,35 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->hv_timer_armed = false;
}
+u64 __always_inline vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx)
+{
+ u64 guestval, hostval = this_cpu_read(x86_spec_ctrl_current);
+
+ if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
+ return 0;
+
+ guestval = __rdmsr(MSR_IA32_SPEC_CTRL);
+
+ /*
+ *
+ * For legacy IBRS, the IBRS bit always needs to be written after
+ * transitioning from a less privileged predictor mode, regardless of
+ * whether the guest/host values differ.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
+ guestval != hostval)
+ native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
+
+ barrier_nospec();
+
+ return guestval;
+}
+
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4, evmcs_rsp;
+ u64 spec_ctrl;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
@@ -10773,6 +10867,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_l1d_flush(vcpu);
else if (static_branch_unlikely(&mds_user_clear))
mds_clear_cpu_buffers();
+ else if (static_branch_unlikely(&mmio_stale_data_clear) &&
+ kvm_arch_has_assigned_device(vcpu->kvm))
+ mds_clear_cpu_buffers();
+
+ vmx_disable_fb_clear(vmx);
asm volatile (
/* Store host registers */
@@ -10919,6 +11018,26 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
);
/*
+ * IMPORTANT: RSB filling and SPEC_CTRL handling must be done before
+ * the first unbalanced RET after vmexit!
+ *
+ * For retpoline or IBRS, RSB filling is needed to prevent poisoned RSB
+ * entries and (in some cases) RSB underflow.
+ *
+ * eIBRS has its own protection against poisoned RSB, so it doesn't
+ * need the RSB filling sequence. But it does need to be enabled, and a
+ * single call to retire, before the first unbalanced RET.
+ *
+ * So no RETs before vmx_spec_ctrl_restore_host() below.
+ */
+ vmexit_fill_RSB();
+
+ /* Save this for below */
+ spec_ctrl = vmx_spec_ctrl_restore_host(vmx);
+
+ vmx_enable_fb_clear(vmx);
+
+ /*
* We do not use IBRS in the kernel. If this vCPU has used the
* SPEC_CTRL MSR it may have left it on; save the value and
* turn it off. This is much more efficient than blindly adding
@@ -10934,12 +11053,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
* save it.
*/
if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
- vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
-
- x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
-
- /* Eliminate branch target predictions from guest mode */
- vmexit_fill_RSB();
+ vmx->spec_ctrl = spec_ctrl;
/* All fields are clean at this point */
if (static_branch_unlikely(&enable_evmcs))
@@ -12498,6 +12612,10 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ if (!nested_cpu_has_preemption_timer(vmcs12) &&
+ nested_cpu_has_save_preemption_timer(vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
if (nested_vmx_check_pml_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
@@ -12634,7 +12752,7 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
u32 *exit_qual)
{
- bool ia32e;
+ bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE);
*exit_qual = ENTRY_FAIL_DEFAULT;
@@ -12647,6 +12765,13 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
return 1;
}
+ if ((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)
+ return 1;
+
+ if ((ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) ||
+ (ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG)))
+ return 1;
+
/*
* If the load IA32_EFER VM-entry control is 1, the following checks
* are performed on the field for the IA32_EFER MSR:
@@ -12658,7 +12783,6 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
*/
if (to_vmx(vcpu)->nested.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
- ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
((vmcs12->guest_cr0 & X86_CR0_PG) &&
@@ -13204,14 +13328,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
*/
vmcs12_save_pending_event(vcpu, vmcs12);
}
-
- /*
- * Drop what we picked up for L2 via vmx_complete_interrupts. It is
- * preserved above and would only end up incorrectly in L1.
- */
- vcpu->arch.nmi_injected = false;
- kvm_clear_exception_queue(vcpu);
- kvm_clear_interrupt_queue(vcpu);
}
/*
@@ -13545,6 +13661,17 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
}
+ /*
+ * Drop events/exceptions that were queued for re-injection to L2
+ * (picked up via vmx_complete_interrupts()), as well as exceptions
+ * that were pending for L2. Note, this must NOT be hoisted above
+ * prepare_vmcs12(), events/exceptions queued for re-injection need to
+ * be captured in vmcs12 (see vmcs12_save_pending_event()).
+ */
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
vm_entry_controls_reset_shadow(vmx);
vm_exit_controls_reset_shadow(vmx);
@@ -13751,6 +13878,21 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
/* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
break;
+ case x86_intercept_pause:
+ /*
+ * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides
+ * with vanilla NOPs in the emulator. Apply the interception
+ * check only to actual PAUSE instructions. Don't check
+ * PAUSE-loop-exiting, software can't expect a given PAUSE to
+ * exit, i.e. KVM is within its rights to allow L2 to execute
+ * the PAUSE.
+ */
+ if ((info->rep_prefix != REPE_PREFIX) ||
+ !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING))
+ return X86EMUL_CONTINUE;
+
+ break;
+
/* TODO: check more intercepts... */
default:
break;
@@ -14623,8 +14765,11 @@ static int __init vmx_init(void)
}
}
+ vmx_setup_fb_clear_ctrl();
+
for_each_possible_cpu(cpu) {
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+
INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
}
diff --git a/arch/x86/kvm/vmx_evmcs.h b/arch/x86/kvm/vmx/vmx_evmcs.h
index 210a884090ad..210a884090ad 100644
--- a/arch/x86/kvm/vmx_evmcs.h
+++ b/arch/x86/kvm/vmx/vmx_evmcs.h
diff --git a/arch/x86/kvm/vmx_shadow_fields.h b/arch/x86/kvm/vmx/vmx_shadow_fields.h
index cd0c75f6d037..cd0c75f6d037 100644
--- a/arch/x86/kvm/vmx_shadow_fields.h
+++ b/arch/x86/kvm/vmx/vmx_shadow_fields.h
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 417abc9ba1ad..0548ae57826f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1156,7 +1156,7 @@ static u32 msr_based_features[] = {
MSR_IA32_VMX_EPT_VPID_CAP,
MSR_IA32_VMX_VMFUNC,
- MSR_F10H_DECFG,
+ MSR_AMD64_DE_CFG,
MSR_IA32_UCODE_REV,
MSR_IA32_ARCH_CAPABILITIES,
};
@@ -1220,6 +1220,13 @@ u64 kvm_get_arch_capabilities(void)
/* KVM does not emulate MSR_IA32_TSX_CTRL. */
data &= ~ARCH_CAP_TSX_CTRL_MSR;
+
+ /* Guests don't need to know "Fill buffer clear control" exists */
+ data &= ~ARCH_CAP_FB_CLEAR_CTRL;
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated())
+ data |= ARCH_CAP_GDS_NO;
+
return data;
}
@@ -2453,6 +2460,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_AMD64_PATCH_LOADER:
case MSR_AMD64_BU_CFG2:
case MSR_AMD64_DC_CFG:
+ case MSR_AMD64_TW_CFG:
case MSR_F15H_EX_CFG:
break;
@@ -2756,6 +2764,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_AMD64_BU_CFG2:
case MSR_IA32_PERF_CTL:
case MSR_AMD64_DC_CFG:
+ case MSR_AMD64_TW_CFG:
case MSR_F15H_EX_CFG:
msr_info->data = 0;
break;
@@ -3633,12 +3642,11 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
{
unsigned long val;
+ memset(dbgregs, 0, sizeof(*dbgregs));
memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
kvm_get_dr(vcpu, 6, &val);
dbgregs->dr6 = val;
dbgregs->dr7 = vcpu->arch.dr7;
- dbgregs->flags = 0;
- memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
}
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -9732,9 +9740,9 @@ void kvm_arch_end_assignment(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
-bool kvm_arch_has_assigned_device(struct kvm *kvm)
+bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
{
- return atomic_read(&kvm->arch.assigned_device_count);
+ return arch_atomic_read(&kvm->arch.assigned_device_count);
}
EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);