aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/mmu/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu/mmu.c')
-rw-r--r--arch/x86/kvm/mmu/mmu.c462
1 files changed, 248 insertions, 214 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index cf89328780b1..b1beacae81a7 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -18,6 +18,7 @@
#include "irq.h"
#include "ioapic.h"
#include "mmu.h"
+#include "mmu_internal.h"
#include "x86.h"
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
@@ -91,7 +92,8 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
*/
bool tdp_enabled = false;
-static int max_page_level __read_mostly;
+static int max_huge_page_level __read_mostly;
+static int max_tdp_level __read_mostly;
enum {
AUDIT_PRE_PAGE_FAULT,
@@ -515,6 +517,18 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
return likely(kvm_gen == spte_gen);
}
+static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception)
+{
+ /* Check if guest physical address doesn't exceed guest maximum */
+ if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) {
+ exception->error_code |= PFERR_RSVD_MASK;
+ return UNMAPPED_GVA;
+ }
+
+ return gpa;
+}
+
/*
* Sets the shadow PTE masks used by the MMU.
*
@@ -676,7 +690,7 @@ union split_spte {
static void count_spte_clear(u64 *sptep, u64 spte)
{
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
if (is_shadow_present_pte(spte))
return;
@@ -760,7 +774,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
*/
static u64 __get_spte_lockless(u64 *sptep)
{
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
union split_spte spte, *orig = (union split_spte *)sptep;
int count;
@@ -1060,94 +1074,40 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
local_irq_enable();
}
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
- struct kmem_cache *base_cache, int min)
-{
- void *obj;
-
- if (cache->nobjs >= min)
- return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
- if (!obj)
- return cache->nobjs >= min ? 0 : -ENOMEM;
- cache->objects[cache->nobjs++] = obj;
- }
- return 0;
-}
-
-static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
-{
- return cache->nobjs;
-}
-
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
- struct kmem_cache *cache)
-{
- while (mc->nobjs)
- kmem_cache_free(cache, mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
- int min)
-{
- void *page;
-
- if (cache->nobjs >= min)
- return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
- if (!page)
- return cache->nobjs >= min ? 0 : -ENOMEM;
- cache->objects[cache->nobjs++] = page;
- }
- return 0;
-}
-
-static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
-{
- while (mc->nobjs)
- free_page((unsigned long)mc->objects[--mc->nobjs]);
-}
-
-static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
{
int r;
- r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
- pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
+ /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+ 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
if (r)
- goto out;
- r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
+ return r;
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
+ PT64_ROOT_MAX_LEVEL);
if (r)
- goto out;
- r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
- mmu_page_header_cache, 4);
-out:
- return r;
+ return r;
+ if (maybe_indirect) {
+ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache,
+ PT64_ROOT_MAX_LEVEL);
+ if (r)
+ return r;
+ }
+ return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ PT64_ROOT_MAX_LEVEL);
}
static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
- mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
- pte_list_desc_cache);
- mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
- mmu_page_header_cache);
-}
-
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
-{
- void *p;
-
- BUG_ON(!mc->nobjs);
- p = mc->objects[--mc->nobjs];
- return p;
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
}
static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
{
- return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
+ return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
}
static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
@@ -1415,10 +1375,10 @@ static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
static bool rmap_can_add(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu_memory_cache *cache;
+ struct kvm_mmu_memory_cache *mc;
- cache = &vcpu->arch.mmu_pte_list_desc_cache;
- return mmu_memory_cache_free_objects(cache);
+ mc = &vcpu->arch.mmu_pte_list_desc_cache;
+ return kvm_mmu_memory_cache_nr_free_objects(mc);
}
static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -1426,7 +1386,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
struct kvm_mmu_page *sp;
struct kvm_rmap_head *rmap_head;
- sp = page_header(__pa(spte));
+ sp = sptep_to_sp(spte);
kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
return pte_list_add(vcpu, spte, rmap_head);
@@ -1438,7 +1398,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
gfn_t gfn;
struct kvm_rmap_head *rmap_head;
- sp = page_header(__pa(spte));
+ sp = sptep_to_sp(spte);
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
rmap_head = gfn_to_rmap(kvm, gfn, sp);
__pte_list_remove(spte, rmap_head);
@@ -1530,7 +1490,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
{
if (is_large_pte(*sptep)) {
- WARN_ON(page_header(__pa(sptep))->role.level == PG_LEVEL_4K);
+ WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
drop_spte(kvm, sptep);
--kvm->stat.lpages;
return true;
@@ -1542,7 +1502,7 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
{
if (__drop_large_spte(vcpu->kvm, sptep)) {
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
KVM_PAGES_PER_HPAGE(sp->role.level));
@@ -1738,21 +1698,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
-/**
- * kvm_arch_write_log_dirty - emulate dirty page logging
- * @vcpu: Guest mode vcpu
- *
- * Emulate arch specific page modification logging for the
- * nested hypervisor
- */
-int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa)
-{
- if (kvm_x86_ops.write_log_dirty)
- return kvm_x86_ops.write_log_dirty(vcpu, l2_gpa);
-
- return 0;
-}
-
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn)
{
@@ -2016,7 +1961,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
struct kvm_rmap_head *rmap_head;
struct kvm_mmu_page *sp;
- sp = page_header(__pa(spte));
+ sp = sptep_to_sp(spte);
rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
@@ -2105,10 +2050,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
{
struct kvm_mmu_page *sp;
- sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
- sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
+ sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
+ sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
if (!direct)
- sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
+ sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
/*
@@ -2138,7 +2083,7 @@ static void mark_unsync(u64 *spte)
struct kvm_mmu_page *sp;
unsigned int index;
- sp = page_header(__pa(spte));
+ sp = sptep_to_sp(spte);
index = spte - sp->spt;
if (__test_and_set_bit(index, sp->unsync_child_bitmap))
return;
@@ -2207,7 +2152,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
continue;
}
- child = page_header(ent & PT64_BASE_ADDR_MASK);
+ child = to_shadow_page(ent & PT64_BASE_ADDR_MASK);
if (child->unsync_children) {
if (mmu_pages_add(pvec, child, i))
@@ -2258,15 +2203,14 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list);
-
-#define for_each_valid_sp(_kvm, _sp, _gfn) \
- hlist_for_each_entry(_sp, \
- &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
+#define for_each_valid_sp(_kvm, _sp, _list) \
+ hlist_for_each_entry(_sp, _list, hash_link) \
if (is_obsolete_sp((_kvm), (_sp))) { \
} else
#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
- for_each_valid_sp(_kvm, _sp, _gfn) \
+ for_each_valid_sp(_kvm, _sp, \
+ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
static inline bool is_ept_sp(struct kvm_mmu_page *sp)
@@ -2464,9 +2408,7 @@ static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
static void clear_sp_write_flooding_count(u64 *spte)
{
- struct kvm_mmu_page *sp = page_header(__pa(spte));
-
- __clear_sp_write_flooding_count(sp);
+ __clear_sp_write_flooding_count(sptep_to_sp(spte));
}
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
@@ -2476,7 +2418,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
int direct,
unsigned int access)
{
+ bool direct_mmu = vcpu->arch.mmu->direct_map;
union kvm_mmu_page_role role;
+ struct hlist_head *sp_list;
unsigned quadrant;
struct kvm_mmu_page *sp;
bool need_sync = false;
@@ -2490,13 +2434,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (role.direct)
role.gpte_is_8_bytes = true;
role.access = access;
- if (!vcpu->arch.mmu->direct_map
- && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
+ if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
- for_each_valid_sp(vcpu->kvm, sp, gfn) {
+
+ sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
+ for_each_valid_sp(vcpu->kvm, sp, sp_list) {
if (sp->gfn != gfn) {
collisions++;
continue;
@@ -2508,6 +2453,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (sp->role.word != role.word)
continue;
+ if (direct_mmu)
+ goto trace_get_page;
+
if (sp->unsync) {
/* The page is good, but __kvm_sync_page might still end
* up zapping it. If so, break in order to rebuild it.
@@ -2523,6 +2471,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
__clear_sp_write_flooding_count(sp);
+
+trace_get_page:
trace_kvm_mmu_get_page(sp, false);
goto out;
}
@@ -2533,8 +2483,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
sp->gfn = gfn;
sp->role = role;
- hlist_add_head(&sp->hash_link,
- &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
+ hlist_add_head(&sp->hash_link, sp_list);
if (!direct) {
/*
* we should do write protection before syncing pages
@@ -2548,8 +2497,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (level > PG_LEVEL_4K && need_sync)
flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
}
- sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
- clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true);
kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
@@ -2658,7 +2605,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* so we should update the spte at this point to get
* a new sp with the correct access.
*/
- child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+ child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK);
if (child->role.access == direct_access)
return;
@@ -2680,7 +2627,7 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
if (is_large_pte(pte))
--kvm->stat.lpages;
} else {
- child = page_header(pte & PT64_BASE_ADDR_MASK);
+ child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
}
return true;
@@ -2758,10 +2705,23 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
if (!sp->root_count) {
/* Count self */
(*nr_zapped)++;
- list_move(&sp->link, invalid_list);
+
+ /*
+ * Already invalid pages (previously active roots) are not on
+ * the active page list. See list_del() in the "else" case of
+ * !sp->root_count.
+ */
+ if (sp->role.invalid)
+ list_add(&sp->link, invalid_list);
+ else
+ list_move(&sp->link, invalid_list);
kvm_mod_used_mmu_pages(kvm, -1);
} else {
- list_move(&sp->link, &kvm->arch.active_mmu_pages);
+ /*
+ * Remove the active root from the active page list, the root
+ * will be explicitly freed when the root_count hits zero.
+ */
+ list_del(&sp->link);
/*
* Obsolete pages cannot be used on any vCPUs, see the comment
@@ -2813,33 +2773,60 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
}
}
-static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
- struct list_head *invalid_list)
+static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
+ unsigned long nr_to_zap)
{
- struct kvm_mmu_page *sp;
+ unsigned long total_zapped = 0;
+ struct kvm_mmu_page *sp, *tmp;
+ LIST_HEAD(invalid_list);
+ bool unstable;
+ int nr_zapped;
if (list_empty(&kvm->arch.active_mmu_pages))
- return false;
+ return 0;
+
+restart:
+ list_for_each_entry_safe(sp, tmp, &kvm->arch.active_mmu_pages, link) {
+ /*
+ * Don't zap active root pages, the page itself can't be freed
+ * and zapping it will just force vCPUs to realloc and reload.
+ */
+ if (sp->root_count)
+ continue;
+
+ unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list,
+ &nr_zapped);
+ total_zapped += nr_zapped;
+ if (total_zapped >= nr_to_zap)
+ break;
+
+ if (unstable)
+ goto restart;
+ }
- sp = list_last_entry(&kvm->arch.active_mmu_pages,
- struct kvm_mmu_page, link);
- return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+ kvm->stat.mmu_recycled += total_zapped;
+ return total_zapped;
+}
+
+static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
+{
+ if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
+ return kvm->arch.n_max_mmu_pages -
+ kvm->arch.n_used_mmu_pages;
+
+ return 0;
}
static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
{
- LIST_HEAD(invalid_list);
+ unsigned long avail = kvm_mmu_available_pages(vcpu->kvm);
- if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
+ if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
return 0;
- while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
- if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
- break;
-
- ++vcpu->kvm->stat.mmu_recycled;
- }
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail);
if (!kvm_mmu_available_pages(vcpu->kvm))
return -ENOSPC;
@@ -2852,17 +2839,12 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
*/
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
{
- LIST_HEAD(invalid_list);
-
spin_lock(&kvm->mmu_lock);
if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
- /* Need to free some mmu pages to achieve the goal. */
- while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
- if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
- break;
+ kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
+ goal_nr_mmu_pages);
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
}
@@ -3000,7 +2982,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
return 0;
- sp = page_header(__pa(sptep));
+ sp = sptep_to_sp(sptep);
if (sp_ad_disabled(sp))
spte |= SPTE_AD_DISABLED_MASK;
else if (kvm_vcpu_ad_need_write_protect(vcpu))
@@ -3103,7 +3085,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
struct kvm_mmu_page *child;
u64 pte = *sptep;
- child = page_header(pte & PT64_BASE_ADDR_MASK);
+ child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, sptep);
flush = true;
} else if (pfn != spte_to_pfn(*sptep)) {
@@ -3213,7 +3195,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
{
struct kvm_mmu_page *sp;
- sp = page_header(__pa(sptep));
+ sp = sptep_to_sp(sptep);
/*
* Without accessed bits, there's no way to distinguish between
@@ -3275,7 +3257,7 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
if (!slot)
return PG_LEVEL_4K;
- max_level = min(max_level, max_page_level);
+ max_level = min(max_level, max_huge_page_level);
for ( ; max_level > PG_LEVEL_4K; max_level--) {
linfo = lpage_info_slot(gfn, slot, max_level);
if (!linfo->disallow_lpage)
@@ -3521,7 +3503,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
if (!is_shadow_present_pte(spte))
break;
- sp = page_header(__pa(iterator.sptep));
+ sp = sptep_to_sp(iterator.sptep);
if (!is_last_spte(spte, sp->role.level))
break;
@@ -3608,7 +3590,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
if (!VALID_PAGE(*root_hpa))
return;
- sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
+ sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
--sp->root_count;
if (!sp->root_count && sp->role.invalid)
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
@@ -3669,7 +3651,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
{
int ret = 0;
- if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
+ if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
ret = 1;
}
@@ -3838,7 +3820,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu->root_hpa;
- sp = page_header(root);
+ sp = to_shadow_page(root);
/*
* Even if another CPU was marking the SP as unsync-ed
@@ -3872,7 +3854,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
if (root && VALID_PAGE(root)) {
root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
+ sp = to_shadow_page(root);
mmu_sync_children(vcpu, sp);
}
}
@@ -4046,8 +4028,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
walk_shadow_page_lockless_end(vcpu);
}
-static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- gfn_t gfn)
+static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ gfn_t gfn)
{
struct kvm_arch_async_pf arch;
@@ -4109,16 +4091,16 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
if (page_fault_handle_page_track(vcpu, error_code, gfn))
return RET_PF_EMULATE;
- r = mmu_topup_memory_caches(vcpu);
+ if (fast_page_fault(vcpu, gpa, error_code))
+ return RET_PF_RETRY;
+
+ r = mmu_topup_memory_caches(vcpu, false);
if (r)
return r;
if (lpage_disallowed)
max_level = PG_LEVEL_4K;
- if (fast_page_fault(vcpu, gpa, error_code))
- return RET_PF_RETRY;
-
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -4132,7 +4114,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
- if (make_mmu_pages_available(vcpu) < 0)
+ r = make_mmu_pages_available(vcpu);
+ if (r)
goto out_unlock;
r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn,
prefault, is_tdp && lpage_disallowed);
@@ -4157,6 +4140,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len)
{
int r = 1;
+ u32 flags = vcpu->arch.apf.host_apf_flags;
#ifndef CONFIG_X86_64
/* A 64-bit CR2 should be impossible on 32-bit KVM. */
@@ -4165,28 +4149,22 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
#endif
vcpu->arch.l1tf_flush_l1d = true;
- switch (vcpu->arch.apf.host_apf_flags) {
- default:
+ if (!flags) {
trace_kvm_page_fault(fault_address, error_code);
if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, fault_address);
r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
insn_len);
- break;
- case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
vcpu->arch.apf.host_apf_flags = 0;
local_irq_disable();
kvm_async_pf_task_wait_schedule(fault_address);
local_irq_enable();
- break;
- case KVM_PV_REASON_PAGE_READY:
- vcpu->arch.apf.host_apf_flags = 0;
- local_irq_disable();
- kvm_async_pf_task_wake(fault_address);
- local_irq_enable();
- break;
+ } else {
+ WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags);
}
+
return r;
}
EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
@@ -4228,8 +4206,8 @@ static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
union kvm_mmu_page_role role)
{
return (role.direct || pgd == root->pgd) &&
- VALID_PAGE(root->hpa) && page_header(root->hpa) &&
- role.word == page_header(root->hpa)->role.word;
+ VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) &&
+ role.word == to_shadow_page(root->hpa)->role.word;
}
/*
@@ -4278,8 +4256,7 @@ static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd,
*/
if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
mmu->root_level >= PT64_ROOT_4LEVEL)
- return !mmu_check_root(vcpu, new_pgd >> PAGE_SHIFT) &&
- cached_root_available(vcpu, new_pgd, new_role);
+ return cached_root_available(vcpu, new_pgd, new_role);
return false;
}
@@ -4314,7 +4291,7 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
*/
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
- __clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa));
+ __clear_sp_write_flooding_count(to_shadow_page(vcpu->arch.mmu->root_hpa));
}
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
@@ -4870,13 +4847,22 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
return role;
}
+static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
+{
+ /* Use 5-level TDP if and only if it's useful/necessary. */
+ if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
+ return 4;
+
+ return max_tdp_level;
+}
+
static union kvm_mmu_role
kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
{
union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
role.base.ad_disabled = (shadow_accessed_mask == 0);
- role.base.level = vcpu->arch.tdp_level;
+ role.base.level = kvm_mmu_get_tdp_level(vcpu);
role.base.direct = true;
role.base.gpte_is_8_bytes = true;
@@ -4885,7 +4871,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu *context = vcpu->arch.mmu;
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
union kvm_mmu_role new_role =
kvm_calc_tdp_mmu_root_page_role(vcpu, false);
@@ -4897,7 +4883,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->sync_page = nonpaging_sync_page;
context->invlpg = NULL;
context->update_pte = nonpaging_update_pte;
- context->shadow_root_level = vcpu->arch.tdp_level;
+ context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
context->direct_map = true;
context->get_guest_pgd = get_cr3;
context->get_pdptr = kvm_pdptr_read;
@@ -4932,7 +4918,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
}
static union kvm_mmu_role
-kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
+kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, bool base_only)
{
union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
@@ -4940,9 +4926,19 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
!is_write_protection(vcpu);
role.base.smap_andnot_wp = role.ext.cr4_smap &&
!is_write_protection(vcpu);
- role.base.direct = !is_paging(vcpu);
role.base.gpte_is_8_bytes = !!is_pae(vcpu);
+ return role;
+}
+
+static union kvm_mmu_role
+kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
+{
+ union kvm_mmu_role role =
+ kvm_calc_shadow_root_page_role_common(vcpu, base_only);
+
+ role.base.direct = !is_paging(vcpu);
+
if (!is_long_mode(vcpu))
role.base.level = PT32E_ROOT_LEVEL;
else if (is_la57_mode(vcpu))
@@ -4953,15 +4949,10 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
return role;
}
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer)
+static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+ u32 cr0, u32 cr4, u32 efer,
+ union kvm_mmu_role new_role)
{
- struct kvm_mmu *context = vcpu->arch.mmu;
- union kvm_mmu_role new_role =
- kvm_calc_shadow_mmu_root_page_role(vcpu, false);
-
- if (new_role.as_u64 == context->mmu_role.as_u64)
- return;
-
if (!(cr0 & X86_CR0_PG))
nonpaging_init_context(vcpu, context);
else if (efer & EFER_LMA)
@@ -4974,7 +4965,43 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer)
context->mmu_role.as_u64 = new_role.as_u64;
reset_shadow_zero_bits_mask(vcpu, context);
}
-EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+
+static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer)
+{
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
+ union kvm_mmu_role new_role =
+ kvm_calc_shadow_mmu_root_page_role(vcpu, false);
+
+ if (new_role.as_u64 != context->mmu_role.as_u64)
+ shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+}
+
+static union kvm_mmu_role
+kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu)
+{
+ union kvm_mmu_role role =
+ kvm_calc_shadow_root_page_role_common(vcpu, false);
+
+ role.base.direct = false;
+ role.base.level = kvm_mmu_get_tdp_level(vcpu);
+
+ return role;
+}
+
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
+ gpa_t nested_cr3)
+{
+ struct kvm_mmu *context = &vcpu->arch.guest_mmu;
+ union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu);
+
+ context->shadow_root_level = new_role.base.level;
+
+ __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false);
+
+ if (new_role.as_u64 != context->mmu_role.as_u64)
+ shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
static union kvm_mmu_role
kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
@@ -5008,7 +5035,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
bool accessed_dirty, gpa_t new_eptp)
{
- struct kvm_mmu *context = vcpu->arch.mmu;
+ struct kvm_mmu *context = &vcpu->arch.guest_mmu;
u8 level = vmx_eptp_page_walk_level(new_eptp);
union kvm_mmu_role new_role =
kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
@@ -5042,7 +5069,7 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu *context = vcpu->arch.mmu;
+ struct kvm_mmu *context = &vcpu->arch.root_mmu;
kvm_init_shadow_mmu(vcpu,
kvm_read_cr0_bits(vcpu, X86_CR0_PG),
@@ -5152,7 +5179,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
{
int r;
- r = mmu_topup_memory_caches(vcpu);
+ r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
if (r)
goto out;
r = mmu_alloc_roots(vcpu);
@@ -5346,7 +5373,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
* or not since pte prefetch is skiped if it does not have
* enough objects in the cache.
*/
- mmu_topup_memory_caches(vcpu);
+ mmu_topup_memory_caches(vcpu, true);
spin_lock(&vcpu->kvm->mmu_lock);
@@ -5554,23 +5581,25 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
}
EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
-void kvm_configure_mmu(bool enable_tdp, int tdp_page_level)
+void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
+ int tdp_huge_page_level)
{
tdp_enabled = enable_tdp;
+ max_tdp_level = tdp_max_root_level;
/*
- * max_page_level reflects the capabilities of KVM's MMU irrespective
+ * max_huge_page_level reflects KVM's MMU capabilities irrespective
* of kernel support, e.g. KVM may be capable of using 1GB pages when
* the kernel is not. But, KVM never creates a page size greater than
* what is used by the kernel for any given HVA, i.e. the kernel's
* capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
*/
if (tdp_enabled)
- max_page_level = tdp_page_level;
+ max_huge_page_level = tdp_huge_page_level;
else if (boot_cpu_has(X86_FEATURE_GBPAGES))
- max_page_level = PG_LEVEL_1G;
+ max_huge_page_level = PG_LEVEL_1G;
else
- max_page_level = PG_LEVEL_2M;
+ max_huge_page_level = PG_LEVEL_2M;
}
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
@@ -5666,7 +5695,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
* SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
* skip allocating the PDP table.
*/
- if (tdp_enabled && vcpu->arch.tdp_level > PT32E_ROOT_LEVEL)
+ if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
return 0;
page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
@@ -5685,6 +5714,14 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
uint i;
int ret;
+ vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
+ vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
+
+ vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
+ vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
+
+ vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
@@ -5733,12 +5770,11 @@ restart:
break;
/*
- * Skip invalid pages with a non-zero root count, zapping pages
- * with a non-zero root count will never succeed, i.e. the page
- * will get thrown back on active_mmu_pages and we'll get stuck
- * in an infinite loop.
+ * Invalid pages should never land back on the list of active
+ * pages. Skip the bogus page, otherwise we'll get stuck in an
+ * infinite loop if the page gets put back on the list (again).
*/
- if (sp->role.invalid && sp->root_count)
+ if (WARN_ON(sp->role.invalid))
continue;
/*
@@ -5908,7 +5944,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
restart:
for_each_rmap_spte(rmap_head, &iter, sptep) {
- sp = page_header(__pa(sptep));
+ sp = sptep_to_sp(sptep);
pfn = spte_to_pfn(*sptep);
/*
@@ -6019,7 +6055,7 @@ void kvm_mmu_zap_all(struct kvm *kvm)
spin_lock(&kvm->mmu_lock);
restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
- if (sp->role.invalid && sp->root_count)
+ if (WARN_ON(sp->role.invalid))
continue;
if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
goto restart;
@@ -6096,9 +6132,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
goto unlock;
}
- if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
- freed++;
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
unlock:
spin_unlock(&kvm->mmu_lock);