From c5cd21271fbd17c27cb4dbfa0a70b9108529d184 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 9 Sep 2017 21:27:32 -0700 Subject: [PATCH 019/103] kaiser: vmstat show NR_KAISERTABLE as nr_overhead The kaiser update made an interesting choice, never to free any shadow page tables. Contention on global spinlock was worrying, particularly with it held across page table scans when freeing. Something had to be done: I was going to add refcounting; but simply never to free them is an appealing choice, minimizing contention without complicating the code (the more a page table is found already, the less the spinlock is used). But leaking pages in this way is also a worry: can we get away with it? At the very least, we need a count to show how bad it actually gets: in principle, one might end up wasting about 1/256 of memory that way (1/512 for when direct-mapped pages have to be user-mapped, plus 1/512 for when they are user-mapped from the vmalloc area on another occasion (but we don't have vmalloc'ed stacks, so only large ldts are vmalloc'ed). Add per-cpu stat NR_KAISERTABLE: including 256 at startup for the shared pgd entries, and 1 for each intermediate page table added thereafter for user-mapping - but leave out the 1 per mm, for its shadow pgd, because that distracts from the monotonic increase. Shown in /proc/vmstat as nr_overhead (0 if kaiser not enabled). In practice, it doesn't look so bad so far: more like 1/12000 after nine hours of gtests below; and movable pageblock segregation should tend to cluster the kaiser tables into a subset of the address space (if not, they will be bad for compaction too). But production may tell a different story: keep an eye on this number, and bring back lighter freeing if it gets out of control (maybe a shrinker). ["nr_overhead" should of course say "nr_kaisertable", if it needs to stay; but for the moment we are being coy, preferring that when Joe Blow notices a new line in his /proc/vmstat, he does not get too curious about what this "kaiser" stuff might be.] Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/kaiser.c | 16 +++++++++++----- include/linux/mmzone.h | 3 ++- mm/vmstat.c | 1 + 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index 7a7e850..bd22ef5 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -121,9 +121,11 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic) if (!new_pmd_page) return NULL; spin_lock(&shadow_table_allocation_lock); - if (pud_none(*pud)) + if (pud_none(*pud)) { set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); - else + __inc_zone_page_state(virt_to_page((void *) + new_pmd_page), NR_KAISERTABLE); + } else free_page(new_pmd_page); spin_unlock(&shadow_table_allocation_lock); } @@ -139,9 +141,11 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic) if (!new_pte_page) return NULL; spin_lock(&shadow_table_allocation_lock); - if (pmd_none(*pmd)) + if (pmd_none(*pmd)) { set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); - else + __inc_zone_page_state(virt_to_page((void *) + new_pte_page), NR_KAISERTABLE); + } else free_page(new_pte_page); spin_unlock(&shadow_table_allocation_lock); } @@ -205,11 +209,13 @@ static void __init kaiser_init_all_pgds(void) pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { pgd_t new_pgd; - pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE); + pud_t *pud = pud_alloc_one(&init_mm, + PAGE_OFFSET + i * PGDIR_SIZE); if (!pud) { WARN_ON(1); break; } + inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE); new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); /* * Make sure not to stomp on some other pgd entry. diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7e273e2..0547d4f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -124,8 +124,9 @@ enum zone_stat_item { NR_SLAB_UNRECLAIMABLE, NR_PAGETABLE, /* used for pagetables */ NR_KERNEL_STACK_KB, /* measured in KiB */ - /* Second 128 byte cacheline */ + NR_KAISERTABLE, NR_BOUNCE, + /* Second 128 byte cacheline */ #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ #endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 604f26a..6a088df 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -932,6 +932,7 @@ const char * const vmstat_text[] = { "nr_slab_unreclaimable", "nr_page_table_pages", "nr_kernel_stack", + "nr_overhead", "nr_bounce", #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", -- 2.7.4