From 6ceca45ce264990a8831d3e5f7ff6e8c0d10df3a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 24 Sep 2017 16:59:49 -0700 Subject: [PATCH 027/103] kaiser: add "nokaiser" boot option, using ALTERNATIVE Added "nokaiser" boot option: an early param like "noinvpcid". Most places now check int kaiser_enabled (#defined 0 when not CONFIG_KAISER) instead of #ifdef CONFIG_KAISER; but entry_64.S and entry_64_compat.S are using the ALTERNATIVE technique, which patches in the preferred instructions at runtime. That technique is tied to x86 cpu features, so X86_FEATURE_KAISER is fabricated. Prior to "nokaiser", Kaiser #defined _PAGE_GLOBAL 0: revert that, but be careful with both _PAGE_GLOBAL and CR4.PGE: setting them when nokaiser like when !CONFIG_KAISER, but not setting either when kaiser - neither matters on its own, but it's hard to be sure that _PAGE_GLOBAL won't get set in some obscure corner, or something add PGE into CR4. By omitting _PAGE_GLOBAL from __supported_pte_mask when kaiser_enabled, all page table setup which uses pte_pfn() masks it out of the ptes. It's slightly shameful that the same declaration versus definition of kaiser_enabled appears in not one, not two, but in three header files (asm/kaiser.h, asm/pgtable.h, asm/tlbflush.h). I felt safer that way, than with #including any of those in any of the others; and did not feel it worth an asm/kaiser_enabled.h - kernel/cpu/common.c includes them all, so we shall hear about it if they get out of synch. Cleanups while in the area: removed the silly #ifdef CONFIG_KAISER from kaiser.c; removed the unused native_get_normal_pgd(); removed the spurious reg clutter from SWITCH_*_CR3 macro stubs; corrected some comments. But more interestingly, set CR4.PSE in secondary_startup_64: the manual is clear that it does not matter whether it's 0 or 1 when 4-level-pts are enabled, but I was distracted to find cr4 different on BSP and auxiliaries - BSP alone was adding PSE, in probe_page_size_mask(). Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- Documentation/kernel-parameters.txt | 2 ++ arch/x86/entry/entry_64.S | 15 ++++++------ arch/x86/include/asm/cpufeatures.h | 3 +++ arch/x86/include/asm/kaiser.h | 27 ++++++++++++++++------ arch/x86/include/asm/pgtable.h | 20 +++++++++++----- arch/x86/include/asm/pgtable_64.h | 13 ++++------- arch/x86/include/asm/pgtable_types.h | 4 ---- arch/x86/include/asm/tlbflush.h | 39 ++++++++++++++++++++------------ arch/x86/kernel/cpu/common.c | 28 ++++++++++++++++++++++- arch/x86/kernel/espfix_64.c | 3 ++- arch/x86/kernel/head_64.S | 4 ++-- arch/x86/mm/init.c | 2 +- arch/x86/mm/init_64.c | 10 ++++++++ arch/x86/mm/kaiser.c | 26 +++++++++++++++++---- arch/x86/mm/pgtable.c | 8 ++----- arch/x86/mm/tlb.c | 4 +--- tools/arch/x86/include/asm/cpufeatures.h | 3 +++ 17 files changed, 146 insertions(+), 65 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index a303387..e2642ec 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2753,6 +2753,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nojitter [IA-64] Disables jitter checking for ITC timers. + nokaiser [X86-64] Disable KAISER isolation of kernel from user. + no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 41bf650..bbb38ac 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1079,7 +1079,7 @@ ENTRY(paranoid_entry) * unconditionally, but we need to find out whether the reverse * should be done on return (conveyed to paranoid_exit in %ebx). */ - movq %cr3, %rax + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER testl $KAISER_SHADOW_PGD_OFFSET, %eax jz 2f orl $2, %ebx @@ -1111,6 +1111,7 @@ ENTRY(paranoid_exit) TRACE_IRQS_OFF_DEBUG TRACE_IRQS_IRETQ_DEBUG #ifdef CONFIG_KAISER + /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */ testl $2, %ebx /* SWITCH_USER_CR3 needed? */ jz paranoid_exit_no_switch SWITCH_USER_CR3 @@ -1339,13 +1340,14 @@ ENTRY(nmi) #ifdef CONFIG_KAISER /* Unconditionally use kernel CR3 for do_nmi() */ /* %rax is saved above, so OK to clobber here */ - movq %cr3, %rax + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ orq x86_cr3_pcid_noflush, %rax pushq %rax /* mask off "user" bit of pgd address and 12 PCID bits: */ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax movq %rax, %cr3 +2: #endif call do_nmi @@ -1355,8 +1357,7 @@ ENTRY(nmi) * kernel code that needs user CR3, but do we ever return * to "user mode" where we need the kernel CR3? */ - popq %rax - mov %rax, %cr3 + ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER #endif /* @@ -1583,13 +1584,14 @@ end_repeat_nmi: #ifdef CONFIG_KAISER /* Unconditionally use kernel CR3 for do_nmi() */ /* %rax is saved above, so OK to clobber here */ - movq %cr3, %rax + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ orq x86_cr3_pcid_noflush, %rax pushq %rax /* mask off "user" bit of pgd address and 12 PCID bits: */ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax movq %rax, %cr3 +2: #endif /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ @@ -1601,8 +1603,7 @@ end_repeat_nmi: * kernel code that needs user CR3, like just just before * a sysret. */ - popq %rax - mov %rax, %cr3 + ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER #endif testl %ebx, %ebx /* swapgs needed? */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index dc50883..20271d6 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -198,6 +198,9 @@ #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ +/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */ + /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h index 3dc5f4c..96643a9 100644 --- a/arch/x86/include/asm/kaiser.h +++ b/arch/x86/include/asm/kaiser.h @@ -46,28 +46,33 @@ movq \reg, %cr3 .endm .macro SWITCH_KERNEL_CR3 -pushq %rax +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER _SWITCH_TO_KERNEL_CR3 %rax popq %rax +8: .endm .macro SWITCH_USER_CR3 -pushq %rax +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER _SWITCH_TO_USER_CR3 %rax %al popq %rax +8: .endm .macro SWITCH_KERNEL_CR3_NO_STACK -movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) +ALTERNATIVE "jmp 8f", \ + __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \ + X86_FEATURE_KAISER _SWITCH_TO_KERNEL_CR3 %rax movq PER_CPU_VAR(unsafe_stack_register_backup), %rax +8: .endm #else /* CONFIG_KAISER */ -.macro SWITCH_KERNEL_CR3 reg +.macro SWITCH_KERNEL_CR3 .endm -.macro SWITCH_USER_CR3 reg regb +.macro SWITCH_USER_CR3 .endm .macro SWITCH_KERNEL_CR3_NO_STACK .endm @@ -90,6 +95,16 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; +extern int kaiser_enabled; +#else +#define kaiser_enabled 0 +#endif /* CONFIG_KAISER */ + +/* + * Kaiser function prototypes are needed even when CONFIG_KAISER is not set, + * so as to build with tests on kaiser_enabled instead of #ifdefs. + */ + /** * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping * @addr: the start address of the range @@ -119,8 +134,6 @@ extern void kaiser_remove_mapping(unsigned long start, unsigned long size); */ extern void kaiser_init(void); -#endif /* CONFIG_KAISER */ - #endif /* __ASSEMBLY */ #endif /* _ASM_X86_KAISER_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1cee98e..217e83a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -18,6 +18,12 @@ #ifndef __ASSEMBLY__ #include +#ifdef CONFIG_KAISER +extern int kaiser_enabled; +#else +#define kaiser_enabled 0 +#endif + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); void ptdump_walk_pgd_level_checkwx(void); @@ -697,7 +703,7 @@ static inline int pgd_bad(pgd_t pgd) * page table by accident; it will fault on the first * instruction it tries to run. See native_set_pgd(). */ - if (IS_ENABLED(CONFIG_KAISER)) + if (kaiser_enabled) ignore_flags |= _PAGE_NX; return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; @@ -913,12 +919,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, */ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { - memcpy(dst, src, count * sizeof(pgd_t)); + memcpy(dst, src, count * sizeof(pgd_t)); #ifdef CONFIG_KAISER - /* Clone the shadow pgd part as well */ - memcpy(native_get_shadow_pgd(dst), - native_get_shadow_pgd(src), - count * sizeof(pgd_t)); + if (kaiser_enabled) { + /* Clone the shadow pgd part as well */ + memcpy(native_get_shadow_pgd(dst), + native_get_shadow_pgd(src), + count * sizeof(pgd_t)); + } #endif } diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 177caf3..cf68b5c 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -111,13 +111,12 @@ extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) { +#ifdef CONFIG_DEBUG_VM + /* linux/mmdebug.h may not have been included at this point */ + BUG_ON(!kaiser_enabled); +#endif return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); } - -static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) -{ - return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE); -} #else static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) { @@ -128,10 +127,6 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) BUILD_BUG_ON(1); return NULL; } -static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) -{ - return pgdp; -} #endif /* CONFIG_KAISER */ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 7cf2883..f0d9a1a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -45,11 +45,7 @@ #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) -#ifdef CONFIG_KAISER -#define _PAGE_GLOBAL (_AT(pteval_t, 0)) -#else #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) -#endif #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 4fff696..13a74f6 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -138,9 +138,11 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) * to avoid the need for asm/kaiser.h in unexpected places. */ #ifdef CONFIG_KAISER +extern int kaiser_enabled; extern void kaiser_setup_pcid(void); extern void kaiser_flush_tlb_on_return_to_user(void); #else +#define kaiser_enabled 0 static inline void kaiser_setup_pcid(void) { } @@ -165,7 +167,7 @@ static inline void __native_flush_tlb(void) * back: */ preempt_disable(); - if (this_cpu_has(X86_FEATURE_PCID)) + if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) kaiser_flush_tlb_on_return_to_user(); native_write_cr3(native_read_cr3()); preempt_enable(); @@ -176,20 +178,30 @@ static inline void __native_flush_tlb_global_irq_disabled(void) unsigned long cr4; cr4 = this_cpu_read(cpu_tlbstate.cr4); - /* clear PGE */ - native_write_cr4(cr4 & ~X86_CR4_PGE); - /* write old PGE again and flush TLBs */ - native_write_cr4(cr4); + if (cr4 & X86_CR4_PGE) { + /* clear PGE and flush TLB of all entries */ + native_write_cr4(cr4 & ~X86_CR4_PGE); + /* restore PGE as it was before */ + native_write_cr4(cr4); + } else { + /* + * x86_64 microcode update comes this way when CR4.PGE is not + * enabled, and it's safer for all callers to allow this case. + */ + native_write_cr3(native_read_cr3()); + } } static inline void __native_flush_tlb_global(void) { -#ifdef CONFIG_KAISER - /* Globals are not used at all */ - __native_flush_tlb(); -#else unsigned long flags; + if (kaiser_enabled) { + /* Globals are not used at all */ + __native_flush_tlb(); + return; + } + if (this_cpu_has(X86_FEATURE_INVPCID)) { /* * Using INVPCID is considerably faster than a pair of writes @@ -209,7 +221,6 @@ static inline void __native_flush_tlb_global(void) raw_local_irq_save(flags); __native_flush_tlb_global_irq_disabled(); raw_local_irq_restore(flags); -#endif } static inline void __native_flush_tlb_single(unsigned long addr) @@ -224,7 +235,7 @@ static inline void __native_flush_tlb_single(unsigned long addr) */ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { - if (this_cpu_has(X86_FEATURE_PCID)) + if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) kaiser_flush_tlb_on_return_to_user(); asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); return; @@ -239,9 +250,9 @@ static inline void __native_flush_tlb_single(unsigned long addr) * Make sure to do only a single invpcid when KAISER is * disabled and we have only a single ASID. */ - if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER) - invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); - invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); + if (kaiser_enabled) + invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); + invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); } static inline void __flush_tlb_all(void) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e6be5f3..8b03874 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -179,6 +179,20 @@ static int __init x86_pcid_setup(char *s) return 1; } __setup("nopcid", x86_pcid_setup); + +static int __init x86_nokaiser_setup(char *s) +{ + /* nokaiser doesn't accept parameters */ + if (s) + return -EINVAL; +#ifdef CONFIG_KAISER + kaiser_enabled = 0; + setup_clear_cpu_cap(X86_FEATURE_KAISER); + pr_info("nokaiser: KAISER feature disabled\n"); +#endif + return 0; +} +early_param("nokaiser", x86_nokaiser_setup); #endif static int __init x86_noinvpcid_setup(char *s) @@ -327,7 +341,7 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) static void setup_pcid(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_PCID)) { - if (cpu_has(c, X86_FEATURE_PGE)) { + if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) { cr4_set_bits(X86_CR4_PCIDE); /* * INVPCID has two "groups" of types: @@ -799,6 +813,10 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); init_scattered_cpuid_features(c); +#ifdef CONFIG_KAISER + if (kaiser_enabled) + set_cpu_cap(c, X86_FEATURE_KAISER); +#endif } static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -1537,6 +1555,14 @@ void cpu_init(void) * try to read it. */ cr4_init_shadow(); + if (!kaiser_enabled) { + /* + * secondary_startup_64() deferred setting PGE in cr4: + * probe_page_size_mask() sets it on the boot cpu, + * but it needs to be set on each secondary cpu. + */ + cr4_set_bits(X86_CR4_PGE); + } /* * Load microcode on this cpu if a valid microcode is available. diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 560c2fd..e33b385 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -132,9 +132,10 @@ void __init init_espfix_bsp(void) * area to ensure it is mapped into the shadow user page * tables. */ - if (IS_ENABLED(CONFIG_KAISER)) + if (kaiser_enabled) { set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); + } /* Randomize the locations */ init_espfix_random(); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 5775379..d04479b 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -190,8 +190,8 @@ ENTRY(secondary_startup_64) movq $(init_level4_pgt - __START_KERNEL_map), %rax 1: - /* Enable PAE mode and PGE */ - movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */ + movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx movq %rcx, %cr4 /* Setup early boot stage 4 level pagetables. */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 22af912..05a9855 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -177,7 +177,7 @@ static void __init probe_page_size_mask(void) cr4_set_bits_and_update_boot(X86_CR4_PSE); /* Enable PGE if available */ - if (boot_cpu_has(X86_FEATURE_PGE)) { + if (boot_cpu_has(X86_FEATURE_PGE) && !kaiser_enabled) { cr4_set_bits_and_update_boot(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } else diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 14b9dd7..a0e8df6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -324,6 +324,16 @@ void __init cleanup_highmap(void) continue; if (vaddr < (unsigned long) _text || vaddr > end) set_pmd(pmd, __pmd(0)); + else if (kaiser_enabled) { + /* + * level2_kernel_pgt is initialized with _PAGE_GLOBAL: + * clear that now. This is not important, so long as + * CR4.PGE remains clear, but it removes an anomaly. + * Physical mapping setup below avoids _PAGE_GLOBAL + * by use of massage_pgprot() inside pfn_pte() etc. + */ + set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL)); + } } } diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c index cc0950f..11032dc 100644 --- a/arch/x86/mm/kaiser.c +++ b/arch/x86/mm/kaiser.c @@ -16,7 +16,9 @@ #include #include -#ifdef CONFIG_KAISER +int kaiser_enabled __read_mostly = 1; +EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */ + __visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); @@ -167,8 +169,8 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic) return pte_offset_kernel(pmd, address); } -int kaiser_add_user_map(const void *__start_addr, unsigned long size, - unsigned long flags) +static int kaiser_add_user_map(const void *__start_addr, unsigned long size, + unsigned long flags) { int ret = 0; pte_t *pte; @@ -177,6 +179,15 @@ int kaiser_add_user_map(const void *__start_addr, unsigned long size, unsigned long end_addr = PAGE_ALIGN(start_addr + size); unsigned long target_address; + /* + * It is convenient for callers to pass in __PAGE_KERNEL etc, + * and there is no actual harm from setting _PAGE_GLOBAL, so + * long as CR4.PGE is not set. But it is nonetheless troubling + * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser" + * requires that not to be #defined to 0): so mask it off here. + */ + flags &= ~_PAGE_GLOBAL; + for (; address < end_addr; address += PAGE_SIZE) { target_address = get_pa_from_mapping(address); if (target_address == -1) { @@ -263,6 +274,8 @@ void __init kaiser_init(void) { int cpu; + if (!kaiser_enabled) + return; kaiser_init_all_pgds(); for_each_possible_cpu(cpu) { @@ -311,6 +324,8 @@ void __init kaiser_init(void) /* Add a mapping to the shadow mapping, and synchronize the mappings */ int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) { + if (!kaiser_enabled) + return 0; return kaiser_add_user_map((const void *)addr, size, flags); } @@ -322,6 +337,8 @@ void kaiser_remove_mapping(unsigned long start, unsigned long size) unsigned long addr, next; pgd_t *pgd; + if (!kaiser_enabled) + return; pgd = native_get_shadow_pgd(pgd_offset_k(start)); for (addr = start; addr < end; pgd++, addr = next) { next = pgd_addr_end(addr, end); @@ -343,6 +360,8 @@ static inline bool is_userspace_pgd(pgd_t *pgdp) pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) { + if (!kaiser_enabled) + return pgd; /* * Do we need to also populate the shadow pgd? Check _PAGE_USER to * skip cases like kexec and EFI which make temporary low mappings. @@ -399,4 +418,3 @@ void kaiser_flush_tlb_on_return_to_user(void) X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); } EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); -#endif /* CONFIG_KAISER */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 352fd01..5aaec8e 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -345,16 +345,12 @@ static inline void _pgd_free(pgd_t *pgd) } #else -#ifdef CONFIG_KAISER /* - * Instead of one pmd, we aquire two pmds. Being order-1, it is + * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is * both 8k in size and 8k-aligned. That lets us just flip bit 12 * in a pointer to swap between the two 4k halves. */ -#define PGD_ALLOCATION_ORDER 1 -#else -#define PGD_ALLOCATION_ORDER 0 -#endif +#define PGD_ALLOCATION_ORDER kaiser_enabled static inline pgd_t *_pgd_alloc(void) { diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 852c665..fde44bb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -41,8 +41,7 @@ static void load_new_mm_cr3(pgd_t *pgdir) { unsigned long new_mm_cr3 = __pa(pgdir); -#ifdef CONFIG_KAISER - if (this_cpu_has(X86_FEATURE_PCID)) { + if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) { /* * We reuse the same PCID for different tasks, so we must * flush all the entries for the PCID out when we change tasks. @@ -59,7 +58,6 @@ static void load_new_mm_cr3(pgd_t *pgdir) new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH; kaiser_flush_tlb_on_return_to_user(); } -#endif /* CONFIG_KAISER */ /* * Caution: many callers of this function expect diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index a396292..67c93d9 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -197,6 +197,9 @@ #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ +/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */ + /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ -- 2.7.4