diff options
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.9.21/0027-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch')
-rw-r--r-- | common/recipes-kernel/linux/linux-yocto-4.9.21/0027-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch | 686 |
1 files changed, 686 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0027-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0027-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch new file mode 100644 index 00000000..8243cf1f --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0027-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch @@ -0,0 +1,686 @@ +From 98cbbfe8b0e5e38dac94986ffa4b09da9860a9af Mon Sep 17 00:00:00 2001 +From: Hugh Dickins <hughd@google.com> +Date: Sun, 24 Sep 2017 16:59:49 -0700 +Subject: [PATCH 027/102] kaiser: add "nokaiser" boot option, using ALTERNATIVE + +Added "nokaiser" boot option: an early param like "noinvpcid". +Most places now check int kaiser_enabled (#defined 0 when not +CONFIG_KAISER) instead of #ifdef CONFIG_KAISER; but entry_64.S +and entry_64_compat.S are using the ALTERNATIVE technique, which +patches in the preferred instructions at runtime. That technique +is tied to x86 cpu features, so X86_FEATURE_KAISER is fabricated. + +Prior to "nokaiser", Kaiser #defined _PAGE_GLOBAL 0: revert that, +but be careful with both _PAGE_GLOBAL and CR4.PGE: setting them when +nokaiser like when !CONFIG_KAISER, but not setting either when kaiser - +neither matters on its own, but it's hard to be sure that _PAGE_GLOBAL +won't get set in some obscure corner, or something add PGE into CR4. +By omitting _PAGE_GLOBAL from __supported_pte_mask when kaiser_enabled, +all page table setup which uses pte_pfn() masks it out of the ptes. + +It's slightly shameful that the same declaration versus definition of +kaiser_enabled appears in not one, not two, but in three header files +(asm/kaiser.h, asm/pgtable.h, asm/tlbflush.h). I felt safer that way, +than with #including any of those in any of the others; and did not +feel it worth an asm/kaiser_enabled.h - kernel/cpu/common.c includes +them all, so we shall hear about it if they get out of synch. + +Cleanups while in the area: removed the silly #ifdef CONFIG_KAISER +from kaiser.c; removed the unused native_get_normal_pgd(); removed +the spurious reg clutter from SWITCH_*_CR3 macro stubs; corrected some +comments. But more interestingly, set CR4.PSE in secondary_startup_64: +the manual is clear that it does not matter whether it's 0 or 1 when +4-level-pts are enabled, but I was distracted to find cr4 different on +BSP and auxiliaries - BSP alone was adding PSE, in probe_page_size_mask(). + +Signed-off-by: Hugh Dickins <hughd@google.com> +Acked-by: Jiri Kosina <jkosina@suse.cz> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + Documentation/kernel-parameters.txt | 2 ++ + arch/x86/entry/entry_64.S | 15 ++++++------ + arch/x86/include/asm/cpufeatures.h | 3 +++ + arch/x86/include/asm/kaiser.h | 27 ++++++++++++++++------ + arch/x86/include/asm/pgtable.h | 20 +++++++++++----- + arch/x86/include/asm/pgtable_64.h | 13 ++++------- + arch/x86/include/asm/pgtable_types.h | 4 ---- + arch/x86/include/asm/tlbflush.h | 39 ++++++++++++++++++++------------ + arch/x86/kernel/cpu/common.c | 28 ++++++++++++++++++++++- + arch/x86/kernel/espfix_64.c | 3 ++- + arch/x86/kernel/head_64.S | 4 ++-- + arch/x86/mm/init.c | 2 +- + arch/x86/mm/init_64.c | 10 ++++++++ + arch/x86/mm/kaiser.c | 26 +++++++++++++++++---- + arch/x86/mm/pgtable.c | 8 ++----- + arch/x86/mm/tlb.c | 4 +--- + tools/arch/x86/include/asm/cpufeatures.h | 3 +++ + 17 files changed, 146 insertions(+), 65 deletions(-) + +diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt +index a303387..e2642ec 100644 +--- a/Documentation/kernel-parameters.txt ++++ b/Documentation/kernel-parameters.txt +@@ -2753,6 +2753,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. + + nojitter [IA-64] Disables jitter checking for ITC timers. + ++ nokaiser [X86-64] Disable KAISER isolation of kernel from user. ++ + no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver + + no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 41bf650..bbb38ac 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1079,7 +1079,7 @@ ENTRY(paranoid_entry) + * unconditionally, but we need to find out whether the reverse + * should be done on return (conveyed to paranoid_exit in %ebx). + */ +- movq %cr3, %rax ++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER + testl $KAISER_SHADOW_PGD_OFFSET, %eax + jz 2f + orl $2, %ebx +@@ -1111,6 +1111,7 @@ ENTRY(paranoid_exit) + TRACE_IRQS_OFF_DEBUG + TRACE_IRQS_IRETQ_DEBUG + #ifdef CONFIG_KAISER ++ /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */ + testl $2, %ebx /* SWITCH_USER_CR3 needed? */ + jz paranoid_exit_no_switch + SWITCH_USER_CR3 +@@ -1339,13 +1340,14 @@ ENTRY(nmi) + #ifdef CONFIG_KAISER + /* Unconditionally use kernel CR3 for do_nmi() */ + /* %rax is saved above, so OK to clobber here */ +- movq %cr3, %rax ++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER + /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ + orq x86_cr3_pcid_noflush, %rax + pushq %rax + /* mask off "user" bit of pgd address and 12 PCID bits: */ + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax + movq %rax, %cr3 ++2: + #endif + call do_nmi + +@@ -1355,8 +1357,7 @@ ENTRY(nmi) + * kernel code that needs user CR3, but do we ever return + * to "user mode" where we need the kernel CR3? + */ +- popq %rax +- mov %rax, %cr3 ++ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER + #endif + + /* +@@ -1583,13 +1584,14 @@ end_repeat_nmi: + #ifdef CONFIG_KAISER + /* Unconditionally use kernel CR3 for do_nmi() */ + /* %rax is saved above, so OK to clobber here */ +- movq %cr3, %rax ++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER + /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ + orq x86_cr3_pcid_noflush, %rax + pushq %rax + /* mask off "user" bit of pgd address and 12 PCID bits: */ + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax + movq %rax, %cr3 ++2: + #endif + + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ +@@ -1601,8 +1603,7 @@ end_repeat_nmi: + * kernel code that needs user CR3, like just just before + * a sysret. + */ +- popq %rax +- mov %rax, %cr3 ++ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER + #endif + + testl %ebx, %ebx /* swapgs needed? */ +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index dc50883..20271d6 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -198,6 +198,9 @@ + #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ + #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ + ++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ ++#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */ ++ + /* Virtualization flags: Linux defined, word 8 */ + #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ + #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ +diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h +index 3dc5f4c..96643a9 100644 +--- a/arch/x86/include/asm/kaiser.h ++++ b/arch/x86/include/asm/kaiser.h +@@ -46,28 +46,33 @@ movq \reg, %cr3 + .endm + + .macro SWITCH_KERNEL_CR3 +-pushq %rax ++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER + _SWITCH_TO_KERNEL_CR3 %rax + popq %rax ++8: + .endm + + .macro SWITCH_USER_CR3 +-pushq %rax ++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER + _SWITCH_TO_USER_CR3 %rax %al + popq %rax ++8: + .endm + + .macro SWITCH_KERNEL_CR3_NO_STACK +-movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) ++ALTERNATIVE "jmp 8f", \ ++ __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \ ++ X86_FEATURE_KAISER + _SWITCH_TO_KERNEL_CR3 %rax + movq PER_CPU_VAR(unsafe_stack_register_backup), %rax ++8: + .endm + + #else /* CONFIG_KAISER */ + +-.macro SWITCH_KERNEL_CR3 reg ++.macro SWITCH_KERNEL_CR3 + .endm +-.macro SWITCH_USER_CR3 reg regb ++.macro SWITCH_USER_CR3 + .endm + .macro SWITCH_KERNEL_CR3_NO_STACK + .endm +@@ -90,6 +95,16 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); + + extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; + ++extern int kaiser_enabled; ++#else ++#define kaiser_enabled 0 ++#endif /* CONFIG_KAISER */ ++ ++/* ++ * Kaiser function prototypes are needed even when CONFIG_KAISER is not set, ++ * so as to build with tests on kaiser_enabled instead of #ifdefs. ++ */ ++ + /** + * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping + * @addr: the start address of the range +@@ -119,8 +134,6 @@ extern void kaiser_remove_mapping(unsigned long start, unsigned long size); + */ + extern void kaiser_init(void); + +-#endif /* CONFIG_KAISER */ +- + #endif /* __ASSEMBLY */ + + #endif /* _ASM_X86_KAISER_H */ +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index 1cee98e..217e83a 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -18,6 +18,12 @@ + #ifndef __ASSEMBLY__ + #include <asm/x86_init.h> + ++#ifdef CONFIG_KAISER ++extern int kaiser_enabled; ++#else ++#define kaiser_enabled 0 ++#endif ++ + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); + void ptdump_walk_pgd_level_checkwx(void); + +@@ -697,7 +703,7 @@ static inline int pgd_bad(pgd_t pgd) + * page table by accident; it will fault on the first + * instruction it tries to run. See native_set_pgd(). + */ +- if (IS_ENABLED(CONFIG_KAISER)) ++ if (kaiser_enabled) + ignore_flags |= _PAGE_NX; + + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; +@@ -913,12 +919,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, + */ + static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) + { +- memcpy(dst, src, count * sizeof(pgd_t)); ++ memcpy(dst, src, count * sizeof(pgd_t)); + #ifdef CONFIG_KAISER +- /* Clone the shadow pgd part as well */ +- memcpy(native_get_shadow_pgd(dst), +- native_get_shadow_pgd(src), +- count * sizeof(pgd_t)); ++ if (kaiser_enabled) { ++ /* Clone the shadow pgd part as well */ ++ memcpy(native_get_shadow_pgd(dst), ++ native_get_shadow_pgd(src), ++ count * sizeof(pgd_t)); ++ } + #endif + } + +diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h +index 177caf3..cf68b5c 100644 +--- a/arch/x86/include/asm/pgtable_64.h ++++ b/arch/x86/include/asm/pgtable_64.h +@@ -111,13 +111,12 @@ extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); + + static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) + { ++#ifdef CONFIG_DEBUG_VM ++ /* linux/mmdebug.h may not have been included at this point */ ++ BUG_ON(!kaiser_enabled); ++#endif + return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); + } +- +-static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) +-{ +- return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE); +-} + #else + static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) + { +@@ -128,10 +127,6 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) + BUILD_BUG_ON(1); + return NULL; + } +-static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp) +-{ +- return pgdp; +-} + #endif /* CONFIG_KAISER */ + + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) +diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h +index 7cf2883..f0d9a1a 100644 +--- a/arch/x86/include/asm/pgtable_types.h ++++ b/arch/x86/include/asm/pgtable_types.h +@@ -45,11 +45,7 @@ + #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) + #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) + #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) +-#ifdef CONFIG_KAISER +-#define _PAGE_GLOBAL (_AT(pteval_t, 0)) +-#else + #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +-#endif + #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) + #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) + #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 4fff696..13a74f6 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -138,9 +138,11 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) + * to avoid the need for asm/kaiser.h in unexpected places. + */ + #ifdef CONFIG_KAISER ++extern int kaiser_enabled; + extern void kaiser_setup_pcid(void); + extern void kaiser_flush_tlb_on_return_to_user(void); + #else ++#define kaiser_enabled 0 + static inline void kaiser_setup_pcid(void) + { + } +@@ -165,7 +167,7 @@ static inline void __native_flush_tlb(void) + * back: + */ + preempt_disable(); +- if (this_cpu_has(X86_FEATURE_PCID)) ++ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) + kaiser_flush_tlb_on_return_to_user(); + native_write_cr3(native_read_cr3()); + preempt_enable(); +@@ -176,20 +178,30 @@ static inline void __native_flush_tlb_global_irq_disabled(void) + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); +- /* clear PGE */ +- native_write_cr4(cr4 & ~X86_CR4_PGE); +- /* write old PGE again and flush TLBs */ +- native_write_cr4(cr4); ++ if (cr4 & X86_CR4_PGE) { ++ /* clear PGE and flush TLB of all entries */ ++ native_write_cr4(cr4 & ~X86_CR4_PGE); ++ /* restore PGE as it was before */ ++ native_write_cr4(cr4); ++ } else { ++ /* ++ * x86_64 microcode update comes this way when CR4.PGE is not ++ * enabled, and it's safer for all callers to allow this case. ++ */ ++ native_write_cr3(native_read_cr3()); ++ } + } + + static inline void __native_flush_tlb_global(void) + { +-#ifdef CONFIG_KAISER +- /* Globals are not used at all */ +- __native_flush_tlb(); +-#else + unsigned long flags; + ++ if (kaiser_enabled) { ++ /* Globals are not used at all */ ++ __native_flush_tlb(); ++ return; ++ } ++ + if (this_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Using INVPCID is considerably faster than a pair of writes +@@ -209,7 +221,6 @@ static inline void __native_flush_tlb_global(void) + raw_local_irq_save(flags); + __native_flush_tlb_global_irq_disabled(); + raw_local_irq_restore(flags); +-#endif + } + + static inline void __native_flush_tlb_single(unsigned long addr) +@@ -224,7 +235,7 @@ static inline void __native_flush_tlb_single(unsigned long addr) + */ + + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { +- if (this_cpu_has(X86_FEATURE_PCID)) ++ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) + kaiser_flush_tlb_on_return_to_user(); + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + return; +@@ -239,9 +250,9 @@ static inline void __native_flush_tlb_single(unsigned long addr) + * Make sure to do only a single invpcid when KAISER is + * disabled and we have only a single ASID. + */ +- if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER) +- invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); +- invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); ++ if (kaiser_enabled) ++ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); ++ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); + } + + static inline void __flush_tlb_all(void) +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index e6be5f3..8b03874 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -179,6 +179,20 @@ static int __init x86_pcid_setup(char *s) + return 1; + } + __setup("nopcid", x86_pcid_setup); ++ ++static int __init x86_nokaiser_setup(char *s) ++{ ++ /* nokaiser doesn't accept parameters */ ++ if (s) ++ return -EINVAL; ++#ifdef CONFIG_KAISER ++ kaiser_enabled = 0; ++ setup_clear_cpu_cap(X86_FEATURE_KAISER); ++ pr_info("nokaiser: KAISER feature disabled\n"); ++#endif ++ return 0; ++} ++early_param("nokaiser", x86_nokaiser_setup); + #endif + + static int __init x86_noinvpcid_setup(char *s) +@@ -327,7 +341,7 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) + static void setup_pcid(struct cpuinfo_x86 *c) + { + if (cpu_has(c, X86_FEATURE_PCID)) { +- if (cpu_has(c, X86_FEATURE_PGE)) { ++ if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) { + cr4_set_bits(X86_CR4_PCIDE); + /* + * INVPCID has two "groups" of types: +@@ -799,6 +813,10 @@ void get_cpu_cap(struct cpuinfo_x86 *c) + c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + + init_scattered_cpuid_features(c); ++#ifdef CONFIG_KAISER ++ if (kaiser_enabled) ++ set_cpu_cap(c, X86_FEATURE_KAISER); ++#endif + } + + static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) +@@ -1537,6 +1555,14 @@ void cpu_init(void) + * try to read it. + */ + cr4_init_shadow(); ++ if (!kaiser_enabled) { ++ /* ++ * secondary_startup_64() deferred setting PGE in cr4: ++ * probe_page_size_mask() sets it on the boot cpu, ++ * but it needs to be set on each secondary cpu. ++ */ ++ cr4_set_bits(X86_CR4_PGE); ++ } + + /* + * Load microcode on this cpu if a valid microcode is available. +diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c +index 560c2fd..e33b385 100644 +--- a/arch/x86/kernel/espfix_64.c ++++ b/arch/x86/kernel/espfix_64.c +@@ -132,9 +132,10 @@ void __init init_espfix_bsp(void) + * area to ensure it is mapped into the shadow user page + * tables. + */ +- if (IS_ENABLED(CONFIG_KAISER)) ++ if (kaiser_enabled) { + set_pgd(native_get_shadow_pgd(pgd_p), + __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); ++ } + + /* Randomize the locations */ + init_espfix_random(); +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index 5775379..d04479b 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -190,8 +190,8 @@ ENTRY(secondary_startup_64) + movq $(init_level4_pgt - __START_KERNEL_map), %rax + 1: + +- /* Enable PAE mode and PGE */ +- movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx ++ /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */ ++ movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx + movq %rcx, %cr4 + + /* Setup early boot stage 4 level pagetables. */ +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c +index 22af912..05a9855 100644 +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -177,7 +177,7 @@ static void __init probe_page_size_mask(void) + cr4_set_bits_and_update_boot(X86_CR4_PSE); + + /* Enable PGE if available */ +- if (boot_cpu_has(X86_FEATURE_PGE)) { ++ if (boot_cpu_has(X86_FEATURE_PGE) && !kaiser_enabled) { + cr4_set_bits_and_update_boot(X86_CR4_PGE); + __supported_pte_mask |= _PAGE_GLOBAL; + } else +diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c +index 14b9dd7..a0e8df6 100644 +--- a/arch/x86/mm/init_64.c ++++ b/arch/x86/mm/init_64.c +@@ -324,6 +324,16 @@ void __init cleanup_highmap(void) + continue; + if (vaddr < (unsigned long) _text || vaddr > end) + set_pmd(pmd, __pmd(0)); ++ else if (kaiser_enabled) { ++ /* ++ * level2_kernel_pgt is initialized with _PAGE_GLOBAL: ++ * clear that now. This is not important, so long as ++ * CR4.PGE remains clear, but it removes an anomaly. ++ * Physical mapping setup below avoids _PAGE_GLOBAL ++ * by use of massage_pgprot() inside pfn_pte() etc. ++ */ ++ set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL)); ++ } + } + } + +diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c +index cc0950f..11032dc 100644 +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -16,7 +16,9 @@ + #include <asm/pgalloc.h> + #include <asm/desc.h> + +-#ifdef CONFIG_KAISER ++int kaiser_enabled __read_mostly = 1; ++EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */ ++ + __visible + DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +@@ -167,8 +169,8 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic) + return pte_offset_kernel(pmd, address); + } + +-int kaiser_add_user_map(const void *__start_addr, unsigned long size, +- unsigned long flags) ++static int kaiser_add_user_map(const void *__start_addr, unsigned long size, ++ unsigned long flags) + { + int ret = 0; + pte_t *pte; +@@ -177,6 +179,15 @@ int kaiser_add_user_map(const void *__start_addr, unsigned long size, + unsigned long end_addr = PAGE_ALIGN(start_addr + size); + unsigned long target_address; + ++ /* ++ * It is convenient for callers to pass in __PAGE_KERNEL etc, ++ * and there is no actual harm from setting _PAGE_GLOBAL, so ++ * long as CR4.PGE is not set. But it is nonetheless troubling ++ * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser" ++ * requires that not to be #defined to 0): so mask it off here. ++ */ ++ flags &= ~_PAGE_GLOBAL; ++ + for (; address < end_addr; address += PAGE_SIZE) { + target_address = get_pa_from_mapping(address); + if (target_address == -1) { +@@ -263,6 +274,8 @@ void __init kaiser_init(void) + { + int cpu; + ++ if (!kaiser_enabled) ++ return; + kaiser_init_all_pgds(); + + for_each_possible_cpu(cpu) { +@@ -311,6 +324,8 @@ void __init kaiser_init(void) + /* Add a mapping to the shadow mapping, and synchronize the mappings */ + int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) + { ++ if (!kaiser_enabled) ++ return 0; + return kaiser_add_user_map((const void *)addr, size, flags); + } + +@@ -322,6 +337,8 @@ void kaiser_remove_mapping(unsigned long start, unsigned long size) + unsigned long addr, next; + pgd_t *pgd; + ++ if (!kaiser_enabled) ++ return; + pgd = native_get_shadow_pgd(pgd_offset_k(start)); + for (addr = start; addr < end; pgd++, addr = next) { + next = pgd_addr_end(addr, end); +@@ -343,6 +360,8 @@ static inline bool is_userspace_pgd(pgd_t *pgdp) + + pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) + { ++ if (!kaiser_enabled) ++ return pgd; + /* + * Do we need to also populate the shadow pgd? Check _PAGE_USER to + * skip cases like kexec and EFI which make temporary low mappings. +@@ -399,4 +418,3 @@ void kaiser_flush_tlb_on_return_to_user(void) + X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); + } + EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); +-#endif /* CONFIG_KAISER */ +diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c +index 352fd01..5aaec8e 100644 +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -345,16 +345,12 @@ static inline void _pgd_free(pgd_t *pgd) + } + #else + +-#ifdef CONFIG_KAISER + /* +- * Instead of one pmd, we aquire two pmds. Being order-1, it is ++ * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is + * both 8k in size and 8k-aligned. That lets us just flip bit 12 + * in a pointer to swap between the two 4k halves. + */ +-#define PGD_ALLOCATION_ORDER 1 +-#else +-#define PGD_ALLOCATION_ORDER 0 +-#endif ++#define PGD_ALLOCATION_ORDER kaiser_enabled + + static inline pgd_t *_pgd_alloc(void) + { +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 852c665..fde44bb 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -41,8 +41,7 @@ static void load_new_mm_cr3(pgd_t *pgdir) + { + unsigned long new_mm_cr3 = __pa(pgdir); + +-#ifdef CONFIG_KAISER +- if (this_cpu_has(X86_FEATURE_PCID)) { ++ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) { + /* + * We reuse the same PCID for different tasks, so we must + * flush all the entries for the PCID out when we change tasks. +@@ -59,7 +58,6 @@ static void load_new_mm_cr3(pgd_t *pgdir) + new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH; + kaiser_flush_tlb_on_return_to_user(); + } +-#endif /* CONFIG_KAISER */ + + /* + * Caution: many callers of this function expect +diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h +index a396292..67c93d9 100644 +--- a/tools/arch/x86/include/asm/cpufeatures.h ++++ b/tools/arch/x86/include/asm/cpufeatures.h +@@ -197,6 +197,9 @@ + #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ + #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ + ++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ ++#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */ ++ + /* Virtualization flags: Linux defined, word 8 */ + #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ + #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ +-- +2.7.4 + |