aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.9.21/0027-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch
diff options
context:
space:
mode:
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.9.21/0027-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch')
-rw-r--r--common/recipes-kernel/linux/linux-yocto-4.9.21/0027-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch686
1 files changed, 686 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0027-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0027-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch
new file mode 100644
index 00000000..8243cf1f
--- /dev/null
+++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0027-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch
@@ -0,0 +1,686 @@
+From 98cbbfe8b0e5e38dac94986ffa4b09da9860a9af Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Sun, 24 Sep 2017 16:59:49 -0700
+Subject: [PATCH 027/102] kaiser: add "nokaiser" boot option, using ALTERNATIVE
+
+Added "nokaiser" boot option: an early param like "noinvpcid".
+Most places now check int kaiser_enabled (#defined 0 when not
+CONFIG_KAISER) instead of #ifdef CONFIG_KAISER; but entry_64.S
+and entry_64_compat.S are using the ALTERNATIVE technique, which
+patches in the preferred instructions at runtime. That technique
+is tied to x86 cpu features, so X86_FEATURE_KAISER is fabricated.
+
+Prior to "nokaiser", Kaiser #defined _PAGE_GLOBAL 0: revert that,
+but be careful with both _PAGE_GLOBAL and CR4.PGE: setting them when
+nokaiser like when !CONFIG_KAISER, but not setting either when kaiser -
+neither matters on its own, but it's hard to be sure that _PAGE_GLOBAL
+won't get set in some obscure corner, or something add PGE into CR4.
+By omitting _PAGE_GLOBAL from __supported_pte_mask when kaiser_enabled,
+all page table setup which uses pte_pfn() masks it out of the ptes.
+
+It's slightly shameful that the same declaration versus definition of
+kaiser_enabled appears in not one, not two, but in three header files
+(asm/kaiser.h, asm/pgtable.h, asm/tlbflush.h). I felt safer that way,
+than with #including any of those in any of the others; and did not
+feel it worth an asm/kaiser_enabled.h - kernel/cpu/common.c includes
+them all, so we shall hear about it if they get out of synch.
+
+Cleanups while in the area: removed the silly #ifdef CONFIG_KAISER
+from kaiser.c; removed the unused native_get_normal_pgd(); removed
+the spurious reg clutter from SWITCH_*_CR3 macro stubs; corrected some
+comments. But more interestingly, set CR4.PSE in secondary_startup_64:
+the manual is clear that it does not matter whether it's 0 or 1 when
+4-level-pts are enabled, but I was distracted to find cr4 different on
+BSP and auxiliaries - BSP alone was adding PSE, in probe_page_size_mask().
+
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Acked-by: Jiri Kosina <jkosina@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/kernel-parameters.txt | 2 ++
+ arch/x86/entry/entry_64.S | 15 ++++++------
+ arch/x86/include/asm/cpufeatures.h | 3 +++
+ arch/x86/include/asm/kaiser.h | 27 ++++++++++++++++------
+ arch/x86/include/asm/pgtable.h | 20 +++++++++++-----
+ arch/x86/include/asm/pgtable_64.h | 13 ++++-------
+ arch/x86/include/asm/pgtable_types.h | 4 ----
+ arch/x86/include/asm/tlbflush.h | 39 ++++++++++++++++++++------------
+ arch/x86/kernel/cpu/common.c | 28 ++++++++++++++++++++++-
+ arch/x86/kernel/espfix_64.c | 3 ++-
+ arch/x86/kernel/head_64.S | 4 ++--
+ arch/x86/mm/init.c | 2 +-
+ arch/x86/mm/init_64.c | 10 ++++++++
+ arch/x86/mm/kaiser.c | 26 +++++++++++++++++----
+ arch/x86/mm/pgtable.c | 8 ++-----
+ arch/x86/mm/tlb.c | 4 +---
+ tools/arch/x86/include/asm/cpufeatures.h | 3 +++
+ 17 files changed, 146 insertions(+), 65 deletions(-)
+
+diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
+index a303387..e2642ec 100644
+--- a/Documentation/kernel-parameters.txt
++++ b/Documentation/kernel-parameters.txt
+@@ -2753,6 +2753,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
+
+ nojitter [IA-64] Disables jitter checking for ITC timers.
+
++ nokaiser [X86-64] Disable KAISER isolation of kernel from user.
++
+ no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
+
+ no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
+diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
+index 41bf650..bbb38ac 100644
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -1079,7 +1079,7 @@ ENTRY(paranoid_entry)
+ * unconditionally, but we need to find out whether the reverse
+ * should be done on return (conveyed to paranoid_exit in %ebx).
+ */
+- movq %cr3, %rax
++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
+ testl $KAISER_SHADOW_PGD_OFFSET, %eax
+ jz 2f
+ orl $2, %ebx
+@@ -1111,6 +1111,7 @@ ENTRY(paranoid_exit)
+ TRACE_IRQS_OFF_DEBUG
+ TRACE_IRQS_IRETQ_DEBUG
+ #ifdef CONFIG_KAISER
++ /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
+ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
+ jz paranoid_exit_no_switch
+ SWITCH_USER_CR3
+@@ -1339,13 +1340,14 @@ ENTRY(nmi)
+ #ifdef CONFIG_KAISER
+ /* Unconditionally use kernel CR3 for do_nmi() */
+ /* %rax is saved above, so OK to clobber here */
+- movq %cr3, %rax
++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
+ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
+ orq x86_cr3_pcid_noflush, %rax
+ pushq %rax
+ /* mask off "user" bit of pgd address and 12 PCID bits: */
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+ movq %rax, %cr3
++2:
+ #endif
+ call do_nmi
+
+@@ -1355,8 +1357,7 @@ ENTRY(nmi)
+ * kernel code that needs user CR3, but do we ever return
+ * to "user mode" where we need the kernel CR3?
+ */
+- popq %rax
+- mov %rax, %cr3
++ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
+ #endif
+
+ /*
+@@ -1583,13 +1584,14 @@ end_repeat_nmi:
+ #ifdef CONFIG_KAISER
+ /* Unconditionally use kernel CR3 for do_nmi() */
+ /* %rax is saved above, so OK to clobber here */
+- movq %cr3, %rax
++ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
+ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
+ orq x86_cr3_pcid_noflush, %rax
+ pushq %rax
+ /* mask off "user" bit of pgd address and 12 PCID bits: */
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+ movq %rax, %cr3
++2:
+ #endif
+
+ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
+@@ -1601,8 +1603,7 @@ end_repeat_nmi:
+ * kernel code that needs user CR3, like just just before
+ * a sysret.
+ */
+- popq %rax
+- mov %rax, %cr3
++ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
+ #endif
+
+ testl %ebx, %ebx /* swapgs needed? */
+diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
+index dc50883..20271d6 100644
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -198,6 +198,9 @@
+ #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
+ #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
+
++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
++#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */
++
+ /* Virtualization flags: Linux defined, word 8 */
+ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
+ #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
+diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
+index 3dc5f4c..96643a9 100644
+--- a/arch/x86/include/asm/kaiser.h
++++ b/arch/x86/include/asm/kaiser.h
+@@ -46,28 +46,33 @@ movq \reg, %cr3
+ .endm
+
+ .macro SWITCH_KERNEL_CR3
+-pushq %rax
++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
+ _SWITCH_TO_KERNEL_CR3 %rax
+ popq %rax
++8:
+ .endm
+
+ .macro SWITCH_USER_CR3
+-pushq %rax
++ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
+ _SWITCH_TO_USER_CR3 %rax %al
+ popq %rax
++8:
+ .endm
+
+ .macro SWITCH_KERNEL_CR3_NO_STACK
+-movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
++ALTERNATIVE "jmp 8f", \
++ __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
++ X86_FEATURE_KAISER
+ _SWITCH_TO_KERNEL_CR3 %rax
+ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
++8:
+ .endm
+
+ #else /* CONFIG_KAISER */
+
+-.macro SWITCH_KERNEL_CR3 reg
++.macro SWITCH_KERNEL_CR3
+ .endm
+-.macro SWITCH_USER_CR3 reg regb
++.macro SWITCH_USER_CR3
+ .endm
+ .macro SWITCH_KERNEL_CR3_NO_STACK
+ .endm
+@@ -90,6 +95,16 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+ extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+
++extern int kaiser_enabled;
++#else
++#define kaiser_enabled 0
++#endif /* CONFIG_KAISER */
++
++/*
++ * Kaiser function prototypes are needed even when CONFIG_KAISER is not set,
++ * so as to build with tests on kaiser_enabled instead of #ifdefs.
++ */
++
+ /**
+ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
+ * @addr: the start address of the range
+@@ -119,8 +134,6 @@ extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+ */
+ extern void kaiser_init(void);
+
+-#endif /* CONFIG_KAISER */
+-
+ #endif /* __ASSEMBLY */
+
+ #endif /* _ASM_X86_KAISER_H */
+diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
+index 1cee98e..217e83a 100644
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -18,6 +18,12 @@
+ #ifndef __ASSEMBLY__
+ #include <asm/x86_init.h>
+
++#ifdef CONFIG_KAISER
++extern int kaiser_enabled;
++#else
++#define kaiser_enabled 0
++#endif
++
+ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+ void ptdump_walk_pgd_level_checkwx(void);
+
+@@ -697,7 +703,7 @@ static inline int pgd_bad(pgd_t pgd)
+ * page table by accident; it will fault on the first
+ * instruction it tries to run. See native_set_pgd().
+ */
+- if (IS_ENABLED(CONFIG_KAISER))
++ if (kaiser_enabled)
+ ignore_flags |= _PAGE_NX;
+
+ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
+@@ -913,12 +919,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+ */
+ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+ {
+- memcpy(dst, src, count * sizeof(pgd_t));
++ memcpy(dst, src, count * sizeof(pgd_t));
+ #ifdef CONFIG_KAISER
+- /* Clone the shadow pgd part as well */
+- memcpy(native_get_shadow_pgd(dst),
+- native_get_shadow_pgd(src),
+- count * sizeof(pgd_t));
++ if (kaiser_enabled) {
++ /* Clone the shadow pgd part as well */
++ memcpy(native_get_shadow_pgd(dst),
++ native_get_shadow_pgd(src),
++ count * sizeof(pgd_t));
++ }
+ #endif
+ }
+
+diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
+index 177caf3..cf68b5c 100644
+--- a/arch/x86/include/asm/pgtable_64.h
++++ b/arch/x86/include/asm/pgtable_64.h
+@@ -111,13 +111,12 @@ extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
+
+ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+ {
++#ifdef CONFIG_DEBUG_VM
++ /* linux/mmdebug.h may not have been included at this point */
++ BUG_ON(!kaiser_enabled);
++#endif
+ return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
+ }
+-
+-static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
+-{
+- return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
+-}
+ #else
+ static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+ {
+@@ -128,10 +127,6 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+ BUILD_BUG_ON(1);
+ return NULL;
+ }
+-static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
+-{
+- return pgdp;
+-}
+ #endif /* CONFIG_KAISER */
+
+ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
+diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
+index 7cf2883..f0d9a1a 100644
+--- a/arch/x86/include/asm/pgtable_types.h
++++ b/arch/x86/include/asm/pgtable_types.h
+@@ -45,11 +45,7 @@
+ #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
+ #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+ #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+-#ifdef CONFIG_KAISER
+-#define _PAGE_GLOBAL (_AT(pteval_t, 0))
+-#else
+ #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+-#endif
+ #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
+ #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
+ #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
+diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
+index 4fff696..13a74f6 100644
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -138,9 +138,11 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
+ * to avoid the need for asm/kaiser.h in unexpected places.
+ */
+ #ifdef CONFIG_KAISER
++extern int kaiser_enabled;
+ extern void kaiser_setup_pcid(void);
+ extern void kaiser_flush_tlb_on_return_to_user(void);
+ #else
++#define kaiser_enabled 0
+ static inline void kaiser_setup_pcid(void)
+ {
+ }
+@@ -165,7 +167,7 @@ static inline void __native_flush_tlb(void)
+ * back:
+ */
+ preempt_disable();
+- if (this_cpu_has(X86_FEATURE_PCID))
++ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
+ kaiser_flush_tlb_on_return_to_user();
+ native_write_cr3(native_read_cr3());
+ preempt_enable();
+@@ -176,20 +178,30 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
+ unsigned long cr4;
+
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+- /* clear PGE */
+- native_write_cr4(cr4 & ~X86_CR4_PGE);
+- /* write old PGE again and flush TLBs */
+- native_write_cr4(cr4);
++ if (cr4 & X86_CR4_PGE) {
++ /* clear PGE and flush TLB of all entries */
++ native_write_cr4(cr4 & ~X86_CR4_PGE);
++ /* restore PGE as it was before */
++ native_write_cr4(cr4);
++ } else {
++ /*
++ * x86_64 microcode update comes this way when CR4.PGE is not
++ * enabled, and it's safer for all callers to allow this case.
++ */
++ native_write_cr3(native_read_cr3());
++ }
+ }
+
+ static inline void __native_flush_tlb_global(void)
+ {
+-#ifdef CONFIG_KAISER
+- /* Globals are not used at all */
+- __native_flush_tlb();
+-#else
+ unsigned long flags;
+
++ if (kaiser_enabled) {
++ /* Globals are not used at all */
++ __native_flush_tlb();
++ return;
++ }
++
+ if (this_cpu_has(X86_FEATURE_INVPCID)) {
+ /*
+ * Using INVPCID is considerably faster than a pair of writes
+@@ -209,7 +221,6 @@ static inline void __native_flush_tlb_global(void)
+ raw_local_irq_save(flags);
+ __native_flush_tlb_global_irq_disabled();
+ raw_local_irq_restore(flags);
+-#endif
+ }
+
+ static inline void __native_flush_tlb_single(unsigned long addr)
+@@ -224,7 +235,7 @@ static inline void __native_flush_tlb_single(unsigned long addr)
+ */
+
+ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+- if (this_cpu_has(X86_FEATURE_PCID))
++ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
+ kaiser_flush_tlb_on_return_to_user();
+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ return;
+@@ -239,9 +250,9 @@ static inline void __native_flush_tlb_single(unsigned long addr)
+ * Make sure to do only a single invpcid when KAISER is
+ * disabled and we have only a single ASID.
+ */
+- if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
+- invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
+- invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
++ if (kaiser_enabled)
++ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
++ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
+ }
+
+ static inline void __flush_tlb_all(void)
+diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
+index e6be5f3..8b03874 100644
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -179,6 +179,20 @@ static int __init x86_pcid_setup(char *s)
+ return 1;
+ }
+ __setup("nopcid", x86_pcid_setup);
++
++static int __init x86_nokaiser_setup(char *s)
++{
++ /* nokaiser doesn't accept parameters */
++ if (s)
++ return -EINVAL;
++#ifdef CONFIG_KAISER
++ kaiser_enabled = 0;
++ setup_clear_cpu_cap(X86_FEATURE_KAISER);
++ pr_info("nokaiser: KAISER feature disabled\n");
++#endif
++ return 0;
++}
++early_param("nokaiser", x86_nokaiser_setup);
+ #endif
+
+ static int __init x86_noinvpcid_setup(char *s)
+@@ -327,7 +341,7 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
+ static void setup_pcid(struct cpuinfo_x86 *c)
+ {
+ if (cpu_has(c, X86_FEATURE_PCID)) {
+- if (cpu_has(c, X86_FEATURE_PGE)) {
++ if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
+ cr4_set_bits(X86_CR4_PCIDE);
+ /*
+ * INVPCID has two "groups" of types:
+@@ -799,6 +813,10 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
+ c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
+
+ init_scattered_cpuid_features(c);
++#ifdef CONFIG_KAISER
++ if (kaiser_enabled)
++ set_cpu_cap(c, X86_FEATURE_KAISER);
++#endif
+ }
+
+ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
+@@ -1537,6 +1555,14 @@ void cpu_init(void)
+ * try to read it.
+ */
+ cr4_init_shadow();
++ if (!kaiser_enabled) {
++ /*
++ * secondary_startup_64() deferred setting PGE in cr4:
++ * probe_page_size_mask() sets it on the boot cpu,
++ * but it needs to be set on each secondary cpu.
++ */
++ cr4_set_bits(X86_CR4_PGE);
++ }
+
+ /*
+ * Load microcode on this cpu if a valid microcode is available.
+diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
+index 560c2fd..e33b385 100644
+--- a/arch/x86/kernel/espfix_64.c
++++ b/arch/x86/kernel/espfix_64.c
+@@ -132,9 +132,10 @@ void __init init_espfix_bsp(void)
+ * area to ensure it is mapped into the shadow user page
+ * tables.
+ */
+- if (IS_ENABLED(CONFIG_KAISER))
++ if (kaiser_enabled) {
+ set_pgd(native_get_shadow_pgd(pgd_p),
+ __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
++ }
+
+ /* Randomize the locations */
+ init_espfix_random();
+diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
+index 5775379..d04479b 100644
+--- a/arch/x86/kernel/head_64.S
++++ b/arch/x86/kernel/head_64.S
+@@ -190,8 +190,8 @@ ENTRY(secondary_startup_64)
+ movq $(init_level4_pgt - __START_KERNEL_map), %rax
+ 1:
+
+- /* Enable PAE mode and PGE */
+- movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
++ /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
++ movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx
+ movq %rcx, %cr4
+
+ /* Setup early boot stage 4 level pagetables. */
+diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
+index 22af912..05a9855 100644
+--- a/arch/x86/mm/init.c
++++ b/arch/x86/mm/init.c
+@@ -177,7 +177,7 @@ static void __init probe_page_size_mask(void)
+ cr4_set_bits_and_update_boot(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+- if (boot_cpu_has(X86_FEATURE_PGE)) {
++ if (boot_cpu_has(X86_FEATURE_PGE) && !kaiser_enabled) {
+ cr4_set_bits_and_update_boot(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ } else
+diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
+index 14b9dd7..a0e8df6 100644
+--- a/arch/x86/mm/init_64.c
++++ b/arch/x86/mm/init_64.c
+@@ -324,6 +324,16 @@ void __init cleanup_highmap(void)
+ continue;
+ if (vaddr < (unsigned long) _text || vaddr > end)
+ set_pmd(pmd, __pmd(0));
++ else if (kaiser_enabled) {
++ /*
++ * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
++ * clear that now. This is not important, so long as
++ * CR4.PGE remains clear, but it removes an anomaly.
++ * Physical mapping setup below avoids _PAGE_GLOBAL
++ * by use of massage_pgprot() inside pfn_pte() etc.
++ */
++ set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
++ }
+ }
+ }
+
+diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
+index cc0950f..11032dc 100644
+--- a/arch/x86/mm/kaiser.c
++++ b/arch/x86/mm/kaiser.c
+@@ -16,7 +16,9 @@
+ #include <asm/pgalloc.h>
+ #include <asm/desc.h>
+
+-#ifdef CONFIG_KAISER
++int kaiser_enabled __read_mostly = 1;
++EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
++
+ __visible
+ DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+@@ -167,8 +169,8 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
+ return pte_offset_kernel(pmd, address);
+ }
+
+-int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+- unsigned long flags)
++static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
++ unsigned long flags)
+ {
+ int ret = 0;
+ pte_t *pte;
+@@ -177,6 +179,15 @@ int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+ unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+ unsigned long target_address;
+
++ /*
++ * It is convenient for callers to pass in __PAGE_KERNEL etc,
++ * and there is no actual harm from setting _PAGE_GLOBAL, so
++ * long as CR4.PGE is not set. But it is nonetheless troubling
++ * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
++ * requires that not to be #defined to 0): so mask it off here.
++ */
++ flags &= ~_PAGE_GLOBAL;
++
+ for (; address < end_addr; address += PAGE_SIZE) {
+ target_address = get_pa_from_mapping(address);
+ if (target_address == -1) {
+@@ -263,6 +274,8 @@ void __init kaiser_init(void)
+ {
+ int cpu;
+
++ if (!kaiser_enabled)
++ return;
+ kaiser_init_all_pgds();
+
+ for_each_possible_cpu(cpu) {
+@@ -311,6 +324,8 @@ void __init kaiser_init(void)
+ /* Add a mapping to the shadow mapping, and synchronize the mappings */
+ int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+ {
++ if (!kaiser_enabled)
++ return 0;
+ return kaiser_add_user_map((const void *)addr, size, flags);
+ }
+
+@@ -322,6 +337,8 @@ void kaiser_remove_mapping(unsigned long start, unsigned long size)
+ unsigned long addr, next;
+ pgd_t *pgd;
+
++ if (!kaiser_enabled)
++ return;
+ pgd = native_get_shadow_pgd(pgd_offset_k(start));
+ for (addr = start; addr < end; pgd++, addr = next) {
+ next = pgd_addr_end(addr, end);
+@@ -343,6 +360,8 @@ static inline bool is_userspace_pgd(pgd_t *pgdp)
+
+ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+ {
++ if (!kaiser_enabled)
++ return pgd;
+ /*
+ * Do we need to also populate the shadow pgd? Check _PAGE_USER to
+ * skip cases like kexec and EFI which make temporary low mappings.
+@@ -399,4 +418,3 @@ void kaiser_flush_tlb_on_return_to_user(void)
+ X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
+ }
+ EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
+-#endif /* CONFIG_KAISER */
+diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
+index 352fd01..5aaec8e 100644
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -345,16 +345,12 @@ static inline void _pgd_free(pgd_t *pgd)
+ }
+ #else
+
+-#ifdef CONFIG_KAISER
+ /*
+- * Instead of one pmd, we aquire two pmds. Being order-1, it is
++ * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is
+ * both 8k in size and 8k-aligned. That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+-#define PGD_ALLOCATION_ORDER 1
+-#else
+-#define PGD_ALLOCATION_ORDER 0
+-#endif
++#define PGD_ALLOCATION_ORDER kaiser_enabled
+
+ static inline pgd_t *_pgd_alloc(void)
+ {
+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
+index 852c665..fde44bb 100644
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -41,8 +41,7 @@ static void load_new_mm_cr3(pgd_t *pgdir)
+ {
+ unsigned long new_mm_cr3 = __pa(pgdir);
+
+-#ifdef CONFIG_KAISER
+- if (this_cpu_has(X86_FEATURE_PCID)) {
++ if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) {
+ /*
+ * We reuse the same PCID for different tasks, so we must
+ * flush all the entries for the PCID out when we change tasks.
+@@ -59,7 +58,6 @@ static void load_new_mm_cr3(pgd_t *pgdir)
+ new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
+ kaiser_flush_tlb_on_return_to_user();
+ }
+-#endif /* CONFIG_KAISER */
+
+ /*
+ * Caution: many callers of this function expect
+diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
+index a396292..67c93d9 100644
+--- a/tools/arch/x86/include/asm/cpufeatures.h
++++ b/tools/arch/x86/include/asm/cpufeatures.h
+@@ -197,6 +197,9 @@
+ #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
+ #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
+
++/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
++#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_KAISER w/o nokaiser */
++
+ /* Virtualization flags: Linux defined, word 8 */
+ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
+ #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
+--
+2.7.4
+