aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.9.21/0027-kaiser-add-nokaiser-boot-option-using-ALTERNATIVE.patch
blob: 64e5f55ed4448ff04bb1d581872288f09634bbc9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
From 6ceca45ce264990a8831d3e5f7ff6e8c0d10df3a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 24 Sep 2017 16:59:49 -0700
Subject: [PATCH 027/103] kaiser: add "nokaiser" boot option, using ALTERNATIVE

Added "nokaiser" boot option: an early param like "noinvpcid".
Most places now check int kaiser_enabled (#defined 0 when not
CONFIG_KAISER) instead of #ifdef CONFIG_KAISER; but entry_64.S
and entry_64_compat.S are using the ALTERNATIVE technique, which
patches in the preferred instructions at runtime.  That technique
is tied to x86 cpu features, so X86_FEATURE_KAISER is fabricated.

Prior to "nokaiser", Kaiser #defined _PAGE_GLOBAL 0: revert that,
but be careful with both _PAGE_GLOBAL and CR4.PGE: setting them when
nokaiser like when !CONFIG_KAISER, but not setting either when kaiser -
neither matters on its own, but it's hard to be sure that _PAGE_GLOBAL
won't get set in some obscure corner, or something add PGE into CR4.
By omitting _PAGE_GLOBAL from __supported_pte_mask when kaiser_enabled,
all page table setup which uses pte_pfn() masks it out of the ptes.

It's slightly shameful that the same declaration versus definition of
kaiser_enabled appears in not one, not two, but in three header files
(asm/kaiser.h, asm/pgtable.h, asm/tlbflush.h).  I felt safer that way,
than with #including any of those in any of the others; and did not
feel it worth an asm/kaiser_enabled.h - kernel/cpu/common.c includes
them all, so we shall hear about it if they get out of synch.

Cleanups while in the area: removed the silly #ifdef CONFIG_KAISER
from kaiser.c; removed the unused native_get_normal_pgd(); removed
the spurious reg clutter from SWITCH_*_CR3 macro stubs; corrected some
comments.  But more interestingly, set CR4.PSE in secondary_startup_64:
the manual is clear that it does not matter whether it's 0 or 1 when
4-level-pts are enabled, but I was distracted to find cr4 different on
BSP and auxiliaries - BSP alone was adding PSE, in probe_page_size_mask().

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/kernel-parameters.txt      |  2 ++
 arch/x86/entry/entry_64.S                | 15 ++++++------
 arch/x86/include/asm/cpufeatures.h       |  3 +++
 arch/x86/include/asm/kaiser.h            | 27 ++++++++++++++++------
 arch/x86/include/asm/pgtable.h           | 20 +++++++++++-----
 arch/x86/include/asm/pgtable_64.h        | 13 ++++-------
 arch/x86/include/asm/pgtable_types.h     |  4 ----
 arch/x86/include/asm/tlbflush.h          | 39 ++++++++++++++++++++------------
 arch/x86/kernel/cpu/common.c             | 28 ++++++++++++++++++++++-
 arch/x86/kernel/espfix_64.c              |  3 ++-
 arch/x86/kernel/head_64.S                |  4 ++--
 arch/x86/mm/init.c                       |  2 +-
 arch/x86/mm/init_64.c                    | 10 ++++++++
 arch/x86/mm/kaiser.c                     | 26 +++++++++++++++++----
 arch/x86/mm/pgtable.c                    |  8 ++-----
 arch/x86/mm/tlb.c                        |  4 +---
 tools/arch/x86/include/asm/cpufeatures.h |  3 +++
 17 files changed, 146 insertions(+), 65 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a303387..e2642ec 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2753,6 +2753,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	nojitter	[IA-64] Disables jitter checking for ITC timers.
 
+	nokaiser	[X86-64] Disable KAISER isolation of kernel from user.
+
 	no-kvmclock	[X86,KVM] Disable paravirtualized KVM clock driver
 
 	no-kvmapf	[X86,KVM] Disable paravirtualized asynchronous page
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 41bf650..bbb38ac 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1079,7 +1079,7 @@ ENTRY(paranoid_entry)
 	 * unconditionally, but we need to find out whether the reverse
 	 * should be done on return (conveyed to paranoid_exit in %ebx).
 	 */
-	movq	%cr3, %rax
+	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
 	testl	$KAISER_SHADOW_PGD_OFFSET, %eax
 	jz	2f
 	orl	$2, %ebx
@@ -1111,6 +1111,7 @@ ENTRY(paranoid_exit)
 	TRACE_IRQS_OFF_DEBUG
 	TRACE_IRQS_IRETQ_DEBUG
 #ifdef CONFIG_KAISER
+	/* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
 	testl	$2, %ebx			/* SWITCH_USER_CR3 needed? */
 	jz	paranoid_exit_no_switch
 	SWITCH_USER_CR3
@@ -1339,13 +1340,14 @@ ENTRY(nmi)
 #ifdef CONFIG_KAISER
 	/* Unconditionally use kernel CR3 for do_nmi() */
 	/* %rax is saved above, so OK to clobber here */
-	movq	%cr3, %rax
+	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
 	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
 	orq	x86_cr3_pcid_noflush, %rax
 	pushq	%rax
 	/* mask off "user" bit of pgd address and 12 PCID bits: */
 	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
 	movq	%rax, %cr3
+2:
 #endif
 	call	do_nmi
 
@@ -1355,8 +1357,7 @@ ENTRY(nmi)
 	 * kernel code that needs user CR3, but do we ever return
 	 * to "user mode" where we need the kernel CR3?
 	 */
-	popq	%rax
-	mov	%rax, %cr3
+	ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
 #endif
 
 	/*
@@ -1583,13 +1584,14 @@ end_repeat_nmi:
 #ifdef CONFIG_KAISER
 	/* Unconditionally use kernel CR3 for do_nmi() */
 	/* %rax is saved above, so OK to clobber here */
-	movq	%cr3, %rax
+	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
 	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
 	orq	x86_cr3_pcid_noflush, %rax
 	pushq	%rax
 	/* mask off "user" bit of pgd address and 12 PCID bits: */
 	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
 	movq	%rax, %cr3
+2:
 #endif
 
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
@@ -1601,8 +1603,7 @@ end_repeat_nmi:
 	 * kernel code that needs user CR3, like just just before
 	 * a sysret.
 	 */
-	popq	%rax
-	mov	%rax, %cr3
+	ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
 #endif
 
 	testl	%ebx, %ebx			/* swapgs needed? */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index dc50883..20271d6 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -198,6 +198,9 @@
 #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
 #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
 
+/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
+#define X86_FEATURE_KAISER	( 7*32+31) /* CONFIG_KAISER w/o nokaiser */
+
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
 #define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 3dc5f4c..96643a9 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -46,28 +46,33 @@ movq \reg, %cr3
 .endm
 
 .macro SWITCH_KERNEL_CR3
-pushq %rax
+ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
 _SWITCH_TO_KERNEL_CR3 %rax
 popq %rax
+8:
 .endm
 
 .macro SWITCH_USER_CR3
-pushq %rax
+ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
 _SWITCH_TO_USER_CR3 %rax %al
 popq %rax
+8:
 .endm
 
 .macro SWITCH_KERNEL_CR3_NO_STACK
-movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+ALTERNATIVE "jmp 8f", \
+	__stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
+	X86_FEATURE_KAISER
 _SWITCH_TO_KERNEL_CR3 %rax
 movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+8:
 .endm
 
 #else /* CONFIG_KAISER */
 
-.macro SWITCH_KERNEL_CR3 reg
+.macro SWITCH_KERNEL_CR3
 .endm
-.macro SWITCH_USER_CR3 reg regb
+.macro SWITCH_USER_CR3
 .endm
 .macro SWITCH_KERNEL_CR3_NO_STACK
 .endm
@@ -90,6 +95,16 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
 
 extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
 
+extern int kaiser_enabled;
+#else
+#define kaiser_enabled	0
+#endif /* CONFIG_KAISER */
+
+/*
+ * Kaiser function prototypes are needed even when CONFIG_KAISER is not set,
+ * so as to build with tests on kaiser_enabled instead of #ifdefs.
+ */
+
 /**
  *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
  *  @addr: the start address of the range
@@ -119,8 +134,6 @@ extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
  */
 extern void kaiser_init(void);
 
-#endif /* CONFIG_KAISER */
-
 #endif /* __ASSEMBLY */
 
 #endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1cee98e..217e83a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -18,6 +18,12 @@
 #ifndef __ASSEMBLY__
 #include <asm/x86_init.h>
 
+#ifdef CONFIG_KAISER
+extern int kaiser_enabled;
+#else
+#define kaiser_enabled 0
+#endif
+
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
 void ptdump_walk_pgd_level_checkwx(void);
 
@@ -697,7 +703,7 @@ static inline int pgd_bad(pgd_t pgd)
 	 * page table by accident; it will fault on the first
 	 * instruction it tries to run.  See native_set_pgd().
 	 */
-	if (IS_ENABLED(CONFIG_KAISER))
+	if (kaiser_enabled)
 		ignore_flags |= _PAGE_NX;
 
 	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
@@ -913,12 +919,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
  */
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
-       memcpy(dst, src, count * sizeof(pgd_t));
+	memcpy(dst, src, count * sizeof(pgd_t));
 #ifdef CONFIG_KAISER
-	/* Clone the shadow pgd part as well */
-	memcpy(native_get_shadow_pgd(dst),
-	       native_get_shadow_pgd(src),
-	       count * sizeof(pgd_t));
+	if (kaiser_enabled) {
+		/* Clone the shadow pgd part as well */
+		memcpy(native_get_shadow_pgd(dst),
+			native_get_shadow_pgd(src),
+			count * sizeof(pgd_t));
+	}
 #endif
 }
 
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 177caf3..cf68b5c 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -111,13 +111,12 @@ extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
 
 static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
 {
+#ifdef CONFIG_DEBUG_VM
+	/* linux/mmdebug.h may not have been included at this point */
+	BUG_ON(!kaiser_enabled);
+#endif
 	return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
 }
-
-static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
-{
-	return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
-}
 #else
 static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 {
@@ -128,10 +127,6 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
 	BUILD_BUG_ON(1);
 	return NULL;
 }
-static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
-{
-	return pgdp;
-}
 #endif /* CONFIG_KAISER */
 
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 7cf2883..f0d9a1a 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -45,11 +45,7 @@
 #define _PAGE_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
 #define _PAGE_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
 #define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
-#ifdef CONFIG_KAISER
-#define _PAGE_GLOBAL	(_AT(pteval_t, 0))
-#else
 #define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
-#endif
 #define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
 #define _PAGE_SOFTW2	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
 #define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4fff696..13a74f6 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -138,9 +138,11 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
  * to avoid the need for asm/kaiser.h in unexpected places.
  */
 #ifdef CONFIG_KAISER
+extern int kaiser_enabled;
 extern void kaiser_setup_pcid(void);
 extern void kaiser_flush_tlb_on_return_to_user(void);
 #else
+#define kaiser_enabled 0
 static inline void kaiser_setup_pcid(void)
 {
 }
@@ -165,7 +167,7 @@ static inline void __native_flush_tlb(void)
 	 * back:
 	 */
 	preempt_disable();
-	if (this_cpu_has(X86_FEATURE_PCID))
+	if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
 		kaiser_flush_tlb_on_return_to_user();
 	native_write_cr3(native_read_cr3());
 	preempt_enable();
@@ -176,20 +178,30 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
 	unsigned long cr4;
 
 	cr4 = this_cpu_read(cpu_tlbstate.cr4);
-	/* clear PGE */
-	native_write_cr4(cr4 & ~X86_CR4_PGE);
-	/* write old PGE again and flush TLBs */
-	native_write_cr4(cr4);
+	if (cr4 & X86_CR4_PGE) {
+		/* clear PGE and flush TLB of all entries */
+		native_write_cr4(cr4 & ~X86_CR4_PGE);
+		/* restore PGE as it was before */
+		native_write_cr4(cr4);
+	} else {
+		/*
+		 * x86_64 microcode update comes this way when CR4.PGE is not
+		 * enabled, and it's safer for all callers to allow this case.
+		 */
+		native_write_cr3(native_read_cr3());
+	}
 }
 
 static inline void __native_flush_tlb_global(void)
 {
-#ifdef CONFIG_KAISER
-	/* Globals are not used at all */
-	__native_flush_tlb();
-#else
 	unsigned long flags;
 
+	if (kaiser_enabled) {
+		/* Globals are not used at all */
+		__native_flush_tlb();
+		return;
+	}
+
 	if (this_cpu_has(X86_FEATURE_INVPCID)) {
 		/*
 		 * Using INVPCID is considerably faster than a pair of writes
@@ -209,7 +221,6 @@ static inline void __native_flush_tlb_global(void)
 	raw_local_irq_save(flags);
 	__native_flush_tlb_global_irq_disabled();
 	raw_local_irq_restore(flags);
-#endif
 }
 
 static inline void __native_flush_tlb_single(unsigned long addr)
@@ -224,7 +235,7 @@ static inline void __native_flush_tlb_single(unsigned long addr)
 	 */
 
 	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
-		if (this_cpu_has(X86_FEATURE_PCID))
+		if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
 			kaiser_flush_tlb_on_return_to_user();
 		asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
 		return;
@@ -239,9 +250,9 @@ static inline void __native_flush_tlb_single(unsigned long addr)
 	 * Make sure to do only a single invpcid when KAISER is
 	 * disabled and we have only a single ASID.
 	 */
-	if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
-		invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
-	invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
+	if (kaiser_enabled)
+		invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
+	invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
 }
 
 static inline void __flush_tlb_all(void)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e6be5f3..8b03874 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -179,6 +179,20 @@ static int __init x86_pcid_setup(char *s)
 	return 1;
 }
 __setup("nopcid", x86_pcid_setup);
+
+static int __init x86_nokaiser_setup(char *s)
+{
+	/* nokaiser doesn't accept parameters */
+	if (s)
+		return -EINVAL;
+#ifdef CONFIG_KAISER
+	kaiser_enabled = 0;
+	setup_clear_cpu_cap(X86_FEATURE_KAISER);
+	pr_info("nokaiser: KAISER feature disabled\n");
+#endif
+	return 0;
+}
+early_param("nokaiser", x86_nokaiser_setup);
 #endif
 
 static int __init x86_noinvpcid_setup(char *s)
@@ -327,7 +341,7 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
 static void setup_pcid(struct cpuinfo_x86 *c)
 {
 	if (cpu_has(c, X86_FEATURE_PCID)) {
-		if (cpu_has(c, X86_FEATURE_PGE)) {
+		if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
 			cr4_set_bits(X86_CR4_PCIDE);
 			/*
 			 * INVPCID has two "groups" of types:
@@ -799,6 +813,10 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
 
 	init_scattered_cpuid_features(c);
+#ifdef CONFIG_KAISER
+	if (kaiser_enabled)
+		set_cpu_cap(c, X86_FEATURE_KAISER);
+#endif
 }
 
 static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -1537,6 +1555,14 @@ void cpu_init(void)
 	 * try to read it.
 	 */
 	cr4_init_shadow();
+	if (!kaiser_enabled) {
+		/*
+		 * secondary_startup_64() deferred setting PGE in cr4:
+		 * probe_page_size_mask() sets it on the boot cpu,
+		 * but it needs to be set on each secondary cpu.
+		 */
+		cr4_set_bits(X86_CR4_PGE);
+	}
 
 	/*
 	 * Load microcode on this cpu if a valid microcode is available.
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 560c2fd..e33b385 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -132,9 +132,10 @@ void __init init_espfix_bsp(void)
 	 * area to ensure it is mapped into the shadow user page
 	 * tables.
 	 */
-	if (IS_ENABLED(CONFIG_KAISER))
+	if (kaiser_enabled) {
 		set_pgd(native_get_shadow_pgd(pgd_p),
 			__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
+	}
 
 	/* Randomize the locations */
 	init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 5775379..d04479b 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -190,8 +190,8 @@ ENTRY(secondary_startup_64)
 	movq	$(init_level4_pgt - __START_KERNEL_map), %rax
 1:
 
-	/* Enable PAE mode and PGE */
-	movl	$(X86_CR4_PAE | X86_CR4_PGE), %ecx
+	/* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
+	movl	$(X86_CR4_PAE | X86_CR4_PSE), %ecx
 	movq	%rcx, %cr4
 
 	/* Setup early boot stage 4 level pagetables. */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 22af912..05a9855 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -177,7 +177,7 @@ static void __init probe_page_size_mask(void)
 		cr4_set_bits_and_update_boot(X86_CR4_PSE);
 
 	/* Enable PGE if available */
-	if (boot_cpu_has(X86_FEATURE_PGE)) {
+	if (boot_cpu_has(X86_FEATURE_PGE) && !kaiser_enabled) {
 		cr4_set_bits_and_update_boot(X86_CR4_PGE);
 		__supported_pte_mask |= _PAGE_GLOBAL;
 	} else
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 14b9dd7..a0e8df6 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -324,6 +324,16 @@ void __init cleanup_highmap(void)
 			continue;
 		if (vaddr < (unsigned long) _text || vaddr > end)
 			set_pmd(pmd, __pmd(0));
+		else if (kaiser_enabled) {
+			/*
+			 * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
+			 * clear that now.  This is not important, so long as
+			 * CR4.PGE remains clear, but it removes an anomaly.
+			 * Physical mapping setup below avoids _PAGE_GLOBAL
+			 * by use of massage_pgprot() inside pfn_pte() etc.
+			 */
+			set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
+		}
 	}
 }
 
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index cc0950f..11032dc 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -16,7 +16,9 @@
 #include <asm/pgalloc.h>
 #include <asm/desc.h>
 
-#ifdef CONFIG_KAISER
+int kaiser_enabled __read_mostly = 1;
+EXPORT_SYMBOL(kaiser_enabled);	/* for inlined TLB flush functions */
+
 __visible
 DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 
@@ -167,8 +169,8 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
 	return pte_offset_kernel(pmd, address);
 }
 
-int kaiser_add_user_map(const void *__start_addr, unsigned long size,
-			unsigned long flags)
+static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+			       unsigned long flags)
 {
 	int ret = 0;
 	pte_t *pte;
@@ -177,6 +179,15 @@ int kaiser_add_user_map(const void *__start_addr, unsigned long size,
 	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
 	unsigned long target_address;
 
+	/*
+	 * It is convenient for callers to pass in __PAGE_KERNEL etc,
+	 * and there is no actual harm from setting _PAGE_GLOBAL, so
+	 * long as CR4.PGE is not set.  But it is nonetheless troubling
+	 * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
+	 * requires that not to be #defined to 0): so mask it off here.
+	 */
+	flags &= ~_PAGE_GLOBAL;
+
 	for (; address < end_addr; address += PAGE_SIZE) {
 		target_address = get_pa_from_mapping(address);
 		if (target_address == -1) {
@@ -263,6 +274,8 @@ void __init kaiser_init(void)
 {
 	int cpu;
 
+	if (!kaiser_enabled)
+		return;
 	kaiser_init_all_pgds();
 
 	for_each_possible_cpu(cpu) {
@@ -311,6 +324,8 @@ void __init kaiser_init(void)
 /* Add a mapping to the shadow mapping, and synchronize the mappings */
 int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
 {
+	if (!kaiser_enabled)
+		return 0;
 	return kaiser_add_user_map((const void *)addr, size, flags);
 }
 
@@ -322,6 +337,8 @@ void kaiser_remove_mapping(unsigned long start, unsigned long size)
 	unsigned long addr, next;
 	pgd_t *pgd;
 
+	if (!kaiser_enabled)
+		return;
 	pgd = native_get_shadow_pgd(pgd_offset_k(start));
 	for (addr = start; addr < end; pgd++, addr = next) {
 		next = pgd_addr_end(addr, end);
@@ -343,6 +360,8 @@ static inline bool is_userspace_pgd(pgd_t *pgdp)
 
 pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 {
+	if (!kaiser_enabled)
+		return pgd;
 	/*
 	 * Do we need to also populate the shadow pgd?  Check _PAGE_USER to
 	 * skip cases like kexec and EFI which make temporary low mappings.
@@ -399,4 +418,3 @@ void kaiser_flush_tlb_on_return_to_user(void)
 			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
 }
 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
-#endif /* CONFIG_KAISER */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 352fd01..5aaec8e 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -345,16 +345,12 @@ static inline void _pgd_free(pgd_t *pgd)
 }
 #else
 
-#ifdef CONFIG_KAISER
 /*
- * Instead of one pmd, we aquire two pmds.  Being order-1, it is
+ * Instead of one pgd, Kaiser acquires two pgds.  Being order-1, it is
  * both 8k in size and 8k-aligned.  That lets us just flip bit 12
  * in a pointer to swap between the two 4k halves.
  */
-#define PGD_ALLOCATION_ORDER 1
-#else
-#define PGD_ALLOCATION_ORDER 0
-#endif
+#define PGD_ALLOCATION_ORDER	kaiser_enabled
 
 static inline pgd_t *_pgd_alloc(void)
 {
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 852c665..fde44bb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -41,8 +41,7 @@ static void load_new_mm_cr3(pgd_t *pgdir)
 {
 	unsigned long new_mm_cr3 = __pa(pgdir);
 
-#ifdef CONFIG_KAISER
-	if (this_cpu_has(X86_FEATURE_PCID)) {
+	if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) {
 		/*
 		 * We reuse the same PCID for different tasks, so we must
 		 * flush all the entries for the PCID out when we change tasks.
@@ -59,7 +58,6 @@ static void load_new_mm_cr3(pgd_t *pgdir)
 		new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
 		kaiser_flush_tlb_on_return_to_user();
 	}
-#endif /* CONFIG_KAISER */
 
 	/*
 	 * Caution: many callers of this function expect
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index a396292..67c93d9 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -197,6 +197,9 @@
 #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
 #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
 
+/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
+#define X86_FEATURE_KAISER	( 7*32+31) /* CONFIG_KAISER w/o nokaiser */
+
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
 #define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
-- 
2.7.4