aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch
diff options
context:
space:
mode:
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch')
-rw-r--r--common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch1327
1 files changed, 1327 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch
new file mode 100644
index 00000000..52bf5963
--- /dev/null
+++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch
@@ -0,0 +1,1327 @@
+From 48523e23d22e5a66009d404caca4721b84cde67a Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Wed, 30 Aug 2017 16:23:00 -0700
+Subject: [PATCH 005/102] kaiser: merged update
+
+Merged fixes and cleanups, rebased to 4.9.51 tree (no 5-level paging).
+
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/entry/entry_64.S | 105 ++++++++++--
+ arch/x86/include/asm/kaiser.h | 43 +++--
+ arch/x86/include/asm/pgtable.h | 18 +-
+ arch/x86/include/asm/pgtable_64.h | 48 +++++-
+ arch/x86/include/asm/pgtable_types.h | 6 +-
+ arch/x86/kernel/espfix_64.c | 13 +-
+ arch/x86/kernel/head_64.S | 19 ++-
+ arch/x86/kernel/ldt.c | 27 ++-
+ arch/x86/kernel/tracepoint.c | 2 +
+ arch/x86/mm/kaiser.c | 313 +++++++++++++++++++++++++----------
+ arch/x86/mm/pageattr.c | 63 +++++--
+ arch/x86/mm/pgtable.c | 40 ++---
+ include/linux/kaiser.h | 26 +++
+ kernel/fork.c | 9 +-
+ security/Kconfig | 5 +
+ 15 files changed, 549 insertions(+), 188 deletions(-)
+ create mode 100644 include/linux/kaiser.h
+
+diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
+index 6c880dc..d84e3a7 100644
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -230,6 +230,13 @@ entry_SYSCALL_64_fastpath:
+ movq RIP(%rsp), %rcx
+ movq EFLAGS(%rsp), %r11
+ RESTORE_C_REGS_EXCEPT_RCX_R11
++ /*
++ * This opens a window where we have a user CR3, but are
++ * running in the kernel. This makes using the CS
++ * register useless for telling whether or not we need to
++ * switch CR3 in NMIs. Normal interrupts are OK because
++ * they are off here.
++ */
+ SWITCH_USER_CR3
+ movq RSP(%rsp), %rsp
+ USERGS_SYSRET64
+@@ -326,11 +333,25 @@ return_from_SYSCALL_64:
+ syscall_return_via_sysret:
+ /* rcx and r11 are already restored (see code above) */
+ RESTORE_C_REGS_EXCEPT_RCX_R11
++ /*
++ * This opens a window where we have a user CR3, but are
++ * running in the kernel. This makes using the CS
++ * register useless for telling whether or not we need to
++ * switch CR3 in NMIs. Normal interrupts are OK because
++ * they are off here.
++ */
+ SWITCH_USER_CR3
+ movq RSP(%rsp), %rsp
+ USERGS_SYSRET64
+
+ opportunistic_sysret_failed:
++ /*
++ * This opens a window where we have a user CR3, but are
++ * running in the kernel. This makes using the CS
++ * register useless for telling whether or not we need to
++ * switch CR3 in NMIs. Normal interrupts are OK because
++ * they are off here.
++ */
+ SWITCH_USER_CR3
+ SWAPGS
+ jmp restore_c_regs_and_iret
+@@ -1087,6 +1108,13 @@ ENTRY(error_entry)
+ cld
+ SAVE_C_REGS 8
+ SAVE_EXTRA_REGS 8
++ /*
++ * error_entry() always returns with a kernel gsbase and
++ * CR3. We must also have a kernel CR3/gsbase before
++ * calling TRACE_IRQS_*. Just unconditionally switch to
++ * the kernel CR3 here.
++ */
++ SWITCH_KERNEL_CR3
+ xorl %ebx, %ebx
+ testb $3, CS+8(%rsp)
+ jz .Lerror_kernelspace
+@@ -1096,7 +1124,6 @@ ENTRY(error_entry)
+ * from user mode due to an IRET fault.
+ */
+ SWAPGS
+- SWITCH_KERNEL_CR3
+
+ .Lerror_entry_from_usermode_after_swapgs:
+ /*
+@@ -1148,7 +1175,6 @@ ENTRY(error_entry)
+ * Switch to kernel gsbase:
+ */
+ SWAPGS
+- SWITCH_KERNEL_CR3
+
+ /*
+ * Pretend that the exception came from user mode: set up pt_regs
+@@ -1247,7 +1273,10 @@ ENTRY(nmi)
+ */
+
+ SWAPGS_UNSAFE_STACK
+- SWITCH_KERNEL_CR3_NO_STACK
++ /*
++ * percpu variables are mapped with user CR3, so no need
++ * to switch CR3 here.
++ */
+ cld
+ movq %rsp, %rdx
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+@@ -1281,14 +1310,33 @@ ENTRY(nmi)
+
+ movq %rsp, %rdi
+ movq $-1, %rsi
++#ifdef CONFIG_KAISER
++ /* Unconditionally use kernel CR3 for do_nmi() */
++ /* %rax is saved above, so OK to clobber here */
++ movq %cr3, %rax
++ pushq %rax
++#ifdef CONFIG_KAISER_REAL_SWITCH
++ andq $(~0x1000), %rax
++#endif
++ movq %rax, %cr3
++#endif
+ call do_nmi
++ /*
++ * Unconditionally restore CR3. I know we return to
++ * kernel code that needs user CR3, but do we ever return
++ * to "user mode" where we need the kernel CR3?
++ */
++#ifdef CONFIG_KAISER
++ popq %rax
++ mov %rax, %cr3
++#endif
+
+ /*
+ * Return back to user mode. We must *not* do the normal exit
+- * work, because we don't want to enable interrupts. Fortunately,
+- * do_nmi doesn't modify pt_regs.
++ * work, because we don't want to enable interrupts. Do not
++ * switch to user CR3: we might be going back to kernel code
++ * that had a user CR3 set.
+ */
+- SWITCH_USER_CR3
+ SWAPGS
+ jmp restore_c_regs_and_iret
+
+@@ -1484,23 +1532,54 @@ end_repeat_nmi:
+ ALLOC_PT_GPREGS_ON_STACK
+
+ /*
+- * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
+- * as we should not be calling schedule in NMI context.
+- * Even with normal interrupts enabled. An NMI should not be
+- * setting NEED_RESCHED or anything that normal interrupts and
+- * exceptions might do.
++ * Use the same approach as paranoid_entry to handle SWAPGS, but
++ * without CR3 handling since we do that differently in NMIs. No
++ * need to use paranoid_exit as we should not be calling schedule
++ * in NMI context. Even with normal interrupts enabled. An NMI
++ * should not be setting NEED_RESCHED or anything that normal
++ * interrupts and exceptions might do.
+ */
+- call paranoid_entry
++ cld
++ SAVE_C_REGS
++ SAVE_EXTRA_REGS
++ movl $1, %ebx
++ movl $MSR_GS_BASE, %ecx
++ rdmsr
++ testl %edx, %edx
++ js 1f /* negative -> in kernel */
++ SWAPGS
++ xorl %ebx, %ebx
++1:
++#ifdef CONFIG_KAISER
++ /* Unconditionally use kernel CR3 for do_nmi() */
++ /* %rax is saved above, so OK to clobber here */
++ movq %cr3, %rax
++ pushq %rax
++#ifdef CONFIG_KAISER_REAL_SWITCH
++ andq $(~0x1000), %rax
++#endif
++ movq %rax, %cr3
++#endif
+
+ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
+ movq %rsp, %rdi
++ addq $8, %rdi /* point %rdi at ptregs, fixed up for CR3 */
+ movq $-1, %rsi
+ call do_nmi
++ /*
++ * Unconditionally restore CR3. We might be returning to
++ * kernel code that needs user CR3, like just just before
++ * a sysret.
++ */
++#ifdef CONFIG_KAISER
++ popq %rax
++ mov %rax, %cr3
++#endif
+
+ testl %ebx, %ebx /* swapgs needed? */
+ jnz nmi_restore
+ nmi_swapgs:
+- SWITCH_USER_CR3_NO_STACK
++ /* We fixed up CR3 above, so no need to switch it here */
+ SWAPGS_UNSAFE_STACK
+ nmi_restore:
+ RESTORE_EXTRA_REGS
+diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
+index 63ee830..0703f48 100644
+--- a/arch/x86/include/asm/kaiser.h
++++ b/arch/x86/include/asm/kaiser.h
+@@ -16,13 +16,17 @@
+
+ .macro _SWITCH_TO_KERNEL_CR3 reg
+ movq %cr3, \reg
++#ifdef CONFIG_KAISER_REAL_SWITCH
+ andq $(~0x1000), \reg
++#endif
+ movq \reg, %cr3
+ .endm
+
+ .macro _SWITCH_TO_USER_CR3 reg
+ movq %cr3, \reg
++#ifdef CONFIG_KAISER_REAL_SWITCH
+ orq $(0x1000), \reg
++#endif
+ movq \reg, %cr3
+ .endm
+
+@@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+ .endm
+
+ #endif /* CONFIG_KAISER */
++
+ #else /* __ASSEMBLY__ */
+
+
+ #ifdef CONFIG_KAISER
+-// Upon kernel/user mode switch, it may happen that
+-// the address space has to be switched before the registers have been stored.
+-// To change the address space, another register is needed.
+-// A register therefore has to be stored/restored.
+-//
+-DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
++/*
++ * Upon kernel/user mode switch, it may happen that the address
++ * space has to be switched before the registers have been
++ * stored. To change the address space, another register is
++ * needed. A register therefore has to be stored/restored.
++*/
+
+-#endif /* CONFIG_KAISER */
++DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+ /**
+- * shadowmem_add_mapping - map a virtual memory part to the shadow mapping
++ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
+ * @addr: the start address of the range
+ * @size: the size of the range
+ * @flags: The mapping flags of the pages
+ *
+- * the mapping is done on a global scope, so no bigger synchronization has to be done.
+- * the pages have to be manually unmapped again when they are not needed any longer.
++ * The mapping is done on a global scope, so no bigger
++ * synchronization has to be done. the pages have to be
++ * manually unmapped again when they are not needed any longer.
+ */
+-extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
++extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+
+
+ /**
+- * shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
++ * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
+ * @addr: the start address of the range
+ * @size: the size of the range
+ */
+ extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+ /**
+- * shadowmem_initialize_mapping - Initalize the shadow mapping
++ * kaiser_initialize_mapping - Initalize the shadow mapping
+ *
+- * most parts of the shadow mapping can be mapped upon boot time.
+- * only the thread stacks have to be mapped on runtime.
+- * the mapped regions are not unmapped at all.
++ * Most parts of the shadow mapping can be mapped upon boot
++ * time. Only per-process things like the thread stacks
++ * or a new LDT have to be mapped at runtime. These boot-
++ * time mappings are permanent and nevertunmapped.
+ */
+ extern void kaiser_init(void);
+
+-#endif
++#endif /* CONFIG_KAISER */
++
++#endif /* __ASSEMBLY */
+
+
+
+diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
+index 4b479c9..1cee98e 100644
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -690,7 +690,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
+
+ static inline int pgd_bad(pgd_t pgd)
+ {
+- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
++ pgdval_t ignore_flags = _PAGE_USER;
++ /*
++ * We set NX on KAISER pgds that map userspace memory so
++ * that userspace can not meaningfully use the kernel
++ * page table by accident; it will fault on the first
++ * instruction it tries to run. See native_set_pgd().
++ */
++ if (IS_ENABLED(CONFIG_KAISER))
++ ignore_flags |= _PAGE_NX;
++
++ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
+ }
+
+ static inline int pgd_none(pgd_t pgd)
+@@ -905,8 +915,10 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+ {
+ memcpy(dst, src, count * sizeof(pgd_t));
+ #ifdef CONFIG_KAISER
+- // clone the shadow pgd part as well
+- memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
++ /* Clone the shadow pgd part as well */
++ memcpy(native_get_shadow_pgd(dst),
++ native_get_shadow_pgd(src),
++ count * sizeof(pgd_t));
+ #endif
+ }
+
+diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
+index e6ea39f..000265c 100644
+--- a/arch/x86/include/asm/pgtable_64.h
++++ b/arch/x86/include/asm/pgtable_64.h
+@@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_t *pud)
+ }
+
+ #ifdef CONFIG_KAISER
+-static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
++static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
++{
+ return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
+ }
+
+-static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
++static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
++{
+ return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE);
+ }
++#else
++static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
++{
++ BUILD_BUG_ON(1);
++ return NULL;
++}
++static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
++{
++ return pgdp;
++}
+ #endif /* CONFIG_KAISER */
+
++/*
++ * Page table pages are page-aligned. The lower half of the top
++ * level is used for userspace and the top half for the kernel.
++ * This returns true for user pages that need to get copied into
++ * both the user and kernel copies of the page tables, and false
++ * for kernel pages that should only be in the kernel copy.
++ */
++static inline bool is_userspace_pgd(void *__ptr)
++{
++ unsigned long ptr = (unsigned long)__ptr;
++
++ return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
++}
++
+ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
+ {
+ #ifdef CONFIG_KAISER
+- // We know that a pgd is page aligned.
+- // Therefore the lower indices have to be mapped to user space.
+- // These pages are mapped to the shadow mapping.
+- if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
++ pteval_t extra_kern_pgd_flags = 0;
++ /* Do we need to also populate the shadow pgd? */
++ if (is_userspace_pgd(pgdp)) {
+ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
++ /*
++ * Even if the entry is *mapping* userspace, ensure
++ * that userspace can not use it. This way, if we
++ * get out to userspace running on the kernel CR3,
++ * userspace will crash instead of running.
++ */
++ extra_kern_pgd_flags = _PAGE_NX;
+ }
+-
+- pgdp->pgd = pgd.pgd & ~_PAGE_USER;
++ pgdp->pgd = pgd.pgd;
++ pgdp->pgd |= extra_kern_pgd_flags;
+ #else /* CONFIG_KAISER */
+ *pgdp = pgd;
+ #endif
+diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
+index 00fecbb..8bc8d02 100644
+--- a/arch/x86/include/asm/pgtable_types.h
++++ b/arch/x86/include/asm/pgtable_types.h
+@@ -48,7 +48,7 @@
+ #ifdef CONFIG_KAISER
+ #define _PAGE_GLOBAL (_AT(pteval_t, 0))
+ #else
+-#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
++#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+ #endif
+ #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
+ #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
+@@ -123,11 +123,7 @@
+ #define _PAGE_DEVMAP (_AT(pteval_t, 0))
+ #endif
+
+-#ifdef CONFIG_KAISER
+-#define _PAGE_PROTNONE (_AT(pteval_t, 0))
+-#else
+ #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+-#endif
+
+ #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_DIRTY)
+diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
+index 9ff875a..560c2fd 100644
+--- a/arch/x86/kernel/espfix_64.c
++++ b/arch/x86/kernel/espfix_64.c
+@@ -127,11 +127,14 @@ void __init init_espfix_bsp(void)
+ /* Install the espfix pud into the kernel page directory */
+ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+-#ifdef CONFIG_KAISER
+- // add the esp stack pud to the shadow mapping here.
+- // This can be done directly, because the fixup stack has its own pud
+- set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
+-#endif
++ /*
++ * Just copy the top-level PGD that is mapping the espfix
++ * area to ensure it is mapped into the shadow user page
++ * tables.
++ */
++ if (IS_ENABLED(CONFIG_KAISER))
++ set_pgd(native_get_shadow_pgd(pgd_p),
++ __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
+
+ /* Randomize the locations */
+ init_espfix_random();
+diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
+index 9e849b5..5775379 100644
+--- a/arch/x86/kernel/head_64.S
++++ b/arch/x86/kernel/head_64.S
+@@ -406,11 +406,24 @@ GLOBAL(early_recursion_flag)
+ GLOBAL(name)
+
+ #ifdef CONFIG_KAISER
++/*
++ * Each PGD needs to be 8k long and 8k aligned. We do not
++ * ever go out to userspace with these, so we do not
++ * strictly *need* the second page, but this allows us to
++ * have a single set_pgd() implementation that does not
++ * need to worry about whether it has 4k or 8k to work
++ * with.
++ *
++ * This ensures PGDs are 8k long:
++ */
++#define KAISER_USER_PGD_FILL 512
++/* This ensures they are 8k-aligned: */
+ #define NEXT_PGD_PAGE(name) \
+ .balign 2 * PAGE_SIZE; \
+ GLOBAL(name)
+ #else
+ #define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
++#define KAISER_USER_PGD_FILL 0
+ #endif
+
+ /* Automate the creation of 1 to 1 mapping pmd entries */
+@@ -425,6 +438,7 @@ GLOBAL(name)
+ NEXT_PGD_PAGE(early_level4_pgt)
+ .fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
++ .fill KAISER_USER_PGD_FILL,8,0
+
+ NEXT_PAGE(early_dynamic_pgts)
+ .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
+@@ -433,7 +447,8 @@ NEXT_PAGE(early_dynamic_pgts)
+
+ #ifndef CONFIG_XEN
+ NEXT_PGD_PAGE(init_level4_pgt)
+- .fill 2*512,8,0
++ .fill 512,8,0
++ .fill KAISER_USER_PGD_FILL,8,0
+ #else
+ NEXT_PGD_PAGE(init_level4_pgt)
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+@@ -442,6 +457,7 @@ NEXT_PGD_PAGE(init_level4_pgt)
+ .org init_level4_pgt + L4_START_KERNEL*8, 0
+ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
++ .fill KAISER_USER_PGD_FILL,8,0
+
+ NEXT_PAGE(level3_ident_pgt)
+ .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+@@ -452,6 +468,7 @@ NEXT_PAGE(level2_ident_pgt)
+ */
+ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+ #endif
++ .fill KAISER_USER_PGD_FILL,8,0
+
+ NEXT_PAGE(level3_kernel_pgt)
+ .fill L3_START_KERNEL,8,0
+diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
+index 6707039..3c2d55b 100644
+--- a/arch/x86/kernel/ldt.c
++++ b/arch/x86/kernel/ldt.c
+@@ -17,6 +17,7 @@
+ #include <linux/uaccess.h>
+
+ #include <asm/ldt.h>
++#include <asm/kaiser.h>
+ #include <asm/desc.h>
+ #include <asm/mmu_context.h>
+ #include <asm/syscalls.h>
+@@ -33,11 +34,21 @@ static void flush_ldt(void *current_mm)
+ set_ldt(pc->ldt->entries, pc->ldt->size);
+ }
+
++static void __free_ldt_struct(struct ldt_struct *ldt)
++{
++ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
++ vfree(ldt->entries);
++ else
++ free_page((unsigned long)ldt->entries);
++ kfree(ldt);
++}
++
+ /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
+ static struct ldt_struct *alloc_ldt_struct(int size)
+ {
+ struct ldt_struct *new_ldt;
+ int alloc_size;
++ int ret = 0;
+
+ if (size > LDT_ENTRIES)
+ return NULL;
+@@ -65,6 +76,14 @@ static struct ldt_struct *alloc_ldt_struct(int size)
+ return NULL;
+ }
+
++ // FIXME: make kaiser_add_mapping() return an error code
++ // when it fails
++ kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
++ __PAGE_KERNEL);
++ if (ret) {
++ __free_ldt_struct(new_ldt);
++ return NULL;
++ }
+ new_ldt->size = size;
+ return new_ldt;
+ }
+@@ -91,12 +110,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
+ if (likely(!ldt))
+ return;
+
++ kaiser_remove_mapping((unsigned long)ldt->entries,
++ ldt->size * LDT_ENTRY_SIZE);
+ paravirt_free_ldt(ldt->entries, ldt->size);
+- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+- vfree(ldt->entries);
+- else
+- free_page((unsigned long)ldt->entries);
+- kfree(ldt);
++ __free_ldt_struct(ldt);
+ }
+
+ /*
+diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
+index 1c113db..2bb5ee4 100644
+--- a/arch/x86/kernel/tracepoint.c
++++ b/arch/x86/kernel/tracepoint.c
+@@ -9,10 +9,12 @@
+ #include <linux/atomic.h>
+
+ atomic_t trace_idt_ctr = ATOMIC_INIT(0);
++__aligned(PAGE_SIZE)
+ struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
+ (unsigned long) trace_idt_table };
+
+ /* No need to be aligned, but done to keep all IDTs defined the same way. */
++__aligned(PAGE_SIZE)
+ gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
+
+ static int trace_irq_vector_refcount;
+diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
+index cf1bb92..7270a29 100644
+--- a/arch/x86/mm/kaiser.c
++++ b/arch/x86/mm/kaiser.c
+@@ -1,160 +1,305 @@
+-
+-
++#include <linux/bug.h>
+ #include <linux/kernel.h>
+ #include <linux/errno.h>
+ #include <linux/string.h>
+ #include <linux/types.h>
+ #include <linux/bug.h>
+ #include <linux/init.h>
++#include <linux/interrupt.h>
+ #include <linux/spinlock.h>
+ #include <linux/mm.h>
+-
+ #include <linux/uaccess.h>
++
++#include <asm/kaiser.h>
+ #include <asm/pgtable.h>
+ #include <asm/pgalloc.h>
+ #include <asm/desc.h>
+ #ifdef CONFIG_KAISER
+
+ __visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
++/*
++ * At runtime, the only things we map are some things for CPU
++ * hotplug, and stacks for new processes. No two CPUs will ever
++ * be populating the same addresses, so we only need to ensure
++ * that we protect between two CPUs trying to allocate and
++ * populate the same page table page.
++ *
++ * Only take this lock when doing a set_p[4um]d(), but it is not
++ * needed for doing a set_pte(). We assume that only the *owner*
++ * of a given allocation will be doing this for _their_
++ * allocation.
++ *
++ * This ensures that once a system has been running for a while
++ * and there have been stacks all over and these page tables
++ * are fully populated, there will be no further acquisitions of
++ * this lock.
++ */
++static DEFINE_SPINLOCK(shadow_table_allocation_lock);
+
+-/**
+- * Get the real ppn from a address in kernel mapping.
+- * @param address The virtual adrress
+- * @return the physical address
++/*
++ * Returns -1 on error.
+ */
+-static inline unsigned long get_pa_from_mapping (unsigned long address)
++static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
+ {
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+- pgd = pgd_offset_k(address);
+- BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
+-
+- pud = pud_offset(pgd, address);
+- BUG_ON(pud_none(*pud));
++ pgd = pgd_offset_k(vaddr);
++ /*
++ * We made all the kernel PGDs present in kaiser_init().
++ * We expect them to stay that way.
++ */
++ BUG_ON(pgd_none(*pgd));
++ /*
++ * PGDs are either 512GB or 128TB on all x86_64
++ * configurations. We don't handle these.
++ */
++ BUG_ON(pgd_large(*pgd));
+
+- if (pud_large(*pud)) {
+- return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);
++ pud = pud_offset(pgd, vaddr);
++ if (pud_none(*pud)) {
++ WARN_ON_ONCE(1);
++ return -1;
+ }
+
+- pmd = pmd_offset(pud, address);
+- BUG_ON(pmd_none(*pmd));
++ if (pud_large(*pud))
++ return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
+
+- if (pmd_large(*pmd)) {
+- return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);
++ pmd = pmd_offset(pud, vaddr);
++ if (pmd_none(*pmd)) {
++ WARN_ON_ONCE(1);
++ return -1;
+ }
+
+- pte = pte_offset_kernel(pmd, address);
+- BUG_ON(pte_none(*pte));
++ if (pmd_large(*pmd))
++ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
+
+- return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);
++ pte = pte_offset_kernel(pmd, vaddr);
++ if (pte_none(*pte)) {
++ WARN_ON_ONCE(1);
++ return -1;
++ }
++
++ return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
+ }
+
+-void _kaiser_copy (unsigned long start_addr, unsigned long size,
+- unsigned long flags)
++/*
++ * This is a relatively normal page table walk, except that it
++ * also tries to allocate page tables pages along the way.
++ *
++ * Returns a pointer to a PTE on success, or NULL on failure.
++ */
++static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
+ {
+- pgd_t *pgd;
+- pud_t *pud;
+ pmd_t *pmd;
+- pte_t *pte;
+- unsigned long address;
+- unsigned long end_addr = start_addr + size;
+- unsigned long target_address;
++ pud_t *pud;
++ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+- for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));
+- address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) {
+- target_address = get_pa_from_mapping(address);
++ might_sleep();
++ if (is_atomic) {
++ gfp &= ~GFP_KERNEL;
++ gfp |= __GFP_HIGH | __GFP_ATOMIC;
++ }
+
+- pgd = native_get_shadow_pgd(pgd_offset_k(address));
++ if (pgd_none(*pgd)) {
++ WARN_ONCE(1, "All shadow pgds should have been populated");
++ return NULL;
++ }
++ BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+- BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");
+- BUG_ON(pgd_large(*pgd));
++ pud = pud_offset(pgd, address);
++ /* The shadow page tables do not use large mappings: */
++ if (pud_large(*pud)) {
++ WARN_ON(1);
++ return NULL;
++ }
++ if (pud_none(*pud)) {
++ unsigned long new_pmd_page = __get_free_page(gfp);
++ if (!new_pmd_page)
++ return NULL;
++ spin_lock(&shadow_table_allocation_lock);
++ if (pud_none(*pud))
++ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
++ else
++ free_page(new_pmd_page);
++ spin_unlock(&shadow_table_allocation_lock);
++ }
+
+- pud = pud_offset(pgd, address);
+- if (pud_none(*pud)) {
+- set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));
+- }
+- BUG_ON(pud_large(*pud));
++ pmd = pmd_offset(pud, address);
++ /* The shadow page tables do not use large mappings: */
++ if (pmd_large(*pmd)) {
++ WARN_ON(1);
++ return NULL;
++ }
++ if (pmd_none(*pmd)) {
++ unsigned long new_pte_page = __get_free_page(gfp);
++ if (!new_pte_page)
++ return NULL;
++ spin_lock(&shadow_table_allocation_lock);
++ if (pmd_none(*pmd))
++ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
++ else
++ free_page(new_pte_page);
++ spin_unlock(&shadow_table_allocation_lock);
++ }
+
+- pmd = pmd_offset(pud, address);
+- if (pmd_none(*pmd)) {
+- set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
+- }
+- BUG_ON(pmd_large(*pmd));
++ return pte_offset_kernel(pmd, address);
++}
+
+- pte = pte_offset_kernel(pmd, address);
++int kaiser_add_user_map(const void *__start_addr, unsigned long size,
++ unsigned long flags)
++{
++ int ret = 0;
++ pte_t *pte;
++ unsigned long start_addr = (unsigned long )__start_addr;
++ unsigned long address = start_addr & PAGE_MASK;
++ unsigned long end_addr = PAGE_ALIGN(start_addr + size);
++ unsigned long target_address;
++
++ for (;address < end_addr; address += PAGE_SIZE) {
++ target_address = get_pa_from_mapping(address);
++ if (target_address == -1) {
++ ret = -EIO;
++ break;
++ }
++ pte = kaiser_pagetable_walk(address, false);
+ if (pte_none(*pte)) {
+ set_pte(pte, __pte(flags | target_address));
+ } else {
+- BUG_ON(__pa(pte_page(*pte)) != target_address);
++ pte_t tmp;
++ set_pte(&tmp, __pte(flags | target_address));
++ WARN_ON_ONCE(!pte_same(*pte, tmp));
+ }
+ }
++ return ret;
++}
++
++static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
++{
++ unsigned long size = end - start;
++
++ return kaiser_add_user_map(start, size, flags);
+ }
+
+-// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping
+-static inline void __init _kaiser_init(void)
++/*
++ * Ensure that the top level of the (shadow) page tables are
++ * entirely populated. This ensures that all processes that get
++ * forked have the same entries. This way, we do not have to
++ * ever go set up new entries in older processes.
++ *
++ * Note: we never free these, so there are no updates to them
++ * after this.
++ */
++static void __init kaiser_init_all_pgds(void)
+ {
+ pgd_t *pgd;
+ int i = 0;
+
+ pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
+ for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+- set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));
++ pgd_t new_pgd;
++ pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE);
++ if (!pud) {
++ WARN_ON(1);
++ break;
++ }
++ new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
++ /*
++ * Make sure not to stomp on some other pgd entry.
++ */
++ if (!pgd_none(pgd[i])) {
++ WARN_ON(1);
++ continue;
++ }
++ set_pgd(pgd + i, new_pgd);
+ }
+ }
+
++#define kaiser_add_user_map_early(start, size, flags) do { \
++ int __ret = kaiser_add_user_map(start, size, flags); \
++ WARN_ON(__ret); \
++} while (0)
++
++#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
++ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
++ WARN_ON(__ret); \
++} while (0)
++
+ extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+-spinlock_t shadow_table_lock;
++/*
++ * If anything in here fails, we will likely die on one of the
++ * first kernel->user transitions and init will die. But, we
++ * will have most of the kernel up by then and should be able to
++ * get a clean warning out of it. If we BUG_ON() here, we run
++ * the risk of being before we have good console output.
++ */
+ void __init kaiser_init(void)
+ {
+ int cpu;
+- spin_lock_init(&shadow_table_lock);
+-
+- spin_lock(&shadow_table_lock);
+
+- _kaiser_init();
++ kaiser_init_all_pgds();
+
+ for_each_possible_cpu(cpu) {
+- // map the per cpu user variables
+- _kaiser_copy(
+- (unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),
+- (unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,
+- __PAGE_KERNEL);
++ void *percpu_vaddr = __per_cpu_user_mapped_start +
++ per_cpu_offset(cpu);
++ unsigned long percpu_sz = __per_cpu_user_mapped_end -
++ __per_cpu_user_mapped_start;
++ kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
++ __PAGE_KERNEL);
+ }
+
+- // map the entry/exit text section, which is responsible to switch between user- and kernel mode
+- _kaiser_copy(
+- (unsigned long) __entry_text_start,
+- (unsigned long) __entry_text_end - (unsigned long) __entry_text_start,
+- __PAGE_KERNEL_RX);
++ /*
++ * Map the entry/exit text section, which is needed at
++ * switches from user to and from kernel.
++ */
++ kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
++ __PAGE_KERNEL_RX);
+
+- // the fixed map address of the idt_table
+- _kaiser_copy(
+- (unsigned long) idt_descr.address,
+- sizeof(gate_desc) * NR_VECTORS,
+- __PAGE_KERNEL_RO);
+-
+- spin_unlock(&shadow_table_lock);
++#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
++ kaiser_add_user_map_ptrs_early(__irqentry_text_start,
++ __irqentry_text_end,
++ __PAGE_KERNEL_RX);
++#endif
++ kaiser_add_user_map_early((void *)idt_descr.address,
++ sizeof(gate_desc) * NR_VECTORS,
++ __PAGE_KERNEL_RO);
++#ifdef CONFIG_TRACING
++ kaiser_add_user_map_early(&trace_idt_descr,
++ sizeof(trace_idt_descr),
++ __PAGE_KERNEL);
++ kaiser_add_user_map_early(&trace_idt_table,
++ sizeof(gate_desc) * NR_VECTORS,
++ __PAGE_KERNEL);
++#endif
++ kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
++ __PAGE_KERNEL);
++ kaiser_add_user_map_early(&debug_idt_table,
++ sizeof(gate_desc) * NR_VECTORS,
++ __PAGE_KERNEL);
+ }
+
++extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end);
+ // add a mapping to the shadow-mapping, and synchronize the mappings
+-void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
++int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+ {
+- spin_lock(&shadow_table_lock);
+- _kaiser_copy(addr, size, flags);
+- spin_unlock(&shadow_table_lock);
++ return kaiser_add_user_map((const void *)addr, size, flags);
+ }
+
+-extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
+ void kaiser_remove_mapping(unsigned long start, unsigned long size)
+ {
+- pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));
+- spin_lock(&shadow_table_lock);
+- do {
+- unmap_pud_range(pgd, start, start + size);
+- } while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));
+- spin_unlock(&shadow_table_lock);
++ unsigned long end = start + size;
++ unsigned long addr;
++
++ for (addr = start; addr < end; addr += PGDIR_SIZE) {
++ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));
++ /*
++ * unmap_p4d_range() handles > P4D_SIZE unmaps,
++ * so no need to trim 'end'.
++ */
++ unmap_pud_range_nofree(pgd, addr, end);
++ }
+ }
+ #endif /* CONFIG_KAISER */
+diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
+index c17412f..73dcb0e1 100644
+--- a/arch/x86/mm/pageattr.c
++++ b/arch/x86/mm/pageattr.c
+@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
+ #define CPA_FLUSHTLB 1
+ #define CPA_ARRAY 2
+ #define CPA_PAGES_ARRAY 4
++#define CPA_FREE_PAGETABLES 8
+
+ #ifdef CONFIG_PROC_FS
+ static unsigned long direct_pages_count[PG_LEVEL_NUM];
+@@ -729,10 +730,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
+ return 0;
+ }
+
+-static bool try_to_free_pte_page(pte_t *pte)
++static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
+ {
+ int i;
+
++ if (!(cpa->flags & CPA_FREE_PAGETABLES))
++ return false;
++
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ if (!pte_none(pte[i]))
+ return false;
+@@ -741,10 +745,13 @@ static bool try_to_free_pte_page(pte_t *pte)
+ return true;
+ }
+
+-static bool try_to_free_pmd_page(pmd_t *pmd)
++static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
+ {
+ int i;
+
++ if (!(cpa->flags & CPA_FREE_PAGETABLES))
++ return false;
++
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ if (!pmd_none(pmd[i]))
+ return false;
+@@ -753,7 +760,9 @@ static bool try_to_free_pmd_page(pmd_t *pmd)
+ return true;
+ }
+
+-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
++static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
++ unsigned long start,
++ unsigned long end)
+ {
+ pte_t *pte = pte_offset_kernel(pmd, start);
+
+@@ -764,22 +773,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+ pte++;
+ }
+
+- if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
++ if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
+ pmd_clear(pmd);
+ return true;
+ }
+ return false;
+ }
+
+-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
++static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
+ unsigned long start, unsigned long end)
+ {
+- if (unmap_pte_range(pmd, start, end))
+- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
++ if (unmap_pte_range(cpa, pmd, start, end))
++ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
+ pud_clear(pud);
+ }
+
+-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
++static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
++ unsigned long start, unsigned long end)
+ {
+ pmd_t *pmd = pmd_offset(pud, start);
+
+@@ -790,7 +800,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+ unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+ unsigned long pre_end = min_t(unsigned long, end, next_page);
+
+- __unmap_pmd_range(pud, pmd, start, pre_end);
++ __unmap_pmd_range(cpa, pud, pmd, start, pre_end);
+
+ start = pre_end;
+ pmd++;
+@@ -803,7 +813,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+ if (pmd_large(*pmd))
+ pmd_clear(pmd);
+ else
+- __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
++ __unmap_pmd_range(cpa, pud, pmd,
++ start, start + PMD_SIZE);
+
+ start += PMD_SIZE;
+ pmd++;
+@@ -813,17 +824,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+ * 4K leftovers?
+ */
+ if (start < end)
+- return __unmap_pmd_range(pud, pmd, start, end);
++ return __unmap_pmd_range(cpa, pud, pmd, start, end);
+
+ /*
+ * Try again to free the PMD page if haven't succeeded above.
+ */
+ if (!pud_none(*pud))
+- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
++ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
+ pud_clear(pud);
+ }
+
+-void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
++static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
++ unsigned long start,
++ unsigned long end)
+ {
+ pud_t *pud = pud_offset(pgd, start);
+
+@@ -834,7 +847,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+ unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+ unsigned long pre_end = min_t(unsigned long, end, next_page);
+
+- unmap_pmd_range(pud, start, pre_end);
++ unmap_pmd_range(cpa, pud, start, pre_end);
+
+ start = pre_end;
+ pud++;
+@@ -848,7 +861,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+ if (pud_large(*pud))
+ pud_clear(pud);
+ else
+- unmap_pmd_range(pud, start, start + PUD_SIZE);
++ unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
+
+ start += PUD_SIZE;
+ pud++;
+@@ -858,7 +871,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+ * 2M leftovers?
+ */
+ if (start < end)
+- unmap_pmd_range(pud, start, end);
++ unmap_pmd_range(cpa, pud, start, end);
+
+ /*
+ * No need to try to free the PUD page because we'll free it in
+@@ -866,6 +879,24 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+ */
+ }
+
++static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
++{
++ struct cpa_data cpa = {
++ .flags = CPA_FREE_PAGETABLES,
++ };
++
++ __unmap_pud_range(&cpa, pgd, start, end);
++}
++
++void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
++{
++ struct cpa_data cpa = {
++ .flags = 0,
++ };
++
++ __unmap_pud_range(&cpa, pgd, start, end);
++}
++
+ static int alloc_pte_page(pmd_t *pmd)
+ {
+ pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
+index 27d218b..352fd01 100644
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -344,40 +344,26 @@ static inline void _pgd_free(pgd_t *pgd)
+ kmem_cache_free(pgd_cache, pgd);
+ }
+ #else
+-static inline pgd_t *_pgd_alloc(void)
+-{
++
+ #ifdef CONFIG_KAISER
+- // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
+- // block. Therefore, we have to allocate at least 3 pages. However, the
+- // __get_free_pages returns us 4 pages. Hence, we store the base pointer at
+- // the beginning of the page of our 8kb-aligned memory block in order to
+- // correctly free it afterwars.
+-
+- unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
+-
+- if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
+- {
+- *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
+- return (pgd_t *) pages;
+- }
+- else
+- {
+- *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
+- return (pgd_t *) (pages + PAGE_SIZE);
+- }
++/*
++ * Instead of one pmd, we aquire two pmds. Being order-1, it is
++ * both 8k in size and 8k-aligned. That lets us just flip bit 12
++ * in a pointer to swap between the two 4k halves.
++ */
++#define PGD_ALLOCATION_ORDER 1
+ #else
+- return (pgd_t *)__get_free_page(PGALLOC_GFP);
++#define PGD_ALLOCATION_ORDER 0
+ #endif
++
++static inline pgd_t *_pgd_alloc(void)
++{
++ return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
+ }
+
+ static inline void _pgd_free(pgd_t *pgd)
+ {
+-#ifdef CONFIG_KAISER
+- unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
+- free_pages(pages, get_order(4*PAGE_SIZE));
+-#else
+- free_page((unsigned long)pgd);
+-#endif
++ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
+ }
+ #endif /* CONFIG_X86_PAE */
+
+diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
+new file mode 100644
+index 0000000..9db5433
+--- /dev/null
++++ b/include/linux/kaiser.h
+@@ -0,0 +1,26 @@
++#ifndef _INCLUDE_KAISER_H
++#define _INCLUDE_KAISER_H
++
++#ifdef CONFIG_KAISER
++#include <asm/kaiser.h>
++#else
++
++/*
++ * These stubs are used whenever CONFIG_KAISER is off, which
++ * includes architectures that support KAISER, but have it
++ * disabled.
++ */
++
++static inline void kaiser_init(void)
++{
++}
++static inline void kaiser_remove_mapping(unsigned long start, unsigned long size)
++{
++}
++static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
++{
++ return 0;
++}
++
++#endif /* !CONFIG_KAISER */
++#endif /* _INCLUDE_KAISER_H */
+diff --git a/kernel/fork.c b/kernel/fork.c
+index d34394e..8013f22 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -58,6 +58,7 @@
+ #include <linux/tsacct_kern.h>
+ #include <linux/cn_proc.h>
+ #include <linux/freezer.h>
++#include <linux/kaiser.h>
+ #include <linux/delayacct.h>
+ #include <linux/taskstats_kern.h>
+ #include <linux/random.h>
+@@ -472,7 +473,6 @@ void set_task_stack_end_magic(struct task_struct *tsk)
+ *stackend = STACK_END_MAGIC; /* for overflow detection */
+ }
+
+-extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
+ {
+ struct task_struct *tsk;
+@@ -500,9 +500,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
+ * functions again.
+ */
+ tsk->stack = stack;
+-#ifdef CONFIG_KAISER
+- kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
+-#endif
++
++ err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
++ if (err)
++ goto free_stack;
+ #ifdef CONFIG_VMAP_STACK
+ tsk->stack_vm_area = stack_vm_area;
+ #endif
+diff --git a/security/Kconfig b/security/Kconfig
+index f515ac3..334d2e8 100644
+--- a/security/Kconfig
++++ b/security/Kconfig
+@@ -32,12 +32,17 @@ config SECURITY
+ If you are unsure how to answer this question, answer N.
+ config KAISER
+ bool "Remove the kernel mapping in user mode"
++ default y
+ depends on X86_64
+ depends on !PARAVIRT
+ help
+ This enforces a strict kernel and user space isolation in order to close
+ hardware side channels on kernel address information.
+
++config KAISER_REAL_SWITCH
++ bool "KAISER: actually switch page tables"
++ default y
++
+ config SECURITYFS
+ bool "Enable the securityfs filesystem"
+ help
+--
+2.7.4
+