1 files changed, 0 insertions, 1327 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch
deleted file mode 100644
index 0a554805..00000000
--- a/common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch
+++ /dev/null
@@ -1,1327 +0,0 @@
-From 63e6d8f6f8a48f02da9fbd55819b1154efad82ba Mon Sep 17 00:00:00 2001
-From: Dave Hansen <dave.hansen@linux.intel.com>
-Date: Wed, 30 Aug 2017 16:23:00 -0700
-Subject: [PATCH 005/103] kaiser: merged update
-
-Merged fixes and cleanups, rebased to 4.9.51 tree (no 5-level paging).
-
-Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
-Signed-off-by: Hugh Dickins <hughd@google.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- arch/x86/entry/entry_64.S            | 105 ++++++++++--
- arch/x86/include/asm/kaiser.h        |  43 +++--
- arch/x86/include/asm/pgtable.h       |  18 +-
- arch/x86/include/asm/pgtable_64.h    |  48 +++++-
- arch/x86/include/asm/pgtable_types.h |   6 +-
- arch/x86/kernel/espfix_64.c          |  13 +-
- arch/x86/kernel/head_64.S            |  19 ++-
- arch/x86/kernel/ldt.c                |  27 ++-
- arch/x86/kernel/tracepoint.c         |   2 +
- arch/x86/mm/kaiser.c                 | 313 +++++++++++++++++++++++++----------
- arch/x86/mm/pageattr.c               |  63 +++++--
- arch/x86/mm/pgtable.c                |  40 ++---
- include/linux/kaiser.h               |  26 +++
- kernel/fork.c                        |   9 +-
- security/Kconfig                     |   5 +
- 15 files changed, 549 insertions(+), 188 deletions(-)
- create mode 100644 include/linux/kaiser.h
-
-diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
-index 6c880dc..d84e3a7 100644
---- a/arch/x86/entry/entry_64.S
-+++ b/arch/x86/entry/entry_64.S
-@@ -230,6 +230,13 @@ entry_SYSCALL_64_fastpath:
- 	movq	RIP(%rsp), %rcx
- 	movq	EFLAGS(%rsp), %r11
- 	RESTORE_C_REGS_EXCEPT_RCX_R11
-+	/*
-+	 * This opens a window where we have a user CR3, but are
-+	 * running in the kernel.  This makes using the CS
-+	 * register useless for telling whether or not we need to
-+	 * switch CR3 in NMIs.  Normal interrupts are OK because
-+	 * they are off here.
-+	 */
- 	SWITCH_USER_CR3
- 	movq	RSP(%rsp), %rsp
- 	USERGS_SYSRET64
-@@ -326,11 +333,25 @@ return_from_SYSCALL_64:
- syscall_return_via_sysret:
- 	/* rcx and r11 are already restored (see code above) */
- 	RESTORE_C_REGS_EXCEPT_RCX_R11
-+	/*
-+	 * This opens a window where we have a user CR3, but are
-+	 * running in the kernel.  This makes using the CS
-+	 * register useless for telling whether or not we need to
-+	 * switch CR3 in NMIs.  Normal interrupts are OK because
-+	 * they are off here.
-+	 */
- 	SWITCH_USER_CR3
- 	movq	RSP(%rsp), %rsp
- 	USERGS_SYSRET64
- 
- opportunistic_sysret_failed:
-+	/*
-+	 * This opens a window where we have a user CR3, but are
-+	 * running in the kernel.  This makes using the CS
-+	 * register useless for telling whether or not we need to
-+	 * switch CR3 in NMIs.  Normal interrupts are OK because
-+	 * they are off here.
-+	 */
- 	SWITCH_USER_CR3
- 	SWAPGS
- 	jmp	restore_c_regs_and_iret
-@@ -1087,6 +1108,13 @@ ENTRY(error_entry)
- 	cld
- 	SAVE_C_REGS 8
- 	SAVE_EXTRA_REGS 8
-+	/*
-+	 * error_entry() always returns with a kernel gsbase and
-+	 * CR3.  We must also have a kernel CR3/gsbase before
-+	 * calling TRACE_IRQS_*.  Just unconditionally switch to
-+	 * the kernel CR3 here.
-+	 */
-+	SWITCH_KERNEL_CR3
- 	xorl	%ebx, %ebx
- 	testb	$3, CS+8(%rsp)
- 	jz	.Lerror_kernelspace
-@@ -1096,7 +1124,6 @@ ENTRY(error_entry)
- 	 * from user mode due to an IRET fault.
- 	 */
- 	SWAPGS
--	SWITCH_KERNEL_CR3
- 
- .Lerror_entry_from_usermode_after_swapgs:
- 	/*
-@@ -1148,7 +1175,6 @@ ENTRY(error_entry)
- 	 * Switch to kernel gsbase:
- 	 */
- 	SWAPGS
--	SWITCH_KERNEL_CR3
- 
- 	/*
- 	 * Pretend that the exception came from user mode: set up pt_regs
-@@ -1247,7 +1273,10 @@ ENTRY(nmi)
- 	 */
- 
- 	SWAPGS_UNSAFE_STACK
--	SWITCH_KERNEL_CR3_NO_STACK
-+	/*
-+	 * percpu variables are mapped with user CR3, so no need
-+	 * to switch CR3 here.
-+	 */
- 	cld
- 	movq	%rsp, %rdx
- 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
-@@ -1281,14 +1310,33 @@ ENTRY(nmi)
- 
- 	movq	%rsp, %rdi
- 	movq	$-1, %rsi
-+#ifdef CONFIG_KAISER
-+	/* Unconditionally use kernel CR3 for do_nmi() */
-+	/* %rax is saved above, so OK to clobber here */
-+	movq	%cr3, %rax
-+	pushq	%rax
-+#ifdef CONFIG_KAISER_REAL_SWITCH
-+	andq	$(~0x1000), %rax
-+#endif
-+	movq	%rax, %cr3
-+#endif
- 	call	do_nmi
-+	/*
-+	 * Unconditionally restore CR3.  I know we return to
-+	 * kernel code that needs user CR3, but do we ever return
-+	 * to "user mode" where we need the kernel CR3?
-+	 */
-+#ifdef CONFIG_KAISER
-+	popq	%rax
-+	mov	%rax, %cr3
-+#endif
- 
- 	/*
- 	 * Return back to user mode.  We must *not* do the normal exit
--	 * work, because we don't want to enable interrupts.  Fortunately,
--	 * do_nmi doesn't modify pt_regs.
-+	 * work, because we don't want to enable interrupts.  Do not
-+	 * switch to user CR3: we might be going back to kernel code
-+	 * that had a user CR3 set.
- 	 */
--	SWITCH_USER_CR3
- 	SWAPGS
- 	jmp	restore_c_regs_and_iret
- 
-@@ -1484,23 +1532,54 @@ end_repeat_nmi:
- 	ALLOC_PT_GPREGS_ON_STACK
- 
- 	/*
--	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
--	 * as we should not be calling schedule in NMI context.
--	 * Even with normal interrupts enabled. An NMI should not be
--	 * setting NEED_RESCHED or anything that normal interrupts and
--	 * exceptions might do.
-+	 * Use the same approach as paranoid_entry to handle SWAPGS, but
-+	 * without CR3 handling since we do that differently in NMIs.  No
-+	 * need to use paranoid_exit as we should not be calling schedule
-+	 * in NMI context.  Even with normal interrupts enabled. An NMI
-+	 * should not be setting NEED_RESCHED or anything that normal
-+	 * interrupts and exceptions might do.
- 	 */
--	call	paranoid_entry
-+	cld
-+	SAVE_C_REGS
-+	SAVE_EXTRA_REGS
-+	movl	$1, %ebx
-+	movl	$MSR_GS_BASE, %ecx
-+	rdmsr
-+	testl	%edx, %edx
-+	js	1f				/* negative -> in kernel */
-+	SWAPGS
-+	xorl	%ebx, %ebx
-+1:
-+#ifdef CONFIG_KAISER
-+	/* Unconditionally use kernel CR3 for do_nmi() */
-+	/* %rax is saved above, so OK to clobber here */
-+	movq	%cr3, %rax
-+	pushq	%rax
-+#ifdef CONFIG_KAISER_REAL_SWITCH
-+	andq	$(~0x1000), %rax
-+#endif
-+	movq	%rax, %cr3
-+#endif
- 
- 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
- 	movq	%rsp, %rdi
-+	addq	$8, %rdi /* point %rdi at ptregs, fixed up for CR3 */
- 	movq	$-1, %rsi
- 	call	do_nmi
-+	/*
-+	 * Unconditionally restore CR3.  We might be returning to
-+	 * kernel code that needs user CR3, like just just before
-+	 * a sysret.
-+	 */
-+#ifdef CONFIG_KAISER
-+	popq	%rax
-+	mov	%rax, %cr3
-+#endif
- 
- 	testl	%ebx, %ebx			/* swapgs needed? */
- 	jnz	nmi_restore
- nmi_swapgs:
--	SWITCH_USER_CR3_NO_STACK
-+	/* We fixed up CR3 above, so no need to switch it here */
- 	SWAPGS_UNSAFE_STACK
- nmi_restore:
- 	RESTORE_EXTRA_REGS
-diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
-index 63ee830..0703f48 100644
---- a/arch/x86/include/asm/kaiser.h
-+++ b/arch/x86/include/asm/kaiser.h
-@@ -16,13 +16,17 @@
- 
- .macro _SWITCH_TO_KERNEL_CR3 reg
- movq %cr3, \reg
-+#ifdef CONFIG_KAISER_REAL_SWITCH
- andq $(~0x1000), \reg
-+#endif
- movq \reg, %cr3
- .endm
- 
- .macro _SWITCH_TO_USER_CR3 reg
- movq %cr3, \reg
-+#ifdef CONFIG_KAISER_REAL_SWITCH
- orq $(0x1000), \reg
-+#endif
- movq \reg, %cr3
- .endm
- 
-@@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
- .endm
- 
- #endif /* CONFIG_KAISER */
-+
- #else /* __ASSEMBLY__ */
- 
- 
- #ifdef CONFIG_KAISER
--// Upon kernel/user mode switch, it may happen that
--// the address space has to be switched before the registers have been stored.
--// To change the address space, another register is needed.
--// A register therefore has to be stored/restored.
--//
--DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
-+/*
-+ * Upon kernel/user mode switch, it may happen that the address
-+ * space has to be switched before the registers have been
-+ * stored.  To change the address space, another register is
-+ * needed.  A register therefore has to be stored/restored.
-+*/
- 
--#endif /* CONFIG_KAISER */
-+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
- 
- /**
-- *  shadowmem_add_mapping - map a virtual memory part to the shadow mapping
-+ *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
-  *  @addr: the start address of the range
-  *  @size: the size of the range
-  *  @flags: The mapping flags of the pages
-  *
-- *  the mapping is done on a global scope, so no bigger synchronization has to be done.
-- *  the pages have to be manually unmapped again when they are not needed any longer.
-+ *  The mapping is done on a global scope, so no bigger
-+ *  synchronization has to be done.  the pages have to be
-+ *  manually unmapped again when they are not needed any longer.
-  */
--extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
-+extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
- 
- 
- /**
-- *  shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
-+ *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
-  *  @addr: the start address of the range
-  *  @size: the size of the range
-  */
- extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
- 
- /**
-- *  shadowmem_initialize_mapping - Initalize the shadow mapping
-+ *  kaiser_initialize_mapping - Initalize the shadow mapping
-  *
-- *  most parts of the shadow mapping can be mapped upon boot time.
-- *  only the thread stacks have to be mapped on runtime.
-- *  the mapped regions are not unmapped at all.
-+ *  Most parts of the shadow mapping can be mapped upon boot
-+ *  time.  Only per-process things like the thread stacks
-+ *  or a new LDT have to be mapped at runtime.  These boot-
-+ *  time mappings are permanent and nevertunmapped.
-  */
- extern void kaiser_init(void);
- 
--#endif
-+#endif /* CONFIG_KAISER */
-+
-+#endif /* __ASSEMBLY */
- 
- 
- 
-diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
-index 4b479c9..1cee98e 100644
---- a/arch/x86/include/asm/pgtable.h
-+++ b/arch/x86/include/asm/pgtable.h
-@@ -690,7 +690,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
- 
- static inline int pgd_bad(pgd_t pgd)
- {
--	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
-+	pgdval_t ignore_flags = _PAGE_USER;
-+	/*
-+	 * We set NX on KAISER pgds that map userspace memory so
-+	 * that userspace can not meaningfully use the kernel
-+	 * page table by accident; it will fault on the first
-+	 * instruction it tries to run.  See native_set_pgd().
-+	 */
-+	if (IS_ENABLED(CONFIG_KAISER))
-+		ignore_flags |= _PAGE_NX;
-+
-+	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
- }
- 
- static inline int pgd_none(pgd_t pgd)
-@@ -905,8 +915,10 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
- {
-        memcpy(dst, src, count * sizeof(pgd_t));
- #ifdef CONFIG_KAISER
--	// clone the shadow pgd part as well
--	memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
-+	/* Clone the shadow pgd part as well */
-+	memcpy(native_get_shadow_pgd(dst),
-+	       native_get_shadow_pgd(src),
-+	       count * sizeof(pgd_t));
- #endif
- }
- 
-diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
-index e6ea39f..000265c 100644
---- a/arch/x86/include/asm/pgtable_64.h
-+++ b/arch/x86/include/asm/pgtable_64.h
-@@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_t *pud)
- }
- 
- #ifdef CONFIG_KAISER
--static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
-+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
-+{
- 	return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
- }
- 
--static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
-+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
-+{
- 	return (pgd_t *)(void*)((unsigned long)(void*)pgdp &  ~(unsigned long)PAGE_SIZE);
- }
-+#else
-+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
-+{
-+	BUILD_BUG_ON(1);
-+	return NULL;
-+}
-+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
-+{
-+	return pgdp;
-+}
- #endif /* CONFIG_KAISER */
- 
-+/*
-+ * Page table pages are page-aligned.  The lower half of the top
-+ * level is used for userspace and the top half for the kernel.
-+ * This returns true for user pages that need to get copied into
-+ * both the user and kernel copies of the page tables, and false
-+ * for kernel pages that should only be in the kernel copy.
-+ */
-+static inline bool is_userspace_pgd(void *__ptr)
-+{
-+	unsigned long ptr = (unsigned long)__ptr;
-+
-+	return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
-+}
-+
- static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
- {
- #ifdef CONFIG_KAISER
--	// We know that a pgd is page aligned.
--	// Therefore the lower indices have to be mapped to user space.
--	// These pages are mapped to the shadow mapping.
--	if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
-+	pteval_t extra_kern_pgd_flags = 0;
-+	/* Do we need to also populate the shadow pgd? */
-+	if (is_userspace_pgd(pgdp)) {
- 		native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
-+		/*
-+		 * Even if the entry is *mapping* userspace, ensure
-+		 * that userspace can not use it.  This way, if we
-+		 * get out to userspace running on the kernel CR3,
-+		 * userspace will crash instead of running.
-+		 */
-+		extra_kern_pgd_flags = _PAGE_NX;
- 	}
--
--	pgdp->pgd = pgd.pgd & ~_PAGE_USER;
-+	pgdp->pgd = pgd.pgd;
-+	pgdp->pgd |= extra_kern_pgd_flags;
- #else /* CONFIG_KAISER */
- 	*pgdp = pgd;
- #endif
-diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
-index 00fecbb..8bc8d02 100644
---- a/arch/x86/include/asm/pgtable_types.h
-+++ b/arch/x86/include/asm/pgtable_types.h
-@@ -48,7 +48,7 @@
- #ifdef CONFIG_KAISER
- #define _PAGE_GLOBAL	(_AT(pteval_t, 0))
- #else
--#define _PAGE_GLOBAL  (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
-+#define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
- #endif
- #define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
- #define _PAGE_SOFTW2	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
-@@ -123,11 +123,7 @@
- #define _PAGE_DEVMAP	(_AT(pteval_t, 0))
- #endif
- 
--#ifdef CONFIG_KAISER
--#define _PAGE_PROTNONE	(_AT(pteval_t, 0))
--#else
- #define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
--#endif
- 
- #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
- 			 _PAGE_ACCESSED | _PAGE_DIRTY)
-diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
-index 9ff875a..560c2fd 100644
---- a/arch/x86/kernel/espfix_64.c
-+++ b/arch/x86/kernel/espfix_64.c
-@@ -127,11 +127,14 @@ void __init init_espfix_bsp(void)
- 	/* Install the espfix pud into the kernel page directory */
- 	pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
- 	pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
--#ifdef CONFIG_KAISER
--	// add the esp stack pud to the shadow mapping here.
--	// This can be done directly, because the fixup stack has its own pud
--	set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
--#endif
-+	/*
-+	 * Just copy the top-level PGD that is mapping the espfix
-+	 * area to ensure it is mapped into the shadow user page
-+	 * tables.
-+	 */
-+	if (IS_ENABLED(CONFIG_KAISER))
-+		set_pgd(native_get_shadow_pgd(pgd_p),
-+			__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
- 
- 	/* Randomize the locations */
- 	init_espfix_random();
-diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
-index 9e849b5..5775379 100644
---- a/arch/x86/kernel/head_64.S
-+++ b/arch/x86/kernel/head_64.S
-@@ -406,11 +406,24 @@ GLOBAL(early_recursion_flag)
- GLOBAL(name)
- 
- #ifdef CONFIG_KAISER
-+/*
-+ * Each PGD needs to be 8k long and 8k aligned.  We do not
-+ * ever go out to userspace with these, so we do not
-+ * strictly *need* the second page, but this allows us to
-+ * have a single set_pgd() implementation that does not
-+ * need to worry about whether it has 4k or 8k to work
-+ * with.
-+ *
-+ * This ensures PGDs are 8k long:
-+ */
-+#define KAISER_USER_PGD_FILL	512
-+/* This ensures they are 8k-aligned: */
- #define NEXT_PGD_PAGE(name) \
- 	.balign 2 * PAGE_SIZE; \
- GLOBAL(name)
- #else
- #define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
-+#define KAISER_USER_PGD_FILL	0
- #endif
- 
- /* Automate the creation of 1 to 1 mapping pmd entries */
-@@ -425,6 +438,7 @@ GLOBAL(name)
- NEXT_PGD_PAGE(early_level4_pgt)
- 	.fill	511,8,0
- 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
-+	.fill	KAISER_USER_PGD_FILL,8,0
- 
- NEXT_PAGE(early_dynamic_pgts)
- 	.fill	512*EARLY_DYNAMIC_PAGE_TABLES,8,0
-@@ -433,7 +447,8 @@ NEXT_PAGE(early_dynamic_pgts)
- 
- #ifndef CONFIG_XEN
- NEXT_PGD_PAGE(init_level4_pgt)
--	.fill	2*512,8,0
-+	.fill	512,8,0
-+	.fill	KAISER_USER_PGD_FILL,8,0
- #else
- NEXT_PGD_PAGE(init_level4_pgt)
- 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-@@ -442,6 +457,7 @@ NEXT_PGD_PAGE(init_level4_pgt)
- 	.org    init_level4_pgt + L4_START_KERNEL*8, 0
- 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- 	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
-+	.fill	KAISER_USER_PGD_FILL,8,0
- 
- NEXT_PAGE(level3_ident_pgt)
- 	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-@@ -452,6 +468,7 @@ NEXT_PAGE(level2_ident_pgt)
- 	 */
- 	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
- #endif
-+	.fill	KAISER_USER_PGD_FILL,8,0
- 
- NEXT_PAGE(level3_kernel_pgt)
- 	.fill	L3_START_KERNEL,8,0
-diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
-index 6707039..3c2d55b 100644
---- a/arch/x86/kernel/ldt.c
-+++ b/arch/x86/kernel/ldt.c
-@@ -17,6 +17,7 @@
- #include <linux/uaccess.h>
- 
- #include <asm/ldt.h>
-+#include <asm/kaiser.h>
- #include <asm/desc.h>
- #include <asm/mmu_context.h>
- #include <asm/syscalls.h>
-@@ -33,11 +34,21 @@ static void flush_ldt(void *current_mm)
- 	set_ldt(pc->ldt->entries, pc->ldt->size);
- }
- 
-+static void __free_ldt_struct(struct ldt_struct *ldt)
-+{
-+	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
-+		vfree(ldt->entries);
-+	else
-+		free_page((unsigned long)ldt->entries);
-+	kfree(ldt);
-+}
-+
- /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
- static struct ldt_struct *alloc_ldt_struct(int size)
- {
- 	struct ldt_struct *new_ldt;
- 	int alloc_size;
-+	int ret = 0;
- 
- 	if (size > LDT_ENTRIES)
- 		return NULL;
-@@ -65,6 +76,14 @@ static struct ldt_struct *alloc_ldt_struct(int size)
- 		return NULL;
- 	}
- 
-+	// FIXME: make kaiser_add_mapping() return an error code
-+	// when it fails
-+	kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
-+			   __PAGE_KERNEL);
-+	if (ret) {
-+		__free_ldt_struct(new_ldt);
-+		return NULL;
-+	}
- 	new_ldt->size = size;
- 	return new_ldt;
- }
-@@ -91,12 +110,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
- 	if (likely(!ldt))
- 		return;
- 
-+	kaiser_remove_mapping((unsigned long)ldt->entries,
-+			      ldt->size * LDT_ENTRY_SIZE);
- 	paravirt_free_ldt(ldt->entries, ldt->size);
--	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
--		vfree(ldt->entries);
--	else
--		free_page((unsigned long)ldt->entries);
--	kfree(ldt);
-+	__free_ldt_struct(ldt);
- }
- 
- /*
-diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
-index 1c113db..2bb5ee4 100644
---- a/arch/x86/kernel/tracepoint.c
-+++ b/arch/x86/kernel/tracepoint.c
-@@ -9,10 +9,12 @@
- #include <linux/atomic.h>
- 
- atomic_t trace_idt_ctr = ATOMIC_INIT(0);
-+__aligned(PAGE_SIZE)
- struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
- 				(unsigned long) trace_idt_table };
- 
- /* No need to be aligned, but done to keep all IDTs defined the same way. */
-+__aligned(PAGE_SIZE)
- gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
- 
- static int trace_irq_vector_refcount;
-diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
-index cf1bb92..7270a29 100644
---- a/arch/x86/mm/kaiser.c
-+++ b/arch/x86/mm/kaiser.c
-@@ -1,160 +1,305 @@
--
--
-+#include <linux/bug.h>
- #include <linux/kernel.h>
- #include <linux/errno.h>
- #include <linux/string.h>
- #include <linux/types.h>
- #include <linux/bug.h>
- #include <linux/init.h>
-+#include <linux/interrupt.h>
- #include <linux/spinlock.h>
- #include <linux/mm.h>
--
- #include <linux/uaccess.h>
-+
-+#include <asm/kaiser.h>
- #include <asm/pgtable.h>
- #include <asm/pgalloc.h>
- #include <asm/desc.h>
- #ifdef CONFIG_KAISER
- 
- __visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
-+/*
-+ * At runtime, the only things we map are some things for CPU
-+ * hotplug, and stacks for new processes.  No two CPUs will ever
-+ * be populating the same addresses, so we only need to ensure
-+ * that we protect between two CPUs trying to allocate and
-+ * populate the same page table page.
-+ *
-+ * Only take this lock when doing a set_p[4um]d(), but it is not
-+ * needed for doing a set_pte().  We assume that only the *owner*
-+ * of a given allocation will be doing this for _their_
-+ * allocation.
-+ *
-+ * This ensures that once a system has been running for a while
-+ * and there have been stacks all over and these page tables
-+ * are fully populated, there will be no further acquisitions of
-+ * this lock.
-+ */
-+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
- 
--/**
-- * Get the real ppn from a address in kernel mapping.
-- * @param address The virtual adrress
-- * @return the physical address
-+/*
-+ * Returns -1 on error.
-  */
--static inline unsigned long get_pa_from_mapping (unsigned long address)
-+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
- {
- 	pgd_t *pgd;
- 	pud_t *pud;
- 	pmd_t *pmd;
- 	pte_t *pte;
- 
--	pgd = pgd_offset_k(address);
--	BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
--
--	pud = pud_offset(pgd, address);
--	BUG_ON(pud_none(*pud));
-+	pgd = pgd_offset_k(vaddr);
-+	/*
-+	 * We made all the kernel PGDs present in kaiser_init().
-+	 * We expect them to stay that way.
-+	 */
-+	BUG_ON(pgd_none(*pgd));
-+	/*
-+	 * PGDs are either 512GB or 128TB on all x86_64
-+	 * configurations.  We don't handle these.
-+	 */
-+	BUG_ON(pgd_large(*pgd));
- 
--	if (pud_large(*pud)) {
--		return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);
-+	pud = pud_offset(pgd, vaddr);
-+	if (pud_none(*pud)) {
-+		WARN_ON_ONCE(1);
-+		return -1;
- 	}
- 
--	pmd = pmd_offset(pud, address);
--	BUG_ON(pmd_none(*pmd));
-+	if (pud_large(*pud))
-+		return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
- 
--	if (pmd_large(*pmd)) {
--		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);
-+	pmd = pmd_offset(pud, vaddr);
-+	if (pmd_none(*pmd)) {
-+		WARN_ON_ONCE(1);
-+		return -1;
- 	}
- 
--	pte = pte_offset_kernel(pmd, address);
--	BUG_ON(pte_none(*pte));
-+	if (pmd_large(*pmd))
-+		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
- 
--	return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);
-+	pte = pte_offset_kernel(pmd, vaddr);
-+	if (pte_none(*pte)) {
-+		WARN_ON_ONCE(1);
-+		return -1;
-+	}
-+
-+	return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
- }
- 
--void _kaiser_copy (unsigned long start_addr, unsigned long size,
--					unsigned long flags)
-+/*
-+ * This is a relatively normal page table walk, except that it
-+ * also tries to allocate page tables pages along the way.
-+ *
-+ * Returns a pointer to a PTE on success, or NULL on failure.
-+ */
-+static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
- {
--	pgd_t *pgd;
--	pud_t *pud;
- 	pmd_t *pmd;
--	pte_t *pte;
--	unsigned long address;
--	unsigned long end_addr = start_addr + size;
--	unsigned long target_address;
-+	pud_t *pud;
-+	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
-+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
- 
--	for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));
--			address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) {
--		target_address = get_pa_from_mapping(address);
-+	might_sleep();
-+	if (is_atomic) {
-+		gfp &= ~GFP_KERNEL;
-+		gfp |= __GFP_HIGH | __GFP_ATOMIC;
-+	}
- 
--		pgd = native_get_shadow_pgd(pgd_offset_k(address));
-+	if (pgd_none(*pgd)) {
-+		WARN_ONCE(1, "All shadow pgds should have been populated");
-+		return NULL;
-+	}
-+	BUILD_BUG_ON(pgd_large(*pgd) != 0);
- 
--		BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");
--		BUG_ON(pgd_large(*pgd));
-+	pud = pud_offset(pgd, address);
-+	/* The shadow page tables do not use large mappings: */
-+	if (pud_large(*pud)) {
-+		WARN_ON(1);
-+		return NULL;
-+	}
-+	if (pud_none(*pud)) {
-+		unsigned long new_pmd_page = __get_free_page(gfp);
-+		if (!new_pmd_page)
-+			return NULL;
-+		spin_lock(&shadow_table_allocation_lock);
-+		if (pud_none(*pud))
-+			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
-+		else
-+			free_page(new_pmd_page);
-+		spin_unlock(&shadow_table_allocation_lock);
-+	}
- 
--		pud = pud_offset(pgd, address);
--		if (pud_none(*pud)) {
--			set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));
--		}
--		BUG_ON(pud_large(*pud));
-+	pmd = pmd_offset(pud, address);
-+	/* The shadow page tables do not use large mappings: */
-+	if (pmd_large(*pmd)) {
-+		WARN_ON(1);
-+		return NULL;
-+	}
-+	if (pmd_none(*pmd)) {
-+		unsigned long new_pte_page = __get_free_page(gfp);
-+		if (!new_pte_page)
-+			return NULL;
-+		spin_lock(&shadow_table_allocation_lock);
-+		if (pmd_none(*pmd))
-+			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
-+		else
-+			free_page(new_pte_page);
-+		spin_unlock(&shadow_table_allocation_lock);
-+	}
- 
--		pmd = pmd_offset(pud, address);
--		if (pmd_none(*pmd)) {
--			set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
--		}
--		BUG_ON(pmd_large(*pmd));
-+	return pte_offset_kernel(pmd, address);
-+}
- 
--		pte = pte_offset_kernel(pmd, address);
-+int kaiser_add_user_map(const void *__start_addr, unsigned long size,
-+			unsigned long flags)
-+{
-+	int ret = 0;
-+	pte_t *pte;
-+	unsigned long start_addr = (unsigned long )__start_addr;
-+	unsigned long address = start_addr & PAGE_MASK;
-+	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
-+	unsigned long target_address;
-+
-+	for (;address < end_addr; address += PAGE_SIZE) {
-+		target_address = get_pa_from_mapping(address);
-+		if (target_address == -1) {
-+			ret = -EIO;
-+			break;
-+		}
-+		pte = kaiser_pagetable_walk(address, false);
- 		if (pte_none(*pte)) {
- 			set_pte(pte, __pte(flags | target_address));
- 		} else {
--			BUG_ON(__pa(pte_page(*pte)) != target_address);
-+			pte_t tmp;
-+			set_pte(&tmp, __pte(flags | target_address));
-+			WARN_ON_ONCE(!pte_same(*pte, tmp));
- 		}
- 	}
-+	return ret;
-+}
-+
-+static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
-+{
-+	unsigned long size = end - start;
-+
-+	return kaiser_add_user_map(start, size, flags);
- }
- 
--// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping
--static inline void __init _kaiser_init(void)
-+/*
-+ * Ensure that the top level of the (shadow) page tables are
-+ * entirely populated.  This ensures that all processes that get
-+ * forked have the same entries.  This way, we do not have to
-+ * ever go set up new entries in older processes.
-+ *
-+ * Note: we never free these, so there are no updates to them
-+ * after this.
-+ */
-+static void __init kaiser_init_all_pgds(void)
- {
- 	pgd_t *pgd;
- 	int i = 0;
- 
- 	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
- 	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
--		set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));
-+		pgd_t new_pgd;
-+		pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE);
-+		if (!pud) {
-+			WARN_ON(1);
-+			break;
-+		}
-+		new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
-+		/*
-+		 * Make sure not to stomp on some other pgd entry.
-+		 */
-+		if (!pgd_none(pgd[i])) {
-+			WARN_ON(1);
-+			continue;
-+		}
-+		set_pgd(pgd + i, new_pgd);
- 	}
- }
- 
-+#define kaiser_add_user_map_early(start, size, flags) do {	\
-+	int __ret = kaiser_add_user_map(start, size, flags);	\
-+	WARN_ON(__ret);						\
-+} while (0)
-+
-+#define kaiser_add_user_map_ptrs_early(start, end, flags) do {		\
-+	int __ret = kaiser_add_user_map_ptrs(start, end, flags);	\
-+	WARN_ON(__ret);							\
-+} while (0)
-+
- extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
--spinlock_t shadow_table_lock;
-+/*
-+ * If anything in here fails, we will likely die on one of the
-+ * first kernel->user transitions and init will die.  But, we
-+ * will have most of the kernel up by then and should be able to
-+ * get a clean warning out of it.  If we BUG_ON() here, we run
-+ * the risk of being before we have good console output.
-+ */
- void __init kaiser_init(void)
- {
- 	int cpu;
--	spin_lock_init(&shadow_table_lock);
--
--	spin_lock(&shadow_table_lock);
- 
--	_kaiser_init();
-+	kaiser_init_all_pgds();
- 
- 	for_each_possible_cpu(cpu) {
--		// map the per cpu user variables
--		_kaiser_copy(
--				(unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),
--				(unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,
--				__PAGE_KERNEL);
-+		void *percpu_vaddr = __per_cpu_user_mapped_start +
-+				     per_cpu_offset(cpu);
-+		unsigned long percpu_sz = __per_cpu_user_mapped_end -
-+					  __per_cpu_user_mapped_start;
-+		kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
-+					  __PAGE_KERNEL);
- 	}
- 
--	// map the entry/exit text section, which is responsible to switch between user- and kernel mode
--	_kaiser_copy(
--			(unsigned long) __entry_text_start,
--			(unsigned long) __entry_text_end - (unsigned long) __entry_text_start,
--			__PAGE_KERNEL_RX);
-+	/*
-+	 * Map the entry/exit text section, which is needed at
-+	 * switches from user to and from kernel.
-+	 */
-+	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
-+				       __PAGE_KERNEL_RX);
- 
--	// the fixed map address of the idt_table
--	_kaiser_copy(
--			(unsigned long) idt_descr.address,
--			sizeof(gate_desc) * NR_VECTORS,
--			__PAGE_KERNEL_RO);
--
--	spin_unlock(&shadow_table_lock);
-+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
-+	kaiser_add_user_map_ptrs_early(__irqentry_text_start,
-+				       __irqentry_text_end,
-+				       __PAGE_KERNEL_RX);
-+#endif
-+	kaiser_add_user_map_early((void *)idt_descr.address,
-+				  sizeof(gate_desc) * NR_VECTORS,
-+				  __PAGE_KERNEL_RO);
-+#ifdef CONFIG_TRACING
-+	kaiser_add_user_map_early(&trace_idt_descr,
-+				  sizeof(trace_idt_descr),
-+				  __PAGE_KERNEL);
-+	kaiser_add_user_map_early(&trace_idt_table,
-+				  sizeof(gate_desc) * NR_VECTORS,
-+				  __PAGE_KERNEL);
-+#endif
-+	kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
-+				  __PAGE_KERNEL);
-+	kaiser_add_user_map_early(&debug_idt_table,
-+				  sizeof(gate_desc) * NR_VECTORS,
-+				  __PAGE_KERNEL);
- }
- 
-+extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end);
- // add a mapping to the shadow-mapping, and synchronize the mappings
--void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
-+int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
- {
--	spin_lock(&shadow_table_lock);
--	_kaiser_copy(addr, size, flags);
--	spin_unlock(&shadow_table_lock);
-+	return kaiser_add_user_map((const void *)addr, size, flags);
- }
- 
--extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
- void kaiser_remove_mapping(unsigned long start, unsigned long size)
- {
--	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));
--	spin_lock(&shadow_table_lock);
--	do {
--		unmap_pud_range(pgd, start, start + size);
--	} while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));
--	spin_unlock(&shadow_table_lock);
-+	unsigned long end = start + size;
-+	unsigned long addr;
-+
-+	for (addr = start; addr < end; addr += PGDIR_SIZE) {
-+		pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));
-+		/*
-+		 * unmap_p4d_range() handles > P4D_SIZE unmaps,
-+		 * so no need to trim 'end'.
-+		 */
-+		unmap_pud_range_nofree(pgd, addr, end);
-+	}
- }
- #endif /* CONFIG_KAISER */
-diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
-index c17412f..73dcb0e1 100644
---- a/arch/x86/mm/pageattr.c
-+++ b/arch/x86/mm/pageattr.c
-@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
- #define CPA_FLUSHTLB 1
- #define CPA_ARRAY 2
- #define CPA_PAGES_ARRAY 4
-+#define CPA_FREE_PAGETABLES 8
- 
- #ifdef CONFIG_PROC_FS
- static unsigned long direct_pages_count[PG_LEVEL_NUM];
-@@ -729,10 +730,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
- 	return 0;
- }
- 
--static bool try_to_free_pte_page(pte_t *pte)
-+static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
- {
- 	int i;
- 
-+	if (!(cpa->flags & CPA_FREE_PAGETABLES))
-+		return false;
-+
- 	for (i = 0; i < PTRS_PER_PTE; i++)
- 		if (!pte_none(pte[i]))
- 			return false;
-@@ -741,10 +745,13 @@ static bool try_to_free_pte_page(pte_t *pte)
- 	return true;
- }
- 
--static bool try_to_free_pmd_page(pmd_t *pmd)
-+static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
- {
- 	int i;
- 
-+	if (!(cpa->flags & CPA_FREE_PAGETABLES))
-+		return false;
-+
- 	for (i = 0; i < PTRS_PER_PMD; i++)
- 		if (!pmd_none(pmd[i]))
- 			return false;
-@@ -753,7 +760,9 @@ static bool try_to_free_pmd_page(pmd_t *pmd)
- 	return true;
- }
- 
--static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
-+static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
-+			    unsigned long start,
-+			    unsigned long end)
- {
- 	pte_t *pte = pte_offset_kernel(pmd, start);
- 
-@@ -764,22 +773,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
- 		pte++;
- 	}
- 
--	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
-+	if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
- 		pmd_clear(pmd);
- 		return true;
- 	}
- 	return false;
- }
- 
--static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
-+static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
- 			      unsigned long start, unsigned long end)
- {
--	if (unmap_pte_range(pmd, start, end))
--		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
-+	if (unmap_pte_range(cpa, pmd, start, end))
-+		if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
- 			pud_clear(pud);
- }
- 
--static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
-+static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
-+			    unsigned long start, unsigned long end)
- {
- 	pmd_t *pmd = pmd_offset(pud, start);
- 
-@@ -790,7 +800,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
- 		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
- 		unsigned long pre_end = min_t(unsigned long, end, next_page);
- 
--		__unmap_pmd_range(pud, pmd, start, pre_end);
-+		__unmap_pmd_range(cpa, pud, pmd, start, pre_end);
- 
- 		start = pre_end;
- 		pmd++;
-@@ -803,7 +813,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
- 		if (pmd_large(*pmd))
- 			pmd_clear(pmd);
- 		else
--			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
-+			__unmap_pmd_range(cpa, pud, pmd,
-+					  start, start + PMD_SIZE);
- 
- 		start += PMD_SIZE;
- 		pmd++;
-@@ -813,17 +824,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
- 	 * 4K leftovers?
- 	 */
- 	if (start < end)
--		return __unmap_pmd_range(pud, pmd, start, end);
-+		return __unmap_pmd_range(cpa, pud, pmd, start, end);
- 
- 	/*
- 	 * Try again to free the PMD page if haven't succeeded above.
- 	 */
- 	if (!pud_none(*pud))
--		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
-+		if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
- 			pud_clear(pud);
- }
- 
--void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
-+static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
-+			      unsigned long start,
-+			      unsigned long end)
- {
- 	pud_t *pud = pud_offset(pgd, start);
- 
-@@ -834,7 +847,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
- 		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
- 		unsigned long pre_end	= min_t(unsigned long, end, next_page);
- 
--		unmap_pmd_range(pud, start, pre_end);
-+		unmap_pmd_range(cpa, pud, start, pre_end);
- 
- 		start = pre_end;
- 		pud++;
-@@ -848,7 +861,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
- 		if (pud_large(*pud))
- 			pud_clear(pud);
- 		else
--			unmap_pmd_range(pud, start, start + PUD_SIZE);
-+			unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
- 
- 		start += PUD_SIZE;
- 		pud++;
-@@ -858,7 +871,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
- 	 * 2M leftovers?
- 	 */
- 	if (start < end)
--		unmap_pmd_range(pud, start, end);
-+		unmap_pmd_range(cpa, pud, start, end);
- 
- 	/*
- 	 * No need to try to free the PUD page because we'll free it in
-@@ -866,6 +879,24 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
- 	 */
- }
- 
-+static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
-+{
-+	struct cpa_data cpa = {
-+		.flags = CPA_FREE_PAGETABLES,
-+	};
-+
-+	__unmap_pud_range(&cpa, pgd, start, end);
-+}
-+
-+void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
-+{
-+	struct cpa_data cpa = {
-+		.flags = 0,
-+	};
-+
-+	__unmap_pud_range(&cpa, pgd, start, end);
-+}
-+
- static int alloc_pte_page(pmd_t *pmd)
- {
- 	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
-diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
-index 27d218b..352fd01 100644
---- a/arch/x86/mm/pgtable.c
-+++ b/arch/x86/mm/pgtable.c
-@@ -344,40 +344,26 @@ static inline void _pgd_free(pgd_t *pgd)
- 		kmem_cache_free(pgd_cache, pgd);
- }
- #else
--static inline pgd_t *_pgd_alloc(void)
--{
-+
- #ifdef CONFIG_KAISER
--	// Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
--	// block. Therefore, we have to allocate at least 3 pages. However, the
--	// __get_free_pages returns us 4 pages. Hence, we store the base pointer at
--	// the beginning of the page of our 8kb-aligned memory block in order to
--	// correctly free it afterwars.
--
--	unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
--
--	if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
--	{
--		*((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
--		return (pgd_t *) pages;
--	}
--	else
--	{
--		*((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
--		return (pgd_t *) (pages + PAGE_SIZE);
--	}
-+/*
-+ * Instead of one pmd, we aquire two pmds.  Being order-1, it is
-+ * both 8k in size and 8k-aligned.  That lets us just flip bit 12
-+ * in a pointer to swap between the two 4k halves.
-+ */
-+#define PGD_ALLOCATION_ORDER 1
- #else
--	return (pgd_t *)__get_free_page(PGALLOC_GFP);
-+#define PGD_ALLOCATION_ORDER 0
- #endif
-+
-+static inline pgd_t *_pgd_alloc(void)
-+{
-+	return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
- }
- 
- static inline void _pgd_free(pgd_t *pgd)
- {
--#ifdef CONFIG_KAISER
--  unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
--	free_pages(pages, get_order(4*PAGE_SIZE));
--#else
--	free_page((unsigned long)pgd);
--#endif
-+	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
- }
- #endif /* CONFIG_X86_PAE */
- 
-diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
-new file mode 100644
-index 0000000..9db5433
---- /dev/null
-+++ b/include/linux/kaiser.h
-@@ -0,0 +1,26 @@
-+#ifndef _INCLUDE_KAISER_H
-+#define _INCLUDE_KAISER_H
-+
-+#ifdef CONFIG_KAISER
-+#include <asm/kaiser.h>
-+#else
-+
-+/*
-+ * These stubs are used whenever CONFIG_KAISER is off, which
-+ * includes architectures that support KAISER, but have it
-+ * disabled.
-+ */
-+
-+static inline void kaiser_init(void)
-+{
-+}
-+static inline void kaiser_remove_mapping(unsigned long start, unsigned long size)
-+{
-+}
-+static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
-+{
-+	return 0;
-+}
-+
-+#endif /* !CONFIG_KAISER */
-+#endif /* _INCLUDE_KAISER_H */
-diff --git a/kernel/fork.c b/kernel/fork.c
-index 61748d1..7ba50f1 100644
---- a/kernel/fork.c
-+++ b/kernel/fork.c
-@@ -58,6 +58,7 @@
- #include <linux/tsacct_kern.h>
- #include <linux/cn_proc.h>
- #include <linux/freezer.h>
-+#include <linux/kaiser.h>
- #include <linux/delayacct.h>
- #include <linux/taskstats_kern.h>
- #include <linux/random.h>
-@@ -472,7 +473,6 @@ void set_task_stack_end_magic(struct task_struct *tsk)
- 	*stackend = STACK_END_MAGIC;	/* for overflow detection */
- }
- 
--extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
- static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
- {
- 	struct task_struct *tsk;
-@@ -500,9 +500,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
- 	 * functions again.
- 	 */
- 	tsk->stack = stack;
--#ifdef CONFIG_KAISER
--	kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
--#endif
-+
-+	err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL);
-+	if (err)
-+		goto free_stack;
- #ifdef CONFIG_VMAP_STACK
- 	tsk->stack_vm_area = stack_vm_area;
- #endif
-diff --git a/security/Kconfig b/security/Kconfig
-index f515ac3..334d2e8 100644
---- a/security/Kconfig
-+++ b/security/Kconfig
-@@ -32,12 +32,17 @@ config SECURITY
- 	  If you are unsure how to answer this question, answer N.
- config KAISER
- 	bool "Remove the kernel mapping in user mode"
-+	default y
- 	depends on X86_64
- 	depends on !PARAVIRT
- 	help
- 	  This enforces a strict kernel and user space isolation in order to close
- 	  hardware side channels on kernel address information.
- 
-+config KAISER_REAL_SWITCH
-+	bool "KAISER: actually switch page tables"
-+	default y
-+
- config SECURITYFS
- 	bool "Enable the securityfs filesystem"
- 	help
--- 
-2.7.4
-