diff options
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch')
-rw-r--r-- | common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch | 1327 |
1 files changed, 0 insertions, 1327 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch deleted file mode 100644 index 0a554805..00000000 --- a/common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch +++ /dev/null @@ -1,1327 +0,0 @@ -From 63e6d8f6f8a48f02da9fbd55819b1154efad82ba Mon Sep 17 00:00:00 2001 -From: Dave Hansen <dave.hansen@linux.intel.com> -Date: Wed, 30 Aug 2017 16:23:00 -0700 -Subject: [PATCH 005/103] kaiser: merged update - -Merged fixes and cleanups, rebased to 4.9.51 tree (no 5-level paging). - -Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> -Signed-off-by: Hugh Dickins <hughd@google.com> -Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> ---- - arch/x86/entry/entry_64.S | 105 ++++++++++-- - arch/x86/include/asm/kaiser.h | 43 +++-- - arch/x86/include/asm/pgtable.h | 18 +- - arch/x86/include/asm/pgtable_64.h | 48 +++++- - arch/x86/include/asm/pgtable_types.h | 6 +- - arch/x86/kernel/espfix_64.c | 13 +- - arch/x86/kernel/head_64.S | 19 ++- - arch/x86/kernel/ldt.c | 27 ++- - arch/x86/kernel/tracepoint.c | 2 + - arch/x86/mm/kaiser.c | 313 +++++++++++++++++++++++++---------- - arch/x86/mm/pageattr.c | 63 +++++-- - arch/x86/mm/pgtable.c | 40 ++--- - include/linux/kaiser.h | 26 +++ - kernel/fork.c | 9 +- - security/Kconfig | 5 + - 15 files changed, 549 insertions(+), 188 deletions(-) - create mode 100644 include/linux/kaiser.h - -diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S -index 6c880dc..d84e3a7 100644 ---- a/arch/x86/entry/entry_64.S -+++ b/arch/x86/entry/entry_64.S -@@ -230,6 +230,13 @@ entry_SYSCALL_64_fastpath: - movq RIP(%rsp), %rcx - movq EFLAGS(%rsp), %r11 - RESTORE_C_REGS_EXCEPT_RCX_R11 -+ /* -+ * This opens a window where we have a user CR3, but are -+ * running in the kernel. This makes using the CS -+ * register useless for telling whether or not we need to -+ * switch CR3 in NMIs. Normal interrupts are OK because -+ * they are off here. -+ */ - SWITCH_USER_CR3 - movq RSP(%rsp), %rsp - USERGS_SYSRET64 -@@ -326,11 +333,25 @@ return_from_SYSCALL_64: - syscall_return_via_sysret: - /* rcx and r11 are already restored (see code above) */ - RESTORE_C_REGS_EXCEPT_RCX_R11 -+ /* -+ * This opens a window where we have a user CR3, but are -+ * running in the kernel. This makes using the CS -+ * register useless for telling whether or not we need to -+ * switch CR3 in NMIs. Normal interrupts are OK because -+ * they are off here. -+ */ - SWITCH_USER_CR3 - movq RSP(%rsp), %rsp - USERGS_SYSRET64 - - opportunistic_sysret_failed: -+ /* -+ * This opens a window where we have a user CR3, but are -+ * running in the kernel. This makes using the CS -+ * register useless for telling whether or not we need to -+ * switch CR3 in NMIs. Normal interrupts are OK because -+ * they are off here. -+ */ - SWITCH_USER_CR3 - SWAPGS - jmp restore_c_regs_and_iret -@@ -1087,6 +1108,13 @@ ENTRY(error_entry) - cld - SAVE_C_REGS 8 - SAVE_EXTRA_REGS 8 -+ /* -+ * error_entry() always returns with a kernel gsbase and -+ * CR3. We must also have a kernel CR3/gsbase before -+ * calling TRACE_IRQS_*. Just unconditionally switch to -+ * the kernel CR3 here. -+ */ -+ SWITCH_KERNEL_CR3 - xorl %ebx, %ebx - testb $3, CS+8(%rsp) - jz .Lerror_kernelspace -@@ -1096,7 +1124,6 @@ ENTRY(error_entry) - * from user mode due to an IRET fault. - */ - SWAPGS -- SWITCH_KERNEL_CR3 - - .Lerror_entry_from_usermode_after_swapgs: - /* -@@ -1148,7 +1175,6 @@ ENTRY(error_entry) - * Switch to kernel gsbase: - */ - SWAPGS -- SWITCH_KERNEL_CR3 - - /* - * Pretend that the exception came from user mode: set up pt_regs -@@ -1247,7 +1273,10 @@ ENTRY(nmi) - */ - - SWAPGS_UNSAFE_STACK -- SWITCH_KERNEL_CR3_NO_STACK -+ /* -+ * percpu variables are mapped with user CR3, so no need -+ * to switch CR3 here. -+ */ - cld - movq %rsp, %rdx - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp -@@ -1281,14 +1310,33 @@ ENTRY(nmi) - - movq %rsp, %rdi - movq $-1, %rsi -+#ifdef CONFIG_KAISER -+ /* Unconditionally use kernel CR3 for do_nmi() */ -+ /* %rax is saved above, so OK to clobber here */ -+ movq %cr3, %rax -+ pushq %rax -+#ifdef CONFIG_KAISER_REAL_SWITCH -+ andq $(~0x1000), %rax -+#endif -+ movq %rax, %cr3 -+#endif - call do_nmi -+ /* -+ * Unconditionally restore CR3. I know we return to -+ * kernel code that needs user CR3, but do we ever return -+ * to "user mode" where we need the kernel CR3? -+ */ -+#ifdef CONFIG_KAISER -+ popq %rax -+ mov %rax, %cr3 -+#endif - - /* - * Return back to user mode. We must *not* do the normal exit -- * work, because we don't want to enable interrupts. Fortunately, -- * do_nmi doesn't modify pt_regs. -+ * work, because we don't want to enable interrupts. Do not -+ * switch to user CR3: we might be going back to kernel code -+ * that had a user CR3 set. - */ -- SWITCH_USER_CR3 - SWAPGS - jmp restore_c_regs_and_iret - -@@ -1484,23 +1532,54 @@ end_repeat_nmi: - ALLOC_PT_GPREGS_ON_STACK - - /* -- * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit -- * as we should not be calling schedule in NMI context. -- * Even with normal interrupts enabled. An NMI should not be -- * setting NEED_RESCHED or anything that normal interrupts and -- * exceptions might do. -+ * Use the same approach as paranoid_entry to handle SWAPGS, but -+ * without CR3 handling since we do that differently in NMIs. No -+ * need to use paranoid_exit as we should not be calling schedule -+ * in NMI context. Even with normal interrupts enabled. An NMI -+ * should not be setting NEED_RESCHED or anything that normal -+ * interrupts and exceptions might do. - */ -- call paranoid_entry -+ cld -+ SAVE_C_REGS -+ SAVE_EXTRA_REGS -+ movl $1, %ebx -+ movl $MSR_GS_BASE, %ecx -+ rdmsr -+ testl %edx, %edx -+ js 1f /* negative -> in kernel */ -+ SWAPGS -+ xorl %ebx, %ebx -+1: -+#ifdef CONFIG_KAISER -+ /* Unconditionally use kernel CR3 for do_nmi() */ -+ /* %rax is saved above, so OK to clobber here */ -+ movq %cr3, %rax -+ pushq %rax -+#ifdef CONFIG_KAISER_REAL_SWITCH -+ andq $(~0x1000), %rax -+#endif -+ movq %rax, %cr3 -+#endif - - /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ - movq %rsp, %rdi -+ addq $8, %rdi /* point %rdi at ptregs, fixed up for CR3 */ - movq $-1, %rsi - call do_nmi -+ /* -+ * Unconditionally restore CR3. We might be returning to -+ * kernel code that needs user CR3, like just just before -+ * a sysret. -+ */ -+#ifdef CONFIG_KAISER -+ popq %rax -+ mov %rax, %cr3 -+#endif - - testl %ebx, %ebx /* swapgs needed? */ - jnz nmi_restore - nmi_swapgs: -- SWITCH_USER_CR3_NO_STACK -+ /* We fixed up CR3 above, so no need to switch it here */ - SWAPGS_UNSAFE_STACK - nmi_restore: - RESTORE_EXTRA_REGS -diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h -index 63ee830..0703f48 100644 ---- a/arch/x86/include/asm/kaiser.h -+++ b/arch/x86/include/asm/kaiser.h -@@ -16,13 +16,17 @@ - - .macro _SWITCH_TO_KERNEL_CR3 reg - movq %cr3, \reg -+#ifdef CONFIG_KAISER_REAL_SWITCH - andq $(~0x1000), \reg -+#endif - movq \reg, %cr3 - .endm - - .macro _SWITCH_TO_USER_CR3 reg - movq %cr3, \reg -+#ifdef CONFIG_KAISER_REAL_SWITCH - orq $(0x1000), \reg -+#endif - movq \reg, %cr3 - .endm - -@@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax - .endm - - #endif /* CONFIG_KAISER */ -+ - #else /* __ASSEMBLY__ */ - - - #ifdef CONFIG_KAISER --// Upon kernel/user mode switch, it may happen that --// the address space has to be switched before the registers have been stored. --// To change the address space, another register is needed. --// A register therefore has to be stored/restored. --// --DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); -+/* -+ * Upon kernel/user mode switch, it may happen that the address -+ * space has to be switched before the registers have been -+ * stored. To change the address space, another register is -+ * needed. A register therefore has to be stored/restored. -+*/ - --#endif /* CONFIG_KAISER */ -+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); - - /** -- * shadowmem_add_mapping - map a virtual memory part to the shadow mapping -+ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping - * @addr: the start address of the range - * @size: the size of the range - * @flags: The mapping flags of the pages - * -- * the mapping is done on a global scope, so no bigger synchronization has to be done. -- * the pages have to be manually unmapped again when they are not needed any longer. -+ * The mapping is done on a global scope, so no bigger -+ * synchronization has to be done. the pages have to be -+ * manually unmapped again when they are not needed any longer. - */ --extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); -+extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); - - - /** -- * shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping -+ * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping - * @addr: the start address of the range - * @size: the size of the range - */ - extern void kaiser_remove_mapping(unsigned long start, unsigned long size); - - /** -- * shadowmem_initialize_mapping - Initalize the shadow mapping -+ * kaiser_initialize_mapping - Initalize the shadow mapping - * -- * most parts of the shadow mapping can be mapped upon boot time. -- * only the thread stacks have to be mapped on runtime. -- * the mapped regions are not unmapped at all. -+ * Most parts of the shadow mapping can be mapped upon boot -+ * time. Only per-process things like the thread stacks -+ * or a new LDT have to be mapped at runtime. These boot- -+ * time mappings are permanent and nevertunmapped. - */ - extern void kaiser_init(void); - --#endif -+#endif /* CONFIG_KAISER */ -+ -+#endif /* __ASSEMBLY */ - - - -diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h -index 4b479c9..1cee98e 100644 ---- a/arch/x86/include/asm/pgtable.h -+++ b/arch/x86/include/asm/pgtable.h -@@ -690,7 +690,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) - - static inline int pgd_bad(pgd_t pgd) - { -- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; -+ pgdval_t ignore_flags = _PAGE_USER; -+ /* -+ * We set NX on KAISER pgds that map userspace memory so -+ * that userspace can not meaningfully use the kernel -+ * page table by accident; it will fault on the first -+ * instruction it tries to run. See native_set_pgd(). -+ */ -+ if (IS_ENABLED(CONFIG_KAISER)) -+ ignore_flags |= _PAGE_NX; -+ -+ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; - } - - static inline int pgd_none(pgd_t pgd) -@@ -905,8 +915,10 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) - { - memcpy(dst, src, count * sizeof(pgd_t)); - #ifdef CONFIG_KAISER -- // clone the shadow pgd part as well -- memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t)); -+ /* Clone the shadow pgd part as well */ -+ memcpy(native_get_shadow_pgd(dst), -+ native_get_shadow_pgd(src), -+ count * sizeof(pgd_t)); - #endif - } - -diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h -index e6ea39f..000265c 100644 ---- a/arch/x86/include/asm/pgtable_64.h -+++ b/arch/x86/include/asm/pgtable_64.h -@@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_t *pud) - } - - #ifdef CONFIG_KAISER --static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) { -+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) -+{ - return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE); - } - --static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) { -+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) -+{ - return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE); - } -+#else -+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) -+{ -+ BUILD_BUG_ON(1); -+ return NULL; -+} -+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) -+{ -+ return pgdp; -+} - #endif /* CONFIG_KAISER */ - -+/* -+ * Page table pages are page-aligned. The lower half of the top -+ * level is used for userspace and the top half for the kernel. -+ * This returns true for user pages that need to get copied into -+ * both the user and kernel copies of the page tables, and false -+ * for kernel pages that should only be in the kernel copy. -+ */ -+static inline bool is_userspace_pgd(void *__ptr) -+{ -+ unsigned long ptr = (unsigned long)__ptr; -+ -+ return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2)); -+} -+ - static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) - { - #ifdef CONFIG_KAISER -- // We know that a pgd is page aligned. -- // Therefore the lower indices have to be mapped to user space. -- // These pages are mapped to the shadow mapping. -- if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) { -+ pteval_t extra_kern_pgd_flags = 0; -+ /* Do we need to also populate the shadow pgd? */ -+ if (is_userspace_pgd(pgdp)) { - native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; -+ /* -+ * Even if the entry is *mapping* userspace, ensure -+ * that userspace can not use it. This way, if we -+ * get out to userspace running on the kernel CR3, -+ * userspace will crash instead of running. -+ */ -+ extra_kern_pgd_flags = _PAGE_NX; - } -- -- pgdp->pgd = pgd.pgd & ~_PAGE_USER; -+ pgdp->pgd = pgd.pgd; -+ pgdp->pgd |= extra_kern_pgd_flags; - #else /* CONFIG_KAISER */ - *pgdp = pgd; - #endif -diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h -index 00fecbb..8bc8d02 100644 ---- a/arch/x86/include/asm/pgtable_types.h -+++ b/arch/x86/include/asm/pgtable_types.h -@@ -48,7 +48,7 @@ - #ifdef CONFIG_KAISER - #define _PAGE_GLOBAL (_AT(pteval_t, 0)) - #else --#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) -+#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) - #endif - #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) - #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) -@@ -123,11 +123,7 @@ - #define _PAGE_DEVMAP (_AT(pteval_t, 0)) - #endif - --#ifdef CONFIG_KAISER --#define _PAGE_PROTNONE (_AT(pteval_t, 0)) --#else - #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) --#endif - - #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ - _PAGE_ACCESSED | _PAGE_DIRTY) -diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c -index 9ff875a..560c2fd 100644 ---- a/arch/x86/kernel/espfix_64.c -+++ b/arch/x86/kernel/espfix_64.c -@@ -127,11 +127,14 @@ void __init init_espfix_bsp(void) - /* Install the espfix pud into the kernel page directory */ - pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; - pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); --#ifdef CONFIG_KAISER -- // add the esp stack pud to the shadow mapping here. -- // This can be done directly, because the fixup stack has its own pud -- set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page))); --#endif -+ /* -+ * Just copy the top-level PGD that is mapping the espfix -+ * area to ensure it is mapped into the shadow user page -+ * tables. -+ */ -+ if (IS_ENABLED(CONFIG_KAISER)) -+ set_pgd(native_get_shadow_pgd(pgd_p), -+ __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); - - /* Randomize the locations */ - init_espfix_random(); -diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S -index 9e849b5..5775379 100644 ---- a/arch/x86/kernel/head_64.S -+++ b/arch/x86/kernel/head_64.S -@@ -406,11 +406,24 @@ GLOBAL(early_recursion_flag) - GLOBAL(name) - - #ifdef CONFIG_KAISER -+/* -+ * Each PGD needs to be 8k long and 8k aligned. We do not -+ * ever go out to userspace with these, so we do not -+ * strictly *need* the second page, but this allows us to -+ * have a single set_pgd() implementation that does not -+ * need to worry about whether it has 4k or 8k to work -+ * with. -+ * -+ * This ensures PGDs are 8k long: -+ */ -+#define KAISER_USER_PGD_FILL 512 -+/* This ensures they are 8k-aligned: */ - #define NEXT_PGD_PAGE(name) \ - .balign 2 * PAGE_SIZE; \ - GLOBAL(name) - #else - #define NEXT_PGD_PAGE(name) NEXT_PAGE(name) -+#define KAISER_USER_PGD_FILL 0 - #endif - - /* Automate the creation of 1 to 1 mapping pmd entries */ -@@ -425,6 +438,7 @@ GLOBAL(name) - NEXT_PGD_PAGE(early_level4_pgt) - .fill 511,8,0 - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE -+ .fill KAISER_USER_PGD_FILL,8,0 - - NEXT_PAGE(early_dynamic_pgts) - .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 -@@ -433,7 +447,8 @@ NEXT_PAGE(early_dynamic_pgts) - - #ifndef CONFIG_XEN - NEXT_PGD_PAGE(init_level4_pgt) -- .fill 2*512,8,0 -+ .fill 512,8,0 -+ .fill KAISER_USER_PGD_FILL,8,0 - #else - NEXT_PGD_PAGE(init_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE -@@ -442,6 +457,7 @@ NEXT_PGD_PAGE(init_level4_pgt) - .org init_level4_pgt + L4_START_KERNEL*8, 0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE -+ .fill KAISER_USER_PGD_FILL,8,0 - - NEXT_PAGE(level3_ident_pgt) - .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE -@@ -452,6 +468,7 @@ NEXT_PAGE(level2_ident_pgt) - */ - PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) - #endif -+ .fill KAISER_USER_PGD_FILL,8,0 - - NEXT_PAGE(level3_kernel_pgt) - .fill L3_START_KERNEL,8,0 -diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c -index 6707039..3c2d55b 100644 ---- a/arch/x86/kernel/ldt.c -+++ b/arch/x86/kernel/ldt.c -@@ -17,6 +17,7 @@ - #include <linux/uaccess.h> - - #include <asm/ldt.h> -+#include <asm/kaiser.h> - #include <asm/desc.h> - #include <asm/mmu_context.h> - #include <asm/syscalls.h> -@@ -33,11 +34,21 @@ static void flush_ldt(void *current_mm) - set_ldt(pc->ldt->entries, pc->ldt->size); - } - -+static void __free_ldt_struct(struct ldt_struct *ldt) -+{ -+ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) -+ vfree(ldt->entries); -+ else -+ free_page((unsigned long)ldt->entries); -+ kfree(ldt); -+} -+ - /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ - static struct ldt_struct *alloc_ldt_struct(int size) - { - struct ldt_struct *new_ldt; - int alloc_size; -+ int ret = 0; - - if (size > LDT_ENTRIES) - return NULL; -@@ -65,6 +76,14 @@ static struct ldt_struct *alloc_ldt_struct(int size) - return NULL; - } - -+ // FIXME: make kaiser_add_mapping() return an error code -+ // when it fails -+ kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, -+ __PAGE_KERNEL); -+ if (ret) { -+ __free_ldt_struct(new_ldt); -+ return NULL; -+ } - new_ldt->size = size; - return new_ldt; - } -@@ -91,12 +110,10 @@ static void free_ldt_struct(struct ldt_struct *ldt) - if (likely(!ldt)) - return; - -+ kaiser_remove_mapping((unsigned long)ldt->entries, -+ ldt->size * LDT_ENTRY_SIZE); - paravirt_free_ldt(ldt->entries, ldt->size); -- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) -- vfree(ldt->entries); -- else -- free_page((unsigned long)ldt->entries); -- kfree(ldt); -+ __free_ldt_struct(ldt); - } - - /* -diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c -index 1c113db..2bb5ee4 100644 ---- a/arch/x86/kernel/tracepoint.c -+++ b/arch/x86/kernel/tracepoint.c -@@ -9,10 +9,12 @@ - #include <linux/atomic.h> - - atomic_t trace_idt_ctr = ATOMIC_INIT(0); -+__aligned(PAGE_SIZE) - struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, - (unsigned long) trace_idt_table }; - - /* No need to be aligned, but done to keep all IDTs defined the same way. */ -+__aligned(PAGE_SIZE) - gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; - - static int trace_irq_vector_refcount; -diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c -index cf1bb92..7270a29 100644 ---- a/arch/x86/mm/kaiser.c -+++ b/arch/x86/mm/kaiser.c -@@ -1,160 +1,305 @@ -- -- -+#include <linux/bug.h> - #include <linux/kernel.h> - #include <linux/errno.h> - #include <linux/string.h> - #include <linux/types.h> - #include <linux/bug.h> - #include <linux/init.h> -+#include <linux/interrupt.h> - #include <linux/spinlock.h> - #include <linux/mm.h> -- - #include <linux/uaccess.h> -+ -+#include <asm/kaiser.h> - #include <asm/pgtable.h> - #include <asm/pgalloc.h> - #include <asm/desc.h> - #ifdef CONFIG_KAISER - - __visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); -+/* -+ * At runtime, the only things we map are some things for CPU -+ * hotplug, and stacks for new processes. No two CPUs will ever -+ * be populating the same addresses, so we only need to ensure -+ * that we protect between two CPUs trying to allocate and -+ * populate the same page table page. -+ * -+ * Only take this lock when doing a set_p[4um]d(), but it is not -+ * needed for doing a set_pte(). We assume that only the *owner* -+ * of a given allocation will be doing this for _their_ -+ * allocation. -+ * -+ * This ensures that once a system has been running for a while -+ * and there have been stacks all over and these page tables -+ * are fully populated, there will be no further acquisitions of -+ * this lock. -+ */ -+static DEFINE_SPINLOCK(shadow_table_allocation_lock); - --/** -- * Get the real ppn from a address in kernel mapping. -- * @param address The virtual adrress -- * @return the physical address -+/* -+ * Returns -1 on error. - */ --static inline unsigned long get_pa_from_mapping (unsigned long address) -+static inline unsigned long get_pa_from_mapping(unsigned long vaddr) - { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - -- pgd = pgd_offset_k(address); -- BUG_ON(pgd_none(*pgd) || pgd_large(*pgd)); -- -- pud = pud_offset(pgd, address); -- BUG_ON(pud_none(*pud)); -+ pgd = pgd_offset_k(vaddr); -+ /* -+ * We made all the kernel PGDs present in kaiser_init(). -+ * We expect them to stay that way. -+ */ -+ BUG_ON(pgd_none(*pgd)); -+ /* -+ * PGDs are either 512GB or 128TB on all x86_64 -+ * configurations. We don't handle these. -+ */ -+ BUG_ON(pgd_large(*pgd)); - -- if (pud_large(*pud)) { -- return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK); -+ pud = pud_offset(pgd, vaddr); -+ if (pud_none(*pud)) { -+ WARN_ON_ONCE(1); -+ return -1; - } - -- pmd = pmd_offset(pud, address); -- BUG_ON(pmd_none(*pmd)); -+ if (pud_large(*pud)) -+ return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK); - -- if (pmd_large(*pmd)) { -- return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK); -+ pmd = pmd_offset(pud, vaddr); -+ if (pmd_none(*pmd)) { -+ WARN_ON_ONCE(1); -+ return -1; - } - -- pte = pte_offset_kernel(pmd, address); -- BUG_ON(pte_none(*pte)); -+ if (pmd_large(*pmd)) -+ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK); - -- return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK); -+ pte = pte_offset_kernel(pmd, vaddr); -+ if (pte_none(*pte)) { -+ WARN_ON_ONCE(1); -+ return -1; -+ } -+ -+ return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); - } - --void _kaiser_copy (unsigned long start_addr, unsigned long size, -- unsigned long flags) -+/* -+ * This is a relatively normal page table walk, except that it -+ * also tries to allocate page tables pages along the way. -+ * -+ * Returns a pointer to a PTE on success, or NULL on failure. -+ */ -+static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic) - { -- pgd_t *pgd; -- pud_t *pud; - pmd_t *pmd; -- pte_t *pte; -- unsigned long address; -- unsigned long end_addr = start_addr + size; -- unsigned long target_address; -+ pud_t *pud; -+ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); -+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); - -- for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1)); -- address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) { -- target_address = get_pa_from_mapping(address); -+ might_sleep(); -+ if (is_atomic) { -+ gfp &= ~GFP_KERNEL; -+ gfp |= __GFP_HIGH | __GFP_ATOMIC; -+ } - -- pgd = native_get_shadow_pgd(pgd_offset_k(address)); -+ if (pgd_none(*pgd)) { -+ WARN_ONCE(1, "All shadow pgds should have been populated"); -+ return NULL; -+ } -+ BUILD_BUG_ON(pgd_large(*pgd) != 0); - -- BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n"); -- BUG_ON(pgd_large(*pgd)); -+ pud = pud_offset(pgd, address); -+ /* The shadow page tables do not use large mappings: */ -+ if (pud_large(*pud)) { -+ WARN_ON(1); -+ return NULL; -+ } -+ if (pud_none(*pud)) { -+ unsigned long new_pmd_page = __get_free_page(gfp); -+ if (!new_pmd_page) -+ return NULL; -+ spin_lock(&shadow_table_allocation_lock); -+ if (pud_none(*pud)) -+ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); -+ else -+ free_page(new_pmd_page); -+ spin_unlock(&shadow_table_allocation_lock); -+ } - -- pud = pud_offset(pgd, address); -- if (pud_none(*pud)) { -- set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address)))); -- } -- BUG_ON(pud_large(*pud)); -+ pmd = pmd_offset(pud, address); -+ /* The shadow page tables do not use large mappings: */ -+ if (pmd_large(*pmd)) { -+ WARN_ON(1); -+ return NULL; -+ } -+ if (pmd_none(*pmd)) { -+ unsigned long new_pte_page = __get_free_page(gfp); -+ if (!new_pte_page) -+ return NULL; -+ spin_lock(&shadow_table_allocation_lock); -+ if (pmd_none(*pmd)) -+ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); -+ else -+ free_page(new_pte_page); -+ spin_unlock(&shadow_table_allocation_lock); -+ } - -- pmd = pmd_offset(pud, address); -- if (pmd_none(*pmd)) { -- set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address)))); -- } -- BUG_ON(pmd_large(*pmd)); -+ return pte_offset_kernel(pmd, address); -+} - -- pte = pte_offset_kernel(pmd, address); -+int kaiser_add_user_map(const void *__start_addr, unsigned long size, -+ unsigned long flags) -+{ -+ int ret = 0; -+ pte_t *pte; -+ unsigned long start_addr = (unsigned long )__start_addr; -+ unsigned long address = start_addr & PAGE_MASK; -+ unsigned long end_addr = PAGE_ALIGN(start_addr + size); -+ unsigned long target_address; -+ -+ for (;address < end_addr; address += PAGE_SIZE) { -+ target_address = get_pa_from_mapping(address); -+ if (target_address == -1) { -+ ret = -EIO; -+ break; -+ } -+ pte = kaiser_pagetable_walk(address, false); - if (pte_none(*pte)) { - set_pte(pte, __pte(flags | target_address)); - } else { -- BUG_ON(__pa(pte_page(*pte)) != target_address); -+ pte_t tmp; -+ set_pte(&tmp, __pte(flags | target_address)); -+ WARN_ON_ONCE(!pte_same(*pte, tmp)); - } - } -+ return ret; -+} -+ -+static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags) -+{ -+ unsigned long size = end - start; -+ -+ return kaiser_add_user_map(start, size, flags); - } - --// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping --static inline void __init _kaiser_init(void) -+/* -+ * Ensure that the top level of the (shadow) page tables are -+ * entirely populated. This ensures that all processes that get -+ * forked have the same entries. This way, we do not have to -+ * ever go set up new entries in older processes. -+ * -+ * Note: we never free these, so there are no updates to them -+ * after this. -+ */ -+static void __init kaiser_init_all_pgds(void) - { - pgd_t *pgd; - int i = 0; - - pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); - for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { -- set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0)))); -+ pgd_t new_pgd; -+ pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE); -+ if (!pud) { -+ WARN_ON(1); -+ break; -+ } -+ new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); -+ /* -+ * Make sure not to stomp on some other pgd entry. -+ */ -+ if (!pgd_none(pgd[i])) { -+ WARN_ON(1); -+ continue; -+ } -+ set_pgd(pgd + i, new_pgd); - } - } - -+#define kaiser_add_user_map_early(start, size, flags) do { \ -+ int __ret = kaiser_add_user_map(start, size, flags); \ -+ WARN_ON(__ret); \ -+} while (0) -+ -+#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \ -+ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \ -+ WARN_ON(__ret); \ -+} while (0) -+ - extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; --spinlock_t shadow_table_lock; -+/* -+ * If anything in here fails, we will likely die on one of the -+ * first kernel->user transitions and init will die. But, we -+ * will have most of the kernel up by then and should be able to -+ * get a clean warning out of it. If we BUG_ON() here, we run -+ * the risk of being before we have good console output. -+ */ - void __init kaiser_init(void) - { - int cpu; -- spin_lock_init(&shadow_table_lock); -- -- spin_lock(&shadow_table_lock); - -- _kaiser_init(); -+ kaiser_init_all_pgds(); - - for_each_possible_cpu(cpu) { -- // map the per cpu user variables -- _kaiser_copy( -- (unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)), -- (unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start, -- __PAGE_KERNEL); -+ void *percpu_vaddr = __per_cpu_user_mapped_start + -+ per_cpu_offset(cpu); -+ unsigned long percpu_sz = __per_cpu_user_mapped_end - -+ __per_cpu_user_mapped_start; -+ kaiser_add_user_map_early(percpu_vaddr, percpu_sz, -+ __PAGE_KERNEL); - } - -- // map the entry/exit text section, which is responsible to switch between user- and kernel mode -- _kaiser_copy( -- (unsigned long) __entry_text_start, -- (unsigned long) __entry_text_end - (unsigned long) __entry_text_start, -- __PAGE_KERNEL_RX); -+ /* -+ * Map the entry/exit text section, which is needed at -+ * switches from user to and from kernel. -+ */ -+ kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end, -+ __PAGE_KERNEL_RX); - -- // the fixed map address of the idt_table -- _kaiser_copy( -- (unsigned long) idt_descr.address, -- sizeof(gate_desc) * NR_VECTORS, -- __PAGE_KERNEL_RO); -- -- spin_unlock(&shadow_table_lock); -+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) -+ kaiser_add_user_map_ptrs_early(__irqentry_text_start, -+ __irqentry_text_end, -+ __PAGE_KERNEL_RX); -+#endif -+ kaiser_add_user_map_early((void *)idt_descr.address, -+ sizeof(gate_desc) * NR_VECTORS, -+ __PAGE_KERNEL_RO); -+#ifdef CONFIG_TRACING -+ kaiser_add_user_map_early(&trace_idt_descr, -+ sizeof(trace_idt_descr), -+ __PAGE_KERNEL); -+ kaiser_add_user_map_early(&trace_idt_table, -+ sizeof(gate_desc) * NR_VECTORS, -+ __PAGE_KERNEL); -+#endif -+ kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr), -+ __PAGE_KERNEL); -+ kaiser_add_user_map_early(&debug_idt_table, -+ sizeof(gate_desc) * NR_VECTORS, -+ __PAGE_KERNEL); - } - -+extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end); - // add a mapping to the shadow-mapping, and synchronize the mappings --void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) -+int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) - { -- spin_lock(&shadow_table_lock); -- _kaiser_copy(addr, size, flags); -- spin_unlock(&shadow_table_lock); -+ return kaiser_add_user_map((const void *)addr, size, flags); - } - --extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end); - void kaiser_remove_mapping(unsigned long start, unsigned long size) - { -- pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start)); -- spin_lock(&shadow_table_lock); -- do { -- unmap_pud_range(pgd, start, start + size); -- } while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size))); -- spin_unlock(&shadow_table_lock); -+ unsigned long end = start + size; -+ unsigned long addr; -+ -+ for (addr = start; addr < end; addr += PGDIR_SIZE) { -+ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr)); -+ /* -+ * unmap_p4d_range() handles > P4D_SIZE unmaps, -+ * so no need to trim 'end'. -+ */ -+ unmap_pud_range_nofree(pgd, addr, end); -+ } - } - #endif /* CONFIG_KAISER */ -diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c -index c17412f..73dcb0e1 100644 ---- a/arch/x86/mm/pageattr.c -+++ b/arch/x86/mm/pageattr.c -@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock); - #define CPA_FLUSHTLB 1 - #define CPA_ARRAY 2 - #define CPA_PAGES_ARRAY 4 -+#define CPA_FREE_PAGETABLES 8 - - #ifdef CONFIG_PROC_FS - static unsigned long direct_pages_count[PG_LEVEL_NUM]; -@@ -729,10 +730,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, - return 0; - } - --static bool try_to_free_pte_page(pte_t *pte) -+static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte) - { - int i; - -+ if (!(cpa->flags & CPA_FREE_PAGETABLES)) -+ return false; -+ - for (i = 0; i < PTRS_PER_PTE; i++) - if (!pte_none(pte[i])) - return false; -@@ -741,10 +745,13 @@ static bool try_to_free_pte_page(pte_t *pte) - return true; - } - --static bool try_to_free_pmd_page(pmd_t *pmd) -+static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd) - { - int i; - -+ if (!(cpa->flags & CPA_FREE_PAGETABLES)) -+ return false; -+ - for (i = 0; i < PTRS_PER_PMD; i++) - if (!pmd_none(pmd[i])) - return false; -@@ -753,7 +760,9 @@ static bool try_to_free_pmd_page(pmd_t *pmd) - return true; - } - --static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) -+static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd, -+ unsigned long start, -+ unsigned long end) - { - pte_t *pte = pte_offset_kernel(pmd, start); - -@@ -764,22 +773,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) - pte++; - } - -- if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { -+ if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) { - pmd_clear(pmd); - return true; - } - return false; - } - --static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, -+static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd, - unsigned long start, unsigned long end) - { -- if (unmap_pte_range(pmd, start, end)) -- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) -+ if (unmap_pte_range(cpa, pmd, start, end)) -+ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) - pud_clear(pud); - } - --static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) -+static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, -+ unsigned long start, unsigned long end) - { - pmd_t *pmd = pmd_offset(pud, start); - -@@ -790,7 +800,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) - unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; - unsigned long pre_end = min_t(unsigned long, end, next_page); - -- __unmap_pmd_range(pud, pmd, start, pre_end); -+ __unmap_pmd_range(cpa, pud, pmd, start, pre_end); - - start = pre_end; - pmd++; -@@ -803,7 +813,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) - if (pmd_large(*pmd)) - pmd_clear(pmd); - else -- __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); -+ __unmap_pmd_range(cpa, pud, pmd, -+ start, start + PMD_SIZE); - - start += PMD_SIZE; - pmd++; -@@ -813,17 +824,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) - * 4K leftovers? - */ - if (start < end) -- return __unmap_pmd_range(pud, pmd, start, end); -+ return __unmap_pmd_range(cpa, pud, pmd, start, end); - - /* - * Try again to free the PMD page if haven't succeeded above. - */ - if (!pud_none(*pud)) -- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) -+ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) - pud_clear(pud); - } - --void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) -+static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd, -+ unsigned long start, -+ unsigned long end) - { - pud_t *pud = pud_offset(pgd, start); - -@@ -834,7 +847,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) - unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; - unsigned long pre_end = min_t(unsigned long, end, next_page); - -- unmap_pmd_range(pud, start, pre_end); -+ unmap_pmd_range(cpa, pud, start, pre_end); - - start = pre_end; - pud++; -@@ -848,7 +861,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) - if (pud_large(*pud)) - pud_clear(pud); - else -- unmap_pmd_range(pud, start, start + PUD_SIZE); -+ unmap_pmd_range(cpa, pud, start, start + PUD_SIZE); - - start += PUD_SIZE; - pud++; -@@ -858,7 +871,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) - * 2M leftovers? - */ - if (start < end) -- unmap_pmd_range(pud, start, end); -+ unmap_pmd_range(cpa, pud, start, end); - - /* - * No need to try to free the PUD page because we'll free it in -@@ -866,6 +879,24 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) - */ - } - -+static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) -+{ -+ struct cpa_data cpa = { -+ .flags = CPA_FREE_PAGETABLES, -+ }; -+ -+ __unmap_pud_range(&cpa, pgd, start, end); -+} -+ -+void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end) -+{ -+ struct cpa_data cpa = { -+ .flags = 0, -+ }; -+ -+ __unmap_pud_range(&cpa, pgd, start, end); -+} -+ - static int alloc_pte_page(pmd_t *pmd) - { - pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); -diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c -index 27d218b..352fd01 100644 ---- a/arch/x86/mm/pgtable.c -+++ b/arch/x86/mm/pgtable.c -@@ -344,40 +344,26 @@ static inline void _pgd_free(pgd_t *pgd) - kmem_cache_free(pgd_cache, pgd); - } - #else --static inline pgd_t *_pgd_alloc(void) --{ -+ - #ifdef CONFIG_KAISER -- // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory -- // block. Therefore, we have to allocate at least 3 pages. However, the -- // __get_free_pages returns us 4 pages. Hence, we store the base pointer at -- // the beginning of the page of our 8kb-aligned memory block in order to -- // correctly free it afterwars. -- -- unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE)); -- -- if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages) -- { -- *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages; -- return (pgd_t *) pages; -- } -- else -- { -- *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages; -- return (pgd_t *) (pages + PAGE_SIZE); -- } -+/* -+ * Instead of one pmd, we aquire two pmds. Being order-1, it is -+ * both 8k in size and 8k-aligned. That lets us just flip bit 12 -+ * in a pointer to swap between the two 4k halves. -+ */ -+#define PGD_ALLOCATION_ORDER 1 - #else -- return (pgd_t *)__get_free_page(PGALLOC_GFP); -+#define PGD_ALLOCATION_ORDER 0 - #endif -+ -+static inline pgd_t *_pgd_alloc(void) -+{ -+ return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); - } - - static inline void _pgd_free(pgd_t *pgd) - { --#ifdef CONFIG_KAISER -- unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE)); -- free_pages(pages, get_order(4*PAGE_SIZE)); --#else -- free_page((unsigned long)pgd); --#endif -+ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); - } - #endif /* CONFIG_X86_PAE */ - -diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h -new file mode 100644 -index 0000000..9db5433 ---- /dev/null -+++ b/include/linux/kaiser.h -@@ -0,0 +1,26 @@ -+#ifndef _INCLUDE_KAISER_H -+#define _INCLUDE_KAISER_H -+ -+#ifdef CONFIG_KAISER -+#include <asm/kaiser.h> -+#else -+ -+/* -+ * These stubs are used whenever CONFIG_KAISER is off, which -+ * includes architectures that support KAISER, but have it -+ * disabled. -+ */ -+ -+static inline void kaiser_init(void) -+{ -+} -+static inline void kaiser_remove_mapping(unsigned long start, unsigned long size) -+{ -+} -+static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) -+{ -+ return 0; -+} -+ -+#endif /* !CONFIG_KAISER */ -+#endif /* _INCLUDE_KAISER_H */ -diff --git a/kernel/fork.c b/kernel/fork.c -index 61748d1..7ba50f1 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -58,6 +58,7 @@ - #include <linux/tsacct_kern.h> - #include <linux/cn_proc.h> - #include <linux/freezer.h> -+#include <linux/kaiser.h> - #include <linux/delayacct.h> - #include <linux/taskstats_kern.h> - #include <linux/random.h> -@@ -472,7 +473,6 @@ void set_task_stack_end_magic(struct task_struct *tsk) - *stackend = STACK_END_MAGIC; /* for overflow detection */ - } - --extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); - static struct task_struct *dup_task_struct(struct task_struct *orig, int node) - { - struct task_struct *tsk; -@@ -500,9 +500,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) - * functions again. - */ - tsk->stack = stack; --#ifdef CONFIG_KAISER -- kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL); --#endif -+ -+ err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL); -+ if (err) -+ goto free_stack; - #ifdef CONFIG_VMAP_STACK - tsk->stack_vm_area = stack_vm_area; - #endif -diff --git a/security/Kconfig b/security/Kconfig -index f515ac3..334d2e8 100644 ---- a/security/Kconfig -+++ b/security/Kconfig -@@ -32,12 +32,17 @@ config SECURITY - If you are unsure how to answer this question, answer N. - config KAISER - bool "Remove the kernel mapping in user mode" -+ default y - depends on X86_64 - depends on !PARAVIRT - help - This enforces a strict kernel and user space isolation in order to close - hardware side channels on kernel address information. - -+config KAISER_REAL_SWITCH -+ bool "KAISER: actually switch page tables" -+ default y -+ - config SECURITYFS - bool "Enable the securityfs filesystem" - help --- -2.7.4 - |