diff options
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch')
-rw-r--r-- | common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch | 1327 |
1 files changed, 1327 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch new file mode 100644 index 00000000..52bf5963 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0005-kaiser-merged-update.patch @@ -0,0 +1,1327 @@ +From 48523e23d22e5a66009d404caca4721b84cde67a Mon Sep 17 00:00:00 2001 +From: Dave Hansen <dave.hansen@linux.intel.com> +Date: Wed, 30 Aug 2017 16:23:00 -0700 +Subject: [PATCH 005/102] kaiser: merged update + +Merged fixes and cleanups, rebased to 4.9.51 tree (no 5-level paging). + +Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> +Signed-off-by: Hugh Dickins <hughd@google.com> +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/entry/entry_64.S | 105 ++++++++++-- + arch/x86/include/asm/kaiser.h | 43 +++-- + arch/x86/include/asm/pgtable.h | 18 +- + arch/x86/include/asm/pgtable_64.h | 48 +++++- + arch/x86/include/asm/pgtable_types.h | 6 +- + arch/x86/kernel/espfix_64.c | 13 +- + arch/x86/kernel/head_64.S | 19 ++- + arch/x86/kernel/ldt.c | 27 ++- + arch/x86/kernel/tracepoint.c | 2 + + arch/x86/mm/kaiser.c | 313 +++++++++++++++++++++++++---------- + arch/x86/mm/pageattr.c | 63 +++++-- + arch/x86/mm/pgtable.c | 40 ++--- + include/linux/kaiser.h | 26 +++ + kernel/fork.c | 9 +- + security/Kconfig | 5 + + 15 files changed, 549 insertions(+), 188 deletions(-) + create mode 100644 include/linux/kaiser.h + +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index 6c880dc..d84e3a7 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -230,6 +230,13 @@ entry_SYSCALL_64_fastpath: + movq RIP(%rsp), %rcx + movq EFLAGS(%rsp), %r11 + RESTORE_C_REGS_EXCEPT_RCX_R11 ++ /* ++ * This opens a window where we have a user CR3, but are ++ * running in the kernel. This makes using the CS ++ * register useless for telling whether or not we need to ++ * switch CR3 in NMIs. Normal interrupts are OK because ++ * they are off here. ++ */ + SWITCH_USER_CR3 + movq RSP(%rsp), %rsp + USERGS_SYSRET64 +@@ -326,11 +333,25 @@ return_from_SYSCALL_64: + syscall_return_via_sysret: + /* rcx and r11 are already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_RCX_R11 ++ /* ++ * This opens a window where we have a user CR3, but are ++ * running in the kernel. This makes using the CS ++ * register useless for telling whether or not we need to ++ * switch CR3 in NMIs. Normal interrupts are OK because ++ * they are off here. ++ */ + SWITCH_USER_CR3 + movq RSP(%rsp), %rsp + USERGS_SYSRET64 + + opportunistic_sysret_failed: ++ /* ++ * This opens a window where we have a user CR3, but are ++ * running in the kernel. This makes using the CS ++ * register useless for telling whether or not we need to ++ * switch CR3 in NMIs. Normal interrupts are OK because ++ * they are off here. ++ */ + SWITCH_USER_CR3 + SWAPGS + jmp restore_c_regs_and_iret +@@ -1087,6 +1108,13 @@ ENTRY(error_entry) + cld + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 ++ /* ++ * error_entry() always returns with a kernel gsbase and ++ * CR3. We must also have a kernel CR3/gsbase before ++ * calling TRACE_IRQS_*. Just unconditionally switch to ++ * the kernel CR3 here. ++ */ ++ SWITCH_KERNEL_CR3 + xorl %ebx, %ebx + testb $3, CS+8(%rsp) + jz .Lerror_kernelspace +@@ -1096,7 +1124,6 @@ ENTRY(error_entry) + * from user mode due to an IRET fault. + */ + SWAPGS +- SWITCH_KERNEL_CR3 + + .Lerror_entry_from_usermode_after_swapgs: + /* +@@ -1148,7 +1175,6 @@ ENTRY(error_entry) + * Switch to kernel gsbase: + */ + SWAPGS +- SWITCH_KERNEL_CR3 + + /* + * Pretend that the exception came from user mode: set up pt_regs +@@ -1247,7 +1273,10 @@ ENTRY(nmi) + */ + + SWAPGS_UNSAFE_STACK +- SWITCH_KERNEL_CR3_NO_STACK ++ /* ++ * percpu variables are mapped with user CR3, so no need ++ * to switch CR3 here. ++ */ + cld + movq %rsp, %rdx + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp +@@ -1281,14 +1310,33 @@ ENTRY(nmi) + + movq %rsp, %rdi + movq $-1, %rsi ++#ifdef CONFIG_KAISER ++ /* Unconditionally use kernel CR3 for do_nmi() */ ++ /* %rax is saved above, so OK to clobber here */ ++ movq %cr3, %rax ++ pushq %rax ++#ifdef CONFIG_KAISER_REAL_SWITCH ++ andq $(~0x1000), %rax ++#endif ++ movq %rax, %cr3 ++#endif + call do_nmi ++ /* ++ * Unconditionally restore CR3. I know we return to ++ * kernel code that needs user CR3, but do we ever return ++ * to "user mode" where we need the kernel CR3? ++ */ ++#ifdef CONFIG_KAISER ++ popq %rax ++ mov %rax, %cr3 ++#endif + + /* + * Return back to user mode. We must *not* do the normal exit +- * work, because we don't want to enable interrupts. Fortunately, +- * do_nmi doesn't modify pt_regs. ++ * work, because we don't want to enable interrupts. Do not ++ * switch to user CR3: we might be going back to kernel code ++ * that had a user CR3 set. + */ +- SWITCH_USER_CR3 + SWAPGS + jmp restore_c_regs_and_iret + +@@ -1484,23 +1532,54 @@ end_repeat_nmi: + ALLOC_PT_GPREGS_ON_STACK + + /* +- * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit +- * as we should not be calling schedule in NMI context. +- * Even with normal interrupts enabled. An NMI should not be +- * setting NEED_RESCHED or anything that normal interrupts and +- * exceptions might do. ++ * Use the same approach as paranoid_entry to handle SWAPGS, but ++ * without CR3 handling since we do that differently in NMIs. No ++ * need to use paranoid_exit as we should not be calling schedule ++ * in NMI context. Even with normal interrupts enabled. An NMI ++ * should not be setting NEED_RESCHED or anything that normal ++ * interrupts and exceptions might do. + */ +- call paranoid_entry ++ cld ++ SAVE_C_REGS ++ SAVE_EXTRA_REGS ++ movl $1, %ebx ++ movl $MSR_GS_BASE, %ecx ++ rdmsr ++ testl %edx, %edx ++ js 1f /* negative -> in kernel */ ++ SWAPGS ++ xorl %ebx, %ebx ++1: ++#ifdef CONFIG_KAISER ++ /* Unconditionally use kernel CR3 for do_nmi() */ ++ /* %rax is saved above, so OK to clobber here */ ++ movq %cr3, %rax ++ pushq %rax ++#ifdef CONFIG_KAISER_REAL_SWITCH ++ andq $(~0x1000), %rax ++#endif ++ movq %rax, %cr3 ++#endif + + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ + movq %rsp, %rdi ++ addq $8, %rdi /* point %rdi at ptregs, fixed up for CR3 */ + movq $-1, %rsi + call do_nmi ++ /* ++ * Unconditionally restore CR3. We might be returning to ++ * kernel code that needs user CR3, like just just before ++ * a sysret. ++ */ ++#ifdef CONFIG_KAISER ++ popq %rax ++ mov %rax, %cr3 ++#endif + + testl %ebx, %ebx /* swapgs needed? */ + jnz nmi_restore + nmi_swapgs: +- SWITCH_USER_CR3_NO_STACK ++ /* We fixed up CR3 above, so no need to switch it here */ + SWAPGS_UNSAFE_STACK + nmi_restore: + RESTORE_EXTRA_REGS +diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h +index 63ee830..0703f48 100644 +--- a/arch/x86/include/asm/kaiser.h ++++ b/arch/x86/include/asm/kaiser.h +@@ -16,13 +16,17 @@ + + .macro _SWITCH_TO_KERNEL_CR3 reg + movq %cr3, \reg ++#ifdef CONFIG_KAISER_REAL_SWITCH + andq $(~0x1000), \reg ++#endif + movq \reg, %cr3 + .endm + + .macro _SWITCH_TO_USER_CR3 reg + movq %cr3, \reg ++#ifdef CONFIG_KAISER_REAL_SWITCH + orq $(0x1000), \reg ++#endif + movq \reg, %cr3 + .endm + +@@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax + .endm + + #endif /* CONFIG_KAISER */ ++ + #else /* __ASSEMBLY__ */ + + + #ifdef CONFIG_KAISER +-// Upon kernel/user mode switch, it may happen that +-// the address space has to be switched before the registers have been stored. +-// To change the address space, another register is needed. +-// A register therefore has to be stored/restored. +-// +-DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); ++/* ++ * Upon kernel/user mode switch, it may happen that the address ++ * space has to be switched before the registers have been ++ * stored. To change the address space, another register is ++ * needed. A register therefore has to be stored/restored. ++*/ + +-#endif /* CONFIG_KAISER */ ++DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + + /** +- * shadowmem_add_mapping - map a virtual memory part to the shadow mapping ++ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping + * @addr: the start address of the range + * @size: the size of the range + * @flags: The mapping flags of the pages + * +- * the mapping is done on a global scope, so no bigger synchronization has to be done. +- * the pages have to be manually unmapped again when they are not needed any longer. ++ * The mapping is done on a global scope, so no bigger ++ * synchronization has to be done. the pages have to be ++ * manually unmapped again when they are not needed any longer. + */ +-extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); ++extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); + + + /** +- * shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping ++ * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping + * @addr: the start address of the range + * @size: the size of the range + */ + extern void kaiser_remove_mapping(unsigned long start, unsigned long size); + + /** +- * shadowmem_initialize_mapping - Initalize the shadow mapping ++ * kaiser_initialize_mapping - Initalize the shadow mapping + * +- * most parts of the shadow mapping can be mapped upon boot time. +- * only the thread stacks have to be mapped on runtime. +- * the mapped regions are not unmapped at all. ++ * Most parts of the shadow mapping can be mapped upon boot ++ * time. Only per-process things like the thread stacks ++ * or a new LDT have to be mapped at runtime. These boot- ++ * time mappings are permanent and nevertunmapped. + */ + extern void kaiser_init(void); + +-#endif ++#endif /* CONFIG_KAISER */ ++ ++#endif /* __ASSEMBLY */ + + + +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index 4b479c9..1cee98e 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -690,7 +690,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) + + static inline int pgd_bad(pgd_t pgd) + { +- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; ++ pgdval_t ignore_flags = _PAGE_USER; ++ /* ++ * We set NX on KAISER pgds that map userspace memory so ++ * that userspace can not meaningfully use the kernel ++ * page table by accident; it will fault on the first ++ * instruction it tries to run. See native_set_pgd(). ++ */ ++ if (IS_ENABLED(CONFIG_KAISER)) ++ ignore_flags |= _PAGE_NX; ++ ++ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; + } + + static inline int pgd_none(pgd_t pgd) +@@ -905,8 +915,10 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) + { + memcpy(dst, src, count * sizeof(pgd_t)); + #ifdef CONFIG_KAISER +- // clone the shadow pgd part as well +- memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t)); ++ /* Clone the shadow pgd part as well */ ++ memcpy(native_get_shadow_pgd(dst), ++ native_get_shadow_pgd(src), ++ count * sizeof(pgd_t)); + #endif + } + +diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h +index e6ea39f..000265c 100644 +--- a/arch/x86/include/asm/pgtable_64.h ++++ b/arch/x86/include/asm/pgtable_64.h +@@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_t *pud) + } + + #ifdef CONFIG_KAISER +-static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) { ++static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) ++{ + return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE); + } + +-static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) { ++static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) ++{ + return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE); + } ++#else ++static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) ++{ ++ BUILD_BUG_ON(1); ++ return NULL; ++} ++static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) ++{ ++ return pgdp; ++} + #endif /* CONFIG_KAISER */ + ++/* ++ * Page table pages are page-aligned. The lower half of the top ++ * level is used for userspace and the top half for the kernel. ++ * This returns true for user pages that need to get copied into ++ * both the user and kernel copies of the page tables, and false ++ * for kernel pages that should only be in the kernel copy. ++ */ ++static inline bool is_userspace_pgd(void *__ptr) ++{ ++ unsigned long ptr = (unsigned long)__ptr; ++ ++ return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2)); ++} ++ + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) + { + #ifdef CONFIG_KAISER +- // We know that a pgd is page aligned. +- // Therefore the lower indices have to be mapped to user space. +- // These pages are mapped to the shadow mapping. +- if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) { ++ pteval_t extra_kern_pgd_flags = 0; ++ /* Do we need to also populate the shadow pgd? */ ++ if (is_userspace_pgd(pgdp)) { + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; ++ /* ++ * Even if the entry is *mapping* userspace, ensure ++ * that userspace can not use it. This way, if we ++ * get out to userspace running on the kernel CR3, ++ * userspace will crash instead of running. ++ */ ++ extra_kern_pgd_flags = _PAGE_NX; + } +- +- pgdp->pgd = pgd.pgd & ~_PAGE_USER; ++ pgdp->pgd = pgd.pgd; ++ pgdp->pgd |= extra_kern_pgd_flags; + #else /* CONFIG_KAISER */ + *pgdp = pgd; + #endif +diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h +index 00fecbb..8bc8d02 100644 +--- a/arch/x86/include/asm/pgtable_types.h ++++ b/arch/x86/include/asm/pgtable_types.h +@@ -48,7 +48,7 @@ + #ifdef CONFIG_KAISER + #define _PAGE_GLOBAL (_AT(pteval_t, 0)) + #else +-#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) ++#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) + #endif + #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) + #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) +@@ -123,11 +123,7 @@ + #define _PAGE_DEVMAP (_AT(pteval_t, 0)) + #endif + +-#ifdef CONFIG_KAISER +-#define _PAGE_PROTNONE (_AT(pteval_t, 0)) +-#else + #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +-#endif + + #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY) +diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c +index 9ff875a..560c2fd 100644 +--- a/arch/x86/kernel/espfix_64.c ++++ b/arch/x86/kernel/espfix_64.c +@@ -127,11 +127,14 @@ void __init init_espfix_bsp(void) + /* Install the espfix pud into the kernel page directory */ + pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); +-#ifdef CONFIG_KAISER +- // add the esp stack pud to the shadow mapping here. +- // This can be done directly, because the fixup stack has its own pud +- set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page))); +-#endif ++ /* ++ * Just copy the top-level PGD that is mapping the espfix ++ * area to ensure it is mapped into the shadow user page ++ * tables. ++ */ ++ if (IS_ENABLED(CONFIG_KAISER)) ++ set_pgd(native_get_shadow_pgd(pgd_p), ++ __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); + + /* Randomize the locations */ + init_espfix_random(); +diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S +index 9e849b5..5775379 100644 +--- a/arch/x86/kernel/head_64.S ++++ b/arch/x86/kernel/head_64.S +@@ -406,11 +406,24 @@ GLOBAL(early_recursion_flag) + GLOBAL(name) + + #ifdef CONFIG_KAISER ++/* ++ * Each PGD needs to be 8k long and 8k aligned. We do not ++ * ever go out to userspace with these, so we do not ++ * strictly *need* the second page, but this allows us to ++ * have a single set_pgd() implementation that does not ++ * need to worry about whether it has 4k or 8k to work ++ * with. ++ * ++ * This ensures PGDs are 8k long: ++ */ ++#define KAISER_USER_PGD_FILL 512 ++/* This ensures they are 8k-aligned: */ + #define NEXT_PGD_PAGE(name) \ + .balign 2 * PAGE_SIZE; \ + GLOBAL(name) + #else + #define NEXT_PGD_PAGE(name) NEXT_PAGE(name) ++#define KAISER_USER_PGD_FILL 0 + #endif + + /* Automate the creation of 1 to 1 mapping pmd entries */ +@@ -425,6 +438,7 @@ GLOBAL(name) + NEXT_PGD_PAGE(early_level4_pgt) + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE ++ .fill KAISER_USER_PGD_FILL,8,0 + + NEXT_PAGE(early_dynamic_pgts) + .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 +@@ -433,7 +447,8 @@ NEXT_PAGE(early_dynamic_pgts) + + #ifndef CONFIG_XEN + NEXT_PGD_PAGE(init_level4_pgt) +- .fill 2*512,8,0 ++ .fill 512,8,0 ++ .fill KAISER_USER_PGD_FILL,8,0 + #else + NEXT_PGD_PAGE(init_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE +@@ -442,6 +457,7 @@ NEXT_PGD_PAGE(init_level4_pgt) + .org init_level4_pgt + L4_START_KERNEL*8, 0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE ++ .fill KAISER_USER_PGD_FILL,8,0 + + NEXT_PAGE(level3_ident_pgt) + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE +@@ -452,6 +468,7 @@ NEXT_PAGE(level2_ident_pgt) + */ + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) + #endif ++ .fill KAISER_USER_PGD_FILL,8,0 + + NEXT_PAGE(level3_kernel_pgt) + .fill L3_START_KERNEL,8,0 +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index 6707039..3c2d55b 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -17,6 +17,7 @@ + #include <linux/uaccess.h> + + #include <asm/ldt.h> ++#include <asm/kaiser.h> + #include <asm/desc.h> + #include <asm/mmu_context.h> + #include <asm/syscalls.h> +@@ -33,11 +34,21 @@ static void flush_ldt(void *current_mm) + set_ldt(pc->ldt->entries, pc->ldt->size); + } + ++static void __free_ldt_struct(struct ldt_struct *ldt) ++{ ++ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) ++ vfree(ldt->entries); ++ else ++ free_page((unsigned long)ldt->entries); ++ kfree(ldt); ++} ++ + /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ + static struct ldt_struct *alloc_ldt_struct(int size) + { + struct ldt_struct *new_ldt; + int alloc_size; ++ int ret = 0; + + if (size > LDT_ENTRIES) + return NULL; +@@ -65,6 +76,14 @@ static struct ldt_struct *alloc_ldt_struct(int size) + return NULL; + } + ++ // FIXME: make kaiser_add_mapping() return an error code ++ // when it fails ++ kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, ++ __PAGE_KERNEL); ++ if (ret) { ++ __free_ldt_struct(new_ldt); ++ return NULL; ++ } + new_ldt->size = size; + return new_ldt; + } +@@ -91,12 +110,10 @@ static void free_ldt_struct(struct ldt_struct *ldt) + if (likely(!ldt)) + return; + ++ kaiser_remove_mapping((unsigned long)ldt->entries, ++ ldt->size * LDT_ENTRY_SIZE); + paravirt_free_ldt(ldt->entries, ldt->size); +- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) +- vfree(ldt->entries); +- else +- free_page((unsigned long)ldt->entries); +- kfree(ldt); ++ __free_ldt_struct(ldt); + } + + /* +diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c +index 1c113db..2bb5ee4 100644 +--- a/arch/x86/kernel/tracepoint.c ++++ b/arch/x86/kernel/tracepoint.c +@@ -9,10 +9,12 @@ + #include <linux/atomic.h> + + atomic_t trace_idt_ctr = ATOMIC_INIT(0); ++__aligned(PAGE_SIZE) + struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) trace_idt_table }; + + /* No need to be aligned, but done to keep all IDTs defined the same way. */ ++__aligned(PAGE_SIZE) + gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; + + static int trace_irq_vector_refcount; +diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c +index cf1bb92..7270a29 100644 +--- a/arch/x86/mm/kaiser.c ++++ b/arch/x86/mm/kaiser.c +@@ -1,160 +1,305 @@ +- +- ++#include <linux/bug.h> + #include <linux/kernel.h> + #include <linux/errno.h> + #include <linux/string.h> + #include <linux/types.h> + #include <linux/bug.h> + #include <linux/init.h> ++#include <linux/interrupt.h> + #include <linux/spinlock.h> + #include <linux/mm.h> +- + #include <linux/uaccess.h> ++ ++#include <asm/kaiser.h> + #include <asm/pgtable.h> + #include <asm/pgalloc.h> + #include <asm/desc.h> + #ifdef CONFIG_KAISER + + __visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); ++/* ++ * At runtime, the only things we map are some things for CPU ++ * hotplug, and stacks for new processes. No two CPUs will ever ++ * be populating the same addresses, so we only need to ensure ++ * that we protect between two CPUs trying to allocate and ++ * populate the same page table page. ++ * ++ * Only take this lock when doing a set_p[4um]d(), but it is not ++ * needed for doing a set_pte(). We assume that only the *owner* ++ * of a given allocation will be doing this for _their_ ++ * allocation. ++ * ++ * This ensures that once a system has been running for a while ++ * and there have been stacks all over and these page tables ++ * are fully populated, there will be no further acquisitions of ++ * this lock. ++ */ ++static DEFINE_SPINLOCK(shadow_table_allocation_lock); + +-/** +- * Get the real ppn from a address in kernel mapping. +- * @param address The virtual adrress +- * @return the physical address ++/* ++ * Returns -1 on error. + */ +-static inline unsigned long get_pa_from_mapping (unsigned long address) ++static inline unsigned long get_pa_from_mapping(unsigned long vaddr) + { + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + +- pgd = pgd_offset_k(address); +- BUG_ON(pgd_none(*pgd) || pgd_large(*pgd)); +- +- pud = pud_offset(pgd, address); +- BUG_ON(pud_none(*pud)); ++ pgd = pgd_offset_k(vaddr); ++ /* ++ * We made all the kernel PGDs present in kaiser_init(). ++ * We expect them to stay that way. ++ */ ++ BUG_ON(pgd_none(*pgd)); ++ /* ++ * PGDs are either 512GB or 128TB on all x86_64 ++ * configurations. We don't handle these. ++ */ ++ BUG_ON(pgd_large(*pgd)); + +- if (pud_large(*pud)) { +- return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK); ++ pud = pud_offset(pgd, vaddr); ++ if (pud_none(*pud)) { ++ WARN_ON_ONCE(1); ++ return -1; + } + +- pmd = pmd_offset(pud, address); +- BUG_ON(pmd_none(*pmd)); ++ if (pud_large(*pud)) ++ return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK); + +- if (pmd_large(*pmd)) { +- return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK); ++ pmd = pmd_offset(pud, vaddr); ++ if (pmd_none(*pmd)) { ++ WARN_ON_ONCE(1); ++ return -1; + } + +- pte = pte_offset_kernel(pmd, address); +- BUG_ON(pte_none(*pte)); ++ if (pmd_large(*pmd)) ++ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK); + +- return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK); ++ pte = pte_offset_kernel(pmd, vaddr); ++ if (pte_none(*pte)) { ++ WARN_ON_ONCE(1); ++ return -1; ++ } ++ ++ return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); + } + +-void _kaiser_copy (unsigned long start_addr, unsigned long size, +- unsigned long flags) ++/* ++ * This is a relatively normal page table walk, except that it ++ * also tries to allocate page tables pages along the way. ++ * ++ * Returns a pointer to a PTE on success, or NULL on failure. ++ */ ++static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic) + { +- pgd_t *pgd; +- pud_t *pud; + pmd_t *pmd; +- pte_t *pte; +- unsigned long address; +- unsigned long end_addr = start_addr + size; +- unsigned long target_address; ++ pud_t *pud; ++ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); ++ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + +- for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1)); +- address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) { +- target_address = get_pa_from_mapping(address); ++ might_sleep(); ++ if (is_atomic) { ++ gfp &= ~GFP_KERNEL; ++ gfp |= __GFP_HIGH | __GFP_ATOMIC; ++ } + +- pgd = native_get_shadow_pgd(pgd_offset_k(address)); ++ if (pgd_none(*pgd)) { ++ WARN_ONCE(1, "All shadow pgds should have been populated"); ++ return NULL; ++ } ++ BUILD_BUG_ON(pgd_large(*pgd) != 0); + +- BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n"); +- BUG_ON(pgd_large(*pgd)); ++ pud = pud_offset(pgd, address); ++ /* The shadow page tables do not use large mappings: */ ++ if (pud_large(*pud)) { ++ WARN_ON(1); ++ return NULL; ++ } ++ if (pud_none(*pud)) { ++ unsigned long new_pmd_page = __get_free_page(gfp); ++ if (!new_pmd_page) ++ return NULL; ++ spin_lock(&shadow_table_allocation_lock); ++ if (pud_none(*pud)) ++ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); ++ else ++ free_page(new_pmd_page); ++ spin_unlock(&shadow_table_allocation_lock); ++ } + +- pud = pud_offset(pgd, address); +- if (pud_none(*pud)) { +- set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address)))); +- } +- BUG_ON(pud_large(*pud)); ++ pmd = pmd_offset(pud, address); ++ /* The shadow page tables do not use large mappings: */ ++ if (pmd_large(*pmd)) { ++ WARN_ON(1); ++ return NULL; ++ } ++ if (pmd_none(*pmd)) { ++ unsigned long new_pte_page = __get_free_page(gfp); ++ if (!new_pte_page) ++ return NULL; ++ spin_lock(&shadow_table_allocation_lock); ++ if (pmd_none(*pmd)) ++ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); ++ else ++ free_page(new_pte_page); ++ spin_unlock(&shadow_table_allocation_lock); ++ } + +- pmd = pmd_offset(pud, address); +- if (pmd_none(*pmd)) { +- set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address)))); +- } +- BUG_ON(pmd_large(*pmd)); ++ return pte_offset_kernel(pmd, address); ++} + +- pte = pte_offset_kernel(pmd, address); ++int kaiser_add_user_map(const void *__start_addr, unsigned long size, ++ unsigned long flags) ++{ ++ int ret = 0; ++ pte_t *pte; ++ unsigned long start_addr = (unsigned long )__start_addr; ++ unsigned long address = start_addr & PAGE_MASK; ++ unsigned long end_addr = PAGE_ALIGN(start_addr + size); ++ unsigned long target_address; ++ ++ for (;address < end_addr; address += PAGE_SIZE) { ++ target_address = get_pa_from_mapping(address); ++ if (target_address == -1) { ++ ret = -EIO; ++ break; ++ } ++ pte = kaiser_pagetable_walk(address, false); + if (pte_none(*pte)) { + set_pte(pte, __pte(flags | target_address)); + } else { +- BUG_ON(__pa(pte_page(*pte)) != target_address); ++ pte_t tmp; ++ set_pte(&tmp, __pte(flags | target_address)); ++ WARN_ON_ONCE(!pte_same(*pte, tmp)); + } + } ++ return ret; ++} ++ ++static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags) ++{ ++ unsigned long size = end - start; ++ ++ return kaiser_add_user_map(start, size, flags); + } + +-// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping +-static inline void __init _kaiser_init(void) ++/* ++ * Ensure that the top level of the (shadow) page tables are ++ * entirely populated. This ensures that all processes that get ++ * forked have the same entries. This way, we do not have to ++ * ever go set up new entries in older processes. ++ * ++ * Note: we never free these, so there are no updates to them ++ * after this. ++ */ ++static void __init kaiser_init_all_pgds(void) + { + pgd_t *pgd; + int i = 0; + + pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); + for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { +- set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0)))); ++ pgd_t new_pgd; ++ pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE); ++ if (!pud) { ++ WARN_ON(1); ++ break; ++ } ++ new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); ++ /* ++ * Make sure not to stomp on some other pgd entry. ++ */ ++ if (!pgd_none(pgd[i])) { ++ WARN_ON(1); ++ continue; ++ } ++ set_pgd(pgd + i, new_pgd); + } + } + ++#define kaiser_add_user_map_early(start, size, flags) do { \ ++ int __ret = kaiser_add_user_map(start, size, flags); \ ++ WARN_ON(__ret); \ ++} while (0) ++ ++#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \ ++ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \ ++ WARN_ON(__ret); \ ++} while (0) ++ + extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; +-spinlock_t shadow_table_lock; ++/* ++ * If anything in here fails, we will likely die on one of the ++ * first kernel->user transitions and init will die. But, we ++ * will have most of the kernel up by then and should be able to ++ * get a clean warning out of it. If we BUG_ON() here, we run ++ * the risk of being before we have good console output. ++ */ + void __init kaiser_init(void) + { + int cpu; +- spin_lock_init(&shadow_table_lock); +- +- spin_lock(&shadow_table_lock); + +- _kaiser_init(); ++ kaiser_init_all_pgds(); + + for_each_possible_cpu(cpu) { +- // map the per cpu user variables +- _kaiser_copy( +- (unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)), +- (unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start, +- __PAGE_KERNEL); ++ void *percpu_vaddr = __per_cpu_user_mapped_start + ++ per_cpu_offset(cpu); ++ unsigned long percpu_sz = __per_cpu_user_mapped_end - ++ __per_cpu_user_mapped_start; ++ kaiser_add_user_map_early(percpu_vaddr, percpu_sz, ++ __PAGE_KERNEL); + } + +- // map the entry/exit text section, which is responsible to switch between user- and kernel mode +- _kaiser_copy( +- (unsigned long) __entry_text_start, +- (unsigned long) __entry_text_end - (unsigned long) __entry_text_start, +- __PAGE_KERNEL_RX); ++ /* ++ * Map the entry/exit text section, which is needed at ++ * switches from user to and from kernel. ++ */ ++ kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end, ++ __PAGE_KERNEL_RX); + +- // the fixed map address of the idt_table +- _kaiser_copy( +- (unsigned long) idt_descr.address, +- sizeof(gate_desc) * NR_VECTORS, +- __PAGE_KERNEL_RO); +- +- spin_unlock(&shadow_table_lock); ++#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) ++ kaiser_add_user_map_ptrs_early(__irqentry_text_start, ++ __irqentry_text_end, ++ __PAGE_KERNEL_RX); ++#endif ++ kaiser_add_user_map_early((void *)idt_descr.address, ++ sizeof(gate_desc) * NR_VECTORS, ++ __PAGE_KERNEL_RO); ++#ifdef CONFIG_TRACING ++ kaiser_add_user_map_early(&trace_idt_descr, ++ sizeof(trace_idt_descr), ++ __PAGE_KERNEL); ++ kaiser_add_user_map_early(&trace_idt_table, ++ sizeof(gate_desc) * NR_VECTORS, ++ __PAGE_KERNEL); ++#endif ++ kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr), ++ __PAGE_KERNEL); ++ kaiser_add_user_map_early(&debug_idt_table, ++ sizeof(gate_desc) * NR_VECTORS, ++ __PAGE_KERNEL); + } + ++extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end); + // add a mapping to the shadow-mapping, and synchronize the mappings +-void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) ++int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) + { +- spin_lock(&shadow_table_lock); +- _kaiser_copy(addr, size, flags); +- spin_unlock(&shadow_table_lock); ++ return kaiser_add_user_map((const void *)addr, size, flags); + } + +-extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end); + void kaiser_remove_mapping(unsigned long start, unsigned long size) + { +- pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start)); +- spin_lock(&shadow_table_lock); +- do { +- unmap_pud_range(pgd, start, start + size); +- } while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size))); +- spin_unlock(&shadow_table_lock); ++ unsigned long end = start + size; ++ unsigned long addr; ++ ++ for (addr = start; addr < end; addr += PGDIR_SIZE) { ++ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr)); ++ /* ++ * unmap_p4d_range() handles > P4D_SIZE unmaps, ++ * so no need to trim 'end'. ++ */ ++ unmap_pud_range_nofree(pgd, addr, end); ++ } + } + #endif /* CONFIG_KAISER */ +diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c +index c17412f..73dcb0e1 100644 +--- a/arch/x86/mm/pageattr.c ++++ b/arch/x86/mm/pageattr.c +@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock); + #define CPA_FLUSHTLB 1 + #define CPA_ARRAY 2 + #define CPA_PAGES_ARRAY 4 ++#define CPA_FREE_PAGETABLES 8 + + #ifdef CONFIG_PROC_FS + static unsigned long direct_pages_count[PG_LEVEL_NUM]; +@@ -729,10 +730,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, + return 0; + } + +-static bool try_to_free_pte_page(pte_t *pte) ++static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte) + { + int i; + ++ if (!(cpa->flags & CPA_FREE_PAGETABLES)) ++ return false; ++ + for (i = 0; i < PTRS_PER_PTE; i++) + if (!pte_none(pte[i])) + return false; +@@ -741,10 +745,13 @@ static bool try_to_free_pte_page(pte_t *pte) + return true; + } + +-static bool try_to_free_pmd_page(pmd_t *pmd) ++static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd) + { + int i; + ++ if (!(cpa->flags & CPA_FREE_PAGETABLES)) ++ return false; ++ + for (i = 0; i < PTRS_PER_PMD; i++) + if (!pmd_none(pmd[i])) + return false; +@@ -753,7 +760,9 @@ static bool try_to_free_pmd_page(pmd_t *pmd) + return true; + } + +-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) ++static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd, ++ unsigned long start, ++ unsigned long end) + { + pte_t *pte = pte_offset_kernel(pmd, start); + +@@ -764,22 +773,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) + pte++; + } + +- if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { ++ if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) { + pmd_clear(pmd); + return true; + } + return false; + } + +-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, ++static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd, + unsigned long start, unsigned long end) + { +- if (unmap_pte_range(pmd, start, end)) +- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) ++ if (unmap_pte_range(cpa, pmd, start, end)) ++ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) + pud_clear(pud); + } + +-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) ++static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, ++ unsigned long start, unsigned long end) + { + pmd_t *pmd = pmd_offset(pud, start); + +@@ -790,7 +800,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) + unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; + unsigned long pre_end = min_t(unsigned long, end, next_page); + +- __unmap_pmd_range(pud, pmd, start, pre_end); ++ __unmap_pmd_range(cpa, pud, pmd, start, pre_end); + + start = pre_end; + pmd++; +@@ -803,7 +813,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) + if (pmd_large(*pmd)) + pmd_clear(pmd); + else +- __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); ++ __unmap_pmd_range(cpa, pud, pmd, ++ start, start + PMD_SIZE); + + start += PMD_SIZE; + pmd++; +@@ -813,17 +824,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) + * 4K leftovers? + */ + if (start < end) +- return __unmap_pmd_range(pud, pmd, start, end); ++ return __unmap_pmd_range(cpa, pud, pmd, start, end); + + /* + * Try again to free the PMD page if haven't succeeded above. + */ + if (!pud_none(*pud)) +- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) ++ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) + pud_clear(pud); + } + +-void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) ++static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd, ++ unsigned long start, ++ unsigned long end) + { + pud_t *pud = pud_offset(pgd, start); + +@@ -834,7 +847,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) + unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; + unsigned long pre_end = min_t(unsigned long, end, next_page); + +- unmap_pmd_range(pud, start, pre_end); ++ unmap_pmd_range(cpa, pud, start, pre_end); + + start = pre_end; + pud++; +@@ -848,7 +861,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) + if (pud_large(*pud)) + pud_clear(pud); + else +- unmap_pmd_range(pud, start, start + PUD_SIZE); ++ unmap_pmd_range(cpa, pud, start, start + PUD_SIZE); + + start += PUD_SIZE; + pud++; +@@ -858,7 +871,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) + * 2M leftovers? + */ + if (start < end) +- unmap_pmd_range(pud, start, end); ++ unmap_pmd_range(cpa, pud, start, end); + + /* + * No need to try to free the PUD page because we'll free it in +@@ -866,6 +879,24 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) + */ + } + ++static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) ++{ ++ struct cpa_data cpa = { ++ .flags = CPA_FREE_PAGETABLES, ++ }; ++ ++ __unmap_pud_range(&cpa, pgd, start, end); ++} ++ ++void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end) ++{ ++ struct cpa_data cpa = { ++ .flags = 0, ++ }; ++ ++ __unmap_pud_range(&cpa, pgd, start, end); ++} ++ + static int alloc_pte_page(pmd_t *pmd) + { + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); +diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c +index 27d218b..352fd01 100644 +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -344,40 +344,26 @@ static inline void _pgd_free(pgd_t *pgd) + kmem_cache_free(pgd_cache, pgd); + } + #else +-static inline pgd_t *_pgd_alloc(void) +-{ ++ + #ifdef CONFIG_KAISER +- // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory +- // block. Therefore, we have to allocate at least 3 pages. However, the +- // __get_free_pages returns us 4 pages. Hence, we store the base pointer at +- // the beginning of the page of our 8kb-aligned memory block in order to +- // correctly free it afterwars. +- +- unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE)); +- +- if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages) +- { +- *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages; +- return (pgd_t *) pages; +- } +- else +- { +- *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages; +- return (pgd_t *) (pages + PAGE_SIZE); +- } ++/* ++ * Instead of one pmd, we aquire two pmds. Being order-1, it is ++ * both 8k in size and 8k-aligned. That lets us just flip bit 12 ++ * in a pointer to swap between the two 4k halves. ++ */ ++#define PGD_ALLOCATION_ORDER 1 + #else +- return (pgd_t *)__get_free_page(PGALLOC_GFP); ++#define PGD_ALLOCATION_ORDER 0 + #endif ++ ++static inline pgd_t *_pgd_alloc(void) ++{ ++ return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); + } + + static inline void _pgd_free(pgd_t *pgd) + { +-#ifdef CONFIG_KAISER +- unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE)); +- free_pages(pages, get_order(4*PAGE_SIZE)); +-#else +- free_page((unsigned long)pgd); +-#endif ++ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); + } + #endif /* CONFIG_X86_PAE */ + +diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h +new file mode 100644 +index 0000000..9db5433 +--- /dev/null ++++ b/include/linux/kaiser.h +@@ -0,0 +1,26 @@ ++#ifndef _INCLUDE_KAISER_H ++#define _INCLUDE_KAISER_H ++ ++#ifdef CONFIG_KAISER ++#include <asm/kaiser.h> ++#else ++ ++/* ++ * These stubs are used whenever CONFIG_KAISER is off, which ++ * includes architectures that support KAISER, but have it ++ * disabled. ++ */ ++ ++static inline void kaiser_init(void) ++{ ++} ++static inline void kaiser_remove_mapping(unsigned long start, unsigned long size) ++{ ++} ++static inline int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) ++{ ++ return 0; ++} ++ ++#endif /* !CONFIG_KAISER */ ++#endif /* _INCLUDE_KAISER_H */ +diff --git a/kernel/fork.c b/kernel/fork.c +index d34394e..8013f22 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -58,6 +58,7 @@ + #include <linux/tsacct_kern.h> + #include <linux/cn_proc.h> + #include <linux/freezer.h> ++#include <linux/kaiser.h> + #include <linux/delayacct.h> + #include <linux/taskstats_kern.h> + #include <linux/random.h> +@@ -472,7 +473,6 @@ void set_task_stack_end_magic(struct task_struct *tsk) + *stackend = STACK_END_MAGIC; /* for overflow detection */ + } + +-extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); + static struct task_struct *dup_task_struct(struct task_struct *orig, int node) + { + struct task_struct *tsk; +@@ -500,9 +500,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) + * functions again. + */ + tsk->stack = stack; +-#ifdef CONFIG_KAISER +- kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL); +-#endif ++ ++ err= kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL); ++ if (err) ++ goto free_stack; + #ifdef CONFIG_VMAP_STACK + tsk->stack_vm_area = stack_vm_area; + #endif +diff --git a/security/Kconfig b/security/Kconfig +index f515ac3..334d2e8 100644 +--- a/security/Kconfig ++++ b/security/Kconfig +@@ -32,12 +32,17 @@ config SECURITY + If you are unsure how to answer this question, answer N. + config KAISER + bool "Remove the kernel mapping in user mode" ++ default y + depends on X86_64 + depends on !PARAVIRT + help + This enforces a strict kernel and user space isolation in order to close + hardware side channels on kernel address information. + ++config KAISER_REAL_SWITCH ++ bool "KAISER: actually switch page tables" ++ default y ++ + config SECURITYFS + bool "Enable the securityfs filesystem" + help +-- +2.7.4 + |