aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.9.21/0020-kaiser-enhanced-by-kernel-and-user-PCIDs.patch
diff options
context:
space:
mode:
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.9.21/0020-kaiser-enhanced-by-kernel-and-user-PCIDs.patch')
-rw-r--r--common/recipes-kernel/linux/linux-yocto-4.9.21/0020-kaiser-enhanced-by-kernel-and-user-PCIDs.patch424
1 files changed, 424 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0020-kaiser-enhanced-by-kernel-and-user-PCIDs.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0020-kaiser-enhanced-by-kernel-and-user-PCIDs.patch
new file mode 100644
index 00000000..85bdc307
--- /dev/null
+++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0020-kaiser-enhanced-by-kernel-and-user-PCIDs.patch
@@ -0,0 +1,424 @@
+From 9bc1089baa5051f750a246af746e81bf1bb1fe09 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Wed, 30 Aug 2017 16:23:00 -0700
+Subject: [PATCH 020/102] kaiser: enhanced by kernel and user PCIDs
+
+Merged performance improvements to Kaiser, using distinct kernel
+and user Process Context Identifiers to minimize the TLB flushing.
+
+[This work actually all from Dave Hansen 2017-08-30:
+still omitting trackswitch mods, and KAISER_REAL_SWITCH deleted.]
+
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/entry/entry_64.S | 10 ++++--
+ arch/x86/entry/entry_64_compat.S | 1 +
+ arch/x86/include/asm/cpufeatures.h | 1 +
+ arch/x86/include/asm/kaiser.h | 15 +++++++--
+ arch/x86/include/asm/pgtable_types.h | 26 +++++++++++++++
+ arch/x86/include/asm/tlbflush.h | 52 ++++++++++++++++++++++++-----
+ arch/x86/include/uapi/asm/processor-flags.h | 3 +-
+ arch/x86/kernel/cpu/common.c | 34 +++++++++++++++++++
+ arch/x86/kvm/x86.c | 3 +-
+ arch/x86/mm/kaiser.c | 7 ++++
+ arch/x86/mm/tlb.c | 46 +++++++++++++++++++++++--
+ 11 files changed, 181 insertions(+), 17 deletions(-)
+
+diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
+index df33f10..4a0ebf4 100644
+--- a/arch/x86/entry/entry_64.S
++++ b/arch/x86/entry/entry_64.S
+@@ -1315,7 +1315,10 @@ ENTRY(nmi)
+ /* %rax is saved above, so OK to clobber here */
+ movq %cr3, %rax
+ pushq %rax
+- andq $(~KAISER_SHADOW_PGD_OFFSET), %rax
++ /* mask off "user" bit of pgd address and 12 PCID bits: */
++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
++ /* Add back kernel PCID and "no flush" bit */
++ orq X86_CR3_PCID_KERN_VAR, %rax
+ movq %rax, %cr3
+ #endif
+ call do_nmi
+@@ -1556,7 +1559,10 @@ end_repeat_nmi:
+ /* %rax is saved above, so OK to clobber here */
+ movq %cr3, %rax
+ pushq %rax
+- andq $(~KAISER_SHADOW_PGD_OFFSET), %rax
++ /* mask off "user" bit of pgd address and 12 PCID bits: */
++ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
++ /* Add back kernel PCID and "no flush" bit */
++ orq X86_CR3_PCID_KERN_VAR, %rax
+ movq %rax, %cr3
+ #endif
+
+diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
+index f0e384e..0eb5801 100644
+--- a/arch/x86/entry/entry_64_compat.S
++++ b/arch/x86/entry/entry_64_compat.S
+@@ -13,6 +13,7 @@
+ #include <asm/irqflags.h>
+ #include <asm/asm.h>
+ #include <asm/smap.h>
++#include <asm/pgtable_types.h>
+ #include <asm/kaiser.h>
+ #include <linux/linkage.h>
+ #include <linux/err.h>
+diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
+index ed10b5b..dc50883 100644
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -189,6 +189,7 @@
+
+ #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
+ #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
++#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */
+
+ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
+ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
+index e0fc45e..360ff3b 100644
+--- a/arch/x86/include/asm/kaiser.h
++++ b/arch/x86/include/asm/kaiser.h
+@@ -1,5 +1,8 @@
+ #ifndef _ASM_X86_KAISER_H
+ #define _ASM_X86_KAISER_H
++
++#include <uapi/asm/processor-flags.h> /* For PCID constants */
++
+ /*
+ * This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on
+@@ -21,13 +24,21 @@
+
+ .macro _SWITCH_TO_KERNEL_CR3 reg
+ movq %cr3, \reg
+-andq $(~KAISER_SHADOW_PGD_OFFSET), \reg
++andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
++orq X86_CR3_PCID_KERN_VAR, \reg
+ movq \reg, %cr3
+ .endm
+
+ .macro _SWITCH_TO_USER_CR3 reg
+ movq %cr3, \reg
+-orq $(KAISER_SHADOW_PGD_OFFSET), \reg
++andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
++/*
++ * This can obviously be one instruction by putting the
++ * KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR.
++ * But, just leave it now for simplicity.
++ */
++orq X86_CR3_PCID_USER_VAR, \reg
++orq $(KAISER_SHADOW_PGD_OFFSET), \reg
+ movq \reg, %cr3
+ .endm
+
+diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
+index 8bc8d02..ada77fd 100644
+--- a/arch/x86/include/asm/pgtable_types.h
++++ b/arch/x86/include/asm/pgtable_types.h
+@@ -141,6 +141,32 @@
+ _PAGE_SOFT_DIRTY)
+ #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+
++/* The ASID is the lower 12 bits of CR3 */
++#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
++
++/* Mask for all the PCID-related bits in CR3: */
++#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
++#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
++#define X86_CR3_PCID_ASID_KERN (_AC(0x4,UL))
++#define X86_CR3_PCID_ASID_USER (_AC(0x6,UL))
++
++#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
++#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
++#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
++#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
++#else
++#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
++#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
++/*
++ * PCIDs are unsupported on 32-bit and none of these bits can be
++ * set in CR3:
++ */
++#define X86_CR3_PCID_KERN_FLUSH (0)
++#define X86_CR3_PCID_USER_FLUSH (0)
++#define X86_CR3_PCID_KERN_NOFLUSH (0)
++#define X86_CR3_PCID_USER_NOFLUSH (0)
++#endif
++
+ /*
+ * The cache modes defined here are used to translate between pure SW usage
+ * and the HW defined cache mode bits and/or PAT entries.
+diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
+index c13041e..28b4182 100644
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -12,7 +12,6 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr,
+ unsigned long type)
+ {
+ struct { u64 d[2]; } desc = { { pcid, addr } };
+-
+ /*
+ * The memory clobber is because the whole point is to invalidate
+ * stale TLB entries and, especially if we're flushing global
+@@ -135,14 +134,25 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
+
+ static inline void __native_flush_tlb(void)
+ {
++ if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) {
++ /*
++ * If current->mm == NULL then we borrow a mm which may change during a
++ * task switch and therefore we must not be preempted while we write CR3
++ * back:
++ */
++ preempt_disable();
++ native_write_cr3(native_read_cr3());
++ preempt_enable();
++ return;
++ }
+ /*
+- * If current->mm == NULL then we borrow a mm which may change during a
+- * task switch and therefore we must not be preempted while we write CR3
+- * back:
++ * We are no longer using globals with KAISER, so a
++ * "nonglobals" flush would work too. But, this is more
++ * conservative.
++ *
++ * Note, this works with CR4.PCIDE=0 or 1.
+ */
+- preempt_disable();
+- native_write_cr3(native_read_cr3());
+- preempt_enable();
++ invpcid_flush_all();
+ }
+
+ static inline void __native_flush_tlb_global_irq_disabled(void)
+@@ -164,6 +174,8 @@ static inline void __native_flush_tlb_global(void)
+ /*
+ * Using INVPCID is considerably faster than a pair of writes
+ * to CR4 sandwiched inside an IRQ flag save/restore.
++ *
++ * Note, this works with CR4.PCIDE=0 or 1.
+ */
+ invpcid_flush_all();
+ return;
+@@ -183,7 +195,31 @@ static inline void __native_flush_tlb_global(void)
+
+ static inline void __native_flush_tlb_single(unsigned long addr)
+ {
+- asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
++ /*
++ * SIMICS #GP's if you run INVPCID with type 2/3
++ * and X86_CR4_PCIDE clear. Shame!
++ *
++ * The ASIDs used below are hard-coded. But, we must not
++ * call invpcid(type=1/2) before CR4.PCIDE=1. Just call
++ * invpcid in the case we are called early.
++ */
++ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
++ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
++ return;
++ }
++ /* Flush the address out of both PCIDs. */
++ /*
++ * An optimization here might be to determine addresses
++ * that are only kernel-mapped and only flush the kernel
++ * ASID. But, userspace flushes are probably much more
++ * important performance-wise.
++ *
++ * Make sure to do only a single invpcid when KAISER is
++ * disabled and we have only a single ASID.
++ */
++ if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
++ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
++ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
+ }
+
+ static inline void __flush_tlb_all(void)
+diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
+index 567de50..6768d13 100644
+--- a/arch/x86/include/uapi/asm/processor-flags.h
++++ b/arch/x86/include/uapi/asm/processor-flags.h
+@@ -77,7 +77,8 @@
+ #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
+ #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
+ #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
+-#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
++#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
++#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
+
+ /*
+ * Intel CPU features in CR4
+diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
+index 3efde13..b4c0ae5 100644
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -324,11 +324,45 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
+ }
+ }
+
++/*
++ * These can have bit 63 set, so we can not just use a plain "or"
++ * instruction to get their value or'd into CR3. It would take
++ * another register. So, we use a memory reference to these
++ * instead.
++ *
++ * This is also handy because systems that do not support
++ * PCIDs just end up or'ing a 0 into their CR3, which does
++ * no harm.
++ */
++__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR = 0;
++__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_USER_VAR = 0;
++
+ static void setup_pcid(struct cpuinfo_x86 *c)
+ {
+ if (cpu_has(c, X86_FEATURE_PCID)) {
+ if (cpu_has(c, X86_FEATURE_PGE)) {
+ cr4_set_bits(X86_CR4_PCIDE);
++ /*
++ * These variables are used by the entry/exit
++ * code to change PCIDs.
++ */
++#ifdef CONFIG_KAISER
++ X86_CR3_PCID_KERN_VAR = X86_CR3_PCID_KERN_NOFLUSH;
++ X86_CR3_PCID_USER_VAR = X86_CR3_PCID_USER_NOFLUSH;
++#endif
++ /*
++ * INVPCID has two "groups" of types:
++ * 1/2: Invalidate an individual address
++ * 3/4: Invalidate all contexts
++ *
++ * 1/2 take a PCID, but 3/4 do not. So, 3/4
++ * ignore the PCID argument in the descriptor.
++ * But, we have to be careful not to call 1/2
++ * with an actual non-zero PCID in them before
++ * we do the above cr4_set_bits().
++ */
++ if (cpu_has(c, X86_FEATURE_INVPCID))
++ set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
+ } else {
+ /*
+ * flush_tlb_all(), as currently implemented, won't
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index e5bc139..51a700a 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -773,7 +773,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+ return 1;
+
+ /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
+- if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
++ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
++ !is_long_mode(vcpu))
+ return 1;
+ }
+
+diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
+index bd22ef5..f5c75f7 100644
+--- a/arch/x86/mm/kaiser.c
++++ b/arch/x86/mm/kaiser.c
+@@ -239,6 +239,8 @@ static void __init kaiser_init_all_pgds(void)
+ } while (0)
+
+ extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
++extern unsigned long X86_CR3_PCID_KERN_VAR;
++extern unsigned long X86_CR3_PCID_USER_VAR;
+ /*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die. But, we
+@@ -289,6 +291,11 @@ void __init kaiser_init(void)
+ kaiser_add_user_map_early(&debug_idt_table,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL);
++
++ kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE,
++ __PAGE_KERNEL);
++ kaiser_add_user_map_early(&X86_CR3_PCID_USER_VAR, PAGE_SIZE,
++ __PAGE_KERNEL);
+ }
+
+ /* Add a mapping to the shadow mapping, and synchronize the mappings */
+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
+index a7655f6..a376246 100644
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -36,6 +36,46 @@ struct flush_tlb_info {
+ unsigned long flush_end;
+ };
+
++static void load_new_mm_cr3(pgd_t *pgdir)
++{
++ unsigned long new_mm_cr3 = __pa(pgdir);
++
++ /*
++ * KAISER, plus PCIDs needs some extra work here. But,
++ * if either of features is not present, we need no
++ * PCIDs here and just do a normal, full TLB flush with
++ * the write_cr3()
++ */
++ if (!IS_ENABLED(CONFIG_KAISER) ||
++ !cpu_feature_enabled(X86_FEATURE_PCID))
++ goto out_set_cr3;
++ /*
++ * We reuse the same PCID for different tasks, so we must
++ * flush all the entires for the PCID out when we change
++ * tasks.
++ */
++ new_mm_cr3 = X86_CR3_PCID_KERN_FLUSH | __pa(pgdir);
++
++ /*
++ * The flush from load_cr3() may leave old TLB entries
++ * for userspace in place. We must flush that context
++ * separately. We can theoretically delay doing this
++ * until we actually load up the userspace CR3, but
++ * that's a bit tricky. We have to have the "need to
++ * flush userspace PCID" bit per-cpu and check it in the
++ * exit-to-userspace paths.
++ */
++ invpcid_flush_single_context(X86_CR3_PCID_ASID_USER);
++
++out_set_cr3:
++ /*
++ * Caution: many callers of this function expect
++ * that load_cr3() is serializing and orders TLB
++ * fills with respect to the mm_cpumask writes.
++ */
++ write_cr3(new_mm_cr3);
++}
++
+ /*
+ * We cannot call mmdrop() because we are in interrupt context,
+ * instead update mm->cpu_vm_mask.
+@@ -47,7 +87,7 @@ void leave_mm(int cpu)
+ BUG();
+ if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
+ cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
+- load_cr3(swapper_pg_dir);
++ load_new_mm_cr3(swapper_pg_dir);
+ /*
+ * This gets called in the idle path where RCU
+ * functions differently. Tracing normally
+@@ -126,7 +166,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ * ordering guarantee we need.
+ *
+ */
+- load_cr3(next->pgd);
++ load_new_mm_cr3(next->pgd);
+
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+
+@@ -175,7 +215,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ * As above, load_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask write.
+ */
+- load_cr3(next->pgd);
++ load_new_mm_cr3(next->pgd);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ load_mm_cr4(next);
+ load_mm_ldt(next);
+--
+2.7.4
+