aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.9.21/0100-bpf-avoid-false-sharing-of-map-refcount-with-max_ent.patch
diff options
context:
space:
mode:
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.9.21/0100-bpf-avoid-false-sharing-of-map-refcount-with-max_ent.patch')
-rw-r--r--common/recipes-kernel/linux/linux-yocto-4.9.21/0100-bpf-avoid-false-sharing-of-map-refcount-with-max_ent.patch128
1 files changed, 128 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0100-bpf-avoid-false-sharing-of-map-refcount-with-max_ent.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0100-bpf-avoid-false-sharing-of-map-refcount-with-max_ent.patch
new file mode 100644
index 00000000..129c2475
--- /dev/null
+++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0100-bpf-avoid-false-sharing-of-map-refcount-with-max_ent.patch
@@ -0,0 +1,128 @@
+From 486eb3eb62604a5c691156b6a657b03fe67cb167 Mon Sep 17 00:00:00 2001
+From: Daniel Borkmann <daniel@iogearbox.net>
+Date: Mon, 29 Jan 2018 02:48:58 +0100
+Subject: [PATCH 100/103] bpf: avoid false sharing of map refcount with
+ max_entries
+
+[ upstream commit be95a845cc4402272994ce290e3ad928aff06cb9 ]
+
+In addition to commit b2157399cc98 ("bpf: prevent out-of-bounds
+speculation") also change the layout of struct bpf_map such that
+false sharing of fast-path members like max_entries is avoided
+when the maps reference counter is altered. Therefore enforce
+them to be placed into separate cachelines.
+
+pahole dump after change:
+
+ struct bpf_map {
+ const struct bpf_map_ops * ops; /* 0 8 */
+ struct bpf_map * inner_map_meta; /* 8 8 */
+ void * security; /* 16 8 */
+ enum bpf_map_type map_type; /* 24 4 */
+ u32 key_size; /* 28 4 */
+ u32 value_size; /* 32 4 */
+ u32 max_entries; /* 36 4 */
+ u32 map_flags; /* 40 4 */
+ u32 pages; /* 44 4 */
+ u32 id; /* 48 4 */
+ int numa_node; /* 52 4 */
+ bool unpriv_array; /* 56 1 */
+
+ /* XXX 7 bytes hole, try to pack */
+
+ /* --- cacheline 1 boundary (64 bytes) --- */
+ struct user_struct * user; /* 64 8 */
+ atomic_t refcnt; /* 72 4 */
+ atomic_t usercnt; /* 76 4 */
+ struct work_struct work; /* 80 32 */
+ char name[16]; /* 112 16 */
+ /* --- cacheline 2 boundary (128 bytes) --- */
+
+ /* size: 128, cachelines: 2, members: 17 */
+ /* sum members: 121, holes: 1, sum holes: 7 */
+ };
+
+Now all entries in the first cacheline are read only throughout
+the life time of the map, set up once during map creation. Overall
+struct size and number of cachelines doesn't change from the
+reordering. struct bpf_map is usually first member and embedded
+in map structs in specific map implementations, so also avoid those
+members to sit at the end where it could potentially share the
+cacheline with first map values e.g. in the array since remote
+CPUs could trigger map updates just as well for those (easily
+dirtying members like max_entries intentionally as well) while
+having subsequent values in cache.
+
+Quoting from Google's Project Zero blog [1]:
+
+ Additionally, at least on the Intel machine on which this was
+ tested, bouncing modified cache lines between cores is slow,
+ apparently because the MESI protocol is used for cache coherence
+ [8]. Changing the reference counter of an eBPF array on one
+ physical CPU core causes the cache line containing the reference
+ counter to be bounced over to that CPU core, making reads of the
+ reference counter on all other CPU cores slow until the changed
+ reference counter has been written back to memory. Because the
+ length and the reference counter of an eBPF array are stored in
+ the same cache line, this also means that changing the reference
+ counter on one physical CPU core causes reads of the eBPF array's
+ length to be slow on other physical CPU cores (intentional false
+ sharing).
+
+While this doesn't 'control' the out-of-bounds speculation through
+masking the index as in commit b2157399cc98, triggering a manipulation
+of the map's reference counter is really trivial, so lets not allow
+to easily affect max_entries from it.
+
+Splitting to separate cachelines also generally makes sense from
+a performance perspective anyway in that fast-path won't have a
+cache miss if the map gets pinned, reused in other progs, etc out
+of control path, thus also avoids unintentional false sharing.
+
+ [1] https://googleprojectzero.blogspot.ch/2018/01/reading-privileged-memory-with-side.html
+
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/bpf.h | 16 ++++++++++++----
+ 1 file changed, 12 insertions(+), 4 deletions(-)
+
+diff --git a/include/linux/bpf.h b/include/linux/bpf.h
+index 0dbb21b..80064c8 100644
+--- a/include/linux/bpf.h
++++ b/include/linux/bpf.h
+@@ -36,7 +36,10 @@ struct bpf_map_ops {
+ };
+
+ struct bpf_map {
+- atomic_t refcnt;
++ /* 1st cacheline with read-mostly members of which some
++ * are also accessed in fast-path (e.g. ops, max_entries).
++ */
++ const struct bpf_map_ops *ops ____cacheline_aligned;
+ enum bpf_map_type map_type;
+ u32 key_size;
+ u32 value_size;
+@@ -44,10 +47,15 @@ struct bpf_map {
+ u32 map_flags;
+ u32 pages;
+ bool unpriv_array;
+- struct user_struct *user;
+- const struct bpf_map_ops *ops;
+- struct work_struct work;
++ /* 7 bytes hole */
++
++ /* 2nd cacheline with misc members to avoid false sharing
++ * particularly with refcounting.
++ */
++ struct user_struct *user ____cacheline_aligned;
++ atomic_t refcnt;
+ atomic_t usercnt;
++ struct work_struct work;
+ };
+
+ struct bpf_map_type_list {
+--
+2.7.4
+