1 files changed, 820 insertions, 0 deletions
diff --git a/features/seccomp/seccomp-add-system-call-filtering-using-BPF.patch b/features/seccomp/seccomp-add-system-call-filtering-using-BPF.patch
new file mode 100644
index 00000000..908f3cfd
--- /dev/null
+++ b/features/seccomp/seccomp-add-system-call-filtering-using-BPF.patch
@@ -0,0 +1,820 @@
+From 01c9617a2eca38f68d917ae16bdf8c2c8d863c8e Mon Sep 17 00:00:00 2001
+From: Will Drewry <wad@chromium.org>
+Date: Thu, 12 Apr 2012 16:47:57 -0500
+Subject: [PATCH] seccomp: add system call filtering using BPF
+
+commit e2cfabdfd075648216f99c2c03821cf3f47c1727 upstream.
+
+[This patch depends on luto@mit.edu's no_new_privs patch:
+   https://lkml.org/lkml/2012/1/30/264
+ The whole series including Andrew's patches can be found here:
+   https://github.com/redpig/linux/tree/seccomp
+ Complete diff here:
+   https://github.com/redpig/linux/compare/1dc65fed...seccomp
+]
+
+This patch adds support for seccomp mode 2.  Mode 2 introduces the
+ability for unprivileged processes to install system call filtering
+policy expressed in terms of a Berkeley Packet Filter (BPF) program.
+This program will be evaluated in the kernel for each system call
+the task makes and computes a result based on data in the format
+of struct seccomp_data.
+
+A filter program may be installed by calling:
+  struct sock_fprog fprog = { ... };
+  ...
+  prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &fprog);
+
+The return value of the filter program determines if the system call is
+allowed to proceed or denied.  If the first filter program installed
+allows prctl(2) calls, then the above call may be made repeatedly
+by a task to further reduce its access to the kernel.  All attached
+programs must be evaluated before a system call will be allowed to
+proceed.
+
+Filter programs will be inherited across fork/clone and execve.
+However, if the task attaching the filter is unprivileged
+(!CAP_SYS_ADMIN) the no_new_privs bit will be set on the task.  This
+ensures that unprivileged tasks cannot attach filters that affect
+privileged tasks (e.g., setuid binary).
+
+There are a number of benefits to this approach. A few of which are
+as follows:
+- BPF has been exposed to userland for a long time
+- BPF optimization (and JIT'ing) are well understood
+- Userland already knows its ABI: system call numbers and desired
+  arguments
+- No time-of-check-time-of-use vulnerable data accesses are possible.
+- system call arguments are loaded on access only to minimize copying
+  required for system call policy decisions.
+
+Mode 2 support is restricted to architectures that enable
+HAVE_ARCH_SECCOMP_FILTER.  In this patch, the primary dependency is on
+syscall_get_arguments().  The full desired scope of this feature will
+add a few minor additional requirements expressed later in this series.
+Based on discussion, SECCOMP_RET_ERRNO and SECCOMP_RET_TRACE seem to be
+the desired additional functionality.
+
+No architectures are enabled in this patch.
+
+Signed-off-by: Will Drewry <wad@chromium.org>
+Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
+Reviewed-by: Indan Zupancic <indan@nul.nu>
+Acked-by: Eric Paris <eparis@redhat.com>
+Reviewed-by: Kees Cook <keescook@chromium.org>
+
+v18: - rebase to v3.4-rc2
+     - s/chk/check/ (akpm@linux-foundation.org,jmorris@namei.org)
+     - allocate with GFP_KERNEL|__GFP_NOWARN (indan@nul.nu)
+     - add a comment for get_u32 regarding endianness (akpm@)
+     - fix other typos, style mistakes (akpm@)
+     - added acked-by
+v17: - properly guard seccomp filter needed headers (leann@ubuntu.com)
+     - tighten return mask to 0x7fff0000
+v16: - no change
+v15: - add a 4 instr penalty when counting a path to account for seccomp_filter
+       size (indan@nul.nu)
+     - drop the max insns to 256KB (indan@nul.nu)
+     - return ENOMEM if the max insns limit has been hit (indan@nul.nu)
+     - move IP checks after args (indan@nul.nu)
+     - drop !user_filter check (indan@nul.nu)
+     - only allow explicit bpf codes (indan@nul.nu)
+     - exit_code -> exit_sig
+v14: - put/get_seccomp_filter takes struct task_struct
+       (indan@nul.nu,keescook@chromium.org)
+     - adds seccomp_chk_filter and drops general bpf_run/chk_filter user
+     - add seccomp_bpf_load for use by net/core/filter.c
+     - lower max per-process/per-hierarchy: 1MB
+     - moved nnp/capability check prior to allocation
+       (all of the above: indan@nul.nu)
+v13: - rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc
+v12: - added a maximum instruction count per path (indan@nul.nu,oleg@redhat.com)
+     - removed copy_seccomp (keescook@chromium.org,indan@nul.nu)
+     - reworded the prctl_set_seccomp comment (indan@nul.nu)
+v11: - reorder struct seccomp_data to allow future args expansion (hpa@zytor.com)
+     - style clean up, @compat dropped, compat_sock_fprog32 (indan@nul.nu)
+     - do_exit(SIGSYS) (keescook@chromium.org, luto@mit.edu)
+     - pare down Kconfig doc reference.
+     - extra comment clean up
+v10: - seccomp_data has changed again to be more aesthetically pleasing
+       (hpa@zytor.com)
+     - calling convention is noted in a new u32 field using syscall_get_arch.
+       This allows for cross-calling convention tasks to use seccomp filters.
+       (hpa@zytor.com)
+     - lots of clean up (thanks, Indan!)
+ v9: - n/a
+ v8: - use bpf_chk_filter, bpf_run_filter. update load_fns
+     - Lots of fixes courtesy of indan@nul.nu:
+     -- fix up load behavior, compat fixups, and merge alloc code,
+     -- renamed pc and dropped __packed, use bool compat.
+     -- Added a hidden CONFIG_SECCOMP_FILTER to synthesize non-arch
+        dependencies
+ v7:  (massive overhaul thanks to Indan, others)
+     - added CONFIG_HAVE_ARCH_SECCOMP_FILTER
+     - merged into seccomp.c
+     - minimal seccomp_filter.h
+     - no config option (part of seccomp)
+     - no new prctl
+     - doesn't break seccomp on systems without asm/syscall.h
+       (works but arg access always fails)
+     - dropped seccomp_init_task, extra free functions, ...
+     - dropped the no-asm/syscall.h code paths
+     - merges with network sk_run_filter and sk_chk_filter
+ v6: - fix memory leak on attach compat check failure
+     - require no_new_privs || CAP_SYS_ADMIN prior to filter
+       installation. (luto@mit.edu)
+     - s/seccomp_struct_/seccomp_/ for macros/functions (amwang@redhat.com)
+     - cleaned up Kconfig (amwang@redhat.com)
+     - on block, note if the call was compat (so the # means something)
+ v5: - uses syscall_get_arguments
+       (indan@nul.nu,oleg@redhat.com, mcgrathr@chromium.org)
+      - uses union-based arg storage with hi/lo struct to
+        handle endianness.  Compromises between the two alternate
+        proposals to minimize extra arg shuffling and account for
+        endianness assuming userspace uses offsetof().
+        (mcgrathr@chromium.org, indan@nul.nu)
+      - update Kconfig description
+      - add include/seccomp_filter.h and add its installation
+      - (naive) on-demand syscall argument loading
+      - drop seccomp_t (eparis@redhat.com)
+ v4:  - adjusted prctl to make room for PR_[SG]ET_NO_NEW_PRIVS
+      - now uses current->no_new_privs
+        (luto@mit.edu,torvalds@linux-foundation.com)
+      - assign names to seccomp modes (rdunlap@xenotime.net)
+      - fix style issues (rdunlap@xenotime.net)
+      - reworded Kconfig entry (rdunlap@xenotime.net)
+ v3:  - macros to inline (oleg@redhat.com)
+      - init_task behavior fixed (oleg@redhat.com)
+      - drop creator entry and extra NULL check (oleg@redhat.com)
+      - alloc returns -EINVAL on bad sizing (serge.hallyn@canonical.com)
+      - adds tentative use of "always_unprivileged" as per
+        torvalds@linux-foundation.org and luto@mit.edu
+ v2:  - (patch 2 only)
+Signed-off-by: James Morris <james.l.morris@oracle.com>
+Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
+---
+ arch/Kconfig            |   17 ++
+ include/linux/Kbuild    |    1 +
+ include/linux/seccomp.h |   76 +++++++++-
+ kernel/fork.c           |    3 +
+ kernel/seccomp.c        |  396 ++++++++++++++++++++++++++++++++++++++++++++--
+ kernel/sys.c            |    2 +-
+ 6 files changed, 472 insertions(+), 23 deletions(-)
+
+diff --git a/arch/Kconfig b/arch/Kconfig
+index 684eb5a..91c2c73 100644
+--- a/arch/Kconfig
++++ b/arch/Kconfig
+@@ -216,4 +216,21 @@ config HAVE_CMPXCHG_DOUBLE
+ config ARCH_WANT_OLD_COMPAT_IPC
+ 	bool
+ 
++config HAVE_ARCH_SECCOMP_FILTER
++	bool
++	help
++	  This symbol should be selected by an architecure if it provides
++	  asm/syscall.h, specifically syscall_get_arguments() and
++	  syscall_get_arch().
++
++config SECCOMP_FILTER
++	def_bool y
++	depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET
++	help
++	  Enable tasks to build secure computing environments defined
++	  in terms of Berkeley Packet Filter programs which implement
++	  task-defined system call filtering polices.
++
++	  See Documentation/prctl/seccomp_filter.txt for details.
++
+ source "kernel/gcov/Kconfig"
+diff --git a/include/linux/Kbuild b/include/linux/Kbuild
+index 50f55c7..bc82495 100644
+--- a/include/linux/Kbuild
++++ b/include/linux/Kbuild
+@@ -333,6 +333,7 @@ header-y += scc.h
+ header-y += sched.h
+ header-y += screen_info.h
+ header-y += sdla.h
++header-y += seccomp.h
+ header-y += securebits.h
+ header-y += selinux_netlink.h
+ header-y += sem.h
+diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
+index d61f27f..86bb68f 100644
+--- a/include/linux/seccomp.h
++++ b/include/linux/seccomp.h
+@@ -1,14 +1,67 @@
+ #ifndef _LINUX_SECCOMP_H
+ #define _LINUX_SECCOMP_H
+ 
++#include <linux/compiler.h>
++#include <linux/types.h>
++
++
++/* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
++#define SECCOMP_MODE_DISABLED	0 /* seccomp is not in use. */
++#define SECCOMP_MODE_STRICT	1 /* uses hard-coded filter. */
++#define SECCOMP_MODE_FILTER	2 /* uses user-supplied filter. */
++
++/*
++ * All BPF programs must return a 32-bit value.
++ * The bottom 16-bits are reserved for future use.
++ * The upper 16-bits are ordered from least permissive values to most.
++ *
++ * The ordering ensures that a min_t() over composed return values always
++ * selects the least permissive choice.
++ */
++#define SECCOMP_RET_KILL	0x00000000U /* kill the task immediately */
++#define SECCOMP_RET_ALLOW	0x7fff0000U /* allow */
++
++/* Masks for the return value sections. */
++#define SECCOMP_RET_ACTION	0x7fff0000U
++#define SECCOMP_RET_DATA	0x0000ffffU
++
++/**
++ * struct seccomp_data - the format the BPF program executes over.
++ * @nr: the system call number
++ * @arch: indicates system call convention as an AUDIT_ARCH_* value
++ *        as defined in <linux/audit.h>.
++ * @instruction_pointer: at the time of the system call.
++ * @args: up to 6 system call arguments always stored as 64-bit values
++ *        regardless of the architecture.
++ */
++struct seccomp_data {
++	int nr;
++	__u32 arch;
++	__u64 instruction_pointer;
++	__u64 args[6];
++};
+ 
++#ifdef __KERNEL__
+ #ifdef CONFIG_SECCOMP
+ 
+ #include <linux/thread_info.h>
+ #include <asm/seccomp.h>
+ 
++struct seccomp_filter;
++/**
++ * struct seccomp - the state of a seccomp'ed process
++ *
++ * @mode:  indicates one of the valid values above for controlled
++ *         system calls available to a process.
++ * @filter: The metadata and ruleset for determining what system calls
++ *          are allowed for a task.
++ *
++ *          @filter must only be accessed from the context of current as there
++ *          is no locking.
++ */
+ struct seccomp {
+ 	int mode;
++	struct seccomp_filter *filter;
+ };
+ 
+ extern void __secure_computing(int);
+@@ -19,7 +72,7 @@ static inline void secure_computing(int this_syscall)
+ }
+ 
+ extern long prctl_get_seccomp(void);
+-extern long prctl_set_seccomp(unsigned long);
++extern long prctl_set_seccomp(unsigned long, char __user *);
+ 
+ static inline int seccomp_mode(struct seccomp *s)
+ {
+@@ -31,15 +84,16 @@ static inline int seccomp_mode(struct seccomp *s)
+ #include <linux/errno.h>
+ 
+ struct seccomp { };
++struct seccomp_filter { };
+ 
+-#define secure_computing(x) do { } while (0)
++#define secure_computing(x) 0
+ 
+ static inline long prctl_get_seccomp(void)
+ {
+ 	return -EINVAL;
+ }
+ 
+-static inline long prctl_set_seccomp(unsigned long arg2)
++static inline long prctl_set_seccomp(unsigned long arg2, char __user *arg3)
+ {
+ 	return -EINVAL;
+ }
+@@ -48,7 +102,21 @@ static inline int seccomp_mode(struct seccomp *s)
+ {
+ 	return 0;
+ }
+-
+ #endif /* CONFIG_SECCOMP */
+ 
++#ifdef CONFIG_SECCOMP_FILTER
++extern void put_seccomp_filter(struct task_struct *tsk);
++extern void get_seccomp_filter(struct task_struct *tsk);
++extern u32 seccomp_bpf_load(int off);
++#else  /* CONFIG_SECCOMP_FILTER */
++static inline void put_seccomp_filter(struct task_struct *tsk)
++{
++	return;
++}
++static inline void get_seccomp_filter(struct task_struct *tsk)
++{
++	return;
++}
++#endif /* CONFIG_SECCOMP_FILTER */
++#endif /* __KERNEL__ */
+ #endif /* _LINUX_SECCOMP_H */
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 8163333..9faa812 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -34,6 +34,7 @@
+ #include <linux/cgroup.h>
+ #include <linux/security.h>
+ #include <linux/hugetlb.h>
++#include <linux/seccomp.h>
+ #include <linux/swap.h>
+ #include <linux/syscalls.h>
+ #include <linux/jiffies.h>
+@@ -171,6 +172,7 @@ void free_task(struct task_struct *tsk)
+ 	free_thread_info(tsk->stack);
+ 	rt_mutex_debug_task_free(tsk);
+ 	ftrace_graph_exit_task(tsk);
++	put_seccomp_filter(tsk);
+ 	free_task_struct(tsk);
+ }
+ EXPORT_SYMBOL(free_task);
+@@ -1164,6 +1166,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
+ 		goto fork_out;
+ 
+ 	ftrace_graph_init_task(p);
++	get_seccomp_filter(p);
+ 
+ 	rt_mutex_init_task(p);
+ 
+diff --git a/kernel/seccomp.c b/kernel/seccomp.c
+index e8d76c5..0aeec19 100644
+--- a/kernel/seccomp.c
++++ b/kernel/seccomp.c
+@@ -3,16 +3,343 @@
+  *
+  * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
+  *
+- * This defines a simple but solid secure-computing mode.
++ * Copyright (C) 2012 Google, Inc.
++ * Will Drewry <wad@chromium.org>
++ *
++ * This defines a simple but solid secure-computing facility.
++ *
++ * Mode 1 uses a fixed list of allowed system calls.
++ * Mode 2 allows user-defined system call filters in the form
++ *        of Berkeley Packet Filters/Linux Socket Filters.
+  */
+ 
++#include <linux/atomic.h>
+ #include <linux/audit.h>
+-#include <linux/seccomp.h>
+-#include <linux/sched.h>
+ #include <linux/compat.h>
++#include <linux/sched.h>
++#include <linux/seccomp.h>
+ 
+ /* #define SECCOMP_DEBUG 1 */
+-#define NR_SECCOMP_MODES 1
++
++#ifdef CONFIG_SECCOMP_FILTER
++#include <asm/syscall.h>
++#include <linux/filter.h>
++#include <linux/security.h>
++#include <linux/slab.h>
++#include <linux/tracehook.h>
++#include <linux/uaccess.h>
++
++/**
++ * struct seccomp_filter - container for seccomp BPF programs
++ *
++ * @usage: reference count to manage the object lifetime.
++ *         get/put helpers should be used when accessing an instance
++ *         outside of a lifetime-guarded section.  In general, this
++ *         is only needed for handling filters shared across tasks.
++ * @prev: points to a previously installed, or inherited, filter
++ * @len: the number of instructions in the program
++ * @insns: the BPF program instructions to evaluate
++ *
++ * seccomp_filter objects are organized in a tree linked via the @prev
++ * pointer.  For any task, it appears to be a singly-linked list starting
++ * with current->seccomp.filter, the most recently attached or inherited filter.
++ * However, multiple filters may share a @prev node, by way of fork(), which
++ * results in a unidirectional tree existing in memory.  This is similar to
++ * how namespaces work.
++ *
++ * seccomp_filter objects should never be modified after being attached
++ * to a task_struct (other than @usage).
++ */
++struct seccomp_filter {
++	atomic_t usage;
++	struct seccomp_filter *prev;
++	unsigned short len;  /* Instruction count */
++	struct sock_filter insns[];
++};
++
++/* Limit any path through the tree to 256KB worth of instructions. */
++#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
++
++static void seccomp_filter_log_failure(int syscall)
++{
++	int compat = 0;
++#ifdef CONFIG_COMPAT
++	compat = is_compat_task();
++#endif
++	pr_info("%s[%d]: %ssystem call %d blocked at 0x%lx\n",
++		current->comm, task_pid_nr(current),
++		(compat ? "compat " : ""),
++		syscall, KSTK_EIP(current));
++}
++
++/**
++ * get_u32 - returns a u32 offset into data
++ * @data: a unsigned 64 bit value
++ * @index: 0 or 1 to return the first or second 32-bits
++ *
++ * This inline exists to hide the length of unsigned long.  If a 32-bit
++ * unsigned long is passed in, it will be extended and the top 32-bits will be
++ * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
++ * properly returned.
++ *
++ * Endianness is explicitly ignored and left for BPF program authors to manage
++ * as per the specific architecture.
++ */
++static inline u32 get_u32(u64 data, int index)
++{
++	return ((u32 *)&data)[index];
++}
++
++/* Helper for bpf_load below. */
++#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
++/**
++ * bpf_load: checks and returns a pointer to the requested offset
++ * @off: offset into struct seccomp_data to load from
++ *
++ * Returns the requested 32-bits of data.
++ * seccomp_check_filter() should assure that @off is 32-bit aligned
++ * and not out of bounds.  Failure to do so is a BUG.
++ */
++u32 seccomp_bpf_load(int off)
++{
++	struct pt_regs *regs = task_pt_regs(current);
++	if (off == BPF_DATA(nr))
++		return syscall_get_nr(current, regs);
++	if (off == BPF_DATA(arch))
++		return syscall_get_arch(current, regs);
++	if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
++		unsigned long value;
++		int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
++		int index = !!(off % sizeof(u64));
++		syscall_get_arguments(current, regs, arg, 1, &value);
++		return get_u32(value, index);
++	}
++	if (off == BPF_DATA(instruction_pointer))
++		return get_u32(KSTK_EIP(current), 0);
++	if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
++		return get_u32(KSTK_EIP(current), 1);
++	/* seccomp_check_filter should make this impossible. */
++	BUG();
++}
++
++/**
++ *	seccomp_check_filter - verify seccomp filter code
++ *	@filter: filter to verify
++ *	@flen: length of filter
++ *
++ * Takes a previously checked filter (by sk_chk_filter) and
++ * redirects all filter code that loads struct sk_buff data
++ * and related data through seccomp_bpf_load.  It also
++ * enforces length and alignment checking of those loads.
++ *
++ * Returns 0 if the rule set is legal or -EINVAL if not.
++ */
++static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
++{
++	int pc;
++	for (pc = 0; pc < flen; pc++) {
++		struct sock_filter *ftest = &filter[pc];
++		u16 code = ftest->code;
++		u32 k = ftest->k;
++
++		switch (code) {
++		case BPF_S_LD_W_ABS:
++			ftest->code = BPF_S_ANC_SECCOMP_LD_W;
++			/* 32-bit aligned and not out of bounds. */
++			if (k >= sizeof(struct seccomp_data) || k & 3)
++				return -EINVAL;
++			continue;
++		case BPF_S_LD_W_LEN:
++			ftest->code = BPF_S_LD_IMM;
++			ftest->k = sizeof(struct seccomp_data);
++			continue;
++		case BPF_S_LDX_W_LEN:
++			ftest->code = BPF_S_LDX_IMM;
++			ftest->k = sizeof(struct seccomp_data);
++			continue;
++		/* Explicitly include allowed calls. */
++		case BPF_S_RET_K:
++		case BPF_S_RET_A:
++		case BPF_S_ALU_ADD_K:
++		case BPF_S_ALU_ADD_X:
++		case BPF_S_ALU_SUB_K:
++		case BPF_S_ALU_SUB_X:
++		case BPF_S_ALU_MUL_K:
++		case BPF_S_ALU_MUL_X:
++		case BPF_S_ALU_DIV_X:
++		case BPF_S_ALU_AND_K:
++		case BPF_S_ALU_AND_X:
++		case BPF_S_ALU_OR_K:
++		case BPF_S_ALU_OR_X:
++		case BPF_S_ALU_LSH_K:
++		case BPF_S_ALU_LSH_X:
++		case BPF_S_ALU_RSH_K:
++		case BPF_S_ALU_RSH_X:
++		case BPF_S_ALU_NEG:
++		case BPF_S_LD_IMM:
++		case BPF_S_LDX_IMM:
++		case BPF_S_MISC_TAX:
++		case BPF_S_MISC_TXA:
++		case BPF_S_ALU_DIV_K:
++		case BPF_S_LD_MEM:
++		case BPF_S_LDX_MEM:
++		case BPF_S_ST:
++		case BPF_S_STX:
++		case BPF_S_JMP_JA:
++		case BPF_S_JMP_JEQ_K:
++		case BPF_S_JMP_JEQ_X:
++		case BPF_S_JMP_JGE_K:
++		case BPF_S_JMP_JGE_X:
++		case BPF_S_JMP_JGT_K:
++		case BPF_S_JMP_JGT_X:
++		case BPF_S_JMP_JSET_K:
++		case BPF_S_JMP_JSET_X:
++			continue;
++		default:
++			return -EINVAL;
++		}
++	}
++	return 0;
++}
++
++/**
++ * seccomp_run_filters - evaluates all seccomp filters against @syscall
++ * @syscall: number of the current system call
++ *
++ * Returns valid seccomp BPF response codes.
++ */
++static u32 seccomp_run_filters(int syscall)
++{
++	struct seccomp_filter *f;
++	u32 ret = SECCOMP_RET_KILL;
++	/*
++	 * All filters in the list are evaluated and the lowest BPF return
++	 * value always takes priority.
++	 */
++	for (f = current->seccomp.filter; f; f = f->prev) {
++		ret = sk_run_filter(NULL, f->insns);
++		if (ret != SECCOMP_RET_ALLOW)
++			break;
++	}
++	return ret;
++}
++
++/**
++ * seccomp_attach_filter: Attaches a seccomp filter to current.
++ * @fprog: BPF program to install
++ *
++ * Returns 0 on success or an errno on failure.
++ */
++static long seccomp_attach_filter(struct sock_fprog *fprog)
++{
++	struct seccomp_filter *filter;
++	unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
++	unsigned long total_insns = fprog->len;
++	long ret;
++
++	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
++		return -EINVAL;
++
++	for (filter = current->seccomp.filter; filter; filter = filter->prev)
++		total_insns += filter->len + 4;  /* include a 4 instr penalty */
++	if (total_insns > MAX_INSNS_PER_PATH)
++		return -ENOMEM;
++
++	/*
++	 * Installing a seccomp filter requires that the task have
++	 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
++	 * This avoids scenarios where unprivileged tasks can affect the
++	 * behavior of privileged children.
++	 */
++	if (!current->no_new_privs &&
++	    security_capable_noaudit(current_cred(), current_user_ns(),
++				     CAP_SYS_ADMIN) != 0)
++		return -EACCES;
++
++	/* Allocate a new seccomp_filter */
++	filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
++			 GFP_KERNEL|__GFP_NOWARN);
++	if (!filter)
++		return -ENOMEM;
++	atomic_set(&filter->usage, 1);
++	filter->len = fprog->len;
++
++	/* Copy the instructions from fprog. */
++	ret = -EFAULT;
++	if (copy_from_user(filter->insns, fprog->filter, fp_size))
++		goto fail;
++
++	/* Check and rewrite the fprog via the skb checker */
++	ret = sk_chk_filter(filter->insns, filter->len);
++	if (ret)
++		goto fail;
++
++	/* Check and rewrite the fprog for seccomp use */
++	ret = seccomp_check_filter(filter->insns, filter->len);
++	if (ret)
++		goto fail;
++
++	/*
++	 * If there is an existing filter, make it the prev and don't drop its
++	 * task reference.
++	 */
++	filter->prev = current->seccomp.filter;
++	current->seccomp.filter = filter;
++	return 0;
++fail:
++	kfree(filter);
++	return ret;
++}
++
++/**
++ * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
++ * @user_filter: pointer to the user data containing a sock_fprog.
++ *
++ * Returns 0 on success and non-zero otherwise.
++ */
++long seccomp_attach_user_filter(char __user *user_filter)
++{
++	struct sock_fprog fprog;
++	long ret = -EFAULT;
++
++#ifdef CONFIG_COMPAT
++	if (is_compat_task()) {
++		struct compat_sock_fprog fprog32;
++		if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
++			goto out;
++		fprog.len = fprog32.len;
++		fprog.filter = compat_ptr(fprog32.filter);
++	} else /* falls through to the if below. */
++#endif
++	if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
++		goto out;
++	ret = seccomp_attach_filter(&fprog);
++out:
++	return ret;
++}
++
++/* get_seccomp_filter - increments the reference count of the filter on @tsk */
++void get_seccomp_filter(struct task_struct *tsk)
++{
++	struct seccomp_filter *orig = tsk->seccomp.filter;
++	if (!orig)
++		return;
++	/* Reference count is bounded by the number of total processes. */
++	atomic_inc(&orig->usage);
++}
++
++/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
++void put_seccomp_filter(struct task_struct *tsk)
++{
++	struct seccomp_filter *orig = tsk->seccomp.filter;
++	/* Clean up single-reference branches iteratively. */
++	while (orig && atomic_dec_and_test(&orig->usage)) {
++		struct seccomp_filter *freeme = orig;
++		orig = orig->prev;
++		kfree(freeme);
++	}
++}
++#endif	/* CONFIG_SECCOMP_FILTER */
+ 
+ /*
+  * Secure computing mode 1 allows only read/write/exit/sigreturn.
+@@ -34,10 +361,11 @@ static int mode1_syscalls_32[] = {
+ void __secure_computing(int this_syscall)
+ {
+ 	int mode = current->seccomp.mode;
+-	int * syscall;
++	int exit_sig = 0;
++	int *syscall;
+ 
+ 	switch (mode) {
+-	case 1:
++	case SECCOMP_MODE_STRICT:
+ 		syscall = mode1_syscalls;
+ #ifdef CONFIG_COMPAT
+ 		if (is_compat_task())
+@@ -47,7 +375,16 @@ void __secure_computing(int this_syscall)
+ 			if (*syscall == this_syscall)
+ 				return;
+ 		} while (*++syscall);
++		exit_sig = SIGKILL;
+ 		break;
++#ifdef CONFIG_SECCOMP_FILTER
++	case SECCOMP_MODE_FILTER:
++		if (seccomp_run_filters(this_syscall) == SECCOMP_RET_ALLOW)
++			return;
++		seccomp_filter_log_failure(this_syscall);
++		exit_sig = SIGSYS;
++		break;
++#endif
+ 	default:
+ 		BUG();
+ 	}
+@@ -56,7 +393,7 @@ void __secure_computing(int this_syscall)
+ 	dump_stack();
+ #endif
+ 	audit_seccomp(this_syscall);
+-	do_exit(SIGKILL);
++	do_exit(exit_sig);
+ }
+ 
+ long prctl_get_seccomp(void)
+@@ -64,25 +401,48 @@ long prctl_get_seccomp(void)
+ 	return current->seccomp.mode;
+ }
+ 
+-long prctl_set_seccomp(unsigned long seccomp_mode)
++/**
++ * prctl_set_seccomp: configures current->seccomp.mode
++ * @seccomp_mode: requested mode to use
++ * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
++ *
++ * This function may be called repeatedly with a @seccomp_mode of
++ * SECCOMP_MODE_FILTER to install additional filters.  Every filter
++ * successfully installed will be evaluated (in reverse order) for each system
++ * call the task makes.
++ *
++ * Once current->seccomp.mode is non-zero, it may not be changed.
++ *
++ * Returns 0 on success or -EINVAL on failure.
++ */
++long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
+ {
+-	long ret;
++	long ret = -EINVAL;
+ 
+-	/* can set it only once to be even more secure */
+-	ret = -EPERM;
+-	if (unlikely(current->seccomp.mode))
++	if (current->seccomp.mode &&
++	    current->seccomp.mode != seccomp_mode)
+ 		goto out;
+ 
+-	ret = -EINVAL;
+-	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
+-		current->seccomp.mode = seccomp_mode;
+-		set_thread_flag(TIF_SECCOMP);
++	switch (seccomp_mode) {
++	case SECCOMP_MODE_STRICT:
++		ret = 0;
+ #ifdef TIF_NOTSC
+ 		disable_TSC();
+ #endif
+-		ret = 0;
++		break;
++#ifdef CONFIG_SECCOMP_FILTER
++	case SECCOMP_MODE_FILTER:
++		ret = seccomp_attach_user_filter(filter);
++		if (ret)
++			goto out;
++		break;
++#endif
++	default:
++		goto out;
+ 	}
+ 
+- out:
++	current->seccomp.mode = seccomp_mode;
++	set_thread_flag(TIF_SECCOMP);
++out:
+ 	return ret;
+ }
+diff --git a/kernel/sys.c b/kernel/sys.c
+index b82568b..ba0ae8e 100644
+--- a/kernel/sys.c
++++ b/kernel/sys.c
+@@ -1908,7 +1908,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
+ 			error = prctl_get_seccomp();
+ 			break;
+ 		case PR_SET_SECCOMP:
+-			error = prctl_set_seccomp(arg2);
++			error = prctl_set_seccomp(arg2, (char __user *)arg3);
+ 			break;
+ 		case PR_GET_TSC:
+ 			error = GET_TSC_CTL(arg2);
+-- 
+1.7.9.1
+