diff options
Diffstat (limited to 'features/seccomp/seccomp-add-system-call-filtering-using-BPF.patch')
-rw-r--r-- | features/seccomp/seccomp-add-system-call-filtering-using-BPF.patch | 820 |
1 files changed, 820 insertions, 0 deletions
diff --git a/features/seccomp/seccomp-add-system-call-filtering-using-BPF.patch b/features/seccomp/seccomp-add-system-call-filtering-using-BPF.patch new file mode 100644 index 00000000..908f3cfd --- /dev/null +++ b/features/seccomp/seccomp-add-system-call-filtering-using-BPF.patch @@ -0,0 +1,820 @@ +From 01c9617a2eca38f68d917ae16bdf8c2c8d863c8e Mon Sep 17 00:00:00 2001 +From: Will Drewry <wad@chromium.org> +Date: Thu, 12 Apr 2012 16:47:57 -0500 +Subject: [PATCH] seccomp: add system call filtering using BPF + +commit e2cfabdfd075648216f99c2c03821cf3f47c1727 upstream. + +[This patch depends on luto@mit.edu's no_new_privs patch: + https://lkml.org/lkml/2012/1/30/264 + The whole series including Andrew's patches can be found here: + https://github.com/redpig/linux/tree/seccomp + Complete diff here: + https://github.com/redpig/linux/compare/1dc65fed...seccomp +] + +This patch adds support for seccomp mode 2. Mode 2 introduces the +ability for unprivileged processes to install system call filtering +policy expressed in terms of a Berkeley Packet Filter (BPF) program. +This program will be evaluated in the kernel for each system call +the task makes and computes a result based on data in the format +of struct seccomp_data. + +A filter program may be installed by calling: + struct sock_fprog fprog = { ... }; + ... + prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &fprog); + +The return value of the filter program determines if the system call is +allowed to proceed or denied. If the first filter program installed +allows prctl(2) calls, then the above call may be made repeatedly +by a task to further reduce its access to the kernel. All attached +programs must be evaluated before a system call will be allowed to +proceed. + +Filter programs will be inherited across fork/clone and execve. +However, if the task attaching the filter is unprivileged +(!CAP_SYS_ADMIN) the no_new_privs bit will be set on the task. This +ensures that unprivileged tasks cannot attach filters that affect +privileged tasks (e.g., setuid binary). + +There are a number of benefits to this approach. A few of which are +as follows: +- BPF has been exposed to userland for a long time +- BPF optimization (and JIT'ing) are well understood +- Userland already knows its ABI: system call numbers and desired + arguments +- No time-of-check-time-of-use vulnerable data accesses are possible. +- system call arguments are loaded on access only to minimize copying + required for system call policy decisions. + +Mode 2 support is restricted to architectures that enable +HAVE_ARCH_SECCOMP_FILTER. In this patch, the primary dependency is on +syscall_get_arguments(). The full desired scope of this feature will +add a few minor additional requirements expressed later in this series. +Based on discussion, SECCOMP_RET_ERRNO and SECCOMP_RET_TRACE seem to be +the desired additional functionality. + +No architectures are enabled in this patch. + +Signed-off-by: Will Drewry <wad@chromium.org> +Acked-by: Serge Hallyn <serge.hallyn@canonical.com> +Reviewed-by: Indan Zupancic <indan@nul.nu> +Acked-by: Eric Paris <eparis@redhat.com> +Reviewed-by: Kees Cook <keescook@chromium.org> + +v18: - rebase to v3.4-rc2 + - s/chk/check/ (akpm@linux-foundation.org,jmorris@namei.org) + - allocate with GFP_KERNEL|__GFP_NOWARN (indan@nul.nu) + - add a comment for get_u32 regarding endianness (akpm@) + - fix other typos, style mistakes (akpm@) + - added acked-by +v17: - properly guard seccomp filter needed headers (leann@ubuntu.com) + - tighten return mask to 0x7fff0000 +v16: - no change +v15: - add a 4 instr penalty when counting a path to account for seccomp_filter + size (indan@nul.nu) + - drop the max insns to 256KB (indan@nul.nu) + - return ENOMEM if the max insns limit has been hit (indan@nul.nu) + - move IP checks after args (indan@nul.nu) + - drop !user_filter check (indan@nul.nu) + - only allow explicit bpf codes (indan@nul.nu) + - exit_code -> exit_sig +v14: - put/get_seccomp_filter takes struct task_struct + (indan@nul.nu,keescook@chromium.org) + - adds seccomp_chk_filter and drops general bpf_run/chk_filter user + - add seccomp_bpf_load for use by net/core/filter.c + - lower max per-process/per-hierarchy: 1MB + - moved nnp/capability check prior to allocation + (all of the above: indan@nul.nu) +v13: - rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc +v12: - added a maximum instruction count per path (indan@nul.nu,oleg@redhat.com) + - removed copy_seccomp (keescook@chromium.org,indan@nul.nu) + - reworded the prctl_set_seccomp comment (indan@nul.nu) +v11: - reorder struct seccomp_data to allow future args expansion (hpa@zytor.com) + - style clean up, @compat dropped, compat_sock_fprog32 (indan@nul.nu) + - do_exit(SIGSYS) (keescook@chromium.org, luto@mit.edu) + - pare down Kconfig doc reference. + - extra comment clean up +v10: - seccomp_data has changed again to be more aesthetically pleasing + (hpa@zytor.com) + - calling convention is noted in a new u32 field using syscall_get_arch. + This allows for cross-calling convention tasks to use seccomp filters. + (hpa@zytor.com) + - lots of clean up (thanks, Indan!) + v9: - n/a + v8: - use bpf_chk_filter, bpf_run_filter. update load_fns + - Lots of fixes courtesy of indan@nul.nu: + -- fix up load behavior, compat fixups, and merge alloc code, + -- renamed pc and dropped __packed, use bool compat. + -- Added a hidden CONFIG_SECCOMP_FILTER to synthesize non-arch + dependencies + v7: (massive overhaul thanks to Indan, others) + - added CONFIG_HAVE_ARCH_SECCOMP_FILTER + - merged into seccomp.c + - minimal seccomp_filter.h + - no config option (part of seccomp) + - no new prctl + - doesn't break seccomp on systems without asm/syscall.h + (works but arg access always fails) + - dropped seccomp_init_task, extra free functions, ... + - dropped the no-asm/syscall.h code paths + - merges with network sk_run_filter and sk_chk_filter + v6: - fix memory leak on attach compat check failure + - require no_new_privs || CAP_SYS_ADMIN prior to filter + installation. (luto@mit.edu) + - s/seccomp_struct_/seccomp_/ for macros/functions (amwang@redhat.com) + - cleaned up Kconfig (amwang@redhat.com) + - on block, note if the call was compat (so the # means something) + v5: - uses syscall_get_arguments + (indan@nul.nu,oleg@redhat.com, mcgrathr@chromium.org) + - uses union-based arg storage with hi/lo struct to + handle endianness. Compromises between the two alternate + proposals to minimize extra arg shuffling and account for + endianness assuming userspace uses offsetof(). + (mcgrathr@chromium.org, indan@nul.nu) + - update Kconfig description + - add include/seccomp_filter.h and add its installation + - (naive) on-demand syscall argument loading + - drop seccomp_t (eparis@redhat.com) + v4: - adjusted prctl to make room for PR_[SG]ET_NO_NEW_PRIVS + - now uses current->no_new_privs + (luto@mit.edu,torvalds@linux-foundation.com) + - assign names to seccomp modes (rdunlap@xenotime.net) + - fix style issues (rdunlap@xenotime.net) + - reworded Kconfig entry (rdunlap@xenotime.net) + v3: - macros to inline (oleg@redhat.com) + - init_task behavior fixed (oleg@redhat.com) + - drop creator entry and extra NULL check (oleg@redhat.com) + - alloc returns -EINVAL on bad sizing (serge.hallyn@canonical.com) + - adds tentative use of "always_unprivileged" as per + torvalds@linux-foundation.org and luto@mit.edu + v2: - (patch 2 only) +Signed-off-by: James Morris <james.l.morris@oracle.com> +Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> +--- + arch/Kconfig | 17 ++ + include/linux/Kbuild | 1 + + include/linux/seccomp.h | 76 +++++++++- + kernel/fork.c | 3 + + kernel/seccomp.c | 396 ++++++++++++++++++++++++++++++++++++++++++++-- + kernel/sys.c | 2 +- + 6 files changed, 472 insertions(+), 23 deletions(-) + +diff --git a/arch/Kconfig b/arch/Kconfig +index 684eb5a..91c2c73 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -216,4 +216,21 @@ config HAVE_CMPXCHG_DOUBLE + config ARCH_WANT_OLD_COMPAT_IPC + bool + ++config HAVE_ARCH_SECCOMP_FILTER ++ bool ++ help ++ This symbol should be selected by an architecure if it provides ++ asm/syscall.h, specifically syscall_get_arguments() and ++ syscall_get_arch(). ++ ++config SECCOMP_FILTER ++ def_bool y ++ depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET ++ help ++ Enable tasks to build secure computing environments defined ++ in terms of Berkeley Packet Filter programs which implement ++ task-defined system call filtering polices. ++ ++ See Documentation/prctl/seccomp_filter.txt for details. ++ + source "kernel/gcov/Kconfig" +diff --git a/include/linux/Kbuild b/include/linux/Kbuild +index 50f55c7..bc82495 100644 +--- a/include/linux/Kbuild ++++ b/include/linux/Kbuild +@@ -333,6 +333,7 @@ header-y += scc.h + header-y += sched.h + header-y += screen_info.h + header-y += sdla.h ++header-y += seccomp.h + header-y += securebits.h + header-y += selinux_netlink.h + header-y += sem.h +diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h +index d61f27f..86bb68f 100644 +--- a/include/linux/seccomp.h ++++ b/include/linux/seccomp.h +@@ -1,14 +1,67 @@ + #ifndef _LINUX_SECCOMP_H + #define _LINUX_SECCOMP_H + ++#include <linux/compiler.h> ++#include <linux/types.h> ++ ++ ++/* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */ ++#define SECCOMP_MODE_DISABLED 0 /* seccomp is not in use. */ ++#define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */ ++#define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ ++ ++/* ++ * All BPF programs must return a 32-bit value. ++ * The bottom 16-bits are reserved for future use. ++ * The upper 16-bits are ordered from least permissive values to most. ++ * ++ * The ordering ensures that a min_t() over composed return values always ++ * selects the least permissive choice. ++ */ ++#define SECCOMP_RET_KILL 0x00000000U /* kill the task immediately */ ++#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ ++ ++/* Masks for the return value sections. */ ++#define SECCOMP_RET_ACTION 0x7fff0000U ++#define SECCOMP_RET_DATA 0x0000ffffU ++ ++/** ++ * struct seccomp_data - the format the BPF program executes over. ++ * @nr: the system call number ++ * @arch: indicates system call convention as an AUDIT_ARCH_* value ++ * as defined in <linux/audit.h>. ++ * @instruction_pointer: at the time of the system call. ++ * @args: up to 6 system call arguments always stored as 64-bit values ++ * regardless of the architecture. ++ */ ++struct seccomp_data { ++ int nr; ++ __u32 arch; ++ __u64 instruction_pointer; ++ __u64 args[6]; ++}; + ++#ifdef __KERNEL__ + #ifdef CONFIG_SECCOMP + + #include <linux/thread_info.h> + #include <asm/seccomp.h> + ++struct seccomp_filter; ++/** ++ * struct seccomp - the state of a seccomp'ed process ++ * ++ * @mode: indicates one of the valid values above for controlled ++ * system calls available to a process. ++ * @filter: The metadata and ruleset for determining what system calls ++ * are allowed for a task. ++ * ++ * @filter must only be accessed from the context of current as there ++ * is no locking. ++ */ + struct seccomp { + int mode; ++ struct seccomp_filter *filter; + }; + + extern void __secure_computing(int); +@@ -19,7 +72,7 @@ static inline void secure_computing(int this_syscall) + } + + extern long prctl_get_seccomp(void); +-extern long prctl_set_seccomp(unsigned long); ++extern long prctl_set_seccomp(unsigned long, char __user *); + + static inline int seccomp_mode(struct seccomp *s) + { +@@ -31,15 +84,16 @@ static inline int seccomp_mode(struct seccomp *s) + #include <linux/errno.h> + + struct seccomp { }; ++struct seccomp_filter { }; + +-#define secure_computing(x) do { } while (0) ++#define secure_computing(x) 0 + + static inline long prctl_get_seccomp(void) + { + return -EINVAL; + } + +-static inline long prctl_set_seccomp(unsigned long arg2) ++static inline long prctl_set_seccomp(unsigned long arg2, char __user *arg3) + { + return -EINVAL; + } +@@ -48,7 +102,21 @@ static inline int seccomp_mode(struct seccomp *s) + { + return 0; + } +- + #endif /* CONFIG_SECCOMP */ + ++#ifdef CONFIG_SECCOMP_FILTER ++extern void put_seccomp_filter(struct task_struct *tsk); ++extern void get_seccomp_filter(struct task_struct *tsk); ++extern u32 seccomp_bpf_load(int off); ++#else /* CONFIG_SECCOMP_FILTER */ ++static inline void put_seccomp_filter(struct task_struct *tsk) ++{ ++ return; ++} ++static inline void get_seccomp_filter(struct task_struct *tsk) ++{ ++ return; ++} ++#endif /* CONFIG_SECCOMP_FILTER */ ++#endif /* __KERNEL__ */ + #endif /* _LINUX_SECCOMP_H */ +diff --git a/kernel/fork.c b/kernel/fork.c +index 8163333..9faa812 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -34,6 +34,7 @@ + #include <linux/cgroup.h> + #include <linux/security.h> + #include <linux/hugetlb.h> ++#include <linux/seccomp.h> + #include <linux/swap.h> + #include <linux/syscalls.h> + #include <linux/jiffies.h> +@@ -171,6 +172,7 @@ void free_task(struct task_struct *tsk) + free_thread_info(tsk->stack); + rt_mutex_debug_task_free(tsk); + ftrace_graph_exit_task(tsk); ++ put_seccomp_filter(tsk); + free_task_struct(tsk); + } + EXPORT_SYMBOL(free_task); +@@ -1164,6 +1166,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, + goto fork_out; + + ftrace_graph_init_task(p); ++ get_seccomp_filter(p); + + rt_mutex_init_task(p); + +diff --git a/kernel/seccomp.c b/kernel/seccomp.c +index e8d76c5..0aeec19 100644 +--- a/kernel/seccomp.c ++++ b/kernel/seccomp.c +@@ -3,16 +3,343 @@ + * + * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> + * +- * This defines a simple but solid secure-computing mode. ++ * Copyright (C) 2012 Google, Inc. ++ * Will Drewry <wad@chromium.org> ++ * ++ * This defines a simple but solid secure-computing facility. ++ * ++ * Mode 1 uses a fixed list of allowed system calls. ++ * Mode 2 allows user-defined system call filters in the form ++ * of Berkeley Packet Filters/Linux Socket Filters. + */ + ++#include <linux/atomic.h> + #include <linux/audit.h> +-#include <linux/seccomp.h> +-#include <linux/sched.h> + #include <linux/compat.h> ++#include <linux/sched.h> ++#include <linux/seccomp.h> + + /* #define SECCOMP_DEBUG 1 */ +-#define NR_SECCOMP_MODES 1 ++ ++#ifdef CONFIG_SECCOMP_FILTER ++#include <asm/syscall.h> ++#include <linux/filter.h> ++#include <linux/security.h> ++#include <linux/slab.h> ++#include <linux/tracehook.h> ++#include <linux/uaccess.h> ++ ++/** ++ * struct seccomp_filter - container for seccomp BPF programs ++ * ++ * @usage: reference count to manage the object lifetime. ++ * get/put helpers should be used when accessing an instance ++ * outside of a lifetime-guarded section. In general, this ++ * is only needed for handling filters shared across tasks. ++ * @prev: points to a previously installed, or inherited, filter ++ * @len: the number of instructions in the program ++ * @insns: the BPF program instructions to evaluate ++ * ++ * seccomp_filter objects are organized in a tree linked via the @prev ++ * pointer. For any task, it appears to be a singly-linked list starting ++ * with current->seccomp.filter, the most recently attached or inherited filter. ++ * However, multiple filters may share a @prev node, by way of fork(), which ++ * results in a unidirectional tree existing in memory. This is similar to ++ * how namespaces work. ++ * ++ * seccomp_filter objects should never be modified after being attached ++ * to a task_struct (other than @usage). ++ */ ++struct seccomp_filter { ++ atomic_t usage; ++ struct seccomp_filter *prev; ++ unsigned short len; /* Instruction count */ ++ struct sock_filter insns[]; ++}; ++ ++/* Limit any path through the tree to 256KB worth of instructions. */ ++#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) ++ ++static void seccomp_filter_log_failure(int syscall) ++{ ++ int compat = 0; ++#ifdef CONFIG_COMPAT ++ compat = is_compat_task(); ++#endif ++ pr_info("%s[%d]: %ssystem call %d blocked at 0x%lx\n", ++ current->comm, task_pid_nr(current), ++ (compat ? "compat " : ""), ++ syscall, KSTK_EIP(current)); ++} ++ ++/** ++ * get_u32 - returns a u32 offset into data ++ * @data: a unsigned 64 bit value ++ * @index: 0 or 1 to return the first or second 32-bits ++ * ++ * This inline exists to hide the length of unsigned long. If a 32-bit ++ * unsigned long is passed in, it will be extended and the top 32-bits will be ++ * 0. If it is a 64-bit unsigned long, then whatever data is resident will be ++ * properly returned. ++ * ++ * Endianness is explicitly ignored and left for BPF program authors to manage ++ * as per the specific architecture. ++ */ ++static inline u32 get_u32(u64 data, int index) ++{ ++ return ((u32 *)&data)[index]; ++} ++ ++/* Helper for bpf_load below. */ ++#define BPF_DATA(_name) offsetof(struct seccomp_data, _name) ++/** ++ * bpf_load: checks and returns a pointer to the requested offset ++ * @off: offset into struct seccomp_data to load from ++ * ++ * Returns the requested 32-bits of data. ++ * seccomp_check_filter() should assure that @off is 32-bit aligned ++ * and not out of bounds. Failure to do so is a BUG. ++ */ ++u32 seccomp_bpf_load(int off) ++{ ++ struct pt_regs *regs = task_pt_regs(current); ++ if (off == BPF_DATA(nr)) ++ return syscall_get_nr(current, regs); ++ if (off == BPF_DATA(arch)) ++ return syscall_get_arch(current, regs); ++ if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { ++ unsigned long value; ++ int arg = (off - BPF_DATA(args[0])) / sizeof(u64); ++ int index = !!(off % sizeof(u64)); ++ syscall_get_arguments(current, regs, arg, 1, &value); ++ return get_u32(value, index); ++ } ++ if (off == BPF_DATA(instruction_pointer)) ++ return get_u32(KSTK_EIP(current), 0); ++ if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) ++ return get_u32(KSTK_EIP(current), 1); ++ /* seccomp_check_filter should make this impossible. */ ++ BUG(); ++} ++ ++/** ++ * seccomp_check_filter - verify seccomp filter code ++ * @filter: filter to verify ++ * @flen: length of filter ++ * ++ * Takes a previously checked filter (by sk_chk_filter) and ++ * redirects all filter code that loads struct sk_buff data ++ * and related data through seccomp_bpf_load. It also ++ * enforces length and alignment checking of those loads. ++ * ++ * Returns 0 if the rule set is legal or -EINVAL if not. ++ */ ++static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) ++{ ++ int pc; ++ for (pc = 0; pc < flen; pc++) { ++ struct sock_filter *ftest = &filter[pc]; ++ u16 code = ftest->code; ++ u32 k = ftest->k; ++ ++ switch (code) { ++ case BPF_S_LD_W_ABS: ++ ftest->code = BPF_S_ANC_SECCOMP_LD_W; ++ /* 32-bit aligned and not out of bounds. */ ++ if (k >= sizeof(struct seccomp_data) || k & 3) ++ return -EINVAL; ++ continue; ++ case BPF_S_LD_W_LEN: ++ ftest->code = BPF_S_LD_IMM; ++ ftest->k = sizeof(struct seccomp_data); ++ continue; ++ case BPF_S_LDX_W_LEN: ++ ftest->code = BPF_S_LDX_IMM; ++ ftest->k = sizeof(struct seccomp_data); ++ continue; ++ /* Explicitly include allowed calls. */ ++ case BPF_S_RET_K: ++ case BPF_S_RET_A: ++ case BPF_S_ALU_ADD_K: ++ case BPF_S_ALU_ADD_X: ++ case BPF_S_ALU_SUB_K: ++ case BPF_S_ALU_SUB_X: ++ case BPF_S_ALU_MUL_K: ++ case BPF_S_ALU_MUL_X: ++ case BPF_S_ALU_DIV_X: ++ case BPF_S_ALU_AND_K: ++ case BPF_S_ALU_AND_X: ++ case BPF_S_ALU_OR_K: ++ case BPF_S_ALU_OR_X: ++ case BPF_S_ALU_LSH_K: ++ case BPF_S_ALU_LSH_X: ++ case BPF_S_ALU_RSH_K: ++ case BPF_S_ALU_RSH_X: ++ case BPF_S_ALU_NEG: ++ case BPF_S_LD_IMM: ++ case BPF_S_LDX_IMM: ++ case BPF_S_MISC_TAX: ++ case BPF_S_MISC_TXA: ++ case BPF_S_ALU_DIV_K: ++ case BPF_S_LD_MEM: ++ case BPF_S_LDX_MEM: ++ case BPF_S_ST: ++ case BPF_S_STX: ++ case BPF_S_JMP_JA: ++ case BPF_S_JMP_JEQ_K: ++ case BPF_S_JMP_JEQ_X: ++ case BPF_S_JMP_JGE_K: ++ case BPF_S_JMP_JGE_X: ++ case BPF_S_JMP_JGT_K: ++ case BPF_S_JMP_JGT_X: ++ case BPF_S_JMP_JSET_K: ++ case BPF_S_JMP_JSET_X: ++ continue; ++ default: ++ return -EINVAL; ++ } ++ } ++ return 0; ++} ++ ++/** ++ * seccomp_run_filters - evaluates all seccomp filters against @syscall ++ * @syscall: number of the current system call ++ * ++ * Returns valid seccomp BPF response codes. ++ */ ++static u32 seccomp_run_filters(int syscall) ++{ ++ struct seccomp_filter *f; ++ u32 ret = SECCOMP_RET_KILL; ++ /* ++ * All filters in the list are evaluated and the lowest BPF return ++ * value always takes priority. ++ */ ++ for (f = current->seccomp.filter; f; f = f->prev) { ++ ret = sk_run_filter(NULL, f->insns); ++ if (ret != SECCOMP_RET_ALLOW) ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * seccomp_attach_filter: Attaches a seccomp filter to current. ++ * @fprog: BPF program to install ++ * ++ * Returns 0 on success or an errno on failure. ++ */ ++static long seccomp_attach_filter(struct sock_fprog *fprog) ++{ ++ struct seccomp_filter *filter; ++ unsigned long fp_size = fprog->len * sizeof(struct sock_filter); ++ unsigned long total_insns = fprog->len; ++ long ret; ++ ++ if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) ++ return -EINVAL; ++ ++ for (filter = current->seccomp.filter; filter; filter = filter->prev) ++ total_insns += filter->len + 4; /* include a 4 instr penalty */ ++ if (total_insns > MAX_INSNS_PER_PATH) ++ return -ENOMEM; ++ ++ /* ++ * Installing a seccomp filter requires that the task have ++ * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. ++ * This avoids scenarios where unprivileged tasks can affect the ++ * behavior of privileged children. ++ */ ++ if (!current->no_new_privs && ++ security_capable_noaudit(current_cred(), current_user_ns(), ++ CAP_SYS_ADMIN) != 0) ++ return -EACCES; ++ ++ /* Allocate a new seccomp_filter */ ++ filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, ++ GFP_KERNEL|__GFP_NOWARN); ++ if (!filter) ++ return -ENOMEM; ++ atomic_set(&filter->usage, 1); ++ filter->len = fprog->len; ++ ++ /* Copy the instructions from fprog. */ ++ ret = -EFAULT; ++ if (copy_from_user(filter->insns, fprog->filter, fp_size)) ++ goto fail; ++ ++ /* Check and rewrite the fprog via the skb checker */ ++ ret = sk_chk_filter(filter->insns, filter->len); ++ if (ret) ++ goto fail; ++ ++ /* Check and rewrite the fprog for seccomp use */ ++ ret = seccomp_check_filter(filter->insns, filter->len); ++ if (ret) ++ goto fail; ++ ++ /* ++ * If there is an existing filter, make it the prev and don't drop its ++ * task reference. ++ */ ++ filter->prev = current->seccomp.filter; ++ current->seccomp.filter = filter; ++ return 0; ++fail: ++ kfree(filter); ++ return ret; ++} ++ ++/** ++ * seccomp_attach_user_filter - attaches a user-supplied sock_fprog ++ * @user_filter: pointer to the user data containing a sock_fprog. ++ * ++ * Returns 0 on success and non-zero otherwise. ++ */ ++long seccomp_attach_user_filter(char __user *user_filter) ++{ ++ struct sock_fprog fprog; ++ long ret = -EFAULT; ++ ++#ifdef CONFIG_COMPAT ++ if (is_compat_task()) { ++ struct compat_sock_fprog fprog32; ++ if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) ++ goto out; ++ fprog.len = fprog32.len; ++ fprog.filter = compat_ptr(fprog32.filter); ++ } else /* falls through to the if below. */ ++#endif ++ if (copy_from_user(&fprog, user_filter, sizeof(fprog))) ++ goto out; ++ ret = seccomp_attach_filter(&fprog); ++out: ++ return ret; ++} ++ ++/* get_seccomp_filter - increments the reference count of the filter on @tsk */ ++void get_seccomp_filter(struct task_struct *tsk) ++{ ++ struct seccomp_filter *orig = tsk->seccomp.filter; ++ if (!orig) ++ return; ++ /* Reference count is bounded by the number of total processes. */ ++ atomic_inc(&orig->usage); ++} ++ ++/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ ++void put_seccomp_filter(struct task_struct *tsk) ++{ ++ struct seccomp_filter *orig = tsk->seccomp.filter; ++ /* Clean up single-reference branches iteratively. */ ++ while (orig && atomic_dec_and_test(&orig->usage)) { ++ struct seccomp_filter *freeme = orig; ++ orig = orig->prev; ++ kfree(freeme); ++ } ++} ++#endif /* CONFIG_SECCOMP_FILTER */ + + /* + * Secure computing mode 1 allows only read/write/exit/sigreturn. +@@ -34,10 +361,11 @@ static int mode1_syscalls_32[] = { + void __secure_computing(int this_syscall) + { + int mode = current->seccomp.mode; +- int * syscall; ++ int exit_sig = 0; ++ int *syscall; + + switch (mode) { +- case 1: ++ case SECCOMP_MODE_STRICT: + syscall = mode1_syscalls; + #ifdef CONFIG_COMPAT + if (is_compat_task()) +@@ -47,7 +375,16 @@ void __secure_computing(int this_syscall) + if (*syscall == this_syscall) + return; + } while (*++syscall); ++ exit_sig = SIGKILL; + break; ++#ifdef CONFIG_SECCOMP_FILTER ++ case SECCOMP_MODE_FILTER: ++ if (seccomp_run_filters(this_syscall) == SECCOMP_RET_ALLOW) ++ return; ++ seccomp_filter_log_failure(this_syscall); ++ exit_sig = SIGSYS; ++ break; ++#endif + default: + BUG(); + } +@@ -56,7 +393,7 @@ void __secure_computing(int this_syscall) + dump_stack(); + #endif + audit_seccomp(this_syscall); +- do_exit(SIGKILL); ++ do_exit(exit_sig); + } + + long prctl_get_seccomp(void) +@@ -64,25 +401,48 @@ long prctl_get_seccomp(void) + return current->seccomp.mode; + } + +-long prctl_set_seccomp(unsigned long seccomp_mode) ++/** ++ * prctl_set_seccomp: configures current->seccomp.mode ++ * @seccomp_mode: requested mode to use ++ * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER ++ * ++ * This function may be called repeatedly with a @seccomp_mode of ++ * SECCOMP_MODE_FILTER to install additional filters. Every filter ++ * successfully installed will be evaluated (in reverse order) for each system ++ * call the task makes. ++ * ++ * Once current->seccomp.mode is non-zero, it may not be changed. ++ * ++ * Returns 0 on success or -EINVAL on failure. ++ */ ++long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) + { +- long ret; ++ long ret = -EINVAL; + +- /* can set it only once to be even more secure */ +- ret = -EPERM; +- if (unlikely(current->seccomp.mode)) ++ if (current->seccomp.mode && ++ current->seccomp.mode != seccomp_mode) + goto out; + +- ret = -EINVAL; +- if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { +- current->seccomp.mode = seccomp_mode; +- set_thread_flag(TIF_SECCOMP); ++ switch (seccomp_mode) { ++ case SECCOMP_MODE_STRICT: ++ ret = 0; + #ifdef TIF_NOTSC + disable_TSC(); + #endif +- ret = 0; ++ break; ++#ifdef CONFIG_SECCOMP_FILTER ++ case SECCOMP_MODE_FILTER: ++ ret = seccomp_attach_user_filter(filter); ++ if (ret) ++ goto out; ++ break; ++#endif ++ default: ++ goto out; + } + +- out: ++ current->seccomp.mode = seccomp_mode; ++ set_thread_flag(TIF_SECCOMP); ++out: + return ret; + } +diff --git a/kernel/sys.c b/kernel/sys.c +index b82568b..ba0ae8e 100644 +--- a/kernel/sys.c ++++ b/kernel/sys.c +@@ -1908,7 +1908,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, + error = prctl_get_seccomp(); + break; + case PR_SET_SECCOMP: +- error = prctl_set_seccomp(arg2); ++ error = prctl_set_seccomp(arg2, (char __user *)arg3); + break; + case PR_GET_TSC: + error = GET_TSC_CTL(arg2); +-- +1.7.9.1 + |