summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Gortmaker <paul.gortmaker@windriver.com>2012-06-18 17:45:19 -0400
committerPaul Gortmaker <paul.gortmaker@windriver.com>2012-06-18 17:45:19 -0400
commitce99c8289a21f7f8cfeef6f5c05ce8c0e6ae938a (patch)
tree75086bad30f85a7abf70d214936ce0bd05136ab5
parentb7043d8cbab4dcce4448ee8bc3f3fe5f2a8b9857 (diff)
downloadyocto-kernel-cache-ce99c8289a21f7f8cfeef6f5c05ce8c0e6ae938a.tar.gz
yocto-kernel-cache-ce99c8289a21f7f8cfeef6f5c05ce8c0e6ae938a.tar.bz2
yocto-kernel-cache-ce99c8289a21f7f8cfeef6f5c05ce8c0e6ae938a.zip
seccomp: backport of BPF syscall filtering from v3.5
See 00-README file added in this commit for more details on what it is, where it is documented and how it can be tested. Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
-rw-r--r--features/seccomp/00-README114
-rw-r--r--features/seccomp/Add-PR_-GET-SET-_NO_NEW_PRIVS-to-prevent-execve-from.patch223
-rw-r--r--features/seccomp/Documentation-prctl-seccomp_filter.patch1005
-rw-r--r--features/seccomp/Fix-execve-behavior-apparmor-for-PR_-GET-SET-_NO_NEW.patch106
-rw-r--r--features/seccomp/arch-x86-add-syscall_get_arch-to-syscall.h.patch85
-rw-r--r--features/seccomp/asm-syscall.h-add-syscall_get_arch.patch59
-rw-r--r--features/seccomp/net-compat.c-linux-filter.h-share-compat_sock_fprog.patch80
-rw-r--r--features/seccomp/ptrace-seccomp-Add-PTRACE_SECCOMP-support.patch165
-rw-r--r--features/seccomp/seccomp-Add-SECCOMP_RET_TRAP.patch138
-rw-r--r--features/seccomp/seccomp-add-SECCOMP_RET_ERRNO.patch202
-rw-r--r--features/seccomp/seccomp-add-system-call-filtering-using-BPF.patch820
-rw-r--r--features/seccomp/seccomp-kill-the-seccomp_t-typedef.patch88
-rw-r--r--features/seccomp/seccomp-remove-duplicated-failure-logging.patch135
-rw-r--r--features/seccomp/seccomp.scc15
-rw-r--r--features/seccomp/signal-x86-add-SIGSYS-info-and-make-it-synchronous.patch174
-rw-r--r--features/seccomp/sk_run_filter-add-BPF_S_ANC_SECCOMP_LD_W.patch73
-rw-r--r--features/seccomp/x86-Enable-HAVE_ARCH_SECCOMP_FILTER.patch80
-rw-r--r--ktypes/standard/standard-nocfg.scc3
18 files changed, 3565 insertions, 0 deletions
diff --git a/features/seccomp/00-README b/features/seccomp/00-README
new file mode 100644
index 00000000..e14506a3
--- /dev/null
+++ b/features/seccomp/00-README
@@ -0,0 +1,114 @@
+
+This is a backport of the seccomp BPF syscall filtering from v3.5
+
+Quoting from: https://lkml.org/lkml/2012/1/11/260
+
+---------------
+[RFC,PATCH 0/2] dynamic seccomp policies (using BPF filters)
+
+The goal of the patchset is straightforward:
+
+ To provide a means of reducing the kernel attack surface.
+
+In practice, this is done at the primary kernel ABI: system calls.
+Achieving this goal will address the needs expressed by many systems
+projects:
+ qemu/kvm, openssh, vsftpd, lxc, and chromium and chromium os (me).
+
+While system call filtering has been attempted many times, I hope that
+this approach shows more promise. It works as described below and in
+the patch series.
+
+A userland task may call prctl(PR_ATTACH_SECCOMP_FILTER) to attach a
+BPF program to itself. Once attached, all system calls made by the
+task will be evaluated by the BPF program prior to being accepted.
+Evaluation is done by executing the BPF program over the struct
+user_regs_state for the process.
+--------------
+
+The content appears in v3.5 from:
+
+------------
+commit cb60e3e65c1b96a4d6444a7a13dc7dd48bc15a2b
+Merge: 99262a3 ff2bb04
+Author: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Mon May 21 20:27:36 2012 -0700
+
+ Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security
+
+ Pull security subsystem updates from James Morris:
+ "New notable features:
+ - The seccomp work from Will Drewry
+ - PR_{GET,SET}_NO_NEW_PRIVS from Andy Lutomirski
+ - Longer security labels for Smack from Casey Schaufler
+ - Additional ptrace restriction modes for Yama by Kees Cook"
+-----------
+
+Here, we take Will's linear block of commits from the above merge, which
+are all conveniently all marked with "v18" in the changelog, and the
+one PR_{GET,SET}_NO_NEW_PRIVS commit from Andy (req'd as a dependency).
+
+Documentation:
+==============
+
+See added file: Documentation/prctl/seccomp_filter.txt
+
+
+Testing:
+========
+
+Several samples are added in samples/seccomp -- building is as easy as:
+
+ mkdir ../test
+ make O=../test defconfig
+ make O=../test samples/seccomp/
+
+The bpf-direct is a sample which grabs writes to STDERR, and redirects
+them to STDOUT, with an "[ERR]" prefix. Consider the core of the program:
+
+---------------------
+ syscall(__NR_write, STDOUT_FILENO,
+ payload("OHAI! WHAT IS YOUR NAME? "));
+ bytes = syscall(__NR_read, STDIN_FILENO, buf, sizeof(buf));
+ syscall(__NR_write, STDOUT_FILENO, payload("HELLO, "));
+ syscall(__NR_write, STDOUT_FILENO, buf, bytes);
+ syscall(__NR_write, STDERR_FILENO,
+ payload("Error message going to STDERR\n"));
+---------------------
+
+Running this core on a non-seccomp kernel, (i.e. by copying the above core
+to "foo.c") we can see with redirection, that the sample Error message
+goes to STDERR; i.e.
+
+-----------
+~$./foo
+OHAI! WHAT IS YOUR NAME? sdfsdf
+HELLO, sdfsdf
+Error message going to STDERR
+~$./foo 2> /dev/null
+OHAI! WHAT IS YOUR NAME? sdfs
+HELLO, sdfs
+~$
+------------
+
+Note in the 2nd instance, the error message disappears into /dev/null
+
+Now consider the seccomp enabled case, using the same redirect:
+
+------------
+$ ./bpf-direct
+OHAI! WHAT IS YOUR NAME? sdfsd
+HELLO, sdfsd
+[ERR] Error message going to STDERR
+$ ./bpf-direct 2>/dev/null
+OHAI! WHAT IS YOUR NAME? sdfsdf
+HELLO, sdfsdf
+[ERR] Error message going to STDERR
+$
+------------
+
+There are two things to see in the above.
+ 1) We see the [ERR] prefix that is clearly from the emulator()
+ function we've installed on the __NR_write syscall, and
+ 2) Even when we redirect STDERR to /dev/null, we still see the
+ message, which confirms it was put on STDOUT instead.
diff --git a/features/seccomp/Add-PR_-GET-SET-_NO_NEW_PRIVS-to-prevent-execve-from.patch b/features/seccomp/Add-PR_-GET-SET-_NO_NEW_PRIVS-to-prevent-execve-from.patch
new file mode 100644
index 00000000..a14c20d7
--- /dev/null
+++ b/features/seccomp/Add-PR_-GET-SET-_NO_NEW_PRIVS-to-prevent-execve-from.patch
@@ -0,0 +1,223 @@
+From 45fe2238b82776f1bef0a0eb1082ae8abc97e6a0 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@amacapital.net>
+Date: Thu, 12 Apr 2012 16:47:50 -0500
+Subject: [PATCH] Add PR_{GET,SET}_NO_NEW_PRIVS to prevent execve from
+ granting privs
+
+commit 259e5e6c75a910f3b5e656151dc602f53f9d7548 upstream.
+
+With this change, calling
+ prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)
+disables privilege granting operations at execve-time. For example, a
+process will not be able to execute a setuid binary to change their uid
+or gid if this bit is set. The same is true for file capabilities.
+
+Additionally, LSM_UNSAFE_NO_NEW_PRIVS is defined to ensure that
+LSMs respect the requested behavior.
+
+To determine if the NO_NEW_PRIVS bit is set, a task may call
+ prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
+It returns 1 if set and 0 if it is not set. If any of the arguments are
+non-zero, it will return -1 and set errno to -EINVAL.
+(PR_SET_NO_NEW_PRIVS behaves similarly.)
+
+This functionality is desired for the proposed seccomp filter patch
+series. By using PR_SET_NO_NEW_PRIVS, it allows a task to modify the
+system call behavior for itself and its child tasks without being
+able to impact the behavior of a more privileged task.
+
+Another potential use is making certain privileged operations
+unprivileged. For example, chroot may be considered "safe" if it cannot
+affect privileged tasks.
+
+Note, this patch causes execve to fail when PR_SET_NO_NEW_PRIVS is
+set and AppArmor is in use. It is fixed in a subsequent patch.
+
+Signed-off-by: Andy Lutomirski <luto@amacapital.net>
+Signed-off-by: Will Drewry <wad@chromium.org>
+Acked-by: Eric Paris <eparis@redhat.com>
+Acked-by: Kees Cook <keescook@chromium.org>
+
+v18: updated change desc
+v17: using new define values as per 3.4
+Signed-off-by: James Morris <james.l.morris@oracle.com>
+Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
+---
+ fs/exec.c | 10 +++++++++-
+ include/linux/prctl.h | 15 +++++++++++++++
+ include/linux/sched.h | 2 ++
+ include/linux/security.h | 1 +
+ kernel/sys.c | 10 ++++++++++
+ security/apparmor/domain.c | 4 ++++
+ security/commoncap.c | 7 +++++--
+ security/selinux/hooks.c | 10 +++++++++-
+ 8 files changed, 55 insertions(+), 4 deletions(-)
+
+diff --git a/fs/exec.c b/fs/exec.c
+index b1fd202..d038968 100644
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -1245,6 +1245,13 @@ static int check_unsafe_exec(struct linux_binprm *bprm)
+ bprm->unsafe |= LSM_UNSAFE_PTRACE;
+ }
+
++ /*
++ * This isn't strictly necessary, but it makes it harder for LSMs to
++ * mess up.
++ */
++ if (current->no_new_privs)
++ bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
++
+ n_fs = 1;
+ spin_lock(&p->fs->lock);
+ rcu_read_lock();
+@@ -1288,7 +1295,8 @@ int prepare_binprm(struct linux_binprm *bprm)
+ bprm->cred->euid = current_euid();
+ bprm->cred->egid = current_egid();
+
+- if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
++ if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
++ !current->no_new_privs) {
+ /* Set-uid? */
+ if (mode & S_ISUID) {
+ bprm->per_clear |= PER_CLEAR_ON_SETID;
+diff --git a/include/linux/prctl.h b/include/linux/prctl.h
+index e0cfec2..78b76e2 100644
+--- a/include/linux/prctl.h
++++ b/include/linux/prctl.h
+@@ -124,4 +124,19 @@
+ #define PR_SET_CHILD_SUBREAPER 36
+ #define PR_GET_CHILD_SUBREAPER 37
+
++/*
++ * If no_new_privs is set, then operations that grant new privileges (i.e.
++ * execve) will either fail or not grant them. This affects suid/sgid,
++ * file capabilities, and LSMs.
++ *
++ * Operations that merely manipulate or drop existing privileges (setresuid,
++ * capset, etc.) will still work. Drop those privileges if you want them gone.
++ *
++ * Changing LSM security domain is considered a new privilege. So, for example,
++ * asking selinux for a specific new context (e.g. with runcon) will result
++ * in execve returning -EPERM.
++ */
++#define PR_SET_NO_NEW_PRIVS 38
++#define PR_GET_NO_NEW_PRIVS 39
++
+ #endif /* _LINUX_PRCTL_H */
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 81a173c..ba60897 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1341,6 +1341,8 @@ struct task_struct {
+ * execve */
+ unsigned in_iowait:1;
+
++ /* task may not gain privileges */
++ unsigned no_new_privs:1;
+
+ /* Revert to default priority/policy when forking */
+ unsigned sched_reset_on_fork:1;
+diff --git a/include/linux/security.h b/include/linux/security.h
+index 673afbb..6e1dea9 100644
+--- a/include/linux/security.h
++++ b/include/linux/security.h
+@@ -144,6 +144,7 @@ struct request_sock;
+ #define LSM_UNSAFE_SHARE 1
+ #define LSM_UNSAFE_PTRACE 2
+ #define LSM_UNSAFE_PTRACE_CAP 4
++#define LSM_UNSAFE_NO_NEW_PRIVS 8
+
+ #ifdef CONFIG_MMU
+ extern int mmap_min_addr_handler(struct ctl_table *table, int write,
+diff --git a/kernel/sys.c b/kernel/sys.c
+index e7006eb..b82568b 100644
+--- a/kernel/sys.c
++++ b/kernel/sys.c
+@@ -1979,6 +1979,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
+ error = put_user(me->signal->is_child_subreaper,
+ (int __user *) arg2);
+ break;
++ case PR_SET_NO_NEW_PRIVS:
++ if (arg2 != 1 || arg3 || arg4 || arg5)
++ return -EINVAL;
++
++ current->no_new_privs = 1;
++ break;
++ case PR_GET_NO_NEW_PRIVS:
++ if (arg2 || arg3 || arg4 || arg5)
++ return -EINVAL;
++ return current->no_new_privs ? 1 : 0;
+ default:
+ error = -EINVAL;
+ break;
+diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
+index 6327685..18c88d0 100644
+--- a/security/apparmor/domain.c
++++ b/security/apparmor/domain.c
+@@ -360,6 +360,10 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
+ if (bprm->cred_prepared)
+ return 0;
+
++ /* XXX: no_new_privs is not usable with AppArmor yet */
++ if (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)
++ return -EPERM;
++
+ cxt = bprm->cred->security;
+ BUG_ON(!cxt);
+
+diff --git a/security/commoncap.c b/security/commoncap.c
+index 71a166a..f80d116 100644
+--- a/security/commoncap.c
++++ b/security/commoncap.c
+@@ -512,14 +512,17 @@ skip:
+
+
+ /* Don't let someone trace a set[ug]id/setpcap binary with the revised
+- * credentials unless they have the appropriate permit
++ * credentials unless they have the appropriate permit.
++ *
++ * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
+ */
+ if ((new->euid != old->uid ||
+ new->egid != old->gid ||
+ !cap_issubset(new->cap_permitted, old->cap_permitted)) &&
+ bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
+ /* downgrade; they get no more than they had, and maybe less */
+- if (!capable(CAP_SETUID)) {
++ if (!capable(CAP_SETUID) ||
++ (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
+ new->euid = new->uid;
+ new->egid = new->gid;
+ }
+diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
+index d85b793..0b06685 100644
+--- a/security/selinux/hooks.c
++++ b/security/selinux/hooks.c
+@@ -2016,6 +2016,13 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
+ new_tsec->sid = old_tsec->exec_sid;
+ /* Reset exec SID on execve. */
+ new_tsec->exec_sid = 0;
++
++ /*
++ * Minimize confusion: if no_new_privs and a transition is
++ * explicitly requested, then fail the exec.
++ */
++ if (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)
++ return -EPERM;
+ } else {
+ /* Check for a default transition on this program. */
+ rc = security_transition_sid(old_tsec->sid, isec->sid,
+@@ -2029,7 +2036,8 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
+ ad.selinux_audit_data = &sad;
+ ad.u.path = bprm->file->f_path;
+
+- if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
++ if ((bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) ||
++ (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS))
+ new_tsec->sid = old_tsec->sid;
+
+ if (new_tsec->sid == old_tsec->sid) {
+--
+1.7.9.1
+
diff --git a/features/seccomp/Documentation-prctl-seccomp_filter.patch b/features/seccomp/Documentation-prctl-seccomp_filter.patch
new file mode 100644
index 00000000..9f431fb6
--- /dev/null
+++ b/features/seccomp/Documentation-prctl-seccomp_filter.patch
@@ -0,0 +1,1005 @@
+From c837aebb90de91991e51e55cfddf43b6c16da61e Mon Sep 17 00:00:00 2001
+From: Will Drewry <wad@chromium.org>
+Date: Thu, 12 Apr 2012 16:48:04 -0500
+Subject: [PATCH] Documentation: prctl/seccomp_filter
+
+commit 8ac270d1e29f0428228ab2b9a8ae5e1ed4a5cd84 upstream.
+
+Documents how system call filtering using Berkeley Packet
+Filter programs works and how it may be used.
+Includes an example for x86 and a semi-generic
+example using a macro-based code generator.
+
+Acked-by: Eric Paris <eparis@redhat.com>
+Signed-off-by: Will Drewry <wad@chromium.org>
+Acked-by: Kees Cook <keescook@chromium.org>
+
+v18: - added acked by
+ - update no new privs numbers
+v17: - remove @compat note and add Pitfalls section for arch checking
+ (keescook@chromium.org)
+v16: -
+v15: -
+v14: - rebase/nochanges
+v13: - rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc
+v12: - comment on the ptrace_event use
+ - update arch support comment
+ - note the behavior of SECCOMP_RET_DATA when there are multiple filters
+ (keescook@chromium.org)
+ - lots of samples/ clean up incl 64-bit bpf-direct support
+ (markus@chromium.org)
+ - rebase to linux-next
+v11: - overhaul return value language, updates (keescook@chromium.org)
+ - comment on do_exit(SIGSYS)
+v10: - update for SIGSYS
+ - update for new seccomp_data layout
+ - update for ptrace option use
+v9: - updated bpf-direct.c for SIGILL
+v8: - add PR_SET_NO_NEW_PRIVS to the samples.
+v7: - updated for all the new stuff in v7: TRAP, TRACE
+ - only talk about PR_SET_SECCOMP now
+ - fixed bad JLE32 check (coreyb@linux.vnet.ibm.com)
+ - adds dropper.c: a simple system call disabler
+v6: - tweak the language to note the requirement of
+ PR_SET_NO_NEW_PRIVS being called prior to use. (luto@mit.edu)
+v5: - update sample to use system call arguments
+ - adds a "fancy" example using a macro-based generator
+ - cleaned up bpf in the sample
+ - update docs to mention arguments
+ - fix prctl value (eparis@redhat.com)
+ - language cleanup (rdunlap@xenotime.net)
+v4: - update for no_new_privs use
+ - minor tweaks
+v3: - call out BPF <-> Berkeley Packet Filter (rdunlap@xenotime.net)
+ - document use of tentative always-unprivileged
+ - guard sample compilation for i386 and x86_64
+v2: - move code to samples (corbet@lwn.net)
+Signed-off-by: James Morris <james.l.morris@oracle.com>
+Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
+---
+ Documentation/prctl/seccomp_filter.txt | 163 ++++++++++++++++++++++
+ samples/Makefile | 2 +-
+ samples/seccomp/Makefile | 38 +++++
+ samples/seccomp/bpf-direct.c | 176 +++++++++++++++++++++++
+ samples/seccomp/bpf-fancy.c | 102 ++++++++++++++
+ samples/seccomp/bpf-helper.c | 89 ++++++++++++
+ samples/seccomp/bpf-helper.h | 238 ++++++++++++++++++++++++++++++++
+ samples/seccomp/dropper.c | 68 +++++++++
+ 8 files changed, 875 insertions(+), 1 deletions(-)
+ create mode 100644 Documentation/prctl/seccomp_filter.txt
+ create mode 100644 samples/seccomp/Makefile
+ create mode 100644 samples/seccomp/bpf-direct.c
+ create mode 100644 samples/seccomp/bpf-fancy.c
+ create mode 100644 samples/seccomp/bpf-helper.c
+ create mode 100644 samples/seccomp/bpf-helper.h
+ create mode 100644 samples/seccomp/dropper.c
+
+diff --git a/Documentation/prctl/seccomp_filter.txt b/Documentation/prctl/seccomp_filter.txt
+new file mode 100644
+index 0000000..597c3c5
+--- /dev/null
++++ b/Documentation/prctl/seccomp_filter.txt
+@@ -0,0 +1,163 @@
++ SECure COMPuting with filters
++ =============================
++
++Introduction
++------------
++
++A large number of system calls are exposed to every userland process
++with many of them going unused for the entire lifetime of the process.
++As system calls change and mature, bugs are found and eradicated. A
++certain subset of userland applications benefit by having a reduced set
++of available system calls. The resulting set reduces the total kernel
++surface exposed to the application. System call filtering is meant for
++use with those applications.
++
++Seccomp filtering provides a means for a process to specify a filter for
++incoming system calls. The filter is expressed as a Berkeley Packet
++Filter (BPF) program, as with socket filters, except that the data
++operated on is related to the system call being made: system call
++number and the system call arguments. This allows for expressive
++filtering of system calls using a filter program language with a long
++history of being exposed to userland and a straightforward data set.
++
++Additionally, BPF makes it impossible for users of seccomp to fall prey
++to time-of-check-time-of-use (TOCTOU) attacks that are common in system
++call interposition frameworks. BPF programs may not dereference
++pointers which constrains all filters to solely evaluating the system
++call arguments directly.
++
++What it isn't
++-------------
++
++System call filtering isn't a sandbox. It provides a clearly defined
++mechanism for minimizing the exposed kernel surface. It is meant to be
++a tool for sandbox developers to use. Beyond that, policy for logical
++behavior and information flow should be managed with a combination of
++other system hardening techniques and, potentially, an LSM of your
++choosing. Expressive, dynamic filters provide further options down this
++path (avoiding pathological sizes or selecting which of the multiplexed
++system calls in socketcall() is allowed, for instance) which could be
++construed, incorrectly, as a more complete sandboxing solution.
++
++Usage
++-----
++
++An additional seccomp mode is added and is enabled using the same
++prctl(2) call as the strict seccomp. If the architecture has
++CONFIG_HAVE_ARCH_SECCOMP_FILTER, then filters may be added as below:
++
++PR_SET_SECCOMP:
++ Now takes an additional argument which specifies a new filter
++ using a BPF program.
++ The BPF program will be executed over struct seccomp_data
++ reflecting the system call number, arguments, and other
++ metadata. The BPF program must then return one of the
++ acceptable values to inform the kernel which action should be
++ taken.
++
++ Usage:
++ prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, prog);
++
++ The 'prog' argument is a pointer to a struct sock_fprog which
++ will contain the filter program. If the program is invalid, the
++ call will return -1 and set errno to EINVAL.
++
++ If fork/clone and execve are allowed by @prog, any child
++ processes will be constrained to the same filters and system
++ call ABI as the parent.
++
++ Prior to use, the task must call prctl(PR_SET_NO_NEW_PRIVS, 1) or
++ run with CAP_SYS_ADMIN privileges in its namespace. If these are not
++ true, -EACCES will be returned. This requirement ensures that filter
++ programs cannot be applied to child processes with greater privileges
++ than the task that installed them.
++
++ Additionally, if prctl(2) is allowed by the attached filter,
++ additional filters may be layered on which will increase evaluation
++ time, but allow for further decreasing the attack surface during
++ execution of a process.
++
++The above call returns 0 on success and non-zero on error.
++
++Return values
++-------------
++A seccomp filter may return any of the following values. If multiple
++filters exist, the return value for the evaluation of a given system
++call will always use the highest precedent value. (For example,
++SECCOMP_RET_KILL will always take precedence.)
++
++In precedence order, they are:
++
++SECCOMP_RET_KILL:
++ Results in the task exiting immediately without executing the
++ system call. The exit status of the task (status & 0x7f) will
++ be SIGSYS, not SIGKILL.
++
++SECCOMP_RET_TRAP:
++ Results in the kernel sending a SIGSYS signal to the triggering
++ task without executing the system call. The kernel will
++ rollback the register state to just before the system call
++ entry such that a signal handler in the task will be able to
++ inspect the ucontext_t->uc_mcontext registers and emulate
++ system call success or failure upon return from the signal
++ handler.
++
++ The SECCOMP_RET_DATA portion of the return value will be passed
++ as si_errno.
++
++ SIGSYS triggered by seccomp will have a si_code of SYS_SECCOMP.
++
++SECCOMP_RET_ERRNO:
++ Results in the lower 16-bits of the return value being passed
++ to userland as the errno without executing the system call.
++
++SECCOMP_RET_TRACE:
++ When returned, this value will cause the kernel to attempt to
++ notify a ptrace()-based tracer prior to executing the system
++ call. If there is no tracer present, -ENOSYS is returned to
++ userland and the system call is not executed.
++
++ A tracer will be notified if it requests PTRACE_O_TRACESECCOMP
++ using ptrace(PTRACE_SETOPTIONS). The tracer will be notified
++ of a PTRACE_EVENT_SECCOMP and the SECCOMP_RET_DATA portion of
++ the BPF program return value will be available to the tracer
++ via PTRACE_GETEVENTMSG.
++
++SECCOMP_RET_ALLOW:
++ Results in the system call being executed.
++
++If multiple filters exist, the return value for the evaluation of a
++given system call will always use the highest precedent value.
++
++Precedence is only determined using the SECCOMP_RET_ACTION mask. When
++multiple filters return values of the same precedence, only the
++SECCOMP_RET_DATA from the most recently installed filter will be
++returned.
++
++Pitfalls
++--------
++
++The biggest pitfall to avoid during use is filtering on system call
++number without checking the architecture value. Why? On any
++architecture that supports multiple system call invocation conventions,
++the system call numbers may vary based on the specific invocation. If
++the numbers in the different calling conventions overlap, then checks in
++the filters may be abused. Always check the arch value!
++
++Example
++-------
++
++The samples/seccomp/ directory contains both an x86-specific example
++and a more generic example of a higher level macro interface for BPF
++program generation.
++
++
++
++Adding architecture support
++-----------------------
++
++See arch/Kconfig for the authoritative requirements. In general, if an
++architecture supports both ptrace_event and seccomp, it will be able to
++support seccomp filter with minor fixup: SIGSYS support and seccomp return
++value checking. Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER
++to its arch-specific Kconfig.
+diff --git a/samples/Makefile b/samples/Makefile
+index 2f75851..5ef08bb 100644
+--- a/samples/Makefile
++++ b/samples/Makefile
+@@ -1,4 +1,4 @@
+ # Makefile for Linux samples code
+
+ obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ tracepoints/ trace_events/ \
+- hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/
++ hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/
+diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile
+new file mode 100644
+index 0000000..e8fe0f5
+--- /dev/null
++++ b/samples/seccomp/Makefile
+@@ -0,0 +1,38 @@
++# kbuild trick to avoid linker error. Can be omitted if a module is built.
++obj- := dummy.o
++
++hostprogs-$(CONFIG_SECCOMP) := bpf-fancy dropper
++bpf-fancy-objs := bpf-fancy.o bpf-helper.o
++
++HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include
++HOSTCFLAGS_bpf-fancy.o += -idirafter $(objtree)/include
++HOSTCFLAGS_bpf-helper.o += -I$(objtree)/usr/include
++HOSTCFLAGS_bpf-helper.o += -idirafter $(objtree)/include
++
++HOSTCFLAGS_dropper.o += -I$(objtree)/usr/include
++HOSTCFLAGS_dropper.o += -idirafter $(objtree)/include
++dropper-objs := dropper.o
++
++# bpf-direct.c is x86-only.
++ifeq ($(SRCARCH),x86)
++# List of programs to build
++hostprogs-$(CONFIG_SECCOMP) += bpf-direct
++bpf-direct-objs := bpf-direct.o
++endif
++
++HOSTCFLAGS_bpf-direct.o += -I$(objtree)/usr/include
++HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include
++
++# Try to match the kernel target.
++ifeq ($(CONFIG_64BIT),)
++HOSTCFLAGS_bpf-direct.o += -m32
++HOSTCFLAGS_dropper.o += -m32
++HOSTCFLAGS_bpf-helper.o += -m32
++HOSTCFLAGS_bpf-fancy.o += -m32
++HOSTLOADLIBES_bpf-direct += -m32
++HOSTLOADLIBES_bpf-fancy += -m32
++HOSTLOADLIBES_dropper += -m32
++endif
++
++# Tell kbuild to always build the programs
++always := $(hostprogs-y)
+diff --git a/samples/seccomp/bpf-direct.c b/samples/seccomp/bpf-direct.c
+new file mode 100644
+index 0000000..26f523e
+--- /dev/null
++++ b/samples/seccomp/bpf-direct.c
+@@ -0,0 +1,176 @@
++/*
++ * Seccomp filter example for x86 (32-bit and 64-bit) with BPF macros
++ *
++ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
++ * Author: Will Drewry <wad@chromium.org>
++ *
++ * The code may be used by anyone for any purpose,
++ * and can serve as a starting point for developing
++ * applications using prctl(PR_SET_SECCOMP, 2, ...).
++ */
++#define __USE_GNU 1
++#define _GNU_SOURCE 1
++
++#include <linux/types.h>
++#include <linux/filter.h>
++#include <linux/seccomp.h>
++#include <linux/unistd.h>
++#include <signal.h>
++#include <stdio.h>
++#include <stddef.h>
++#include <string.h>
++#include <sys/prctl.h>
++#include <unistd.h>
++
++#define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
++#define syscall_nr (offsetof(struct seccomp_data, nr))
++
++#if defined(__i386__)
++#define REG_RESULT REG_EAX
++#define REG_SYSCALL REG_EAX
++#define REG_ARG0 REG_EBX
++#define REG_ARG1 REG_ECX
++#define REG_ARG2 REG_EDX
++#define REG_ARG3 REG_ESI
++#define REG_ARG4 REG_EDI
++#define REG_ARG5 REG_EBP
++#elif defined(__x86_64__)
++#define REG_RESULT REG_RAX
++#define REG_SYSCALL REG_RAX
++#define REG_ARG0 REG_RDI
++#define REG_ARG1 REG_RSI
++#define REG_ARG2 REG_RDX
++#define REG_ARG3 REG_R10
++#define REG_ARG4 REG_R8
++#define REG_ARG5 REG_R9
++#else
++#error Unsupported platform
++#endif
++
++#ifndef PR_SET_NO_NEW_PRIVS
++#define PR_SET_NO_NEW_PRIVS 38
++#endif
++
++#ifndef SYS_SECCOMP
++#define SYS_SECCOMP 1
++#endif
++
++static void emulator(int nr, siginfo_t *info, void *void_context)
++{
++ ucontext_t *ctx = (ucontext_t *)(void_context);
++ int syscall;
++ char *buf;
++ ssize_t bytes;
++ size_t len;
++ if (info->si_code != SYS_SECCOMP)
++ return;
++ if (!ctx)
++ return;
++ syscall = ctx->uc_mcontext.gregs[REG_SYSCALL];
++ buf = (char *) ctx->uc_mcontext.gregs[REG_ARG1];
++ len = (size_t) ctx->uc_mcontext.gregs[REG_ARG2];
++
++ if (syscall != __NR_write)
++ return;
++ if (ctx->uc_mcontext.gregs[REG_ARG0] != STDERR_FILENO)
++ return;
++ /* Redirect stderr messages to stdout. Doesn't handle EINTR, etc */
++ ctx->uc_mcontext.gregs[REG_RESULT] = -1;
++ if (write(STDOUT_FILENO, "[ERR] ", 6) > 0) {
++ bytes = write(STDOUT_FILENO, buf, len);
++ ctx->uc_mcontext.gregs[REG_RESULT] = bytes;
++ }
++ return;
++}
++
++static int install_emulator(void)
++{
++ struct sigaction act;
++ sigset_t mask;
++ memset(&act, 0, sizeof(act));
++ sigemptyset(&mask);
++ sigaddset(&mask, SIGSYS);
++
++ act.sa_sigaction = &emulator;
++ act.sa_flags = SA_SIGINFO;
++ if (sigaction(SIGSYS, &act, NULL) < 0) {
++ perror("sigaction");
++ return -1;
++ }
++ if (sigprocmask(SIG_UNBLOCK, &mask, NULL)) {
++ perror("sigprocmask");
++ return -1;
++ }
++ return 0;
++}
++
++static int install_filter(void)
++{
++ struct sock_filter filter[] = {
++ /* Grab the system call number */
++ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_nr),
++ /* Jump table for the allowed syscalls */
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_rt_sigreturn, 0, 1),
++ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
++#ifdef __NR_sigreturn
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_sigreturn, 0, 1),
++ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
++#endif
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit_group, 0, 1),
++ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit, 0, 1),
++ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_read, 1, 0),
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_write, 3, 2),
++
++ /* Check that read is only using stdin. */
++ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)),
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDIN_FILENO, 4, 0),
++ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
++
++ /* Check that write is only using stdout */
++ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)),
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDOUT_FILENO, 1, 0),
++ /* Trap attempts to write to stderr */
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDERR_FILENO, 1, 2),
++
++ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
++ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_TRAP),
++ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
++ };
++ struct sock_fprog prog = {
++ .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
++ .filter = filter,
++ };
++
++ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
++ perror("prctl(NO_NEW_PRIVS)");
++ return 1;
++ }
++
++
++ if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
++ perror("prctl");
++ return 1;
++ }
++ return 0;
++}
++
++#define payload(_c) (_c), sizeof((_c))
++int main(int argc, char **argv)
++{
++ char buf[4096];
++ ssize_t bytes = 0;
++ if (install_emulator())
++ return 1;
++ if (install_filter())
++ return 1;
++ syscall(__NR_write, STDOUT_FILENO,
++ payload("OHAI! WHAT IS YOUR NAME? "));
++ bytes = syscall(__NR_read, STDIN_FILENO, buf, sizeof(buf));
++ syscall(__NR_write, STDOUT_FILENO, payload("HELLO, "));
++ syscall(__NR_write, STDOUT_FILENO, buf, bytes);
++ syscall(__NR_write, STDERR_FILENO,
++ payload("Error message going to STDERR\n"));
++ return 0;
++}
+diff --git a/samples/seccomp/bpf-fancy.c b/samples/seccomp/bpf-fancy.c
+new file mode 100644
+index 0000000..8eb483a
+--- /dev/null
++++ b/samples/seccomp/bpf-fancy.c
+@@ -0,0 +1,102 @@
++/*
++ * Seccomp BPF example using a macro-based generator.
++ *
++ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
++ * Author: Will Drewry <wad@chromium.org>
++ *
++ * The code may be used by anyone for any purpose,
++ * and can serve as a starting point for developing
++ * applications using prctl(PR_ATTACH_SECCOMP_FILTER).
++ */
++
++#include <linux/filter.h>
++#include <linux/seccomp.h>
++#include <linux/unistd.h>
++#include <stdio.h>
++#include <string.h>
++#include <sys/prctl.h>
++#include <unistd.h>
++
++#include "bpf-helper.h"
++
++#ifndef PR_SET_NO_NEW_PRIVS
++#define PR_SET_NO_NEW_PRIVS 38
++#endif
++
++int main(int argc, char **argv)
++{
++ struct bpf_labels l;
++ static const char msg1[] = "Please type something: ";
++ static const char msg2[] = "You typed: ";
++ char buf[256];
++ struct sock_filter filter[] = {
++ /* TODO: LOAD_SYSCALL_NR(arch) and enforce an arch */
++ LOAD_SYSCALL_NR,
++ SYSCALL(__NR_exit, ALLOW),
++ SYSCALL(__NR_exit_group, ALLOW),
++ SYSCALL(__NR_write, JUMP(&l, write_fd)),
++ SYSCALL(__NR_read, JUMP(&l, read)),
++ DENY, /* Don't passthrough into a label */
++
++ LABEL(&l, read),
++ ARG(0),
++ JNE(STDIN_FILENO, DENY),
++ ARG(1),
++ JNE((unsigned long)buf, DENY),
++ ARG(2),
++ JGE(sizeof(buf), DENY),
++ ALLOW,
++
++ LABEL(&l, write_fd),
++ ARG(0),
++ JEQ(STDOUT_FILENO, JUMP(&l, write_buf)),
++ JEQ(STDERR_FILENO, JUMP(&l, write_buf)),
++ DENY,
++
++ LABEL(&l, write_buf),
++ ARG(1),
++ JEQ((unsigned long)msg1, JUMP(&l, msg1_len)),
++ JEQ((unsigned long)msg2, JUMP(&l, msg2_len)),
++ JEQ((unsigned long)buf, JUMP(&l, buf_len)),
++ DENY,
++
++ LABEL(&l, msg1_len),
++ ARG(2),
++ JLT(sizeof(msg1), ALLOW),
++ DENY,
++
++ LABEL(&l, msg2_len),
++ ARG(2),
++ JLT(sizeof(msg2), ALLOW),
++ DENY,
++
++ LABEL(&l, buf_len),
++ ARG(2),
++ JLT(sizeof(buf), ALLOW),
++ DENY,
++ };
++ struct sock_fprog prog = {
++ .filter = filter,
++ .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
++ };
++ ssize_t bytes;
++ bpf_resolve_jumps(&l, filter, sizeof(filter)/sizeof(*filter));
++
++ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
++ perror("prctl(NO_NEW_PRIVS)");
++ return 1;
++ }
++
++ if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
++ perror("prctl(SECCOMP)");
++ return 1;
++ }
++ syscall(__NR_write, STDOUT_FILENO, msg1, strlen(msg1));
++ bytes = syscall(__NR_read, STDIN_FILENO, buf, sizeof(buf)-1);
++ bytes = (bytes > 0 ? bytes : 0);
++ syscall(__NR_write, STDERR_FILENO, msg2, strlen(msg2));
++ syscall(__NR_write, STDERR_FILENO, buf, bytes);
++ /* Now get killed */
++ syscall(__NR_write, STDERR_FILENO, msg2, strlen(msg2)+2);
++ return 0;
++}
+diff --git a/samples/seccomp/bpf-helper.c b/samples/seccomp/bpf-helper.c
+new file mode 100644
+index 0000000..579cfe3
+--- /dev/null
++++ b/samples/seccomp/bpf-helper.c
+@@ -0,0 +1,89 @@
++/*
++ * Seccomp BPF helper functions
++ *
++ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
++ * Author: Will Drewry <wad@chromium.org>
++ *
++ * The code may be used by anyone for any purpose,
++ * and can serve as a starting point for developing
++ * applications using prctl(PR_ATTACH_SECCOMP_FILTER).
++ */
++
++#include <stdio.h>
++#include <string.h>
++
++#include "bpf-helper.h"
++
++int bpf_resolve_jumps(struct bpf_labels *labels,
++ struct sock_filter *filter, size_t count)
++{
++ struct sock_filter *begin = filter;
++ __u8 insn = count - 1;
++
++ if (count < 1)
++ return -1;
++ /*
++ * Walk it once, backwards, to build the label table and do fixups.
++ * Since backward jumps are disallowed by BPF, this is easy.
++ */
++ filter += insn;
++ for (; filter >= begin; --insn, --filter) {
++ if (filter->code != (BPF_JMP+BPF_JA))
++ continue;
++ switch ((filter->jt<<8)|filter->jf) {
++ case (JUMP_JT<<8)|JUMP_JF:
++ if (labels->labels[filter->k].location == 0xffffffff) {
++ fprintf(stderr, "Unresolved label: '%s'\n",
++ labels->labels[filter->k].label);
++ return 1;
++ }
++ filter->k = labels->labels[filter->k].location -
++ (insn + 1);
++ filter->jt = 0;
++ filter->jf = 0;
++ continue;
++ case (LABEL_JT<<8)|LABEL_JF:
++ if (labels->labels[filter->k].location != 0xffffffff) {
++ fprintf(stderr, "Duplicate label use: '%s'\n",
++ labels->labels[filter->k].label);
++ return 1;
++ }
++ labels->labels[filter->k].location = insn;
++ filter->k = 0; /* fall through */
++ filter->jt = 0;
++ filter->jf = 0;
++ continue;
++ }
++ }
++ return 0;
++}
++
++/* Simple lookup table for labels. */
++__u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label)
++{
++ struct __bpf_label *begin = labels->labels, *end;
++ int id;
++ if (labels->count == 0) {
++ begin->label = label;
++ begin->location = 0xffffffff;
++ labels->count++;
++ return 0;
++ }
++ end = begin + labels->count;
++ for (id = 0; begin < end; ++begin, ++id) {
++ if (!strcmp(label, begin->label))
++ return id;
++ }
++ begin->label = label;
++ begin->location = 0xffffffff;
++ labels->count++;
++ return id;
++}
++
++void seccomp_bpf_print(struct sock_filter *filter, size_t count)
++{
++ struct sock_filter *end = filter + count;
++ for ( ; filter < end; ++filter)
++ printf("{ code=%u,jt=%u,jf=%u,k=%u },\n",
++ filter->code, filter->jt, filter->jf, filter->k);
++}
+diff --git a/samples/seccomp/bpf-helper.h b/samples/seccomp/bpf-helper.h
+new file mode 100644
+index 0000000..643279d
+--- /dev/null
++++ b/samples/seccomp/bpf-helper.h
+@@ -0,0 +1,238 @@
++/*
++ * Example wrapper around BPF macros.
++ *
++ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
++ * Author: Will Drewry <wad@chromium.org>
++ *
++ * The code may be used by anyone for any purpose,
++ * and can serve as a starting point for developing
++ * applications using prctl(PR_SET_SECCOMP, 2, ...).
++ *
++ * No guarantees are provided with respect to the correctness
++ * or functionality of this code.
++ */
++#ifndef __BPF_HELPER_H__
++#define __BPF_HELPER_H__
++
++#include <asm/bitsperlong.h> /* for __BITS_PER_LONG */
++#include <endian.h>
++#include <linux/filter.h>
++#include <linux/seccomp.h> /* for seccomp_data */
++#include <linux/types.h>
++#include <linux/unistd.h>
++#include <stddef.h>
++
++#define BPF_LABELS_MAX 256
++struct bpf_labels {
++ int count;
++ struct __bpf_label {
++ const char *label;
++ __u32 location;
++ } labels[BPF_LABELS_MAX];
++};
++
++int bpf_resolve_jumps(struct bpf_labels *labels,
++ struct sock_filter *filter, size_t count);
++__u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label);
++void seccomp_bpf_print(struct sock_filter *filter, size_t count);
++
++#define JUMP_JT 0xff
++#define JUMP_JF 0xff
++#define LABEL_JT 0xfe
++#define LABEL_JF 0xfe
++
++#define ALLOW \
++ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
++#define DENY \
++ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
++#define JUMP(labels, label) \
++ BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \
++ JUMP_JT, JUMP_JF)
++#define LABEL(labels, label) \
++ BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \
++ LABEL_JT, LABEL_JF)
++#define SYSCALL(nr, jt) \
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (nr), 0, 1), \
++ jt
++
++/* Lame, but just an example */
++#define FIND_LABEL(labels, label) seccomp_bpf_label((labels), #label)
++
++#define EXPAND(...) __VA_ARGS__
++/* Map all width-sensitive operations */
++#if __BITS_PER_LONG == 32
++
++#define JEQ(x, jt) JEQ32(x, EXPAND(jt))
++#define JNE(x, jt) JNE32(x, EXPAND(jt))
++#define JGT(x, jt) JGT32(x, EXPAND(jt))
++#define JLT(x, jt) JLT32(x, EXPAND(jt))
++#define JGE(x, jt) JGE32(x, EXPAND(jt))
++#define JLE(x, jt) JLE32(x, EXPAND(jt))
++#define JA(x, jt) JA32(x, EXPAND(jt))
++#define ARG(i) ARG_32(i)
++#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
++
++#elif __BITS_PER_LONG == 64
++
++/* Ensure that we load the logically correct offset. */
++#if __BYTE_ORDER == __LITTLE_ENDIAN
++#define ENDIAN(_lo, _hi) _lo, _hi
++#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
++#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
++#elif __BYTE_ORDER == __BIG_ENDIAN
++#define ENDIAN(_lo, _hi) _hi, _lo
++#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
++#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
++#else
++#error "Unknown endianness"
++#endif
++
++union arg64 {
++ struct {
++ __u32 ENDIAN(lo32, hi32);
++ };
++ __u64 u64;
++};
++
++#define JEQ(x, jt) \
++ JEQ64(((union arg64){.u64 = (x)}).lo32, \
++ ((union arg64){.u64 = (x)}).hi32, \
++ EXPAND(jt))
++#define JGT(x, jt) \
++ JGT64(((union arg64){.u64 = (x)}).lo32, \
++ ((union arg64){.u64 = (x)}).hi32, \
++ EXPAND(jt))
++#define JGE(x, jt) \
++ JGE64(((union arg64){.u64 = (x)}).lo32, \
++ ((union arg64){.u64 = (x)}).hi32, \
++ EXPAND(jt))
++#define JNE(x, jt) \
++ JNE64(((union arg64){.u64 = (x)}).lo32, \
++ ((union arg64){.u64 = (x)}).hi32, \
++ EXPAND(jt))
++#define JLT(x, jt) \
++ JLT64(((union arg64){.u64 = (x)}).lo32, \
++ ((union arg64){.u64 = (x)}).hi32, \
++ EXPAND(jt))
++#define JLE(x, jt) \
++ JLE64(((union arg64){.u64 = (x)}).lo32, \
++ ((union arg64){.u64 = (x)}).hi32, \
++ EXPAND(jt))
++
++#define JA(x, jt) \
++ JA64(((union arg64){.u64 = (x)}).lo32, \
++ ((union arg64){.u64 = (x)}).hi32, \
++ EXPAND(jt))
++#define ARG(i) ARG_64(i)
++
++#else
++#error __BITS_PER_LONG value unusable.
++#endif
++
++/* Loads the arg into A */
++#define ARG_32(idx) \
++ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx))
++
++/* Loads hi into A and lo in X */
++#define ARG_64(idx) \
++ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx)), \
++ BPF_STMT(BPF_ST, 0), /* lo -> M[0] */ \
++ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, HI_ARG(idx)), \
++ BPF_STMT(BPF_ST, 1) /* hi -> M[1] */
++
++#define JEQ32(value, jt) \
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 0, 1), \
++ jt
++
++#define JNE32(value, jt) \
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 1, 0), \
++ jt
++
++/* Checks the lo, then swaps to check the hi. A=lo,X=hi */
++#define JEQ64(lo, hi, jt) \
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
++ BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 0, 2), \
++ BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
++ jt, \
++ BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
++
++#define JNE64(lo, hi, jt) \
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 5, 0), \
++ BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 2, 0), \
++ BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
++ jt, \
++ BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
++
++#define JA32(value, jt) \
++ BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (value), 0, 1), \
++ jt
++
++#define JA64(lo, hi, jt) \
++ BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (hi), 3, 0), \
++ BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
++ BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (lo), 0, 2), \
++ BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
++ jt, \
++ BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
++
++#define JGE32(value, jt) \
++ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 0, 1), \
++ jt
++
++#define JLT32(value, jt) \
++ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 1, 0), \
++ jt
++
++/* Shortcut checking if hi > arg.hi. */
++#define JGE64(lo, hi, jt) \
++ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
++ BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
++ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (lo), 0, 2), \
++ BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
++ jt, \
++ BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
++
++#define JLT64(lo, hi, jt) \
++ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (hi), 0, 4), \
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
++ BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
++ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 2, 0), \
++ BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
++ jt, \
++ BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
++
++#define JGT32(value, jt) \
++ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 0, 1), \
++ jt
++
++#define JLE32(value, jt) \
++ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 1, 0), \
++ jt
++
++/* Check hi > args.hi first, then do the GE checking */
++#define JGT64(lo, hi, jt) \
++ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
++ BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
++ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 0, 2), \
++ BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
++ jt, \
++ BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
++
++#define JLE64(lo, hi, jt) \
++ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 6, 0), \
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 3), \
++ BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
++ BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 2, 0), \
++ BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
++ jt, \
++ BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
++
++#define LOAD_SYSCALL_NR \
++ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
++ offsetof(struct seccomp_data, nr))
++
++#endif /* __BPF_HELPER_H__ */
+diff --git a/samples/seccomp/dropper.c b/samples/seccomp/dropper.c
+new file mode 100644
+index 0000000..c69c347
+--- /dev/null
++++ b/samples/seccomp/dropper.c
+@@ -0,0 +1,68 @@
++/*
++ * Naive system call dropper built on seccomp_filter.
++ *
++ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
++ * Author: Will Drewry <wad@chromium.org>
++ *
++ * The code may be used by anyone for any purpose,
++ * and can serve as a starting point for developing
++ * applications using prctl(PR_SET_SECCOMP, 2, ...).
++ *
++ * When run, returns the specified errno for the specified
++ * system call number against the given architecture.
++ *
++ * Run this one as root as PR_SET_NO_NEW_PRIVS is not called.
++ */
++
++#include <errno.h>
++#include <linux/audit.h>
++#include <linux/filter.h>
++#include <linux/seccomp.h>
++#include <linux/unistd.h>
++#include <stdio.h>
++#include <stddef.h>
++#include <stdlib.h>
++#include <sys/prctl.h>
++#include <unistd.h>
++
++static int install_filter(int nr, int arch, int error)
++{
++ struct sock_filter filter[] = {
++ BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
++ (offsetof(struct seccomp_data, arch))),
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, arch, 0, 3),
++ BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
++ (offsetof(struct seccomp_data, nr))),
++ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
++ BPF_STMT(BPF_RET+BPF_K,
++ SECCOMP_RET_ERRNO|(error & SECCOMP_RET_DATA)),
++ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
++ };
++ struct sock_fprog prog = {
++ .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
++ .filter = filter,
++ };
++ if (prctl(PR_SET_SECCOMP, 2, &prog)) {
++ perror("prctl");
++ return 1;
++ }
++ return 0;
++}
++
++int main(int argc, char **argv)
++{
++ if (argc < 5) {
++ fprintf(stderr, "Usage:\n"
++ "dropper <syscall_nr> <arch> <errno> <prog> [<args>]\n"
++ "Hint: AUDIT_ARCH_I386: 0x%X\n"
++ " AUDIT_ARCH_X86_64: 0x%X\n"
++ "\n", AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
++ return 1;
++ }
++ if (install_filter(strtol(argv[1], NULL, 0), strtol(argv[2], NULL, 0),
++ strtol(argv[3], NULL, 0)))
++ return 1;
++ execv(argv[4], &argv[4]);
++ printf("Failed to execv\n");
++ return 255;
++}
+--
+1.7.9.1
+
diff --git a/features/seccomp/Fix-execve-behavior-apparmor-for-PR_-GET-SET-_NO_NEW.patch b/features/seccomp/Fix-execve-behavior-apparmor-for-PR_-GET-SET-_NO_NEW.patch
new file mode 100644
index 00000000..5d0e2930
--- /dev/null
+++ b/features/seccomp/Fix-execve-behavior-apparmor-for-PR_-GET-SET-_NO_NEW.patch
@@ -0,0 +1,106 @@
+From a7c57bb9edacc420cc99d16852621a12d112cb0f Mon Sep 17 00:00:00 2001
+From: John Johansen <john.johansen@canonical.com>
+Date: Thu, 12 Apr 2012 16:47:51 -0500
+Subject: [PATCH] Fix execve behavior apparmor for PR_{GET,SET}_NO_NEW_PRIVS
+
+commit c29bceb3967398cf2ac8bf8edf9634fdb722df7d upstream.
+
+Add support for AppArmor to explicitly fail requested domain transitions
+if NO_NEW_PRIVS is set and the task is not unconfined.
+
+Transitions from unconfined are still allowed because this always results
+in a reduction of privileges.
+
+Acked-by: Eric Paris <eparis@redhat.com>
+Signed-off-by: Will Drewry <wad@chromium.org>
+Signed-off-by: John Johansen <john.johansen@canonical.com>
+Signed-off-by: Andy Lutomirski <luto@amacapital.net>
+
+v18: new acked-by, new description
+Signed-off-by: James Morris <james.l.morris@oracle.com>
+Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
+---
+ security/apparmor/domain.c | 39 +++++++++++++++++++++++++++++++++++----
+ 1 files changed, 35 insertions(+), 4 deletions(-)
+
+diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
+index 18c88d0..b81ea10 100644
+--- a/security/apparmor/domain.c
++++ b/security/apparmor/domain.c
+@@ -360,10 +360,6 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
+ if (bprm->cred_prepared)
+ return 0;
+
+- /* XXX: no_new_privs is not usable with AppArmor yet */
+- if (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)
+- return -EPERM;
+-
+ cxt = bprm->cred->security;
+ BUG_ON(!cxt);
+
+@@ -398,6 +394,11 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
+ new_profile = find_attach(ns, &ns->base.profiles, name);
+ if (!new_profile)
+ goto cleanup;
++ /*
++ * NOTE: Domain transitions from unconfined are allowed
++ * even when no_new_privs is set because this aways results
++ * in a further reduction of permissions.
++ */
+ goto apply;
+ }
+
+@@ -459,6 +460,16 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
+ /* fail exec */
+ error = -EACCES;
+
++ /*
++ * Policy has specified a domain transition, if no_new_privs then
++ * fail the exec.
++ */
++ if (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS) {
++ aa_put_profile(new_profile);
++ error = -EPERM;
++ goto cleanup;
++ }
++
+ if (!new_profile)
+ goto audit;
+
+@@ -613,6 +624,14 @@ int aa_change_hat(const char *hats[], int count, u64 token, bool permtest)
+ const char *target = NULL, *info = NULL;
+ int error = 0;
+
++ /*
++ * Fail explicitly requested domain transitions if no_new_privs.
++ * There is no exception for unconfined as change_hat is not
++ * available.
++ */
++ if (current->no_new_privs)
++ return -EPERM;
++
+ /* released below */
+ cred = get_current_cred();
+ cxt = cred->security;
+@@ -754,6 +773,18 @@ int aa_change_profile(const char *ns_name, const char *hname, bool onexec,
+ cxt = cred->security;
+ profile = aa_cred_profile(cred);
+
++ /*
++ * Fail explicitly requested domain transitions if no_new_privs
++ * and not unconfined.
++ * Domain transitions from unconfined are allowed even when
++ * no_new_privs is set because this aways results in a reduction
++ * of permissions.
++ */
++ if (current->no_new_privs && !unconfined(profile)) {
++ put_cred(cred);
++ return -EPERM;
++ }
++
+ if (ns_name) {
+ /* released below */
+ ns = aa_find_namespace(profile->ns, ns_name);
+--
+1.7.9.1
+
diff --git a/features/seccomp/arch-x86-add-syscall_get_arch-to-syscall.h.patch b/features/seccomp/arch-x86-add-syscall_get_arch-to-syscall.h.patch
new file mode 100644
index 00000000..841ccffc
--- /dev/null
+++ b/features/seccomp/arch-x86-add-syscall_get_arch-to-syscall.h.patch
@@ -0,0 +1,85 @@
+From d581579e1974f5bd2ff3bb5b93240aa5ccf2f907 Mon Sep 17 00:00:00 2001
+From: Will Drewry <wad@chromium.org>
+Date: Thu, 12 Apr 2012 16:47:56 -0500
+Subject: [PATCH] arch/x86: add syscall_get_arch to syscall.h
+
+commit b7456536cf9466b402b540c5588d79a4177c723a upstream.
+
+Add syscall_get_arch() to export the current AUDIT_ARCH_* based on system call
+entry path.
+
+Signed-off-by: Will Drewry <wad@chromium.org>
+Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
+Reviewed-by: H. Peter Anvin <hpa@zytor.com>
+Acked-by: Eric Paris <eparis@redhat.com>
+Reviewed-by: Kees Cook <keescook@chromium.org>
+
+v18: - update comment about x32 tasks
+ - rebase to v3.4-rc2
+v17: rebase and reviewed-by
+v14: rebase/nochanges
+v13: rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc
+Signed-off-by: James Morris <james.l.morris@oracle.com>
+Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
+---
+ arch/x86/include/asm/syscall.h | 27 +++++++++++++++++++++++++++
+ 1 files changed, 27 insertions(+), 0 deletions(-)
+
+diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
+index 386b786..1ace47b 100644
+--- a/arch/x86/include/asm/syscall.h
++++ b/arch/x86/include/asm/syscall.h
+@@ -13,9 +13,11 @@
+ #ifndef _ASM_X86_SYSCALL_H
+ #define _ASM_X86_SYSCALL_H
+
++#include <linux/audit.h>
+ #include <linux/sched.h>
+ #include <linux/err.h>
+ #include <asm/asm-offsets.h> /* For NR_syscalls */
++#include <asm/thread_info.h> /* for TS_COMPAT */
+ #include <asm/unistd.h>
+
+ extern const unsigned long sys_call_table[];
+@@ -88,6 +90,12 @@ static inline void syscall_set_arguments(struct task_struct *task,
+ memcpy(&regs->bx + i, args, n * sizeof(args[0]));
+ }
+
++static inline int syscall_get_arch(struct task_struct *task,
++ struct pt_regs *regs)
++{
++ return AUDIT_ARCH_I386;
++}
++
+ #else /* CONFIG_X86_64 */
+
+ static inline void syscall_get_arguments(struct task_struct *task,
+@@ -212,6 +220,25 @@ static inline void syscall_set_arguments(struct task_struct *task,
+ }
+ }
+
++static inline int syscall_get_arch(struct task_struct *task,
++ struct pt_regs *regs)
++{
++#ifdef CONFIG_IA32_EMULATION
++ /*
++ * TS_COMPAT is set for 32-bit syscall entry and then
++ * remains set until we return to user mode.
++ *
++ * TIF_IA32 tasks should always have TS_COMPAT set at
++ * system call time.
++ *
++ * x32 tasks should be considered AUDIT_ARCH_X86_64.
++ */
++ if (task_thread_info(task)->status & TS_COMPAT)
++ return AUDIT_ARCH_I386;
++#endif
++ /* Both x32 and x86_64 are considered "64-bit". */
++ return AUDIT_ARCH_X86_64;
++}
+ #endif /* CONFIG_X86_32 */
+
+ #endif /* _ASM_X86_SYSCALL_H */
+--
+1.7.9.1
+
diff --git a/features/seccomp/asm-syscall.h-add-syscall_get_arch.patch b/features/seccomp/asm-syscall.h-add-syscall_get_arch.patch
new file mode 100644
index 00000000..d95c897d
--- /dev/null
+++ b/features/seccomp/asm-syscall.h-add-syscall_get_arch.patch
@@ -0,0 +1,59 @@
+From 2ca6c225eacea82fd7fdcd24312c817e1e8352e4 Mon Sep 17 00:00:00 2001
+From: Will Drewry <wad@chromium.org>
+Date: Thu, 12 Apr 2012 16:47:55 -0500
+Subject: [PATCH] asm/syscall.h: add syscall_get_arch
+
+commit 07bd18d00d5dcf84eb22f8120f47f09c3d8fe27d upstream.
+
+Adds a stub for a function that will return the AUDIT_ARCH_* value
+appropriate to the supplied task based on the system call convention.
+
+For audit's use, the value can generally be hard-coded at the
+audit-site. However, for other functionality not inlined into syscall
+entry/exit, this makes that information available. seccomp_filter is
+the first planned consumer and, as such, the comment indicates a tie to
+CONFIG_HAVE_ARCH_SECCOMP_FILTER.
+
+Suggested-by: Roland McGrath <mcgrathr@chromium.org>
+Signed-off-by: Will Drewry <wad@chromium.org>
+Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
+Acked-by: Eric Paris <eparis@redhat.com>
+
+v18: comment and change reword and rebase.
+v14: rebase/nochanges
+v13: rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc
+v12: rebase on to linux-next
+v11: fixed improper return type
+v10: introduced
+Signed-off-by: James Morris <james.l.morris@oracle.com>
+Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
+---
+ include/asm-generic/syscall.h | 14 ++++++++++++++
+ 1 files changed, 14 insertions(+), 0 deletions(-)
+
+diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h
+index 5c122ae..5b09392 100644
+--- a/include/asm-generic/syscall.h
++++ b/include/asm-generic/syscall.h
+@@ -142,4 +142,18 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
+ unsigned int i, unsigned int n,
+ const unsigned long *args);
+
++/**
++ * syscall_get_arch - return the AUDIT_ARCH for the current system call
++ * @task: task of interest, must be in system call entry tracing
++ * @regs: task_pt_regs() of @task
++ *
++ * Returns the AUDIT_ARCH_* based on the system call convention in use.
++ *
++ * It's only valid to call this when @task is stopped on entry to a system
++ * call, due to %TIF_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or %TIF_SECCOMP.
++ *
++ * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must
++ * provide an implementation of this.
++ */
++int syscall_get_arch(struct task_struct *task, struct pt_regs *regs);
+ #endif /* _ASM_SYSCALL_H */
+--
+1.7.9.1
+
diff --git a/features/seccomp/net-compat.c-linux-filter.h-share-compat_sock_fprog.patch b/features/seccomp/net-compat.c-linux-filter.h-share-compat_sock_fprog.patch
new file mode 100644
index 00000000..f186f7c9
--- /dev/null
+++ b/features/seccomp/net-compat.c-linux-filter.h-share-compat_sock_fprog.patch
@@ -0,0 +1,80 @@
+From 01cef9b98077e652997585d35f765b4b69e33f51 Mon Sep 17 00:00:00 2001
+From: Will Drewry <wad@chromium.org>
+Date: Thu, 12 Apr 2012 16:47:53 -0500
+Subject: [PATCH] net/compat.c,linux/filter.h: share compat_sock_fprog
+
+commit 0c5fe1b4221c6701224c2601cf3c692e5721103e upstream.
+
+Any other users of bpf_*_filter that take a struct sock_fprog from
+userspace will need to be able to also accept a compat_sock_fprog
+if the arch supports compat calls. This change allows the existing
+compat_sock_fprog be shared.
+
+Signed-off-by: Will Drewry <wad@chromium.org>
+Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
+Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
+Acked-by: Eric Paris <eparis@redhat.com>
+
+v18: tasered by the apostrophe police
+v14: rebase/nochanges
+v13: rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc
+v12: rebase on to linux-next
+v11: introduction
+Signed-off-by: James Morris <james.l.morris@oracle.com>
+Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
+---
+ include/linux/filter.h | 11 +++++++++++
+ net/compat.c | 8 --------
+ 2 files changed, 11 insertions(+), 8 deletions(-)
+
+diff --git a/include/linux/filter.h b/include/linux/filter.h
+index aaa2e80..f2e5315 100644
+--- a/include/linux/filter.h
++++ b/include/linux/filter.h
+@@ -10,6 +10,7 @@
+
+ #ifdef __KERNEL__
+ #include <linux/atomic.h>
++#include <linux/compat.h>
+ #endif
+
+ /*
+@@ -132,6 +133,16 @@ struct sock_fprog { /* Required for SO_ATTACH_FILTER. */
+
+ #ifdef __KERNEL__
+
++#ifdef CONFIG_COMPAT
++/*
++ * A struct sock_filter is architecture independent.
++ */
++struct compat_sock_fprog {
++ u16 len;
++ compat_uptr_t filter; /* struct sock_filter * */
++};
++#endif
++
+ struct sk_buff;
+ struct sock;
+
+diff --git a/net/compat.c b/net/compat.c
+index e055708..242c828 100644
+--- a/net/compat.c
++++ b/net/compat.c
+@@ -328,14 +328,6 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
+ __scm_destroy(scm);
+ }
+
+-/*
+- * A struct sock_filter is architecture independent.
+- */
+-struct compat_sock_fprog {
+- u16 len;
+- compat_uptr_t filter; /* struct sock_filter * */
+-};
+-
+ static int do_set_attach_filter(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+ {
+--
+1.7.9.1
+
diff --git a/features/seccomp/ptrace-seccomp-Add-PTRACE_SECCOMP-support.patch b/features/seccomp/ptrace-seccomp-Add-PTRACE_SECCOMP-support.patch
new file mode 100644
index 00000000..6c0194a8
--- /dev/null
+++ b/features/seccomp/ptrace-seccomp-Add-PTRACE_SECCOMP-support.patch
@@ -0,0 +1,165 @@
+From 02fa56dd47cf648e30198b2dd836a45b08354db0 Mon Sep 17 00:00:00 2001
+From: Will Drewry <wad@chromium.org>
+Date: Thu, 12 Apr 2012 16:48:02 -0500
+Subject: [PATCH] ptrace,seccomp: Add PTRACE_SECCOMP support
+
+commit fb0fadf9b213f55ca9368f3edafe51101d5d2deb upstream.
+
+This change adds support for a new ptrace option, PTRACE_O_TRACESECCOMP,
+and a new return value for seccomp BPF programs, SECCOMP_RET_TRACE.
+
+When a tracer specifies the PTRACE_O_TRACESECCOMP ptrace option, the
+tracer will be notified, via PTRACE_EVENT_SECCOMP, for any syscall that
+results in a BPF program returning SECCOMP_RET_TRACE. The 16-bit
+SECCOMP_RET_DATA mask of the BPF program return value will be passed as
+the ptrace_message and may be retrieved using PTRACE_GETEVENTMSG.
+
+If the subordinate process is not using seccomp filter, then no
+system call notifications will occur even if the option is specified.
+
+If there is no tracer with PTRACE_O_TRACESECCOMP when SECCOMP_RET_TRACE
+is returned, the system call will not be executed and an -ENOSYS errno
+will be returned to userspace.
+
+This change adds a dependency on the system call slow path. Any future
+efforts to use the system call fast path for seccomp filter will need to
+address this restriction.
+
+Signed-off-by: Will Drewry <wad@chromium.org>
+Acked-by: Eric Paris <eparis@redhat.com>
+
+v18: - rebase
+ - comment fatal_signal check
+ - acked-by
+ - drop secure_computing_int comment
+v17: - ...
+v16: - update PT_TRACE_MASK to 0xbf4 so that STOP isn't clear on SETOPTIONS call (indan@nul.nu)
+ [note PT_TRACE_MASK disappears in linux-next]
+v15: - add audit support for non-zero return codes
+ - clean up style (indan@nul.nu)
+v14: - rebase/nochanges
+v13: - rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc
+ (Brings back a change to ptrace.c and the masks.)
+v12: - rebase to linux-next
+ - use ptrace_event and update arch/Kconfig to mention slow-path dependency
+ - drop all tracehook changes and inclusion (oleg@redhat.com)
+v11: - invert the logic to just make it a PTRACE_SYSCALL accelerator
+ (indan@nul.nu)
+v10: - moved to PTRACE_O_SECCOMP / PT_TRACE_SECCOMP
+v9: - n/a
+v8: - guarded PTRACE_SECCOMP use with an ifdef
+v7: - introduced
+Signed-off-by: James Morris <james.l.morris@oracle.com>
+Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
+---
+ arch/Kconfig | 10 +++++-----
+ include/linux/ptrace.h | 5 ++++-
+ include/linux/seccomp.h | 1 +
+ kernel/seccomp.c | 16 ++++++++++++++++
+ 4 files changed, 26 insertions(+), 6 deletions(-)
+
+diff --git a/arch/Kconfig b/arch/Kconfig
+index 66aef13..c024b3e 100644
+--- a/arch/Kconfig
++++ b/arch/Kconfig
+@@ -219,15 +219,15 @@ config ARCH_WANT_OLD_COMPAT_IPC
+ config HAVE_ARCH_SECCOMP_FILTER
+ bool
+ help
+- This symbol should be selected by an architecure if it provides:
+- asm/syscall.h:
++ An arch should select this symbol if it provides all of these things:
+ - syscall_get_arch()
+ - syscall_get_arguments()
+ - syscall_rollback()
+ - syscall_set_return_value()
+- SIGSYS siginfo_t support must be implemented.
+- __secure_computing()/secure_computing()'s return value must be
+- checked, with -1 resulting in the syscall being skipped.
++ - SIGSYS siginfo_t support
++ - secure_computing is called from a ptrace_event()-safe context
++ - secure_computing return value is checked and a return value of -1
++ results in the system call being skipped immediately.
+
+ config SECCOMP_FILTER
+ def_bool y
+diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
+index 5c71962..597e4fd 100644
+--- a/include/linux/ptrace.h
++++ b/include/linux/ptrace.h
+@@ -58,6 +58,7 @@
+ #define PTRACE_EVENT_EXEC 4
+ #define PTRACE_EVENT_VFORK_DONE 5
+ #define PTRACE_EVENT_EXIT 6
++#define PTRACE_EVENT_SECCOMP 7
+ /* Extended result codes which enabled by means other than options. */
+ #define PTRACE_EVENT_STOP 128
+
+@@ -69,8 +70,9 @@
+ #define PTRACE_O_TRACEEXEC (1 << PTRACE_EVENT_EXEC)
+ #define PTRACE_O_TRACEVFORKDONE (1 << PTRACE_EVENT_VFORK_DONE)
+ #define PTRACE_O_TRACEEXIT (1 << PTRACE_EVENT_EXIT)
++#define PTRACE_O_TRACESECCOMP (1 << PTRACE_EVENT_SECCOMP)
+
+-#define PTRACE_O_MASK 0x0000007f
++#define PTRACE_O_MASK 0x000000ff
+
+ #include <asm/ptrace.h>
+
+@@ -98,6 +100,7 @@
+ #define PT_TRACE_EXEC PT_EVENT_FLAG(PTRACE_EVENT_EXEC)
+ #define PT_TRACE_VFORK_DONE PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE)
+ #define PT_TRACE_EXIT PT_EVENT_FLAG(PTRACE_EVENT_EXIT)
++#define PT_TRACE_SECCOMP PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP)
+
+ /* single stepping state bits (used on ARM and PA-RISC) */
+ #define PT_SINGLESTEP_BIT 31
+diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
+index 317ccb7..5818e86 100644
+--- a/include/linux/seccomp.h
++++ b/include/linux/seccomp.h
+@@ -21,6 +21,7 @@
+ #define SECCOMP_RET_KILL 0x00000000U /* kill the task immediately */
+ #define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */
+ #define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */
++#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */
+ #define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */
+
+ /* Masks for the return value sections. */
+diff --git a/kernel/seccomp.c b/kernel/seccomp.c
+index 9c38306..d9db6ec 100644
+--- a/kernel/seccomp.c
++++ b/kernel/seccomp.c
+@@ -24,6 +24,7 @@
+ #ifdef CONFIG_SECCOMP_FILTER
+ #include <asm/syscall.h>
+ #include <linux/filter.h>
++#include <linux/ptrace.h>
+ #include <linux/security.h>
+ #include <linux/slab.h>
+ #include <linux/tracehook.h>
+@@ -408,6 +409,21 @@ int __secure_computing(int this_syscall)
+ /* Let the filter pass back 16 bits of data. */
+ seccomp_send_sigsys(this_syscall, data);
+ goto skip;
++ case SECCOMP_RET_TRACE:
++ /* Skip these calls if there is no tracer. */
++ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP))
++ goto skip;
++ /* Allow the BPF to provide the event message */
++ ptrace_event(PTRACE_EVENT_SECCOMP, data);
++ /*
++ * The delivery of a fatal signal during event
++ * notification may silently skip tracer notification.
++ * Terminating the task now avoids executing a system
++ * call that may not be intended.
++ */
++ if (fatal_signal_pending(current))
++ break;
++ return 0;
+ case SECCOMP_RET_ALLOW:
+ return 0;
+ case SECCOMP_RET_KILL:
+--
+1.7.9.1
+
diff --git a/features/seccomp/seccomp-Add-SECCOMP_RET_TRAP.patch b/features/seccomp/seccomp-Add-SECCOMP_RET_TRAP.patch
new file mode 100644
index 00000000..31466638
--- /dev/null
+++ b/features/seccomp/seccomp-Add-SECCOMP_RET_TRAP.patch
@@ -0,0 +1,138 @@
+From 365829a1caa9148a289fe895280a1d2ed0e56e37 Mon Sep 17 00:00:00 2001
+From: Will Drewry <wad@chromium.org>
+Date: Thu, 12 Apr 2012 16:48:01 -0500
+Subject: [PATCH] seccomp: Add SECCOMP_RET_TRAP
+
+commit bb6ea4301a1109afdacaee576fedbfcd7152fc86 upstream.
+
+Adds a new return value to seccomp filters that triggers a SIGSYS to be
+delivered with the new SYS_SECCOMP si_code.
+
+This allows in-process system call emulation, including just specifying
+an errno or cleanly dumping core, rather than just dying.
+
+Suggested-by: Markus Gutschke <markus@chromium.org>
+Suggested-by: Julien Tinnes <jln@chromium.org>
+Signed-off-by: Will Drewry <wad@chromium.org>
+Acked-by: Eric Paris <eparis@redhat.com>
+
+v18: - acked-by, rebase
+ - don't mention secure_computing_int() anymore
+v15: - use audit_seccomp/skip
+ - pad out error spacing; clean up switch (indan@nul.nu)
+v14: - n/a
+v13: - rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc
+v12: - rebase on to linux-next
+v11: - clarify the comment (indan@nul.nu)
+ - s/sigtrap/sigsys
+v10: - use SIGSYS, syscall_get_arch, updates arch/Kconfig
+ note suggested-by (though original suggestion had other behaviors)
+v9: - changes to SIGILL
+v8: - clean up based on changes to dependent patches
+v7: - introduction
+Signed-off-by: James Morris <james.l.morris@oracle.com>
+Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
+---
+ arch/Kconfig | 14 +++++++++-----
+ include/asm-generic/siginfo.h | 2 +-
+ include/linux/seccomp.h | 1 +
+ kernel/seccomp.c | 26 ++++++++++++++++++++++++++
+ 4 files changed, 37 insertions(+), 6 deletions(-)
+
+diff --git a/arch/Kconfig b/arch/Kconfig
+index beaab68..66aef13 100644
+--- a/arch/Kconfig
++++ b/arch/Kconfig
+@@ -219,11 +219,15 @@ config ARCH_WANT_OLD_COMPAT_IPC
+ config HAVE_ARCH_SECCOMP_FILTER
+ bool
+ help
+- This symbol should be selected by an architecure if it provides
+- asm/syscall.h, specifically syscall_get_arguments(),
+- syscall_get_arch(), and syscall_set_return_value(). Additionally,
+- its system call entry path must respect a return value of -1 from
+- __secure_computing() and/or secure_computing().
++ This symbol should be selected by an architecure if it provides:
++ asm/syscall.h:
++ - syscall_get_arch()
++ - syscall_get_arguments()
++ - syscall_rollback()
++ - syscall_set_return_value()
++ SIGSYS siginfo_t support must be implemented.
++ __secure_computing()/secure_computing()'s return value must be
++ checked, with -1 resulting in the syscall being skipped.
+
+ config SECCOMP_FILTER
+ def_bool y
+diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
+index d2c7f29..8ed6777 100644
+--- a/include/asm-generic/siginfo.h
++++ b/include/asm-generic/siginfo.h
+@@ -101,7 +101,7 @@ typedef struct siginfo {
+
+ /* SIGSYS */
+ struct {
+- void __user *_call_addr; /* calling insn */
++ void __user *_call_addr; /* calling user insn */
+ int _syscall; /* triggering system call number */
+ unsigned int _arch; /* AUDIT_ARCH_* of syscall */
+ } _sigsys;
+diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
+index b4ce2c8..317ccb7 100644
+--- a/include/linux/seccomp.h
++++ b/include/linux/seccomp.h
+@@ -19,6 +19,7 @@
+ * selects the least permissive choice.
+ */
+ #define SECCOMP_RET_KILL 0x00000000U /* kill the task immediately */
++#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */
+ #define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */
+ #define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */
+
+diff --git a/kernel/seccomp.c b/kernel/seccomp.c
+index 5f78fb6..9c38306 100644
+--- a/kernel/seccomp.c
++++ b/kernel/seccomp.c
+@@ -332,6 +332,26 @@ void put_seccomp_filter(struct task_struct *tsk)
+ kfree(freeme);
+ }
+ }
++
++/**
++ * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
++ * @syscall: syscall number to send to userland
++ * @reason: filter-supplied reason code to send to userland (via si_errno)
++ *
++ * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
++ */
++static void seccomp_send_sigsys(int syscall, int reason)
++{
++ struct siginfo info;
++ memset(&info, 0, sizeof(info));
++ info.si_signo = SIGSYS;
++ info.si_code = SYS_SECCOMP;
++ info.si_call_addr = (void __user *)KSTK_EIP(current);
++ info.si_errno = reason;
++ info.si_arch = syscall_get_arch(current, task_pt_regs(current));
++ info.si_syscall = syscall;
++ force_sig_info(SIGSYS, &info, current);
++}
+ #endif /* CONFIG_SECCOMP_FILTER */
+
+ /*
+@@ -382,6 +402,12 @@ int __secure_computing(int this_syscall)
+ syscall_set_return_value(current, task_pt_regs(current),
+ -data, 0);
+ goto skip;
++ case SECCOMP_RET_TRAP:
++ /* Show the handler the original registers. */
++ syscall_rollback(current, task_pt_regs(current));
++ /* Let the filter pass back 16 bits of data. */
++ seccomp_send_sigsys(this_syscall, data);
++ goto skip;
+ case SECCOMP_RET_ALLOW:
+ return 0;
+ case SECCOMP_RET_KILL:
+--
+1.7.9.1
+
diff --git a/features/seccomp/seccomp-add-SECCOMP_RET_ERRNO.patch b/features/seccomp/seccomp-add-SECCOMP_RET_ERRNO.patch
new file mode 100644
index 00000000..2bb5adcf
--- /dev/null
+++ b/features/seccomp/seccomp-add-SECCOMP_RET_ERRNO.patch
@@ -0,0 +1,202 @@
+From dbb9ea8331cefce3fe15499126a7a1d29beb5d70 Mon Sep 17 00:00:00 2001
+From: Will Drewry <wad@chromium.org>
+Date: Thu, 12 Apr 2012 16:47:59 -0500
+Subject: [PATCH] seccomp: add SECCOMP_RET_ERRNO
+
+commit acf3b2c71ed20c53dc69826683417703c2a88059 upstream.
+
+This change adds the SECCOMP_RET_ERRNO as a valid return value from a
+seccomp filter. Additionally, it makes the first use of the lower
+16-bits for storing a filter-supplied errno. 16-bits is more than
+enough for the errno-base.h calls.
+
+Returning errors instead of immediately terminating processes that
+violate seccomp policy allow for broader use of this functionality
+for kernel attack surface reduction. For example, a linux container
+could maintain a whitelist of pre-existing system calls but drop
+all new ones with errnos. This would keep a logically static attack
+surface while providing errnos that may allow for graceful failure
+without the downside of do_exit() on a bad call.
+
+This change also changes the signature of __secure_computing. It
+appears the only direct caller is the arm entry code and it clobbers
+any possible return value (register) immediately.
+
+Signed-off-by: Will Drewry <wad@chromium.org>
+Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
+Reviewed-by: Kees Cook <keescook@chromium.org>
+Acked-by: Eric Paris <eparis@redhat.com>
+
+v18: - fix up comments and rebase
+ - fix bad var name which was fixed in later revs
+ - remove _int() and just change the __secure_computing signature
+v16-v17: ...
+v15: - use audit_seccomp and add a skip label. (eparis@redhat.com)
+ - clean up and pad out return codes (indan@nul.nu)
+v14: - no change/rebase
+v13: - rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc
+v12: - move to WARN_ON if filter is NULL
+ (oleg@redhat.com, luto@mit.edu, keescook@chromium.org)
+ - return immediately for filter==NULL (keescook@chromium.org)
+ - change evaluation to only compare the ACTION so that layered
+ errnos don't result in the lowest one being returned.
+ (keeschook@chromium.org)
+v11: - check for NULL filter (keescook@chromium.org)
+v10: - change loaders to fn
+ v9: - n/a
+ v8: - update Kconfig to note new need for syscall_set_return_value.
+ - reordered such that TRAP behavior follows on later.
+ - made the for loop a little less indent-y
+ v7: - introduced
+Signed-off-by: James Morris <james.l.morris@oracle.com>
+Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
+---
+ arch/Kconfig | 6 ++++--
+ include/linux/seccomp.h | 10 ++++++----
+ kernel/seccomp.c | 42 ++++++++++++++++++++++++++++++++----------
+ 3 files changed, 42 insertions(+), 16 deletions(-)
+
+diff --git a/arch/Kconfig b/arch/Kconfig
+index 91c2c73..beaab68 100644
+--- a/arch/Kconfig
++++ b/arch/Kconfig
+@@ -220,8 +220,10 @@ config HAVE_ARCH_SECCOMP_FILTER
+ bool
+ help
+ This symbol should be selected by an architecure if it provides
+- asm/syscall.h, specifically syscall_get_arguments() and
+- syscall_get_arch().
++ asm/syscall.h, specifically syscall_get_arguments(),
++ syscall_get_arch(), and syscall_set_return_value(). Additionally,
++ its system call entry path must respect a return value of -1 from
++ __secure_computing() and/or secure_computing().
+
+ config SECCOMP_FILTER
+ def_bool y
+diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
+index 86bb68f..b4ce2c8 100644
+--- a/include/linux/seccomp.h
++++ b/include/linux/seccomp.h
+@@ -12,13 +12,14 @@
+
+ /*
+ * All BPF programs must return a 32-bit value.
+- * The bottom 16-bits are reserved for future use.
++ * The bottom 16-bits are for optional return data.
+ * The upper 16-bits are ordered from least permissive values to most.
+ *
+ * The ordering ensures that a min_t() over composed return values always
+ * selects the least permissive choice.
+ */
+ #define SECCOMP_RET_KILL 0x00000000U /* kill the task immediately */
++#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */
+ #define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */
+
+ /* Masks for the return value sections. */
+@@ -64,11 +65,12 @@ struct seccomp {
+ struct seccomp_filter *filter;
+ };
+
+-extern void __secure_computing(int);
+-static inline void secure_computing(int this_syscall)
++extern int __secure_computing(int);
++static inline int secure_computing(int this_syscall)
+ {
+ if (unlikely(test_thread_flag(TIF_SECCOMP)))
+- __secure_computing(this_syscall);
++ return __secure_computing(this_syscall);
++ return 0;
+ }
+
+ extern long prctl_get_seccomp(void);
+diff --git a/kernel/seccomp.c b/kernel/seccomp.c
+index 0f7c709..5f78fb6 100644
+--- a/kernel/seccomp.c
++++ b/kernel/seccomp.c
+@@ -199,15 +199,20 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
+ static u32 seccomp_run_filters(int syscall)
+ {
+ struct seccomp_filter *f;
+- u32 ret = SECCOMP_RET_KILL;
++ u32 ret = SECCOMP_RET_ALLOW;
++
++ /* Ensure unexpected behavior doesn't result in failing open. */
++ if (WARN_ON(current->seccomp.filter == NULL))
++ return SECCOMP_RET_KILL;
++
+ /*
+ * All filters in the list are evaluated and the lowest BPF return
+- * value always takes priority.
++ * value always takes priority (ignoring the DATA).
+ */
+ for (f = current->seccomp.filter; f; f = f->prev) {
+- ret = sk_run_filter(NULL, f->insns);
+- if (ret != SECCOMP_RET_ALLOW)
+- break;
++ u32 cur_ret = sk_run_filter(NULL, f->insns);
++ if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
++ ret = cur_ret;
+ }
+ return ret;
+ }
+@@ -346,11 +351,13 @@ static int mode1_syscalls_32[] = {
+ };
+ #endif
+
+-void __secure_computing(int this_syscall)
++int __secure_computing(int this_syscall)
+ {
+ int mode = current->seccomp.mode;
+ int exit_sig = 0;
+ int *syscall;
++ u32 ret = SECCOMP_RET_KILL;
++ int data;
+
+ switch (mode) {
+ case SECCOMP_MODE_STRICT:
+@@ -361,14 +368,26 @@ void __secure_computing(int this_syscall)
+ #endif
+ do {
+ if (*syscall == this_syscall)
+- return;
++ return 0;
+ } while (*++syscall);
+ exit_sig = SIGKILL;
+ break;
+ #ifdef CONFIG_SECCOMP_FILTER
+ case SECCOMP_MODE_FILTER:
+- if (seccomp_run_filters(this_syscall) == SECCOMP_RET_ALLOW)
+- return;
++ ret = seccomp_run_filters(this_syscall);
++ data = ret & SECCOMP_RET_DATA;
++ switch (ret & SECCOMP_RET_ACTION) {
++ case SECCOMP_RET_ERRNO:
++ /* Set the low-order 16-bits as a errno. */
++ syscall_set_return_value(current, task_pt_regs(current),
++ -data, 0);
++ goto skip;
++ case SECCOMP_RET_ALLOW:
++ return 0;
++ case SECCOMP_RET_KILL:
++ default:
++ break;
++ }
+ exit_sig = SIGSYS;
+ break;
+ #endif
+@@ -379,8 +398,11 @@ void __secure_computing(int this_syscall)
+ #ifdef SECCOMP_DEBUG
+ dump_stack();
+ #endif
+- audit_seccomp(this_syscall, exit_code, SECCOMP_RET_KILL);
++ audit_seccomp(this_syscall, exit_sig, ret);
+ do_exit(exit_sig);
++skip:
++ audit_seccomp(this_syscall, exit_sig, ret);
++ return -1;
+ }
+
+ long prctl_get_seccomp(void)
+--
+1.7.9.1
+
diff --git a/features/seccomp/seccomp-add-system-call-filtering-using-BPF.patch b/features/seccomp/seccomp-add-system-call-filtering-using-BPF.patch
new file mode 100644
index 00000000..908f3cfd
--- /dev/null
+++ b/features/seccomp/seccomp-add-system-call-filtering-using-BPF.patch
@@ -0,0 +1,820 @@
+From 01c9617a2eca38f68d917ae16bdf8c2c8d863c8e Mon Sep 17 00:00:00 2001
+From: Will Drewry <wad@chromium.org>
+Date: Thu, 12 Apr 2012 16:47:57 -0500
+Subject: [PATCH] seccomp: add system call filtering using BPF
+
+commit e2cfabdfd075648216f99c2c03821cf3f47c1727 upstream.
+
+[This patch depends on luto@mit.edu's no_new_privs patch:
+ https://lkml.org/lkml/2012/1/30/264
+ The whole series including Andrew's patches can be found here:
+ https://github.com/redpig/linux/tree/seccomp
+ Complete diff here:
+ https://github.com/redpig/linux/compare/1dc65fed...seccomp
+]
+
+This patch adds support for seccomp mode 2. Mode 2 introduces the
+ability for unprivileged processes to install system call filtering
+policy expressed in terms of a Berkeley Packet Filter (BPF) program.
+This program will be evaluated in the kernel for each system call
+the task makes and computes a result based on data in the format
+of struct seccomp_data.
+
+A filter program may be installed by calling:
+ struct sock_fprog fprog = { ... };
+ ...
+ prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &fprog);
+
+The return value of the filter program determines if the system call is
+allowed to proceed or denied. If the first filter program installed
+allows prctl(2) calls, then the above call may be made repeatedly
+by a task to further reduce its access to the kernel. All attached
+programs must be evaluated before a system call will be allowed to
+proceed.
+
+Filter programs will be inherited across fork/clone and execve.
+However, if the task attaching the filter is unprivileged
+(!CAP_SYS_ADMIN) the no_new_privs bit will be set on the task. This
+ensures that unprivileged tasks cannot attach filters that affect
+privileged tasks (e.g., setuid binary).
+
+There are a number of benefits to this approach. A few of which are
+as follows:
+- BPF has been exposed to userland for a long time
+- BPF optimization (and JIT'ing) are well understood
+- Userland already knows its ABI: system call numbers and desired
+ arguments
+- No time-of-check-time-of-use vulnerable data accesses are possible.
+- system call arguments are loaded on access only to minimize copying
+ required for system call policy decisions.
+
+Mode 2 support is restricted to architectures that enable
+HAVE_ARCH_SECCOMP_FILTER. In this patch, the primary dependency is on
+syscall_get_arguments(). The full desired scope of this feature will
+add a few minor additional requirements expressed later in this series.
+Based on discussion, SECCOMP_RET_ERRNO and SECCOMP_RET_TRACE seem to be
+the desired additional functionality.
+
+No architectures are enabled in this patch.
+
+Signed-off-by: Will Drewry <wad@chromium.org>
+Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
+Reviewed-by: Indan Zupancic <indan@nul.nu>
+Acked-by: Eric Paris <eparis@redhat.com>
+Reviewed-by: Kees Cook <keescook@chromium.org>
+
+v18: - rebase to v3.4-rc2
+ - s/chk/check/ (akpm@linux-foundation.org,jmorris@namei.org)
+ - allocate with GFP_KERNEL|__GFP_NOWARN (indan@nul.nu)
+ - add a comment for get_u32 regarding endianness (akpm@)
+ - fix other typos, style mistakes (akpm@)
+ - added acked-by
+v17: - properly guard seccomp filter needed headers (leann@ubuntu.com)
+ - tighten return mask to 0x7fff0000
+v16: - no change
+v15: - add a 4 instr penalty when counting a path to account for seccomp_filter
+ size (indan@nul.nu)
+ - drop the max insns to 256KB (indan@nul.nu)
+ - return ENOMEM if the max insns limit has been hit (indan@nul.nu)
+ - move IP checks after args (indan@nul.nu)
+ - drop !user_filter check (indan@nul.nu)
+ - only allow explicit bpf codes (indan@nul.nu)
+ - exit_code -> exit_sig
+v14: - put/get_seccomp_filter takes struct task_struct
+ (indan@nul.nu,keescook@chromium.org)
+ - adds seccomp_chk_filter and drops general bpf_run/chk_filter user
+ - add seccomp_bpf_load for use by net/core/filter.c
+ - lower max per-process/per-hierarchy: 1MB
+ - moved nnp/capability check prior to allocation
+ (all of the above: indan@nul.nu)
+v13: - rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc
+v12: - added a maximum instruction count per path (indan@nul.nu,oleg@redhat.com)
+ - removed copy_seccomp (keescook@chromium.org,indan@nul.nu)
+ - reworded the prctl_set_seccomp comment (indan@nul.nu)
+v11: - reorder struct seccomp_data to allow future args expansion (hpa@zytor.com)
+ - style clean up, @compat dropped, compat_sock_fprog32 (indan@nul.nu)
+ - do_exit(SIGSYS) (keescook@chromium.org, luto@mit.edu)
+ - pare down Kconfig doc reference.
+ - extra comment clean up
+v10: - seccomp_data has changed again to be more aesthetically pleasing
+ (hpa@zytor.com)
+ - calling convention is noted in a new u32 field using syscall_get_arch.
+ This allows for cross-calling convention tasks to use seccomp filters.
+ (hpa@zytor.com)
+ - lots of clean up (thanks, Indan!)
+ v9: - n/a
+ v8: - use bpf_chk_filter, bpf_run_filter. update load_fns
+ - Lots of fixes courtesy of indan@nul.nu:
+ -- fix up load behavior, compat fixups, and merge alloc code,
+ -- renamed pc and dropped __packed, use bool compat.
+ -- Added a hidden CONFIG_SECCOMP_FILTER to synthesize non-arch
+ dependencies
+ v7: (massive overhaul thanks to Indan, others)
+ - added CONFIG_HAVE_ARCH_SECCOMP_FILTER
+ - merged into seccomp.c
+ - minimal seccomp_filter.h
+ - no config option (part of seccomp)
+ - no new prctl
+ - doesn't break seccomp on systems without asm/syscall.h
+ (works but arg access always fails)
+ - dropped seccomp_init_task, extra free functions, ...
+ - dropped the no-asm/syscall.h code paths
+ - merges with network sk_run_filter and sk_chk_filter
+ v6: - fix memory leak on attach compat check failure
+ - require no_new_privs || CAP_SYS_ADMIN prior to filter
+ installation. (luto@mit.edu)
+ - s/seccomp_struct_/seccomp_/ for macros/functions (amwang@redhat.com)
+ - cleaned up Kconfig (amwang@redhat.com)
+ - on block, note if the call was compat (so the # means something)
+ v5: - uses syscall_get_arguments
+ (indan@nul.nu,oleg@redhat.com, mcgrathr@chromium.org)
+ - uses union-based arg storage with hi/lo struct to
+ handle endianness. Compromises between the two alternate
+ proposals to minimize extra arg shuffling and account for
+ endianness assuming userspace uses offsetof().
+ (mcgrathr@chromium.org, indan@nul.nu)
+ - update Kconfig description
+ - add include/seccomp_filter.h and add its installation
+ - (naive) on-demand syscall argument loading
+ - drop seccomp_t (eparis@redhat.com)
+ v4: - adjusted prctl to make room for PR_[SG]ET_NO_NEW_PRIVS
+ - now uses current->no_new_privs
+ (luto@mit.edu,torvalds@linux-foundation.com)
+ - assign names to seccomp modes (rdunlap@xenotime.net)
+ - fix style issues (rdunlap@xenotime.net)
+ - reworded Kconfig entry (rdunlap@xenotime.net)
+ v3: - macros to inline (oleg@redhat.com)
+ - init_task behavior fixed (oleg@redhat.com)
+ - drop creator entry and extra NULL check (oleg@redhat.com)
+ - alloc returns -EINVAL on bad sizing (serge.hallyn@canonical.com)
+ - adds tentative use of "always_unprivileged" as per
+ torvalds@linux-foundation.org and luto@mit.edu
+ v2: - (patch 2 only)
+Signed-off-by: James Morris <james.l.morris@oracle.com>
+Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
+---
+ arch/Kconfig | 17 ++
+ include/linux/Kbuild | 1 +
+ include/linux/seccomp.h | 76 +++++++++-
+ kernel/fork.c | 3 +
+ kernel/seccomp.c | 396 ++++++++++++++++++++++++++++++++++++++++++++--
+ kernel/sys.c | 2 +-
+ 6 files changed, 472 insertions(+), 23 deletions(-)
+
+diff --git a/arch/Kconfig b/arch/Kconfig
+index 684eb5a..91c2c73 100644
+--- a/arch/Kconfig
++++ b/arch/Kconfig
+@@ -216,4 +216,21 @@ config HAVE_CMPXCHG_DOUBLE
+ config ARCH_WANT_OLD_COMPAT_IPC
+ bool
+
++config HAVE_ARCH_SECCOMP_FILTER
++ bool
++ help
++ This symbol should be selected by an architecure if it provides
++ asm/syscall.h, specifically syscall_get_arguments() and
++ syscall_get_arch().
++
++config SECCOMP_FILTER
++ def_bool y
++ depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET
++ help
++ Enable tasks to build secure computing environments defined
++ in terms of Berkeley Packet Filter programs which implement
++ task-defined system call filtering polices.
++
++ See Documentation/prctl/seccomp_filter.txt for details.
++
+ source "kernel/gcov/Kconfig"
+diff --git a/include/linux/Kbuild b/include/linux/Kbuild
+index 50f55c7..bc82495 100644
+--- a/include/linux/Kbuild
++++ b/include/linux/Kbuild
+@@ -333,6 +333,7 @@ header-y += scc.h
+ header-y += sched.h
+ header-y += screen_info.h
+ header-y += sdla.h
++header-y += seccomp.h
+ header-y += securebits.h
+ header-y += selinux_netlink.h
+ header-y += sem.h
+diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
+index d61f27f..86bb68f 100644
+--- a/include/linux/seccomp.h
++++ b/include/linux/seccomp.h
+@@ -1,14 +1,67 @@
+ #ifndef _LINUX_SECCOMP_H
+ #define _LINUX_SECCOMP_H
+
++#include <linux/compiler.h>
++#include <linux/types.h>
++
++
++/* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
++#define SECCOMP_MODE_DISABLED 0 /* seccomp is not in use. */
++#define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
++#define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
++
++/*
++ * All BPF programs must return a 32-bit value.
++ * The bottom 16-bits are reserved for future use.
++ * The upper 16-bits are ordered from least permissive values to most.
++ *
++ * The ordering ensures that a min_t() over composed return values always
++ * selects the least permissive choice.
++ */
++#define SECCOMP_RET_KILL 0x00000000U /* kill the task immediately */
++#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */
++
++/* Masks for the return value sections. */
++#define SECCOMP_RET_ACTION 0x7fff0000U
++#define SECCOMP_RET_DATA 0x0000ffffU
++
++/**
++ * struct seccomp_data - the format the BPF program executes over.
++ * @nr: the system call number
++ * @arch: indicates system call convention as an AUDIT_ARCH_* value
++ * as defined in <linux/audit.h>.
++ * @instruction_pointer: at the time of the system call.
++ * @args: up to 6 system call arguments always stored as 64-bit values
++ * regardless of the architecture.
++ */
++struct seccomp_data {
++ int nr;
++ __u32 arch;
++ __u64 instruction_pointer;
++ __u64 args[6];
++};
+
++#ifdef __KERNEL__
+ #ifdef CONFIG_SECCOMP
+
+ #include <linux/thread_info.h>
+ #include <asm/seccomp.h>
+
++struct seccomp_filter;
++/**
++ * struct seccomp - the state of a seccomp'ed process
++ *
++ * @mode: indicates one of the valid values above for controlled
++ * system calls available to a process.
++ * @filter: The metadata and ruleset for determining what system calls
++ * are allowed for a task.
++ *
++ * @filter must only be accessed from the context of current as there
++ * is no locking.
++ */
+ struct seccomp {
+ int mode;
++ struct seccomp_filter *filter;
+ };
+
+ extern void __secure_computing(int);
+@@ -19,7 +72,7 @@ static inline void secure_computing(int this_syscall)
+ }
+
+ extern long prctl_get_seccomp(void);
+-extern long prctl_set_seccomp(unsigned long);
++extern long prctl_set_seccomp(unsigned long, char __user *);
+
+ static inline int seccomp_mode(struct seccomp *s)
+ {
+@@ -31,15 +84,16 @@ static inline int seccomp_mode(struct seccomp *s)
+ #include <linux/errno.h>
+
+ struct seccomp { };
++struct seccomp_filter { };
+
+-#define secure_computing(x) do { } while (0)
++#define secure_computing(x) 0
+
+ static inline long prctl_get_seccomp(void)
+ {
+ return -EINVAL;
+ }
+
+-static inline long prctl_set_seccomp(unsigned long arg2)
++static inline long prctl_set_seccomp(unsigned long arg2, char __user *arg3)
+ {
+ return -EINVAL;
+ }
+@@ -48,7 +102,21 @@ static inline int seccomp_mode(struct seccomp *s)
+ {
+ return 0;
+ }
+-
+ #endif /* CONFIG_SECCOMP */
+
++#ifdef CONFIG_SECCOMP_FILTER
++extern void put_seccomp_filter(struct task_struct *tsk);
++extern void get_seccomp_filter(struct task_struct *tsk);
++extern u32 seccomp_bpf_load(int off);
++#else /* CONFIG_SECCOMP_FILTER */
++static inline void put_seccomp_filter(struct task_struct *tsk)
++{
++ return;
++}
++static inline void get_seccomp_filter(struct task_struct *tsk)
++{
++ return;
++}
++#endif /* CONFIG_SECCOMP_FILTER */
++#endif /* __KERNEL__ */
+ #endif /* _LINUX_SECCOMP_H */
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 8163333..9faa812 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -34,6 +34,7 @@
+ #include <linux/cgroup.h>
+ #include <linux/security.h>
+ #include <linux/hugetlb.h>
++#include <linux/seccomp.h>
+ #include <linux/swap.h>
+ #include <linux/syscalls.h>
+ #include <linux/jiffies.h>
+@@ -171,6 +172,7 @@ void free_task(struct task_struct *tsk)
+ free_thread_info(tsk->stack);
+ rt_mutex_debug_task_free(tsk);
+ ftrace_graph_exit_task(tsk);
++ put_seccomp_filter(tsk);
+ free_task_struct(tsk);
+ }
+ EXPORT_SYMBOL(free_task);
+@@ -1164,6 +1166,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
+ goto fork_out;
+
+ ftrace_graph_init_task(p);
++ get_seccomp_filter(p);
+
+ rt_mutex_init_task(p);
+
+diff --git a/kernel/seccomp.c b/kernel/seccomp.c
+index e8d76c5..0aeec19 100644
+--- a/kernel/seccomp.c
++++ b/kernel/seccomp.c
+@@ -3,16 +3,343 @@
+ *
+ * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
+ *
+- * This defines a simple but solid secure-computing mode.
++ * Copyright (C) 2012 Google, Inc.
++ * Will Drewry <wad@chromium.org>
++ *
++ * This defines a simple but solid secure-computing facility.
++ *
++ * Mode 1 uses a fixed list of allowed system calls.
++ * Mode 2 allows user-defined system call filters in the form
++ * of Berkeley Packet Filters/Linux Socket Filters.
+ */
+
++#include <linux/atomic.h>
+ #include <linux/audit.h>
+-#include <linux/seccomp.h>
+-#include <linux/sched.h>
+ #include <linux/compat.h>
++#include <linux/sched.h>
++#include <linux/seccomp.h>
+
+ /* #define SECCOMP_DEBUG 1 */
+-#define NR_SECCOMP_MODES 1
++
++#ifdef CONFIG_SECCOMP_FILTER
++#include <asm/syscall.h>
++#include <linux/filter.h>
++#include <linux/security.h>
++#include <linux/slab.h>
++#include <linux/tracehook.h>
++#include <linux/uaccess.h>
++
++/**
++ * struct seccomp_filter - container for seccomp BPF programs
++ *
++ * @usage: reference count to manage the object lifetime.
++ * get/put helpers should be used when accessing an instance
++ * outside of a lifetime-guarded section. In general, this
++ * is only needed for handling filters shared across tasks.
++ * @prev: points to a previously installed, or inherited, filter
++ * @len: the number of instructions in the program
++ * @insns: the BPF program instructions to evaluate
++ *
++ * seccomp_filter objects are organized in a tree linked via the @prev
++ * pointer. For any task, it appears to be a singly-linked list starting
++ * with current->seccomp.filter, the most recently attached or inherited filter.
++ * However, multiple filters may share a @prev node, by way of fork(), which
++ * results in a unidirectional tree existing in memory. This is similar to
++ * how namespaces work.
++ *
++ * seccomp_filter objects should never be modified after being attached
++ * to a task_struct (other than @usage).
++ */
++struct seccomp_filter {
++ atomic_t usage;
++ struct seccomp_filter *prev;
++ unsigned short len; /* Instruction count */
++ struct sock_filter insns[];
++};
++
++/* Limit any path through the tree to 256KB worth of instructions. */
++#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
++
++static void seccomp_filter_log_failure(int syscall)
++{
++ int compat = 0;
++#ifdef CONFIG_COMPAT
++ compat = is_compat_task();
++#endif
++ pr_info("%s[%d]: %ssystem call %d blocked at 0x%lx\n",
++ current->comm, task_pid_nr(current),
++ (compat ? "compat " : ""),
++ syscall, KSTK_EIP(current));
++}
++
++/**
++ * get_u32 - returns a u32 offset into data
++ * @data: a unsigned 64 bit value
++ * @index: 0 or 1 to return the first or second 32-bits
++ *
++ * This inline exists to hide the length of unsigned long. If a 32-bit
++ * unsigned long is passed in, it will be extended and the top 32-bits will be
++ * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
++ * properly returned.
++ *
++ * Endianness is explicitly ignored and left for BPF program authors to manage
++ * as per the specific architecture.
++ */
++static inline u32 get_u32(u64 data, int index)
++{
++ return ((u32 *)&data)[index];
++}
++
++/* Helper for bpf_load below. */
++#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
++/**
++ * bpf_load: checks and returns a pointer to the requested offset
++ * @off: offset into struct seccomp_data to load from
++ *
++ * Returns the requested 32-bits of data.
++ * seccomp_check_filter() should assure that @off is 32-bit aligned
++ * and not out of bounds. Failure to do so is a BUG.
++ */
++u32 seccomp_bpf_load(int off)
++{
++ struct pt_regs *regs = task_pt_regs(current);
++ if (off == BPF_DATA(nr))
++ return syscall_get_nr(current, regs);
++ if (off == BPF_DATA(arch))
++ return syscall_get_arch(current, regs);
++ if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
++ unsigned long value;
++ int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
++ int index = !!(off % sizeof(u64));
++ syscall_get_arguments(current, regs, arg, 1, &value);
++ return get_u32(value, index);
++ }
++ if (off == BPF_DATA(instruction_pointer))
++ return get_u32(KSTK_EIP(current), 0);
++ if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
++ return get_u32(KSTK_EIP(current), 1);
++ /* seccomp_check_filter should make this impossible. */
++ BUG();
++}
++
++/**
++ * seccomp_check_filter - verify seccomp filter code
++ * @filter: filter to verify
++ * @flen: length of filter
++ *
++ * Takes a previously checked filter (by sk_chk_filter) and
++ * redirects all filter code that loads struct sk_buff data
++ * and related data through seccomp_bpf_load. It also
++ * enforces length and alignment checking of those loads.
++ *
++ * Returns 0 if the rule set is legal or -EINVAL if not.
++ */
++static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
++{
++ int pc;
++ for (pc = 0; pc < flen; pc++) {
++ struct sock_filter *ftest = &filter[pc];
++ u16 code = ftest->code;
++ u32 k = ftest->k;
++
++ switch (code) {
++ case BPF_S_LD_W_ABS:
++ ftest->code = BPF_S_ANC_SECCOMP_LD_W;
++ /* 32-bit aligned and not out of bounds. */
++ if (k >= sizeof(struct seccomp_data) || k & 3)
++ return -EINVAL;
++ continue;
++ case BPF_S_LD_W_LEN:
++ ftest->code = BPF_S_LD_IMM;
++ ftest->k = sizeof(struct seccomp_data);
++ continue;
++ case BPF_S_LDX_W_LEN:
++ ftest->code = BPF_S_LDX_IMM;
++ ftest->k = sizeof(struct seccomp_data);
++ continue;
++ /* Explicitly include allowed calls. */
++ case BPF_S_RET_K:
++ case BPF_S_RET_A:
++ case BPF_S_ALU_ADD_K:
++ case BPF_S_ALU_ADD_X:
++ case BPF_S_ALU_SUB_K:
++ case BPF_S_ALU_SUB_X:
++ case BPF_S_ALU_MUL_K:
++ case BPF_S_ALU_MUL_X:
++ case BPF_S_ALU_DIV_X:
++ case BPF_S_ALU_AND_K:
++ case BPF_S_ALU_AND_X:
++ case BPF_S_ALU_OR_K:
++ case BPF_S_ALU_OR_X:
++ case BPF_S_ALU_LSH_K:
++ case BPF_S_ALU_LSH_X:
++ case BPF_S_ALU_RSH_K:
++ case BPF_S_ALU_RSH_X:
++ case BPF_S_ALU_NEG:
++ case BPF_S_LD_IMM:
++ case BPF_S_LDX_IMM:
++ case BPF_S_MISC_TAX:
++ case BPF_S_MISC_TXA:
++ case BPF_S_ALU_DIV_K:
++ case BPF_S_LD_MEM:
++ case BPF_S_LDX_MEM:
++ case BPF_S_ST:
++ case BPF_S_STX:
++ case BPF_S_JMP_JA:
++ case BPF_S_JMP_JEQ_K:
++ case BPF_S_JMP_JEQ_X:
++ case BPF_S_JMP_JGE_K:
++ case BPF_S_JMP_JGE_X:
++ case BPF_S_JMP_JGT_K:
++ case BPF_S_JMP_JGT_X:
++ case BPF_S_JMP_JSET_K:
++ case BPF_S_JMP_JSET_X:
++ continue;
++ default:
++ return -EINVAL;
++ }
++ }
++ return 0;
++}
++
++/**
++ * seccomp_run_filters - evaluates all seccomp filters against @syscall
++ * @syscall: number of the current system call
++ *
++ * Returns valid seccomp BPF response codes.
++ */
++static u32 seccomp_run_filters(int syscall)
++{
++ struct seccomp_filter *f;
++ u32 ret = SECCOMP_RET_KILL;
++ /*
++ * All filters in the list are evaluated and the lowest BPF return
++ * value always takes priority.
++ */
++ for (f = current->seccomp.filter; f; f = f->prev) {
++ ret = sk_run_filter(NULL, f->insns);
++ if (ret != SECCOMP_RET_ALLOW)
++ break;
++ }
++ return ret;
++}
++
++/**
++ * seccomp_attach_filter: Attaches a seccomp filter to current.
++ * @fprog: BPF program to install
++ *
++ * Returns 0 on success or an errno on failure.
++ */
++static long seccomp_attach_filter(struct sock_fprog *fprog)
++{
++ struct seccomp_filter *filter;
++ unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
++ unsigned long total_insns = fprog->len;
++ long ret;
++
++ if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
++ return -EINVAL;
++
++ for (filter = current->seccomp.filter; filter; filter = filter->prev)
++ total_insns += filter->len + 4; /* include a 4 instr penalty */
++ if (total_insns > MAX_INSNS_PER_PATH)
++ return -ENOMEM;
++
++ /*
++ * Installing a seccomp filter requires that the task have
++ * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
++ * This avoids scenarios where unprivileged tasks can affect the
++ * behavior of privileged children.
++ */
++ if (!current->no_new_privs &&
++ security_capable_noaudit(current_cred(), current_user_ns(),
++ CAP_SYS_ADMIN) != 0)
++ return -EACCES;
++
++ /* Allocate a new seccomp_filter */
++ filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
++ GFP_KERNEL|__GFP_NOWARN);
++ if (!filter)
++ return -ENOMEM;
++ atomic_set(&filter->usage, 1);
++ filter->len = fprog->len;
++
++ /* Copy the instructions from fprog. */
++ ret = -EFAULT;
++ if (copy_from_user(filter->insns, fprog->filter, fp_size))
++ goto fail;
++
++ /* Check and rewrite the fprog via the skb checker */
++ ret = sk_chk_filter(filter->insns, filter->len);
++ if (ret)
++ goto fail;
++
++ /* Check and rewrite the fprog for seccomp use */
++ ret = seccomp_check_filter(filter->insns, filter->len);
++ if (ret)
++ goto fail;
++
++ /*
++ * If there is an existing filter, make it the prev and don't drop its
++ * task reference.
++ */
++ filter->prev = current->seccomp.filter;
++ current->seccomp.filter = filter;
++ return 0;
++fail:
++ kfree(filter);
++ return ret;
++}
++
++/**
++ * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
++ * @user_filter: pointer to the user data containing a sock_fprog.
++ *
++ * Returns 0 on success and non-zero otherwise.
++ */
++long seccomp_attach_user_filter(char __user *user_filter)
++{
++ struct sock_fprog fprog;
++ long ret = -EFAULT;
++
++#ifdef CONFIG_COMPAT
++ if (is_compat_task()) {
++ struct compat_sock_fprog fprog32;
++ if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
++ goto out;
++ fprog.len = fprog32.len;
++ fprog.filter = compat_ptr(fprog32.filter);
++ } else /* falls through to the if below. */
++#endif
++ if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
++ goto out;
++ ret = seccomp_attach_filter(&fprog);
++out:
++ return ret;
++}
++
++/* get_seccomp_filter - increments the reference count of the filter on @tsk */
++void get_seccomp_filter(struct task_struct *tsk)
++{
++ struct seccomp_filter *orig = tsk->seccomp.filter;
++ if (!orig)
++ return;
++ /* Reference count is bounded by the number of total processes. */
++ atomic_inc(&orig->usage);
++}
++
++/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
++void put_seccomp_filter(struct task_struct *tsk)
++{
++ struct seccomp_filter *orig = tsk->seccomp.filter;
++ /* Clean up single-reference branches iteratively. */
++ while (orig && atomic_dec_and_test(&orig->usage)) {
++ struct seccomp_filter *freeme = orig;
++ orig = orig->prev;
++ kfree(freeme);
++ }
++}
++#endif /* CONFIG_SECCOMP_FILTER */
+
+ /*
+ * Secure computing mode 1 allows only read/write/exit/sigreturn.
+@@ -34,10 +361,11 @@ static int mode1_syscalls_32[] = {
+ void __secure_computing(int this_syscall)
+ {
+ int mode = current->seccomp.mode;
+- int * syscall;
++ int exit_sig = 0;
++ int *syscall;
+
+ switch (mode) {
+- case 1:
++ case SECCOMP_MODE_STRICT:
+ syscall = mode1_syscalls;
+ #ifdef CONFIG_COMPAT
+ if (is_compat_task())
+@@ -47,7 +375,16 @@ void __secure_computing(int this_syscall)
+ if (*syscall == this_syscall)
+ return;
+ } while (*++syscall);
++ exit_sig = SIGKILL;
+ break;
++#ifdef CONFIG_SECCOMP_FILTER
++ case SECCOMP_MODE_FILTER:
++ if (seccomp_run_filters(this_syscall) == SECCOMP_RET_ALLOW)
++ return;
++ seccomp_filter_log_failure(this_syscall);
++ exit_sig = SIGSYS;
++ break;
++#endif
+ default:
+ BUG();
+ }
+@@ -56,7 +393,7 @@ void __secure_computing(int this_syscall)
+ dump_stack();
+ #endif
+ audit_seccomp(this_syscall);
+- do_exit(SIGKILL);
++ do_exit(exit_sig);
+ }
+
+ long prctl_get_seccomp(void)
+@@ -64,25 +401,48 @@ long prctl_get_seccomp(void)
+ return current->seccomp.mode;
+ }
+
+-long prctl_set_seccomp(unsigned long seccomp_mode)
++/**
++ * prctl_set_seccomp: configures current->seccomp.mode
++ * @seccomp_mode: requested mode to use
++ * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
++ *
++ * This function may be called repeatedly with a @seccomp_mode of
++ * SECCOMP_MODE_FILTER to install additional filters. Every filter
++ * successfully installed will be evaluated (in reverse order) for each system
++ * call the task makes.
++ *
++ * Once current->seccomp.mode is non-zero, it may not be changed.
++ *
++ * Returns 0 on success or -EINVAL on failure.
++ */
++long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
+ {
+- long ret;
++ long ret = -EINVAL;
+
+- /* can set it only once to be even more secure */
+- ret = -EPERM;
+- if (unlikely(current->seccomp.mode))
++ if (current->seccomp.mode &&
++ current->seccomp.mode != seccomp_mode)
+ goto out;
+
+- ret = -EINVAL;
+- if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
+- current->seccomp.mode = seccomp_mode;
+- set_thread_flag(TIF_SECCOMP);
++ switch (seccomp_mode) {
++ case SECCOMP_MODE_STRICT:
++ ret = 0;
+ #ifdef TIF_NOTSC
+ disable_TSC();
+ #endif
+- ret = 0;
++ break;
++#ifdef CONFIG_SECCOMP_FILTER
++ case SECCOMP_MODE_FILTER:
++ ret = seccomp_attach_user_filter(filter);
++ if (ret)
++ goto out;
++ break;
++#endif
++ default:
++ goto out;
+ }
+
+- out:
++ current->seccomp.mode = seccomp_mode;
++ set_thread_flag(TIF_SECCOMP);
++out:
+ return ret;
+ }
+diff --git a/kernel/sys.c b/kernel/sys.c
+index b82568b..ba0ae8e 100644
+--- a/kernel/sys.c
++++ b/kernel/sys.c
+@@ -1908,7 +1908,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
+ error = prctl_get_seccomp();
+ break;
+ case PR_SET_SECCOMP:
+- error = prctl_set_seccomp(arg2);
++ error = prctl_set_seccomp(arg2, (char __user *)arg3);
+ break;
+ case PR_GET_TSC:
+ error = GET_TSC_CTL(arg2);
+--
+1.7.9.1
+
diff --git a/features/seccomp/seccomp-kill-the-seccomp_t-typedef.patch b/features/seccomp/seccomp-kill-the-seccomp_t-typedef.patch
new file mode 100644
index 00000000..95ac5398
--- /dev/null
+++ b/features/seccomp/seccomp-kill-the-seccomp_t-typedef.patch
@@ -0,0 +1,88 @@
+From 1bed374c1210f2390a39e715243b3767f8958e3b Mon Sep 17 00:00:00 2001
+From: Will Drewry <wad@chromium.org>
+Date: Thu, 12 Apr 2012 16:47:54 -0500
+Subject: [PATCH] seccomp: kill the seccomp_t typedef
+
+commit 932ecebb0405b9a41cd18946e6cff8a17d434e23 upstream.
+
+Replaces the seccomp_t typedef with struct seccomp to match modern
+kernel style.
+
+Signed-off-by: Will Drewry <wad@chromium.org>
+Reviewed-by: James Morris <jmorris@namei.org>
+Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
+Acked-by: Eric Paris <eparis@redhat.com>
+
+v18: rebase
+...
+v14: rebase/nochanges
+v13: rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc
+v12: rebase on to linux-next
+v8-v11: no changes
+v7: struct seccomp_struct -> struct seccomp
+v6: original inclusion in this series.
+Signed-off-by: James Morris <james.l.morris@oracle.com>
+Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
+---
+ include/linux/sched.h | 2 +-
+ include/linux/seccomp.h | 10 ++++++----
+ 2 files changed, 7 insertions(+), 5 deletions(-)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index ba60897..cad1502 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1452,7 +1452,7 @@ struct task_struct {
+ uid_t loginuid;
+ unsigned int sessionid;
+ #endif
+- seccomp_t seccomp;
++ struct seccomp seccomp;
+
+ /* Thread group tracking */
+ u32 parent_exec_id;
+diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
+index cc7a4e9..d61f27f 100644
+--- a/include/linux/seccomp.h
++++ b/include/linux/seccomp.h
+@@ -7,7 +7,9 @@
+ #include <linux/thread_info.h>
+ #include <asm/seccomp.h>
+
+-typedef struct { int mode; } seccomp_t;
++struct seccomp {
++ int mode;
++};
+
+ extern void __secure_computing(int);
+ static inline void secure_computing(int this_syscall)
+@@ -19,7 +21,7 @@ static inline void secure_computing(int this_syscall)
+ extern long prctl_get_seccomp(void);
+ extern long prctl_set_seccomp(unsigned long);
+
+-static inline int seccomp_mode(seccomp_t *s)
++static inline int seccomp_mode(struct seccomp *s)
+ {
+ return s->mode;
+ }
+@@ -28,7 +30,7 @@ static inline int seccomp_mode(seccomp_t *s)
+
+ #include <linux/errno.h>
+
+-typedef struct { } seccomp_t;
++struct seccomp { };
+
+ #define secure_computing(x) do { } while (0)
+
+@@ -42,7 +44,7 @@ static inline long prctl_set_seccomp(unsigned long arg2)
+ return -EINVAL;
+ }
+
+-static inline int seccomp_mode(seccomp_t *s)
++static inline int seccomp_mode(struct seccomp *s)
+ {
+ return 0;
+ }
+--
+1.7.9.1
+
diff --git a/features/seccomp/seccomp-remove-duplicated-failure-logging.patch b/features/seccomp/seccomp-remove-duplicated-failure-logging.patch
new file mode 100644
index 00000000..ed1e662b
--- /dev/null
+++ b/features/seccomp/seccomp-remove-duplicated-failure-logging.patch
@@ -0,0 +1,135 @@
+From 60ec12b5d1111e19e716ee5029296dc0550fad11 Mon Sep 17 00:00:00 2001
+From: Kees Cook <keescook@chromium.org>
+Date: Thu, 12 Apr 2012 16:47:58 -0500
+Subject: [PATCH] seccomp: remove duplicated failure logging
+
+commit 3dc1c1b2d2ed7507ce8a379814ad75745ff97ebe upstream.
+
+This consolidates the seccomp filter error logging path and adds more
+details to the audit log.
+
+Signed-off-by: Will Drewry <wad@chromium.org>
+Signed-off-by: Kees Cook <keescook@chromium.org>
+Acked-by: Eric Paris <eparis@redhat.com>
+
+v18: make compat= permanent in the record
+v15: added a return code to the audit_seccomp path by wad@chromium.org
+ (suggested by eparis@redhat.com)
+v*: original by keescook@chromium.org
+Signed-off-by: James Morris <james.l.morris@oracle.com>
+Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
+---
+ include/linux/audit.h | 8 ++++----
+ kernel/auditsc.c | 8 ++++++--
+ kernel/seccomp.c | 15 +--------------
+ 3 files changed, 11 insertions(+), 20 deletions(-)
+
+diff --git a/include/linux/audit.h b/include/linux/audit.h
+index ed3ef19..22f292a 100644
+--- a/include/linux/audit.h
++++ b/include/linux/audit.h
+@@ -463,7 +463,7 @@ extern void audit_putname(const char *name);
+ extern void __audit_inode(const char *name, const struct dentry *dentry);
+ extern void __audit_inode_child(const struct dentry *dentry,
+ const struct inode *parent);
+-extern void __audit_seccomp(unsigned long syscall);
++extern void __audit_seccomp(unsigned long syscall, long signr, int code);
+ extern void __audit_ptrace(struct task_struct *t);
+
+ static inline int audit_dummy_context(void)
+@@ -508,10 +508,10 @@ static inline void audit_inode_child(const struct dentry *dentry,
+ }
+ void audit_core_dumps(long signr);
+
+-static inline void audit_seccomp(unsigned long syscall)
++static inline void audit_seccomp(unsigned long syscall, long signr, int code)
+ {
+ if (unlikely(!audit_dummy_context()))
+- __audit_seccomp(syscall);
++ __audit_seccomp(syscall, signr, code);
+ }
+
+ static inline void audit_ptrace(struct task_struct *t)
+@@ -634,7 +634,7 @@ extern int audit_signals;
+ #define audit_inode(n,d) do { (void)(d); } while (0)
+ #define audit_inode_child(i,p) do { ; } while (0)
+ #define audit_core_dumps(i) do { ; } while (0)
+-#define audit_seccomp(i) do { ; } while (0)
++#define audit_seccomp(i,s,c) do { ; } while (0)
+ #define auditsc_get_stamp(c,t,s) (0)
+ #define audit_get_loginuid(t) (-1)
+ #define audit_get_sessionid(t) (-1)
+diff --git a/kernel/auditsc.c b/kernel/auditsc.c
+index af1de0f..4b96415 100644
+--- a/kernel/auditsc.c
++++ b/kernel/auditsc.c
+@@ -67,6 +67,7 @@
+ #include <linux/syscalls.h>
+ #include <linux/capability.h>
+ #include <linux/fs_struct.h>
++#include <linux/compat.h>
+
+ #include "audit.h"
+
+@@ -2710,13 +2711,16 @@ void audit_core_dumps(long signr)
+ audit_log_end(ab);
+ }
+
+-void __audit_seccomp(unsigned long syscall)
++void __audit_seccomp(unsigned long syscall, long signr, int code)
+ {
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+- audit_log_abend(ab, "seccomp", SIGKILL);
++ audit_log_abend(ab, "seccomp", signr);
+ audit_log_format(ab, " syscall=%ld", syscall);
++ audit_log_format(ab, " compat=%d", is_compat_task());
++ audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
++ audit_log_format(ab, " code=0x%x", code);
+ audit_log_end(ab);
+ }
+
+diff --git a/kernel/seccomp.c b/kernel/seccomp.c
+index 0aeec19..0f7c709 100644
+--- a/kernel/seccomp.c
++++ b/kernel/seccomp.c
+@@ -60,18 +60,6 @@ struct seccomp_filter {
+ /* Limit any path through the tree to 256KB worth of instructions. */
+ #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
+
+-static void seccomp_filter_log_failure(int syscall)
+-{
+- int compat = 0;
+-#ifdef CONFIG_COMPAT
+- compat = is_compat_task();
+-#endif
+- pr_info("%s[%d]: %ssystem call %d blocked at 0x%lx\n",
+- current->comm, task_pid_nr(current),
+- (compat ? "compat " : ""),
+- syscall, KSTK_EIP(current));
+-}
+-
+ /**
+ * get_u32 - returns a u32 offset into data
+ * @data: a unsigned 64 bit value
+@@ -381,7 +369,6 @@ void __secure_computing(int this_syscall)
+ case SECCOMP_MODE_FILTER:
+ if (seccomp_run_filters(this_syscall) == SECCOMP_RET_ALLOW)
+ return;
+- seccomp_filter_log_failure(this_syscall);
+ exit_sig = SIGSYS;
+ break;
+ #endif
+@@ -392,7 +379,7 @@ void __secure_computing(int this_syscall)
+ #ifdef SECCOMP_DEBUG
+ dump_stack();
+ #endif
+- audit_seccomp(this_syscall);
++ audit_seccomp(this_syscall, exit_code, SECCOMP_RET_KILL);
+ do_exit(exit_sig);
+ }
+
+--
+1.7.9.1
+
diff --git a/features/seccomp/seccomp.scc b/features/seccomp/seccomp.scc
new file mode 100644
index 00000000..7ceac72a
--- /dev/null
+++ b/features/seccomp/seccomp.scc
@@ -0,0 +1,15 @@
+patch Add-PR_-GET-SET-_NO_NEW_PRIVS-to-prevent-execve-from.patch
+patch Fix-execve-behavior-apparmor-for-PR_-GET-SET-_NO_NEW.patch
+patch sk_run_filter-add-BPF_S_ANC_SECCOMP_LD_W.patch
+patch net-compat.c-linux-filter.h-share-compat_sock_fprog.patch
+patch seccomp-kill-the-seccomp_t-typedef.patch
+patch asm-syscall.h-add-syscall_get_arch.patch
+patch arch-x86-add-syscall_get_arch-to-syscall.h.patch
+patch seccomp-add-system-call-filtering-using-BPF.patch
+patch seccomp-remove-duplicated-failure-logging.patch
+patch seccomp-add-SECCOMP_RET_ERRNO.patch
+patch signal-x86-add-SIGSYS-info-and-make-it-synchronous.patch
+patch seccomp-Add-SECCOMP_RET_TRAP.patch
+patch ptrace-seccomp-Add-PTRACE_SECCOMP-support.patch
+patch x86-Enable-HAVE_ARCH_SECCOMP_FILTER.patch
+patch Documentation-prctl-seccomp_filter.patch
diff --git a/features/seccomp/signal-x86-add-SIGSYS-info-and-make-it-synchronous.patch b/features/seccomp/signal-x86-add-SIGSYS-info-and-make-it-synchronous.patch
new file mode 100644
index 00000000..735a9b94
--- /dev/null
+++ b/features/seccomp/signal-x86-add-SIGSYS-info-and-make-it-synchronous.patch
@@ -0,0 +1,174 @@
+From 5b84a784a5f5186e35aea6efad849d8898f527a2 Mon Sep 17 00:00:00 2001
+From: Will Drewry <wad@chromium.org>
+Date: Thu, 12 Apr 2012 16:48:00 -0500
+Subject: [PATCH] signal, x86: add SIGSYS info and make it synchronous.
+
+commit a0727e8ce513fe6890416da960181ceb10fbfae6 upstream.
+
+This change enables SIGSYS, defines _sigfields._sigsys, and adds
+x86 (compat) arch support. _sigsys defines fields which allow
+a signal handler to receive the triggering system call number,
+the relevant AUDIT_ARCH_* value for that number, and the address
+of the callsite.
+
+SIGSYS is added to the SYNCHRONOUS_MASK because it is desirable for it
+to have setup_frame() called for it. The goal is to ensure that
+ucontext_t reflects the machine state from the time-of-syscall and not
+from another signal handler.
+
+The first consumer of SIGSYS would be seccomp filter. In particular,
+a filter program could specify a new return value, SECCOMP_RET_TRAP,
+which would result in the system call being denied and the calling
+thread signaled. This also means that implementing arch-specific
+support can be dependent upon HAVE_ARCH_SECCOMP_FILTER.
+
+Suggested-by: H. Peter Anvin <hpa@zytor.com>
+Signed-off-by: Will Drewry <wad@chromium.org>
+Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
+Reviewed-by: H. Peter Anvin <hpa@zytor.com>
+Acked-by: Eric Paris <eparis@redhat.com>
+
+v18: - added acked by, rebase
+v17: - rebase and reviewed-by addition
+v14: - rebase/nochanges
+v13: - rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc
+v12: - reworded changelog (oleg@redhat.com)
+v11: - fix dropped words in the change description
+ - added fallback copy_siginfo support.
+ - added __ARCH_SIGSYS define to allow stepped arch support.
+v10: - first version based on suggestion
+Signed-off-by: James Morris <james.l.morris@oracle.com>
+Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
+---
+ arch/x86/ia32/ia32_signal.c | 4 ++++
+ arch/x86/include/asm/ia32.h | 6 ++++++
+ include/asm-generic/siginfo.h | 22 ++++++++++++++++++++++
+ kernel/signal.c | 9 ++++++++-
+ 4 files changed, 40 insertions(+), 1 deletions(-)
+
+diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
+index a69245b..0b3f235 100644
+--- a/arch/x86/ia32/ia32_signal.c
++++ b/arch/x86/ia32/ia32_signal.c
+@@ -67,6 +67,10 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
+ switch (from->si_code >> 16) {
+ case __SI_FAULT >> 16:
+ break;
++ case __SI_SYS >> 16:
++ put_user_ex(from->si_syscall, &to->si_syscall);
++ put_user_ex(from->si_arch, &to->si_arch);
++ break;
+ case __SI_CHLD >> 16:
+ if (ia32) {
+ put_user_ex(from->si_utime, &to->si_utime);
+diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
+index ee52760..b04cbdb 100644
+--- a/arch/x86/include/asm/ia32.h
++++ b/arch/x86/include/asm/ia32.h
+@@ -144,6 +144,12 @@ typedef struct compat_siginfo {
+ int _band; /* POLL_IN, POLL_OUT, POLL_MSG */
+ int _fd;
+ } _sigpoll;
++
++ struct {
++ unsigned int _call_addr; /* calling insn */
++ int _syscall; /* triggering system call number */
++ unsigned int _arch; /* AUDIT_ARCH_* of syscall */
++ } _sigsys;
+ } _sifields;
+ } compat_siginfo_t;
+
+diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
+index 5e5e386..d2c7f29 100644
+--- a/include/asm-generic/siginfo.h
++++ b/include/asm-generic/siginfo.h
+@@ -98,9 +98,18 @@ typedef struct siginfo {
+ __ARCH_SI_BAND_T _band; /* POLL_IN, POLL_OUT, POLL_MSG */
+ int _fd;
+ } _sigpoll;
++
++ /* SIGSYS */
++ struct {
++ void __user *_call_addr; /* calling insn */
++ int _syscall; /* triggering system call number */
++ unsigned int _arch; /* AUDIT_ARCH_* of syscall */
++ } _sigsys;
+ } _sifields;
+ } __ARCH_SI_ATTRIBUTES siginfo_t;
+
++/* If the arch shares siginfo, then it has SIGSYS. */
++#define __ARCH_SIGSYS
+ #endif
+
+ /*
+@@ -124,6 +133,11 @@ typedef struct siginfo {
+ #define si_addr_lsb _sifields._sigfault._addr_lsb
+ #define si_band _sifields._sigpoll._band
+ #define si_fd _sifields._sigpoll._fd
++#ifdef __ARCH_SIGSYS
++#define si_call_addr _sifields._sigsys._call_addr
++#define si_syscall _sifields._sigsys._syscall
++#define si_arch _sifields._sigsys._arch
++#endif
+
+ #ifdef __KERNEL__
+ #define __SI_MASK 0xffff0000u
+@@ -134,6 +148,7 @@ typedef struct siginfo {
+ #define __SI_CHLD (4 << 16)
+ #define __SI_RT (5 << 16)
+ #define __SI_MESGQ (6 << 16)
++#define __SI_SYS (7 << 16)
+ #define __SI_CODE(T,N) ((T) | ((N) & 0xffff))
+ #else
+ #define __SI_KILL 0
+@@ -143,6 +158,7 @@ typedef struct siginfo {
+ #define __SI_CHLD 0
+ #define __SI_RT 0
+ #define __SI_MESGQ 0
++#define __SI_SYS 0
+ #define __SI_CODE(T,N) (N)
+ #endif
+
+@@ -240,6 +256,12 @@ typedef struct siginfo {
+ #define NSIGPOLL 6
+
+ /*
++ * SIGSYS si_codes
++ */
++#define SYS_SECCOMP (__SI_SYS|1) /* seccomp triggered */
++#define NSIGSYS 1
++
++/*
+ * sigevent definitions
+ *
+ * It seems likely that SIGEV_THREAD will have to be handled from
+diff --git a/kernel/signal.c b/kernel/signal.c
+index 17afcaf..1a006b5 100644
+--- a/kernel/signal.c
++++ b/kernel/signal.c
+@@ -160,7 +160,7 @@ void recalc_sigpending(void)
+
+ #define SYNCHRONOUS_MASK \
+ (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
+- sigmask(SIGTRAP) | sigmask(SIGFPE))
++ sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))
+
+ int next_signal(struct sigpending *pending, sigset_t *mask)
+ {
+@@ -2706,6 +2706,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
+ err |= __put_user(from->si_uid, &to->si_uid);
+ err |= __put_user(from->si_ptr, &to->si_ptr);
+ break;
++#ifdef __ARCH_SIGSYS
++ case __SI_SYS:
++ err |= __put_user(from->si_call_addr, &to->si_call_addr);
++ err |= __put_user(from->si_syscall, &to->si_syscall);
++ err |= __put_user(from->si_arch, &to->si_arch);
++ break;
++#endif
+ default: /* this is just in case for now ... */
+ err |= __put_user(from->si_pid, &to->si_pid);
+ err |= __put_user(from->si_uid, &to->si_uid);
+--
+1.7.9.1
+
diff --git a/features/seccomp/sk_run_filter-add-BPF_S_ANC_SECCOMP_LD_W.patch b/features/seccomp/sk_run_filter-add-BPF_S_ANC_SECCOMP_LD_W.patch
new file mode 100644
index 00000000..00a6038a
--- /dev/null
+++ b/features/seccomp/sk_run_filter-add-BPF_S_ANC_SECCOMP_LD_W.patch
@@ -0,0 +1,73 @@
+From 23be50acb6765e31a3c1c5b79421c81cce9dbbf9 Mon Sep 17 00:00:00 2001
+From: Will Drewry <wad@chromium.org>
+Date: Thu, 12 Apr 2012 16:47:52 -0500
+Subject: [PATCH] sk_run_filter: add BPF_S_ANC_SECCOMP_LD_W
+
+commit 46b325c7eb01482674406701825ff67f561ccdd4 upstream.
+
+Introduces a new BPF ancillary instruction that all LD calls will be
+mapped through when skb_run_filter() is being used for seccomp BPF. The
+rewriting will be done using a secondary chk_filter function that is run
+after skb_chk_filter.
+
+The code change is guarded by CONFIG_SECCOMP_FILTER which is added,
+along with the seccomp_bpf_load() function later in this series.
+
+This is based on http://lkml.org/lkml/2012/3/2/141
+
+Suggested-by: Indan Zupancic <indan@nul.nu>
+Signed-off-by: Will Drewry <wad@chromium.org>
+Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
+Acked-by: Eric Paris <eparis@redhat.com>
+
+v18: rebase
+...
+v15: include seccomp.h explicitly for when seccomp_bpf_load exists.
+v14: First cut using a single additional instruction
+... v13: made bpf functions generic.
+Signed-off-by: James Morris <james.l.morris@oracle.com>
+Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
+---
+ include/linux/filter.h | 1 +
+ net/core/filter.c | 6 ++++++
+ 2 files changed, 7 insertions(+), 0 deletions(-)
+
+diff --git a/include/linux/filter.h b/include/linux/filter.h
+index 8eeb205..aaa2e80 100644
+--- a/include/linux/filter.h
++++ b/include/linux/filter.h
+@@ -228,6 +228,7 @@ enum {
+ BPF_S_ANC_HATYPE,
+ BPF_S_ANC_RXHASH,
+ BPF_S_ANC_CPU,
++ BPF_S_ANC_SECCOMP_LD_W,
+ };
+
+ #endif /* __KERNEL__ */
+diff --git a/net/core/filter.c b/net/core/filter.c
+index 6f755cc..491e2e1 100644
+--- a/net/core/filter.c
++++ b/net/core/filter.c
+@@ -38,6 +38,7 @@
+ #include <linux/filter.h>
+ #include <linux/reciprocal_div.h>
+ #include <linux/ratelimit.h>
++#include <linux/seccomp.h>
+
+ /* No hurry in this branch
+ *
+@@ -352,6 +353,11 @@ load_b:
+ A = 0;
+ continue;
+ }
++#ifdef CONFIG_SECCOMP_FILTER
++ case BPF_S_ANC_SECCOMP_LD_W:
++ A = seccomp_bpf_load(fentry->k);
++ continue;
++#endif
+ default:
+ WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
+ fentry->code, fentry->jt,
+--
+1.7.9.1
+
diff --git a/features/seccomp/x86-Enable-HAVE_ARCH_SECCOMP_FILTER.patch b/features/seccomp/x86-Enable-HAVE_ARCH_SECCOMP_FILTER.patch
new file mode 100644
index 00000000..9bf43d5b
--- /dev/null
+++ b/features/seccomp/x86-Enable-HAVE_ARCH_SECCOMP_FILTER.patch
@@ -0,0 +1,80 @@
+From 648b737bc10632617f45eff886dcc29398e717da Mon Sep 17 00:00:00 2001
+From: Will Drewry <wad@chromium.org>
+Date: Thu, 12 Apr 2012 16:48:03 -0500
+Subject: [PATCH] x86: Enable HAVE_ARCH_SECCOMP_FILTER
+
+commit c6cfbeb4029610c8c330c312dcf4d514cc067554 upstream.
+
+Enable support for seccomp filter on x86:
+- syscall_get_arch()
+- syscall_get_arguments()
+- syscall_rollback()
+- syscall_set_return_value()
+- SIGSYS siginfo_t support
+- secure_computing is called from a ptrace_event()-safe context
+- secure_computing return value is checked (see below).
+
+SECCOMP_RET_TRACE and SECCOMP_RET_TRAP may result in seccomp needing to
+skip a system call without killing the process. This is done by
+returning a non-zero (-1) value from secure_computing. This change
+makes x86 respect that return value.
+
+To ensure that minimal kernel code is exposed, a non-zero return value
+results in an immediate return to user space (with an invalid syscall
+number).
+
+Signed-off-by: Will Drewry <wad@chromium.org>
+Reviewed-by: H. Peter Anvin <hpa@zytor.com>
+Acked-by: Eric Paris <eparis@redhat.com>
+Reviewed-by: Kees Cook <keescook@chromium.org>
+
+v18: rebase and tweaked change description, acked-by
+v17: added reviewed by and rebased
+v..: all rebases since original introduction.
+Signed-off-by: James Morris <james.l.morris@oracle.com>
+Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
+---
+ arch/x86/Kconfig | 1 +
+ arch/x86/kernel/ptrace.c | 7 ++++++-
+ 2 files changed, 7 insertions(+), 1 deletions(-)
+
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index c9866b0..2b79d94 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -82,6 +82,7 @@ config X86
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
+ select GENERIC_IOMAP
+ select DCACHE_WORD_ACCESS
++ select HAVE_ARCH_SECCOMP_FILTER
+
+ config INSTRUCTION_DECODER
+ def_bool (KPROBES || PERF_EVENTS)
+diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
+index cf11783..c4c6a5c 100644
+--- a/arch/x86/kernel/ptrace.c
++++ b/arch/x86/kernel/ptrace.c
+@@ -1474,7 +1474,11 @@ long syscall_trace_enter(struct pt_regs *regs)
+ regs->flags |= X86_EFLAGS_TF;
+
+ /* do the secure computing check first */
+- secure_computing(regs->orig_ax);
++ if (secure_computing(regs->orig_ax)) {
++ /* seccomp failures shouldn't expose any additional code. */
++ ret = -1L;
++ goto out;
++ }
+
+ if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
+ ret = -1L;
+@@ -1499,6 +1503,7 @@ long syscall_trace_enter(struct pt_regs *regs)
+ regs->dx, regs->r10);
+ #endif
+
++out:
+ return ret ?: regs->orig_ax;
+ }
+
+--
+1.7.9.1
+
diff --git a/ktypes/standard/standard-nocfg.scc b/ktypes/standard/standard-nocfg.scc
index e0b6eb66..6c9a5de5 100644
--- a/ktypes/standard/standard-nocfg.scc
+++ b/ktypes/standard/standard-nocfg.scc
@@ -24,6 +24,9 @@ tag systemtap
include features/utrace/utrace.scc
tag utrace
+include features/seccomp/seccomp.scc
+tag seccomp
+
include arch/arm/arm.scc
tag arm