From 0ae71c7720e3ae3aabd2e8a072d27f7bd173d25c Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Mon, 17 May 2021 12:39:07 -0700 Subject: [PATCH 1/5] seccomp: Support atomic "addfd + send reply" Alban Crequy reported a race condition userspace faces when we want to add some fds and make the syscall return them[1] using seccomp notify. The problem is that currently two different ioctl() calls are needed by the process handling the syscalls (agent) for another userspace process (target): SECCOMP_IOCTL_NOTIF_ADDFD to allocate the fd and SECCOMP_IOCTL_NOTIF_SEND to return that value. Therefore, it is possible for the agent to do the first ioctl to add a file descriptor but the target is interrupted (EINTR) before the agent does the second ioctl() call. This patch adds a flag to the ADDFD ioctl() so it adds the fd and returns that value atomically to the target program, as suggested by Kees Cook[2]. This is done by simply allowing seccomp_do_user_notification() to add the fd and return it in this case. Therefore, in this case the target wakes up from the wait in seccomp_do_user_notification() either to interrupt the syscall or to add the fd and return it. This "allocate an fd and return" functionality is useful for syscalls that return a file descriptor only, like connect(2). Other syscalls that return a file descriptor but not as return value (or return more than one fd), like socketpair(), pipe(), recvmsg with SCM_RIGHTs, will not work with this flag. This effectively combines SECCOMP_IOCTL_NOTIF_ADDFD and SECCOMP_IOCTL_NOTIF_SEND into an atomic opteration. The notification's return value, nor error can be set by the user. Upon successful invocation of the SECCOMP_IOCTL_NOTIF_ADDFD ioctl with the SECCOMP_ADDFD_FLAG_SEND flag, the notifying process's errno will be 0, and the return value will be the file descriptor number that was installed. [1]: https://lore.kernel.org/lkml/CADZs7q4sw71iNHmV8EOOXhUKJMORPzF7thraxZYddTZsxta-KQ@mail.gmail.com/ [2]: https://lore.kernel.org/lkml/202012011322.26DCBC64F2@keescook/ Signed-off-by: Rodrigo Campos Signed-off-by: Sargun Dhillon Acked-by: Tycho Andersen Acked-by: Christian Brauner Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20210517193908.3113-4-sargun@sargun.me --- .../userspace-api/seccomp_filter.rst | 12 +++++ include/uapi/linux/seccomp.h | 1 + kernel/seccomp.c | 51 ++++++++++++++++--- 3 files changed, 58 insertions(+), 6 deletions(-) diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst index 6efb41cc8072..d61219889e49 100644 --- a/Documentation/userspace-api/seccomp_filter.rst +++ b/Documentation/userspace-api/seccomp_filter.rst @@ -259,6 +259,18 @@ and ``ioctl(SECCOMP_IOCTL_NOTIF_SEND)`` a response, indicating what should be returned to userspace. The ``id`` member of ``struct seccomp_notif_resp`` should be the same ``id`` as in ``struct seccomp_notif``. +Userspace can also add file descriptors to the notifying process via +``ioctl(SECCOMP_IOCTL_NOTIF_ADDFD)``. The ``id`` member of +``struct seccomp_notif_addfd`` should be the same ``id`` as in +``struct seccomp_notif``. The ``newfd_flags`` flag may be used to set flags +like O_EXEC on the file descriptor in the notifying process. If the supervisor +wants to inject the file descriptor with a specific number, the +``SECCOMP_ADDFD_FLAG_SETFD`` flag can be used, and set the ``newfd`` member to +the specific number to use. If that file descriptor is already open in the +notifying process it will be replaced. The supervisor can also add an FD, and +respond atomically by using the ``SECCOMP_ADDFD_FLAG_SEND`` flag and the return +value will be the injected file descriptor number. + It is worth noting that ``struct seccomp_data`` contains the values of register arguments to the syscall, but does not contain pointers to memory. The task's memory is accessible to suitably privileged traces via ``ptrace()`` or diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 6ba18b82a02e..78074254ab98 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -115,6 +115,7 @@ struct seccomp_notif_resp { /* valid flags for seccomp_notif_addfd */ #define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */ +#define SECCOMP_ADDFD_FLAG_SEND (1UL << 1) /* Addfd and return it, atomically */ /** * struct seccomp_notif_addfd diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 9f58049ac16d..057e17f3215d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -107,6 +107,7 @@ struct seccomp_knotif { * installing process should allocate the fd as normal. * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC * is allowed. + * @ioctl_flags: The flags used for the seccomp_addfd ioctl. * @ret: The return value of the installing process. It is set to the fd num * upon success (>= 0). * @completion: Indicates that the installing process has completed fd @@ -118,6 +119,7 @@ struct seccomp_kaddfd { struct file *file; int fd; unsigned int flags; + __u32 ioctl_flags; union { bool setfd; @@ -1065,18 +1067,37 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter) return filter->notif->next_id++; } -static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd) +static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_knotif *n) { + int fd; + /* * Remove the notification, and reset the list pointers, indicating * that it has been handled. */ list_del_init(&addfd->list); if (!addfd->setfd) - addfd->ret = receive_fd(addfd->file, addfd->flags); + fd = receive_fd(addfd->file, addfd->flags); else - addfd->ret = receive_fd_replace(addfd->fd, addfd->file, - addfd->flags); + fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); + addfd->ret = fd; + + if (addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_SEND) { + /* If we fail reset and return an error to the notifier */ + if (fd < 0) { + n->state = SECCOMP_NOTIFY_SENT; + } else { + /* Return the FD we just added */ + n->flags = 0; + n->error = 0; + n->val = fd; + } + } + + /* + * Mark the notification as completed. From this point, addfd mem + * might be invalidated and we can't safely read it anymore. + */ complete(&addfd->completion); } @@ -1120,7 +1141,7 @@ static int seccomp_do_user_notification(int this_syscall, struct seccomp_kaddfd, list); /* Check if we were woken up by a addfd message */ if (addfd) - seccomp_handle_addfd(addfd); + seccomp_handle_addfd(addfd, &n); } while (n.state != SECCOMP_NOTIFY_REPLIED); @@ -1581,7 +1602,7 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter, if (addfd.newfd_flags & ~O_CLOEXEC) return -EINVAL; - if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD) + if (addfd.flags & ~(SECCOMP_ADDFD_FLAG_SETFD | SECCOMP_ADDFD_FLAG_SEND)) return -EINVAL; if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD)) @@ -1591,6 +1612,7 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter, if (!kaddfd.file) return -EBADF; + kaddfd.ioctl_flags = addfd.flags; kaddfd.flags = addfd.newfd_flags; kaddfd.setfd = addfd.flags & SECCOMP_ADDFD_FLAG_SETFD; kaddfd.fd = addfd.newfd; @@ -1616,6 +1638,23 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter, goto out_unlock; } + if (addfd.flags & SECCOMP_ADDFD_FLAG_SEND) { + /* + * Disallow queuing an atomic addfd + send reply while there are + * some addfd requests still to process. + * + * There is no clear reason to support it and allows us to keep + * the loop on the other side straight-forward. + */ + if (!list_empty(&knotif->addfd)) { + ret = -EBUSY; + goto out_unlock; + } + + /* Allow exactly only one reply */ + knotif->state = SECCOMP_NOTIFY_REPLIED; + } + list_add(&kaddfd.list, &knotif->addfd); complete(&knotif->ready); mutex_unlock(&filter->notify_lock); From e540ad97e73cefb41e93d0c06d0fe6a8620a77e0 Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Mon, 17 May 2021 12:39:08 -0700 Subject: [PATCH 2/5] selftests/seccomp: Add test for atomic addfd+send This just adds a test to verify that when using the new introduced flag to ADDFD, a valid fd is added and returned as the syscall result. Signed-off-by: Rodrigo Campos Signed-off-by: Sargun Dhillon Acked-by: Tycho Andersen Acked-by: Christian Brauner Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20210517193908.3113-5-sargun@sargun.me --- tools/testing/selftests/seccomp/seccomp_bpf.c | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 98c3b647f54d..e2ba7adc2694 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -235,6 +235,10 @@ struct seccomp_notif_addfd { }; #endif +#ifndef SECCOMP_ADDFD_FLAG_SEND +#define SECCOMP_ADDFD_FLAG_SEND (1UL << 1) /* Addfd and return it, atomically */ +#endif + struct seccomp_notif_addfd_small { __u64 id; char weird[4]; @@ -3976,8 +3980,14 @@ TEST(user_notification_addfd) ASSERT_GE(pid, 0); if (pid == 0) { + /* fds will be added and this value is expected */ if (syscall(__NR_getppid) != USER_NOTIF_MAGIC) exit(1); + + /* Atomic addfd+send is received here. Check it is a valid fd */ + if (fcntl(syscall(__NR_getppid), F_GETFD) == -1) + exit(1); + exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC); } @@ -4056,6 +4066,30 @@ TEST(user_notification_addfd) ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); ASSERT_EQ(addfd.id, req.id); + /* Verify we can do an atomic addfd and send */ + addfd.newfd = 0; + addfd.flags = SECCOMP_ADDFD_FLAG_SEND; + fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd); + + /* Child has fds 0-6 and 42 used, we expect the lower fd available: 7 */ + EXPECT_EQ(fd, 7); + EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0); + + /* + * This sets the ID of the ADD FD to the last request plus 1. The + * notification ID increments 1 per notification. + */ + addfd.id = req.id + 1; + + /* This spins until the underlying notification is generated */ + while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 && + errno != -EINPROGRESS) + nanosleep(&delay, NULL); + + memset(&req, 0, sizeof(req)); + ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + ASSERT_EQ(addfd.id, req.id); + resp.id = req.id; resp.error = 0; resp.val = USER_NOTIF_MAGIC; @@ -4116,6 +4150,10 @@ TEST(user_notification_addfd_rlimit) EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); EXPECT_EQ(errno, EMFILE); + addfd.flags = SECCOMP_ADDFD_FLAG_SEND; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); + EXPECT_EQ(errno, EMFILE); + addfd.newfd = 100; addfd.flags = SECCOMP_ADDFD_FLAG_SETFD; EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); From 93e720d710dfe689099c23bb91414303cf715d27 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 26 May 2021 19:49:15 -0700 Subject: [PATCH 3/5] selftests/seccomp: More closely track fds being assigned Since the open fds might not always start at "4" (especially when running under kselftest, etc), start counting from the first assigned fd, rather than using the more permissive EXPECT_GE(fd, 0). Signed-off-by: Kees Cook Link: https://lore.kernel.org/lkml/20210527032948.3730953-1-keescook@chromium.org Reviewed-by: Rodrigo Campos Acked-by: Christian Brauner --- tools/testing/selftests/seccomp/seccomp_bpf.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index e2ba7adc2694..03b37e660965 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -3954,7 +3954,7 @@ TEST(user_notification_addfd) { pid_t pid; long ret; - int status, listener, memfd, fd; + int status, listener, memfd, fd, nextfd; struct seccomp_notif_addfd addfd = {}; struct seccomp_notif_addfd_small small = {}; struct seccomp_notif_addfd_big big = {}; @@ -3963,18 +3963,21 @@ TEST(user_notification_addfd) /* 100 ms */ struct timespec delay = { .tv_nsec = 100000000 }; + /* There may be arbitrary already-open fds at test start. */ memfd = memfd_create("test", 0); ASSERT_GE(memfd, 0); + nextfd = memfd + 1; ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); ASSERT_EQ(0, ret) { TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); } + /* fd: 4 */ /* Check that the basic notification machinery works */ listener = user_notif_syscall(__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER); - ASSERT_GE(listener, 0); + ASSERT_EQ(listener, nextfd++); pid = fork(); ASSERT_GE(pid, 0); @@ -4029,14 +4032,14 @@ TEST(user_notification_addfd) /* Verify we can set an arbitrary remote fd */ fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd); - EXPECT_GE(fd, 0); + EXPECT_EQ(fd, nextfd++); EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0); /* Verify we can set an arbitrary remote fd with large size */ memset(&big, 0x0, sizeof(big)); big.addfd = addfd; fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big); - EXPECT_GE(fd, 0); + EXPECT_EQ(fd, nextfd++); /* Verify we can set a specific remote fd */ addfd.newfd = 42; @@ -4070,9 +4073,11 @@ TEST(user_notification_addfd) addfd.newfd = 0; addfd.flags = SECCOMP_ADDFD_FLAG_SEND; fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd); - - /* Child has fds 0-6 and 42 used, we expect the lower fd available: 7 */ - EXPECT_EQ(fd, 7); + /* + * Child has earlier "low" fds and now 42, so we expect the next + * lowest available fd to be assigned here. + */ + EXPECT_EQ(fd, nextfd++); EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0); /* From 62ddb91b7771626658c382c2b849a058f1586123 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 26 May 2021 19:46:30 -0700 Subject: [PATCH 4/5] selftests/seccomp: Flush benchmark output When running the seccomp benchmark under a test runner, it wouldn't provide any feedback on progress. Set stdout unbuffered. Suggested-by: Will Drewry Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_benchmark.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c index fcc806585266..363cad755042 100644 --- a/tools/testing/selftests/seccomp/seccomp_benchmark.c +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -143,6 +143,8 @@ int main(int argc, char *argv[]) unsigned long long native, filter1, filter2, bitmap1, bitmap2; unsigned long long entry, per_filter1, per_filter2; + setbuf(stdout, NULL); + printf("Current BPF sysctl settings:\n"); system("sysctl net.core.bpf_jit_enable"); system("sysctl net.core.bpf_jit_harden"); From 9a03abc16c77062c73972df08206f1031862d9b4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Jun 2021 16:18:34 -0700 Subject: [PATCH 5/5] selftests/seccomp: Avoid using "sysctl" for report Instead of depending on "sysctl" being installed, just use "grep -H" for sysctl status reporting. Additionally report kernel version for easier comparisons. Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_benchmark.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c index 363cad755042..6e5102a7d7c9 100644 --- a/tools/testing/selftests/seccomp/seccomp_benchmark.c +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -145,9 +145,13 @@ int main(int argc, char *argv[]) setbuf(stdout, NULL); + printf("Running on:\n"); + system("uname -a"); + printf("Current BPF sysctl settings:\n"); - system("sysctl net.core.bpf_jit_enable"); - system("sysctl net.core.bpf_jit_harden"); + /* Avoid using "sysctl" which may not be installed. */ + system("grep -H . /proc/sys/net/core/bpf_jit_enable"); + system("grep -H . /proc/sys/net/core/bpf_jit_harden"); if (argc > 1) samples = strtoull(argv[1], NULL, 0);