From b7f1d7253a8f44f31c2e1a8d9c8962ef30be83e9 Mon Sep 17 00:00:00 2001 From: Andrew Clayton Date: Fri, 18 Nov 2022 23:42:44 +0000 Subject: Isolation: Rename NXT_HAVE_CLONE -> NXT_HAVE_LINUX_NS. Due to the need to replace our use of clone/__NR_clone on Linux with fork(2)/unshare(2) for enabling Linux namespaces(7) to keep the pthreads(7) API working. Let's rename NXT_HAVE_CLONE to NXT_HAVE_LINUX_NS, i.e name it after the feature, not how it's implemented, then in future if we change how we do namespaces again we don't have to rename this. Reviewed-by: Alejandro Colomar Signed-off-by: Andrew Clayton --- src/nxt_process.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'src/nxt_process.c') diff --git a/src/nxt_process.c b/src/nxt_process.c index d8836ad2..b40eb8cf 100644 --- a/src/nxt_process.c +++ b/src/nxt_process.c @@ -7,7 +7,7 @@ #include #include -#if (NXT_HAVE_CLONE) +#if (NXT_HAVE_LINUX_NS) #include #endif @@ -18,7 +18,7 @@ #endif -#if (NXT_HAVE_CLONE) && (NXT_HAVE_CLONE_NEWPID) +#if (NXT_HAVE_LINUX_NS) && (NXT_HAVE_CLONE_NEWPID) #define nxt_is_pid_isolated(process) \ nxt_is_clone_flag_set(process->isolation.clone.flags, NEWPID) #else @@ -318,7 +318,7 @@ nxt_process_create(nxt_task_t *task, nxt_process_t *process) nxt_pid_t pid; nxt_runtime_t *rt; -#if (NXT_HAVE_CLONE) +#if (NXT_HAVE_LINUX_NS) pid = nxt_clone(SIGCHLD | process->isolation.clone.flags); if (nxt_slow_path(pid < 0)) { nxt_alert(task, "clone() failed for %s %E", process->name, nxt_errno); @@ -355,7 +355,7 @@ nxt_process_create(nxt_task_t *task, nxt_process_t *process) /* Parent. */ -#if (NXT_HAVE_CLONE) +#if (NXT_HAVE_LINUX_NS) nxt_debug(task, "clone(%s): %PI", process->name, pid); #else nxt_debug(task, "fork(%s): %PI", process->name, pid); @@ -781,7 +781,7 @@ nxt_process_apply_creds(nxt_task_t *task, nxt_process_t *process) cap_setid = rt->capabilities.setid; -#if (NXT_HAVE_CLONE && NXT_HAVE_CLONE_NEWUSER) +#if (NXT_HAVE_LINUX_NS && NXT_HAVE_CLONE_NEWUSER) if (!cap_setid && nxt_is_clone_flag_set(process->isolation.clone.flags, NEWUSER)) { -- cgit From c1299faa7d2990acbeb677dfc036ca5179d2bf54 Mon Sep 17 00:00:00 2001 From: Andrew Clayton Date: Fri, 18 Nov 2022 23:53:30 +0000 Subject: Isolation: Switch to fork(2) & unshare(2) on Linux. On GitHub, @razvanphp & @hbernaciak both reported issues running the APCu PHP module under Unit. When using this module they were seeing errors like 'apcu_fetch(): Failed to acquire read lock' However when running APCu under php-fpm, everything was fine. The issue turned out to be due to our use of SYS_clone breaking the pthreads(7) API used by APCu. Even if we had been using glibc's clone(2) wrapper we would still have run into problems due to a known issue there. Essentially the problem is when using clone, glibc doesn't update the TID cache, so the child ends up having the same TID as the parent and that is used in various parts of pthreads(7) such as in the various locking primitives, so when APCu was grabbing a lock it ended up using the TID of the main unit process (rather than that of the php application processes that was grabbing the lock). So due to the above what was happening was when one of the application processes went to grab either a read or write lock, the lock was actually being attributed to the main unit process. If a process had acquired the write lock, then if a process tried to acquire a read or write lock then glibc would return EDEADLK due to detecting a deadlock situation due to thinking the process already held the write lock when in fact it didn't. It seems the right way to do this is via fork(2) and unshare(2). We already use fork(2) on other platforms. This requires a few tricks to keep the essence of the processes the same as before when using clone 1) We use the prctl(2) PR_SET_CHILD_SUBREAPER option (if its available, since Linux 3.4) to make the main unit process inherit prototype processes after a double fork(2), rather than them being reparented to 'init'. This avoids needing to ^C twice to fully exit unit when running in the foreground. It's probably also better if they maintain their parent child relationship where possible. 2) We use a double fork(2) technique on the prototype processes to ensure they themselves end up in a new PID namespace as PID 1 (when CLONE_NEWPID is being used). When using unshare(CLONE_NEWPID), the calling process is _not_ placed in the namespace (as discussed in pid_namespaces(7)). It only sets things up so that subsequent children are placed in a PID namespace. Having the prototype processes as PID 1 in the new PID namespace is probably a good thing and matches the behaviour of clone(2). Also, some isolation tests break if the prototype process is not PID 1. 3) Due to the above double fork(2) the main unit process looses track of the prototype process ID, which it needs to know. To solve this, we employ a simple pipe(2) between the main unit and prototype processes and pass the prototype grandchild PID from the parent of the second fork(2) before exiting. This needs to be done from the parent and not the grandchild, as the grandchild will see itself having a PID of 1 while the main process needs its externally visible PID. Link: Link: Closes: Reviewed-by: Alejandro Colomar Signed-off-by: Andrew Clayton --- src/nxt_process.c | 256 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 247 insertions(+), 9 deletions(-) (limited to 'src/nxt_process.c') diff --git a/src/nxt_process.c b/src/nxt_process.c index b40eb8cf..a2980350 100644 --- a/src/nxt_process.c +++ b/src/nxt_process.c @@ -27,6 +27,19 @@ #endif +#if (NXT_HAVE_LINUX_NS) +static nxt_int_t nxt_process_pipe_timer(nxt_fd_t fd, short event); +static nxt_int_t nxt_process_check_pid_status(const nxt_fd_t *gc_pipe); +static nxt_pid_t nxt_process_recv_pid(const nxt_fd_t *pid_pipe, + const nxt_fd_t *gc_pipe); +static void nxt_process_send_pid(const nxt_fd_t *pid_pipe, nxt_pid_t pid); +static nxt_int_t nxt_process_unshare(nxt_task_t *task, nxt_process_t *process, + nxt_fd_t *pid_pipe, nxt_fd_t *gc_pipe, nxt_bool_t use_pidns); +static nxt_int_t nxt_process_init_pidns(nxt_task_t *task, + const nxt_process_t *process, nxt_fd_t *pid_pipe, nxt_fd_t *gc_pipe, + nxt_bool_t *use_pidns); +#endif + static nxt_pid_t nxt_process_create(nxt_task_t *task, nxt_process_t *process); static nxt_int_t nxt_process_do_start(nxt_task_t *task, nxt_process_t *process); static nxt_int_t nxt_process_whoami(nxt_task_t *task, nxt_process_t *process); @@ -311,6 +324,217 @@ nxt_process_child_fixup(nxt_task_t *task, nxt_process_t *process) } +#if (NXT_HAVE_LINUX_NS) + +static nxt_int_t +nxt_process_pipe_timer(nxt_fd_t fd, short event) +{ + int ret; + sigset_t mask; + struct pollfd pfd; + + static const struct timespec ts = { .tv_sec = 5 }; + + /* + * Temporarily block the signals we are handling, (except + * for SIGINT & SIGTERM) so that ppoll(2) doesn't get + * interrupted. After ppoll(2) returns, our old sigmask + * will be back in effect and any pending signals will be + * delivered. + * + * This is because while the kernel ppoll syscall updates + * the struct timespec with the time remaining if it got + * interrupted with EINTR, the glibc wrapper hides this + * from us so we have no way of knowing how long to retry + * the ppoll(2) for and if we just retry with the same + * timeout we could find ourselves in an infinite loop. + */ + pthread_sigmask(SIG_SETMASK, NULL, &mask); + sigdelset(&mask, SIGINT); + sigdelset(&mask, SIGTERM); + + pfd.fd = fd; + pfd.events = event; + + ret = ppoll(&pfd, 1, &ts, &mask); + if (ret <= 0 || (ret == 1 && pfd.revents & POLLERR)) { + return NXT_ERROR; + } + + return NXT_OK; +} + + +static nxt_int_t +nxt_process_check_pid_status(const nxt_fd_t *gc_pipe) +{ + int8_t status; + ssize_t ret; + + close(gc_pipe[1]); + + ret = nxt_process_pipe_timer(gc_pipe[0], POLLIN); + if (ret == NXT_OK) { + ret = read(gc_pipe[0], &status, sizeof(int8_t)); + } + + if (ret <= 0) { + status = -1; + } + + close(gc_pipe[0]); + + return status; +} + + +static nxt_pid_t +nxt_process_recv_pid(const nxt_fd_t *pid_pipe, const nxt_fd_t *gc_pipe) +{ + int8_t status; + ssize_t ret; + nxt_pid_t pid; + + close(pid_pipe[1]); + close(gc_pipe[0]); + + status = 0; + + ret = nxt_process_pipe_timer(pid_pipe[0], POLLIN); + if (ret == NXT_OK) { + ret = read(pid_pipe[0], &pid, sizeof(nxt_pid_t)); + } + + if (ret <= 0) { + status = -1; + pid = -1; + } + + write(gc_pipe[1], &status, sizeof(int8_t)); + + close(pid_pipe[0]); + close(gc_pipe[1]); + + return pid; +} + + +static void +nxt_process_send_pid(const nxt_fd_t *pid_pipe, nxt_pid_t pid) +{ + nxt_int_t ret; + + close(pid_pipe[0]); + + ret = nxt_process_pipe_timer(pid_pipe[1], POLLOUT); + if (ret == NXT_OK) { + write(pid_pipe[1], &pid, sizeof(nxt_pid_t)); + } + + close(pid_pipe[1]); +} + + +static nxt_int_t +nxt_process_unshare(nxt_task_t *task, nxt_process_t *process, + nxt_fd_t *pid_pipe, nxt_fd_t *gc_pipe, + nxt_bool_t use_pidns) +{ + int ret; + nxt_pid_t pid; + + if (process->isolation.clone.flags == 0) { + return NXT_OK; + } + + ret = unshare(process->isolation.clone.flags); + if (nxt_slow_path(ret == -1)) { + nxt_alert(task, "unshare() failed for %s %E", process->name, + nxt_errno); + + if (use_pidns) { + nxt_pipe_close(task, gc_pipe); + nxt_pipe_close(task, pid_pipe); + } + + return NXT_ERROR; + } + + if (!use_pidns) { + return NXT_OK; + } + + /* + * PID namespace requested. Employ a double fork(2) technique + * so that the prototype process will be placed into the new + * namespace and end up with PID 1 (as before with clone). + */ + pid = fork(); + if (nxt_slow_path(pid < 0)) { + nxt_alert(task, "fork() failed for %s %E", process->name, nxt_errno); + nxt_pipe_close(task, gc_pipe); + nxt_pipe_close(task, pid_pipe); + + return NXT_ERROR; + + } else if (pid > 0) { + nxt_pipe_close(task, gc_pipe); + nxt_process_send_pid(pid_pipe, pid); + + _exit(EXIT_SUCCESS); + } + + nxt_pipe_close(task, pid_pipe); + ret = nxt_process_check_pid_status(gc_pipe); + if (ret == -1) { + return NXT_ERROR; + } + + return NXT_OK; +} + + +static nxt_int_t +nxt_process_init_pidns(nxt_task_t *task, const nxt_process_t *process, + nxt_fd_t *pid_pipe, nxt_fd_t *gc_pipe, + nxt_bool_t *use_pidns) +{ + int ret; + + *use_pidns = 0; + +#if (NXT_HAVE_CLONE_NEWPID) + *use_pidns = nxt_is_pid_isolated(process); +#endif + + if (!*use_pidns) { + return NXT_OK; + } + + ret = nxt_pipe_create(task, pid_pipe, 0, 0); + if (nxt_slow_path(ret == NXT_ERROR)) { + return NXT_ERROR; + } + + ret = nxt_pipe_create(task, gc_pipe, 0, 0); + if (nxt_slow_path(ret == NXT_ERROR)) { + return NXT_ERROR; + } + +#if (NXT_HAVE_PR_SET_CHILD_SUBREAPER) + ret = prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0); + if (nxt_slow_path(ret == -1)) { + nxt_alert(task, "prctl(PR_SET_CHILD_SUBREAPER) failed for %s %E", + process->name, nxt_errno); + } +#endif + + return NXT_OK; +} + +#endif /* NXT_HAVE_LINUX_NS */ + + static nxt_pid_t nxt_process_create(nxt_task_t *task, nxt_process_t *process) { @@ -319,22 +543,31 @@ nxt_process_create(nxt_task_t *task, nxt_process_t *process) nxt_runtime_t *rt; #if (NXT_HAVE_LINUX_NS) - pid = nxt_clone(SIGCHLD | process->isolation.clone.flags); - if (nxt_slow_path(pid < 0)) { - nxt_alert(task, "clone() failed for %s %E", process->name, nxt_errno); - return pid; + nxt_fd_t pid_pipe[2], gc_pipe[2]; + nxt_bool_t use_pidns; + + ret = nxt_process_init_pidns(task, process, pid_pipe, gc_pipe, &use_pidns); + if (ret == NXT_ERROR) { + return -1; } -#else +#endif + pid = fork(); if (nxt_slow_path(pid < 0)) { nxt_alert(task, "fork() failed for %s %E", process->name, nxt_errno); return pid; } -#endif if (pid == 0) { /* Child. */ +#if (NXT_HAVE_LINUX_NS) + ret = nxt_process_unshare(task, process, pid_pipe, gc_pipe, use_pidns); + if (ret == NXT_ERROR) { + _exit(EXIT_FAILURE); + } +#endif + ret = nxt_process_child_fixup(task, process); if (nxt_slow_path(ret != NXT_OK)) { nxt_process_quit(task, 1); @@ -355,10 +588,15 @@ nxt_process_create(nxt_task_t *task, nxt_process_t *process) /* Parent. */ -#if (NXT_HAVE_LINUX_NS) - nxt_debug(task, "clone(%s): %PI", process->name, pid); -#else nxt_debug(task, "fork(%s): %PI", process->name, pid); + +#if (NXT_HAVE_LINUX_NS) + if (use_pidns) { + pid = nxt_process_recv_pid(pid_pipe, gc_pipe); + if (pid == -1) { + return pid; + } + } #endif process->pid = pid; -- cgit From 7d7b5a977c4e461976422c59f4594280b05dde0a Mon Sep 17 00:00:00 2001 From: Andrew Clayton Date: Thu, 1 Dec 2022 21:05:39 +0000 Subject: Remove the nxt_getpid() alias. Since the previous commit, nxt_getpid() is only ever aliased to getpid(2). nxt_getpid() was only used once in the code, while there are multiple direct uses of getpid(2) $ grep -r "getpid()" src/ src/nxt_unit.c: nxt_unit_pid = getpid(); src/nxt_process.c: nxt_pid = nxt_getpid(); src/nxt_process.c: nxt_pid = getpid(); src/nxt_lib.c: nxt_pid = getpid(); src/nxt_process.h:#define nxt_getpid() \ src/nxt_process.h:#define nxt_getpid() \ src/nxt_process.h: getpid() Just remove it and convert the _single_ instance of nxt_getpid() to getpid(2). Reviewed-by: Alejandro Colomar Signed-off-by: Andrew Clayton --- src/nxt_process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/nxt_process.c') diff --git a/src/nxt_process.c b/src/nxt_process.c index a2980350..fe9349e8 100644 --- a/src/nxt_process.c +++ b/src/nxt_process.c @@ -269,7 +269,7 @@ nxt_process_child_fixup(nxt_task_t *task, nxt_process_t *process) nxt_ppid = nxt_pid; - nxt_pid = nxt_getpid(); + nxt_pid = getpid(); process->pid = nxt_pid; process->isolated_pid = nxt_pid; -- cgit From 29471c8d32a640d6e2e460f65d5a319c60043733 Mon Sep 17 00:00:00 2001 From: Andrew Clayton Date: Thu, 23 Feb 2023 12:01:14 +0000 Subject: Set a safer umask(2) when running as a daemon. When running as a daemon. unit currently sets umask(0), i.e no umask. This is resulting in various directories being created with a mode of 0777, e.g rwxrwxrwx this is currently affecting cgroup and rootfs directories, which are being created with a mode of 0777, and when running as a daemon as there is no umask to restrict the permissions. This also affects the language modules (the umask is inherited over fork(2)) whereby unless something explicitly sets a umask, files and directories will be created with full permissions, 0666 (rw-rw-rw-)/ 0777 (rwxrwxrwx) respectively. This could be an unwitting security issue. My original idea was to just remove the umask(0) call and thus inherit the umask from the executing shell/program. However there was some concern about just inheriting whatever umask was in effect. Alex suggested that rather than simply removing the umask(0) call we change it to a value of 022 (which is a common default), which will result in directories and files with permissions at most of 0755 (rwxr-xr-x) & 0644 (rw-r--r--). If applications need some other umask set, they can (as they always have been able to) set their own umask(2). Suggested-by: Alejandro Colomar Reviewed-by: Liam Crilly Signed-off-by: Andrew Clayton --- src/nxt_process.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/nxt_process.c') diff --git a/src/nxt_process.c b/src/nxt_process.c index fe9349e8..025efe70 100644 --- a/src/nxt_process.c +++ b/src/nxt_process.c @@ -1156,10 +1156,10 @@ nxt_process_daemon(nxt_task_t *task) } /* - * Reset file mode creation mask: any access - * rights can be set on file creation. + * Set a sefe umask to give at most 755/644 permissions on + * directories/files. */ - umask(0); + umask(0022); /* Redirect STDIN and STDOUT to the "/dev/null". */ -- cgit