diff --git a/sbin/init/init.8 b/sbin/init/init.8 --- a/sbin/init/init.8 +++ b/sbin/init/init.8 @@ -28,7 +28,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd July 22, 2021 +.Dd August 10, 2024 .Dt INIT 8 .Os .Sh NAME @@ -112,19 +112,6 @@ .Va kern.securelevel variable to the required security level. .Pp -If -.Nm -is run in a jail, the security level of the -.Dq host system -will not be affected. -Part of the information set up in the kernel to support a jail -is a per-jail security level. -This allows running a higher security level inside of a jail -than that of the host system. -See -.Xr jail 8 -for more information about jails. -.Pp In multi-user operation, .Nm maintains @@ -396,6 +383,66 @@ .Pa /etc/rc script is executed with the standard shell .Pa /bin/sh . +.El +.Sh JAIL SUPPORT +.Nm +can run in a +.Xr jail 8 +environment to emulate a bootstrap of another userland environment +with +.Xr rc 8 +scripts and the same directory structure as the root filesystem. +.Pp +While the behavior of +.Nm +is basically the same as the host environment, +there are several differences. +.Nm +will have a process id other than +.Li 1 +and never enter the single-user mode even if it terminates abnormally +in a jail. +.Va STDOUT +and +.Va STDERR +will be used instead of opening the system console device for logging. +All of the +.Xr kenv 1 +variables described in the previous section will be ignored in a jail. +.Pp +Upon invoked, +.Nm +will fork and wait until the +.Xr rc 8 +script runs. +After the script ends successfully, +the parent process of +.Nm +terminates and the child process continues +to handle +.Xr ttys 5 +as a background process. +The +.Xr jail 8 +command blocks until the jailed environment runs the +.Xr rc 8 +script and enters the +.Dq emulated +multi-user mode. +.Pp +If +.Nm +runs in a jail, +the security level of the +.Dq host system +will not be affected. +Part of the information set up in the kernel to support a jail +is a per-jail security level. +This allows running a higher security level inside of a jail +than that of the host system. +See +.Xr jail 8 +for more information about jails. .Sh FILES .Bl -tag -width /var/log/init.log -compact .It Pa /dev/console @@ -451,6 +498,9 @@ .Nm utility appeared in .At v1 . +.Xr jail 8 +support was first appeared in +.Fx 15.0 . .Sh CAVEATS Systems without .Xr sysctl 8 diff --git a/sbin/init/init.c b/sbin/init/init.c --- a/sbin/init/init.c +++ b/sbin/init/init.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -56,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -183,6 +185,11 @@ static session_t *find_session(pid_t); static DB *session_db; +static int jailed; +static int sv[2] = {-1, -1}; +#define SV_RECV 0 +#define SV_SEND 1 + /* * The mother of all processes. */ @@ -201,8 +208,12 @@ BOOTTRACE("init(8) starting..."); + /* Check if init(8) is invoked in a jail. */ + (void) sysctlbyname("security.jail.jailed", &jailed, + &(size_t){ sizeof(jailed) }, NULL, 0); + /* System V users like to reexec init. */ - if (getpid() != 1) { + if (jailed == 0 && getpid() != 1) { #ifdef COMPAT_SYSV_INIT /* So give them what they want */ if (argc > 1) { @@ -250,7 +261,7 @@ * Note that this does NOT open a file... * Does 'init' deserve its own facility number? */ - openlog("init", LOG_CONS, LOG_AUTH); + openlog("init", (jailed) ? LOG_PERROR : LOG_CONS, LOG_AUTH); /* * Create an initial session. @@ -315,22 +326,31 @@ * Paranoia. */ close(0); - close(1); - close(2); + if (jailed == 0) { + /* + * Keep stdout and stderr open in jail + * in order to deliver console messages. + */ + close(1); + close(2); + } - if (kenv(KENV_GET, "init_exec", kenv_value, sizeof(kenv_value)) > 0) { + if (jailed == 0 && + kenv(KENV_GET, "init_exec", kenv_value, sizeof(kenv_value)) > 0) { replace_init(kenv_value); _exit(0); /* reboot */ } - if (kenv(KENV_GET, "init_script", kenv_value, sizeof(kenv_value)) > 0) { + if (jailed == 0 && + kenv(KENV_GET, "init_script", kenv_value, sizeof(kenv_value)) > 0) { state_func_t next_transition; if ((next_transition = run_script(kenv_value)) != NULL) initial_transition = (state_t) next_transition; } - if (kenv(KENV_GET, "init_chroot", kenv_value, sizeof(kenv_value)) > 0) { + if (jailed == 0 && + kenv(KENV_GET, "init_chroot", kenv_value, sizeof(kenv_value)) > 0) { if (chdir(kenv_value) != 0 || chroot(".") != 0) warning("Can't chroot to %s: %m", kenv_value); } @@ -388,7 +408,8 @@ free(s); } - if (initial_transition != reroot_phase_two) { + if (jailed == 0 && + initial_transition != reroot_phase_two) { /* * Unmount reroot leftovers. This runs after init(8) * gets reexecuted after reroot_phase_two() is done. @@ -398,6 +419,77 @@ warning("Cannot unmount %s: %m", _PATH_REROOT); } + if (jailed) { + pid_t pid; + /* + * The init(8) process will never terminate on the host + * environment. Inside a jail, it needs to run as a + * background process because the jail(8) utility needs to + * finish. To do this, it calls fork() and the child + * process will handle the original functions of init(8). + * + * The child process runs the rc(8) script, reads + * /etc/ttys to watch the terminal ports, and stays in the + * background. However, the parent process does not terminate + * immediately to emulate the blocking behavior until + * entering the multi-user mode (starting the state machine) + * in the host environment. The below socketpair is used + * to send a notification from the child after running + * the rc(8) script successfully to block the jail(8) + * utility that invoked /sbin/init. + * Data in sizeof(int) are used as the frame for this + * socketpair and the payload is a constant defined + * in sysexits.h. + * + */ + error = socketpair(AF_LOCAL, SOCK_SEQPACKET | SOCK_CLOEXEC, + 0, sv); + if (error) + err(1, "socketpair"); + + pid = fork(); + if (pid == -1) + err(1, "fork()"); + else if (pid > 0) { + fd_set readfds; + ssize_t len; + int nfds, buf; + + close(sv[SV_SEND]); + sv[SV_SEND] = -1; + + /* + * The parent process waits until init(8) enters + * multi-user mode successfully. + */ + FD_ZERO(&readfds); + nfds = sv[SV_RECV]; + for (;;) { + FD_SET(sv[SV_RECV], &readfds); + error = select(nfds, &readfds, NULL, NULL, + &(struct timeval){0}); + if (error && errno == EINTR) + continue; + if (error) + err(1, "select"); + do { + len = read(sv[SV_RECV], &buf, + sizeof(buf)); + } while (len < 0 && errno == EINTR); + if (len != sizeof(buf)) + err(1, "invalid length: %zd", len); + switch (buf) { + case EX_OK: + case EX_SOFTWARE: + return (buf); + default: + warn("invalid notification: %d", buf); + } + } + } + close(sv[SV_RECV]); + sv[SV_RECV] = -1; + } /* * Start the state machine. */ @@ -609,6 +701,23 @@ } dup2(STDOUT_FILENO, STDERR_FILENO); } +/* + * Close STDIN and use STDOUT and STDERR for console output for jail. + */ +static void +open_console_jail(void) +{ + int fd; + + if ((fd = open(_PATH_DEVNULL, O_RDWR)) == -1) { + stall("cannot open null device."); + _exit(1); + } + if (fd != STDIN_FILENO) { + dup2(fd, STDIN_FILENO); + close(fd); + } +} static const char * get_shell(void) @@ -862,6 +971,16 @@ char altshell[128]; #endif + /* Notify abnormal termination. */ + if (jailed && sv[SV_SEND] > 0) { + (void) send(sv[SV_SEND], &(int){EX_SOFTWARE}, sizeof(int), 0); + close(sv[SV_SEND]); + sv[SV_SEND] = -1; + } + /* In jail, init(8) terminates instead of entering single-user mode. */ + if (jailed) + exit(1); + if (Reboot) { /* Instead of going single user, let's reboot the machine */ BOOTTRACE("shutting down the system"); @@ -1041,7 +1160,11 @@ sigaction(SIGTSTP, &sa, NULL); sigaction(SIGHUP, &sa, NULL); - open_console(); + /* Use stdout and stderr in jail. */ + if (jailed == 0) + open_console(); + else + open_console_jail(); sigprocmask(SIG_SETMASK, &sa.sa_mask, NULL); #ifdef LOGIN_CAP @@ -1708,6 +1831,13 @@ requested_transition = 0; + /* Notify entering multi-user mode successfully. */ + if (jailed && sv[SV_SEND] > 0) { + (void) send(sv[SV_SEND], &(int){EX_OK}, sizeof(int), 0); + close(sv[SV_SEND]); + sv[SV_SEND] = -1; + } + /* * If the administrator has not set the security level to -1 * to indicate that the kernel should not run multiuser in secure @@ -1901,6 +2031,10 @@ pid_t pid; static const int death_sigs[2] = { SIGTERM, SIGKILL }; + /* init(8) in jail terminates when receiving SIGTERM or SIGKILL. */ + if (jailed) + exit(EXIT_SUCCESS); + revoke(_PATH_CONSOLE); BOOTTRACE("start killing user processes");