diff --git a/lib/libjail/jail.c b/lib/libjail/jail.c --- a/lib/libjail/jail.c +++ b/lib/libjail/jail.c @@ -75,8 +75,9 @@ jail_setv(int flags, ...) { va_list ap, tap; - struct jailparam *jp; - const char *name, *value; + struct jailparam *jp, *jp_desc; + const char *name; + char *value, *desc_value; int njp, jid; /* Create the parameter list and import the parameters. */ @@ -86,15 +87,24 @@ (void)va_arg(tap, char *); va_end(tap); jp = alloca(njp * sizeof(struct jailparam)); - for (njp = 0; (name = va_arg(ap, char *)) != NULL;) { + jp_desc = NULL; + desc_value = NULL; + for (njp = 0; (name = va_arg(ap, char *)) != NULL; njp++) { value = va_arg(ap, char *); if (jailparam_init(jp + njp, name) < 0) goto error; - if (jailparam_import(jp + njp++, value) < 0) + if (jailparam_import(jp + njp, value) < 0) goto error; + if (!strcmp(name, "desc") + && (flags & (JAIL_GET_DESC | JAIL_OWN_DESC))) { + jp_desc = jp + njp; + desc_value = value; + } } va_end(ap); jid = jailparam_set(jp, njp, flags); + if (jid > 0 && jp_desc != NULL) + sprintf(desc_value, "%d", *(int *)jp_desc->jp_value); jailparam_free(jp, njp); return (jid); @@ -112,9 +122,10 @@ jail_getv(int flags, ...) { va_list ap, tap; - struct jailparam *jp, *jp_lastjid, *jp_jid, *jp_name, *jp_key; + struct jailparam *jp, *jp_desc, *jp_lastjid, *jp_jid, *jp_name, *jp_key; char *valarg, *value; - const char *name, *key_value, *lastjid_value, *jid_value, *name_value; + const char *name, *key_value, *desc_value, *lastjid_value, *jid_value; + const char *name_value; int njp, i, jid; /* Create the parameter list and find the key. */ @@ -126,15 +137,19 @@ jp = alloca(njp * sizeof(struct jailparam)); va_copy(tap, ap); - jp_lastjid = jp_jid = jp_name = NULL; - lastjid_value = jid_value = name_value = NULL; + jp_desc = jp_lastjid = jp_jid = jp_name = NULL; + desc_value = lastjid_value = jid_value = name_value = NULL; for (njp = 0; (name = va_arg(tap, char *)) != NULL; njp++) { value = va_arg(tap, char *); if (jailparam_init(jp + njp, name) < 0) { va_end(tap); goto error; } - if (!strcmp(jp[njp].jp_name, "lastjid")) { + if (!strcmp(jp[njp].jp_name, "desc") + && (flags & (JAIL_USE_DESC | JAIL_AT_DESC))) { + jp_desc = jp + njp; + desc_value = value; + } else if (!strcmp(jp[njp].jp_name, "lastjid")) { jp_lastjid = jp + njp; lastjid_value = value; } else if (!strcmp(jp[njp].jp_name, "jid")) { @@ -147,7 +162,10 @@ } va_end(tap); /* Import the key parameter. */ - if (jp_lastjid != NULL) { + if (jp_desc != NULL && (flags & JAIL_USE_DESC)) { + jp_key = jp_desc; + key_value = desc_value; + } else if (jp_lastjid != NULL) { jp_key = jp_lastjid; key_value = lastjid_value; } else if (jp_jid != NULL && strtol(jid_value, NULL, 10) != 0) { @@ -163,6 +181,9 @@ } if (jailparam_import(jp_key, key_value) < 0) goto error; + if (jp_desc != NULL && jp_desc != jp_key + && jailparam_import(jp_desc, desc_value) < 0) + goto error; /* Get the jail and export the parameters. */ jid = jailparam_get(jp, njp, flags); if (jid < 0) @@ -571,7 +592,7 @@ jailparam_get(struct jailparam *jp, unsigned njp, int flags) { struct iovec *jiov; - struct jailparam *jp_lastjid, *jp_jid, *jp_name, *jp_key; + struct jailparam *jp_desc, *jp_lastjid, *jp_jid, *jp_name, *jp_key; int i, ai, ki, jid, arrays, sanity; unsigned j; @@ -580,10 +601,13 @@ * Find the key and any array parameters. */ jiov = alloca(sizeof(struct iovec) * 2 * (njp + 1)); - jp_lastjid = jp_jid = jp_name = NULL; + jp_desc = jp_lastjid = jp_jid = jp_name = NULL; arrays = 0; for (ai = j = 0; j < njp; j++) { - if (!strcmp(jp[j].jp_name, "lastjid")) + if (!strcmp(jp[j].jp_name, "desc") + && (flags & (JAIL_USE_DESC | JAIL_AT_DESC))) + jp_desc = jp + j; + else if (!strcmp(jp[j].jp_name, "lastjid")) jp_lastjid = jp + j; else if (!strcmp(jp[j].jp_name, "jid")) jp_jid = jp + j; @@ -599,7 +623,9 @@ ai++; } } - jp_key = jp_lastjid ? jp_lastjid : + jp_key = jp_desc && jp_desc->jp_valuelen == sizeof(int) && + jp_desc->jp_value && (flags & JAIL_USE_DESC) ? jp_desc : + jp_lastjid ? jp_lastjid : jp_jid && jp_jid->jp_valuelen == sizeof(int) && jp_jid->jp_value && *(int *)jp_jid->jp_value ? jp_jid : jp_name; if (jp_key == NULL || jp_key->jp_value == NULL) { @@ -622,6 +648,14 @@ jiov[ki].iov_len = JAIL_ERRMSGLEN; ki++; jail_errmsg[0] = 0; + if (jp_desc != NULL && jp_desc != jp_key) { + jiov[ki].iov_base = jp_desc->jp_name; + jiov[ki].iov_len = strlen(jp_desc->jp_name) + 1; + ki++; + jiov[ki].iov_base = jp_desc->jp_value; + jiov[ki].iov_len = jp_desc->jp_valuelen; + ki++; + } if (arrays && jail_get(jiov, ki, flags) < 0) { if (!jail_errmsg[0]) snprintf(jail_errmsg, sizeof(jail_errmsg), @@ -649,7 +683,7 @@ jiov[ai].iov_base = jp[j].jp_value; memset(jiov[ai].iov_base, 0, jiov[ai].iov_len); ai++; - } else if (jp + j != jp_key) { + } else if (jp + j != jp_key && jp + j != jp_desc) { jiov[i].iov_base = jp[j].jp_name; jiov[i].iov_len = strlen(jp[j].jp_name) + 1; i++; diff --git a/lib/libsys/Symbol.sys.map b/lib/libsys/Symbol.sys.map --- a/lib/libsys/Symbol.sys.map +++ b/lib/libsys/Symbol.sys.map @@ -382,6 +382,8 @@ getrlimitusage; inotify_add_watch_at; inotify_rm_watch; + jail_attach_jd; + jail_remove_jd; kcmp; setcred; setgroups; diff --git a/lib/libsys/_libsys.h b/lib/libsys/_libsys.h --- a/lib/libsys/_libsys.h +++ b/lib/libsys/_libsys.h @@ -468,6 +468,8 @@ typedef int (__sys_inotify_rm_watch_t)(int, int); typedef int (__sys_getgroups_t)(int, gid_t *); typedef int (__sys_setgroups_t)(int, const gid_t *); +typedef int (__sys_jail_attach_jd_t)(int); +typedef int (__sys_jail_remove_jd_t)(int); _Noreturn void __sys__exit(int rval); int __sys_fork(void); @@ -872,6 +874,8 @@ int __sys_inotify_rm_watch(int fd, int wd); int __sys_getgroups(int gidsetsize, gid_t * gidset); int __sys_setgroups(int gidsetsize, const gid_t * gidset); +int __sys_jail_attach_jd(int fd); +int __sys_jail_remove_jd(int fd); __END_DECLS #endif /* __LIBSYS_H_ */ diff --git a/lib/libsys/jail.2 b/lib/libsys/jail.2 --- a/lib/libsys/jail.2 +++ b/lib/libsys/jail.2 @@ -23,7 +23,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd November 29, 2023 +.Dd September 4, 2025 .Dt JAIL 2 .Os .Sh NAME @@ -31,7 +31,9 @@ .Nm jail_get , .Nm jail_set , .Nm jail_remove , -.Nm jail_attach +.Nm jail_attach , +.Nm jail_remove_jd , +.Nm jail_attach_jd .Nd create and manage system jails .Sh LIBRARY .Lb libc @@ -44,6 +46,10 @@ .Fn jail_attach "int jid" .Ft int .Fn jail_remove "int jid" +.Ft int +.Fn jail_attach_jd "int fd" +.Ft int +.Fn jail_remove_jd "int fd" .In sys/uio.h .Ft int .Fn jail_get "struct iovec *iov" "u_int niov" "int flags" @@ -188,6 +194,29 @@ This is deprecated in .Fn jail_set and has no effect. +.It Dv JAIL_USE_DESC +Identify the jail by a descriptor in the +.Va desc +parameter. +.It Dv JAIL_AT_DESC +Operate in the context of the jail described by the +.Va desc +parameter, instead of the current jail. +Only one of +.Dv JAIL_USE_DESC +or +.Dv JAIL_AT_DESC +may be specified. +.It Dv JAIL_GET_DESC +Return a new jail descriptor for the jail in the +.Va desc +parameter. +.It Dv JAIL_OWN_DESC +Return an +.Dq owning +jail descriptor in the +.Va desc +parameter. .El .Pp The @@ -221,6 +250,9 @@ .Bl -tag -width indent .It Dv JAIL_DYING Allow getting a jail that is in the process of being removed. +.It Dv JAIL_USE_DESC , Dv JAIL_AT_DESC , Dv JAIL_GET_DESC , Dv JAIL_OWN_DESC +These have the same meaning as they do in +.Fn jail_set . .El .Pp The @@ -238,6 +270,101 @@ .Fa jid . It will kill all processes belonging to the jail, and remove any children of that jail. +.Pp +The +.Fn jail_attach_fd +and +.Fn jail_remove_fd +system calls work the same as +.Fn jail_attach +and +.Fn jail_remove , +except that they operate on the jail identified by jail descriptor +.Fa fd . +.Ss Jail Descriptors +In addition to the jail ID, +jails can be referred to using a jail descriptor, +a type of file descriptor tied to a particular jail. +Jail descriptors are created by calling +.Fn jail_set +or +.Fn jail_get +with the special parameter +.Va desc , +and either the +.Dv JAIL_GET_DESC +or +.Dv JAIL_OWN_DESC +flags set. +The difference between the two flags is that descriptors created with +.Dv JAIL_OWN_DESC +.Po +called +.Dq owning +descriptors +.Pc +will automatically remove the jail when the descriptor is closed. +.Pp +Jail descriptors can be passed back to +.Fn jail_set +or +.Fm jail_get +with the +.Va desc +parameter, +and either the +.Dv JAIL_USE_DESC +or +.Dv JAIL_AT_DESC +flags set. +With +.Dv JAIL_USE_DESC , +the descriptor identifies the jail to operate on, +instead of the +.Va jid +or +.Va name +parameter. +With +.Dv JAIL_AT_DESC , +the descriptor is used in place of the current jail, +allowing accessing or creating jails that are children of the +descriptor jail. +.Pp +The system calls +.Fn jail_attach_jd +and +.Fn jail_aremove_jd +work the same as +.Fn jail_attach +and +.Fn jail_remove , +except that they operate on the jail referred to by the passed descriptor. +.Pp +Jail operations via descriptors can be done by processes that do not +normally have permission to see or affect the jail, +as long as they are allowed by the file permissions of the jail +descriptor itself. +These permissions can be changed by the descriptor owner via +.Xr fchmod 2 +and +.Xr fchown 2 . +.Fn jail_get +requires read permission, +.Fn jail_set +and +.Fn jail_remove +require write permission, +and +.Fn jail_attach +requires execute permission. +Also, use of a descriptor with the +.Dv JAIL_AT_DESC +flag requires execute permission. +An owning descriptor is identified by the +.Em sticky bit , +which may also be changed via +.Xr fchmod 2 . .Sh RETURN VALUES If successful, .Fn jail , @@ -249,7 +376,7 @@ .Va errno to indicate the error. .Pp -.Rv -std jail_attach jail_remove +.Rv -std jail_attach jail_remove jail_attach_jd jail_remove_jd .Sh ERRORS The .Fn jail @@ -275,12 +402,44 @@ system call will fail if: .Bl -tag -width Er +.It Bq Er EBADF +The +.Va desc +parameter does not refer to a valid jail descriptor, +and either the +.Dv JAIL_USE_DESC +or +.Dv JAIL_AT_DESC +flag was set. +.It Bq Er EACCES +Write permission is denied on the jail descriptor in the +.Va desc +parameter, +and the +.Dv JAIL_USE_DESC +flag was set. +.It Bq Er EACCES +Execute permission is denied on the jail descriptor in the +.Va desc +parameter, +and either the +.Dv JAIL_AT_DESC +or +.Dv JAIL_ATTACH +flag was set. .It Bq Er EPERM This process is not allowed to create a jail, either because it is not the super-user, or because it would exceed the jail's .Va children.max limit. .It Bq Er EPERM +The jail descriptor in the +.Va desc +parameter was created by a user other than the super-user, +and the +.Dv JAIL_USE_DESC +flag was set. +.It Bq Er EPERM A jail parameter was set to a less restrictive value then the current environment. .It Bq Er EFAULT @@ -298,8 +457,12 @@ .It Bq Er ENOENT The jail referred to by a .Va jid -is not accessible by the process, because the process is in a different -jail. +parameter is not accessible by the process, because the process is in a +different jail. +.It Bq Er ENOENT +The jail referred to by a +.Va desc +parameter has been removed. .It Bq Er EEXIST The jail referred to by a .Va jid @@ -326,6 +489,24 @@ A supplied string parameter is longer than allowed. .It Bq Er EAGAIN There are no jail IDs left. +.It Bq Er EMFILE +A jail descriptor could not be created for the +.Va desc +parameter with either the +.Dv JAIL_GET_DESC +or +.Dv JAIL_OWN_DESC +flag set, +because the process has already reached its limit for open file descriptors. +.It Bq Er ENFILE +A jail descriptor could not be created for the +.Va desc +parameter with either the +.Dv JAIL_GET_DESC +or +.Dv JAIL_OWN_DESC +flag set, +because the system file table is full. .El .Pp The @@ -333,6 +514,29 @@ system call will fail if: .Bl -tag -width Er +.It Bq Er EBADF +The +.Va desc +parameter does not refer to a valid jail descriptor, +and either the +.Dv JAIL_USE_DESC +or +.Dv JAIL_AT_DESC +flag was set. +.It Bq Er EACCES +Read permission is denied on the jail descriptor in the +.Va desc +parameter, +and the +.Dv JAIL_USE_DESC +flag was set. +.It Bq Er EACCES +Execute permission is denied on the jail descriptor in the +.Va desc +parameter, +and the +.Dv JAIL_AT_DESC +flag was set. .It Bq Er EFAULT .Fa Iov , or one of the addresses contained within it, @@ -352,10 +556,33 @@ The .Va lastjid parameter is greater than the highest current jail ID. +.It Bq Er ENOENT +The jail referred to by a +.Va desc +parameter has been removed +.Pq even if the Dv JAIL_CREATE flag has been set . .It Bq Er EINVAL A supplied parameter is the wrong size. .It Bq Er EINVAL A supplied parameter name does not match any known parameters. +.It Bq Er EMFILE +A jail descriptor could not be created for the +.Va desc +parameter with either the +.Dv JAIL_GET_DESC +or +.Dv JAIL_OWN_DESC +flag set, +because the process has already reached its limit for open file descriptors. +.It Bq Er ENFILE +A jail descriptor could not be created for the +.Va desc +parameter with either the +.Dv JAIL_GET_DESC +or +.Dv JAIL_OWN_DESC +flag set, +because the system file table is full. .El .Pp The @@ -373,11 +600,39 @@ does not exist. .El .Pp +The +.Fn jail_attach_jd +and +.Fn jail_remove_jd +system calls +will fail if: +.Bl -tag -width Er +.It Bq Er EBADF +The +.Fa fd +argument is not a valid jail descriptor. +.It Bq Er EACCES +Permission is denied on the jail descriptor +.Po +execute permission for +.Fn jail_attach_fd , +or write permission for +.Fn jail_remove_fd +.Pc . +.It Bq Er EPERM +The jail descriptor was created by a user other than the super-user. +.It Bq Er EINVAL +The jail specified by +.Fa jid +has been removed. +.El +.Pp Further .Fn jail , .Fn jail_set , +.Fn jail_attach , and -.Fn jail_attach +.Fn jail_attach_jd call .Xr chroot 2 internally, so they can fail for all the same reasons. diff --git a/lib/libsys/syscalls.map b/lib/libsys/syscalls.map --- a/lib/libsys/syscalls.map +++ b/lib/libsys/syscalls.map @@ -813,4 +813,8 @@ __sys_getgroups; _setgroups; __sys_setgroups; + _jail_attach_jd; + __sys_jail_attach_jd; + _jail_remove_jd; + __sys_jail_remove_jd; }; diff --git a/sys/compat/freebsd32/freebsd32_syscall.h b/sys/compat/freebsd32/freebsd32_syscall.h --- a/sys/compat/freebsd32/freebsd32_syscall.h +++ b/sys/compat/freebsd32/freebsd32_syscall.h @@ -515,4 +515,6 @@ #define FREEBSD32_SYS_inotify_rm_watch 594 #define FREEBSD32_SYS_getgroups 595 #define FREEBSD32_SYS_setgroups 596 -#define FREEBSD32_SYS_MAXSYSCALL 597 +#define FREEBSD32_SYS_jail_attach_jd 597 +#define FREEBSD32_SYS_jail_remove_jd 598 +#define FREEBSD32_SYS_MAXSYSCALL 599 diff --git a/sys/compat/freebsd32/freebsd32_syscalls.c b/sys/compat/freebsd32/freebsd32_syscalls.c --- a/sys/compat/freebsd32/freebsd32_syscalls.c +++ b/sys/compat/freebsd32/freebsd32_syscalls.c @@ -602,4 +602,6 @@ "inotify_rm_watch", /* 594 = inotify_rm_watch */ "getgroups", /* 595 = getgroups */ "setgroups", /* 596 = setgroups */ + "jail_attach_jd", /* 597 = jail_attach_jd */ + "jail_remove_jd", /* 598 = jail_remove_jd */ }; diff --git a/sys/compat/freebsd32/freebsd32_sysent.c b/sys/compat/freebsd32/freebsd32_sysent.c --- a/sys/compat/freebsd32/freebsd32_sysent.c +++ b/sys/compat/freebsd32/freebsd32_sysent.c @@ -664,4 +664,6 @@ { .sy_narg = AS(inotify_rm_watch_args), .sy_call = (sy_call_t *)sys_inotify_rm_watch, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 594 = inotify_rm_watch */ { .sy_narg = AS(getgroups_args), .sy_call = (sy_call_t *)sys_getgroups, .sy_auevent = AUE_GETGROUPS, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 595 = getgroups */ { .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 596 = setgroups */ + { .sy_narg = AS(jail_attach_jd_args), .sy_call = (sy_call_t *)sys_jail_attach_jd, .sy_auevent = AUE_JAIL_ATTACH, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 597 = jail_attach_jd */ + { .sy_narg = AS(jail_remove_jd_args), .sy_call = (sy_call_t *)sys_jail_remove_jd, .sy_auevent = AUE_JAIL_REMOVE, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 598 = jail_remove_jd */ }; diff --git a/sys/compat/freebsd32/freebsd32_systrace_args.c b/sys/compat/freebsd32/freebsd32_systrace_args.c --- a/sys/compat/freebsd32/freebsd32_systrace_args.c +++ b/sys/compat/freebsd32/freebsd32_systrace_args.c @@ -3413,6 +3413,20 @@ *n_args = 2; break; } + /* jail_attach_jd */ + case 597: { + struct jail_attach_jd_args *p = params; + iarg[a++] = p->fd; /* int */ + *n_args = 1; + break; + } + /* jail_remove_jd */ + case 598: { + struct jail_remove_jd_args *p = params; + iarg[a++] = p->fd; /* int */ + *n_args = 1; + break; + } default: *n_args = 0; break; @@ -9222,6 +9236,26 @@ break; }; break; + /* jail_attach_jd */ + case 597: + switch (ndx) { + case 0: + p = "int"; + break; + default: + break; + }; + break; + /* jail_remove_jd */ + case 598: + switch (ndx) { + case 0: + p = "int"; + break; + default: + break; + }; + break; default: break; }; @@ -11130,6 +11164,16 @@ if (ndx == 0 || ndx == 1) p = "int"; break; + /* jail_attach_jd */ + case 597: + if (ndx == 0 || ndx == 1) + p = "int"; + break; + /* jail_remove_jd */ + case 598: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; diff --git a/sys/conf/files b/sys/conf/files --- a/sys/conf/files +++ b/sys/conf/files @@ -3808,6 +3808,7 @@ kern/kern_idle.c standard kern/kern_intr.c standard kern/kern_jail.c standard +kern/kern_jaildesc.c standard kern/kern_jailmeta.c standard kern/kern_kcov.c optional kcov \ compile-with "${NOSAN_C} ${MSAN_CFLAGS}" diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -663,4 +663,6 @@ { .sy_narg = AS(inotify_rm_watch_args), .sy_call = (sy_call_t *)sys_inotify_rm_watch, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 594 = inotify_rm_watch */ { .sy_narg = AS(getgroups_args), .sy_call = (sy_call_t *)sys_getgroups, .sy_auevent = AUE_GETGROUPS, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 595 = getgroups */ { .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 596 = setgroups */ + { .sy_narg = AS(jail_attach_jd_args), .sy_call = (sy_call_t *)sys_jail_attach_jd, .sy_auevent = AUE_JAIL_ATTACH, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 597 = jail_attach_jd */ + { .sy_narg = AS(jail_remove_jd_args), .sy_call = (sy_call_t *)sys_jail_remove_jd, .sy_auevent = AUE_JAIL_REMOVE, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 598 = jail_remove_jd */ }; diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -5250,6 +5250,8 @@ return ("eventfd"); case DTYPE_TIMERFD: return ("timerfd"); + case DTYPE_JAILDESC: + return ("jail"); default: return ("unkn"); } diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -49,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -988,6 +990,8 @@ int kern_jail_set(struct thread *td, struct uio *optuio, int flags) { + struct file *jfp_out; + struct jaildesc *desc_in; struct nameidata nd; #ifdef INET struct prison_ip *ip4; @@ -998,6 +1002,7 @@ struct vfsopt *opt; struct vfsoptlist *opts; struct prison *pr, *deadpr, *dinspr, *inspr, *mypr, *ppr, *tpr; + struct ucred *jdcred; struct vnode *root; char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid; char *g_path, *osrelstr; @@ -1011,7 +1016,7 @@ int created, cuflags, descend, drflags, enforce; int error, errmsg_len, errmsg_pos; int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; - int deadid, jid, jsys, len, level; + int deadid, jfd_in, jfd_out, jfd_pos, jid, jsys, len, level; int childmax, osreldt, rsnum, slevel; #ifdef INET int ip4s; @@ -1027,17 +1032,26 @@ unsigned tallow; char numbuf[12]; - error = priv_check(td, PRIV_JAIL_SET); - if (!error && (flags & JAIL_ATTACH)) - error = priv_check(td, PRIV_JAIL_ATTACH); - if (error) - return (error); mypr = td->td_ucred->cr_prison; - if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) + if (((flags & (JAIL_CREATE | JAIL_AT_DESC)) == JAIL_CREATE) + && mypr->pr_childmax == 0) return (EPERM); if (flags & ~JAIL_SET_MASK) return (EINVAL); + if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) + == (JAIL_USE_DESC | JAIL_AT_DESC)) + return (EINVAL); + prison_hold(mypr); +#ifdef INET + ip4 = NULL; +#endif +#ifdef INET6 + ip6 = NULL; +#endif + g_path = NULL; + jfp_out = NULL; + jfd_out = -1; /* * Check all the parameters before committing to anything. Not all * errors can be caught early, but we may as well try. Also, this @@ -1050,14 +1064,7 @@ */ error = vfs_buildopts(optuio, &opts); if (error) - return (error); -#ifdef INET - ip4 = NULL; -#endif -#ifdef INET6 - ip6 = NULL; -#endif - g_path = NULL; + goto done_free; cuflags = flags & (JAIL_CREATE | JAIL_UPDATE); if (!cuflags) { @@ -1066,6 +1073,72 @@ goto done_errmsg; } + error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in)); + if (error == ENOENT) { + if (flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | + JAIL_OWN_DESC)) { + vfs_opterror(opts, "missing desc"); + goto done_errmsg; + } + jfd_in = -1; + } else if (error != 0) + goto done_free; + else { + if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | + JAIL_OWN_DESC))) { + vfs_opterror(opts, "unexpected desc"); + goto done_errmsg; + } + if (flags & JAIL_AT_DESC) { + /* + * Look up and create jails based on the + * descriptor's prison. + */ + prison_free(mypr); + error = jaildesc_find(td, jfd_in, &desc_in, &mypr, + NULL); + if (error != 0) { + vfs_opterror(opts, error == ENOENT + ? "descriptor to dead jail" + : "not a jail descriptor"); + goto done_errmsg; + } + /* + * Check file permissions using the current + * credentials, and operation permissions + * using the descriptor's credentials. + */ + error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid, + desc_in->jd_gid, VEXEC, td->td_ucred); + JAILDESC_UNLOCK(desc_in); + if (error != 0) + goto done_free; + if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) { + error = EPERM; + goto done_free; + } + } + if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) { + /* Allocate a jail descriptor to return later. */ + error = jaildesc_alloc(td, &jfp_out, &jfd_out, + flags & JAIL_OWN_DESC); + if (error) + goto done_free; + } + } + + /* + * Delay the permission check if using a jail descriptor, + * until we get the descriptor's credentials. + */ + if (!(flags & JAIL_USE_DESC)) { + error = priv_check(td, PRIV_JAIL_SET); + if (error == 0 && (flags & JAIL_ATTACH)) + error = priv_check(td, PRIV_JAIL_ATTACH); + if (error) + goto done_free; + } + error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == ENOENT) jid = 0; @@ -1441,7 +1514,57 @@ error = EAGAIN; goto done_deref; } - if (jid != 0) { + if (flags & JAIL_USE_DESC) { + /* Get the jail from its descriptor. */ + error = jaildesc_find(td, jfd_in, &desc_in, &pr, &jdcred); + if (error) { + vfs_opterror(opts, error == ENOENT + ? "descriptor to dead jail" + : "not a jail descriptor"); + goto done_deref; + } + drflags |= PD_DEREF; + /* + * Check file permissions using the current credentials, + * and operation permissions using the descriptor's + * credentials. + */ + error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid, + desc_in->jd_gid, VWRITE, td->td_ucred); + if (error == 0 && (flags & JAIL_ATTACH)) + error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid, + desc_in->jd_gid, VEXEC, td->td_ucred); + JAILDESC_UNLOCK(desc_in); + if (error == 0) + error = priv_check_cred(jdcred, PRIV_JAIL_SET); + if (error == 0 && (flags & JAIL_ATTACH)) + error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH); + crfree(jdcred); + if (error) + goto done_deref; + mtx_lock(&pr->pr_mtx); + drflags |= PD_LOCKED; + if (cuflags == JAIL_CREATE) { + error = EEXIST; + vfs_opterror(opts, "jail %d already exists", + pr->pr_id); + goto done_deref; + } + if (!prison_isalive(pr)) { + /* While a jid can be resurrected, the prison + * itself cannot. + */ + error = ENOENT; + vfs_opterror(opts, "jail %d is dying", pr->pr_id); + goto done_deref; + } + if (jid != 0 && jid != pr->pr_id) { + error = EINVAL; + vfs_opterror(opts, "cannot change jid"); + goto done_deref; + } + jid = pr->pr_id; + } else if (jid != 0) { if (jid < 0) { error = EINVAL; vfs_opterror(opts, "negative jid"); @@ -1575,7 +1698,7 @@ } } } - /* Update: must provide a jid or name. */ + /* Update: must provide a desc, jid, or name. */ else if (cuflags == JAIL_UPDATE && pr == NULL) { error = ENOENT; vfs_opterror(opts, "update specified no jail"); @@ -1728,8 +1851,10 @@ * Grab a reference for existing prisons, to ensure they * continue to exist for the duration of the call. */ - prison_hold(pr); - drflags |= PD_DEREF; + if (!(drflags & PD_DEREF)) { + prison_hold(pr); + drflags |= PD_DEREF; + } #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((pr->pr_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { @@ -2158,6 +2283,26 @@ printf("Warning jail jid=%d: mountd/nfsd requires a separate" " file system\n", pr->pr_id); + /* + * Now that the prison is fully created without error, set the + * jail descriptor if one was requested. This is the only + * parameter that is returned to the caller (except the error + * message). + */ + if (jfd_out >= 0) { + if (!(drflags & PD_LOCKED)) { + mtx_lock(&pr->pr_mtx); + drflags |= PD_LOCKED; + } + jfd_pos = 2 * vfs_getopt_pos(opts, "desc") + 1; + if (optuio->uio_segflg == UIO_SYSSPACE) + *(int*)optuio->uio_iov[jfd_pos].iov_base = jfd_out; + else + (void)copyout(&jfd_out, + optuio->uio_iov[jfd_pos].iov_base, sizeof(jfd_out)); + jaildesc_set_prison(jfp_out, pr); + } + drflags &= ~PD_KILL; td->td_retval[0] = pr->pr_id; @@ -2195,15 +2340,21 @@ } } done_free: + /* Clean up other resources. */ #ifdef INET prison_ip_free(ip4); #endif #ifdef INET6 prison_ip_free(ip6); #endif + if (jfp_out != NULL) + fdrop(jfp_out, td); + if (error && jfd_out >= 0) + (void)kern_close(td, jfd_out); if (g_path != NULL) free(g_path, M_TEMP); vfs_freeopts(opts); + prison_free(mypr); return (error); } @@ -2348,16 +2499,22 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags) { struct bool_flags *bf; + struct file *jfp_out; + struct jaildesc *desc_in; struct jailsys_flags *jsf; struct prison *pr, *mypr; struct vfsopt *opt; struct vfsoptlist *opts; char *errmsg, *name; int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos; + int jfd_in, jfd_out; unsigned f; if (flags & ~JAIL_GET_MASK) return (EINVAL); + if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) + == (JAIL_USE_DESC | JAIL_AT_DESC)) + return (EINVAL); /* Get the parameter list. */ error = vfs_buildopts(optuio, &opts); @@ -2365,13 +2522,81 @@ return (error); errmsg_pos = vfs_getopt_pos(opts, "errmsg"); mypr = td->td_ucred->cr_prison; + prison_hold(mypr); pr = NULL; + jfp_out = NULL; + jfd_out = -1; /* - * Find the prison specified by one of: lastjid, jid, name. + * Find the prison specified by one of: desc, lastjid, jid, name. */ sx_slock(&allprison_lock); drflags = PD_LIST_SLOCKED; + + error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in)); + if (error == ENOENT) { + if (flags & (JAIL_AT_DESC | JAIL_GET_DESC | JAIL_OWN_DESC)) { + vfs_opterror(opts, "missing desc"); + goto done; + } + } else if (error == 0) { + if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | + JAIL_OWN_DESC))) { + vfs_opterror(opts, "unexpected desc"); + goto done; + } + if (flags & JAIL_USE_DESC) { + /* Get the jail from its descriptor. */ + error = jaildesc_find(td, jfd_in, &desc_in, &pr, NULL); + if (error) { + vfs_opterror(opts, error == ENOENT + ? "descriptor to dead jail" + : "not a jail descriptor"); + goto done; + } + drflags |= PD_DEREF; + error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid, + desc_in->jd_gid, VREAD, td->td_ucred); + JAILDESC_UNLOCK(desc_in); + if (error != 0) + goto done; + mtx_lock(&pr->pr_mtx); + drflags |= PD_LOCKED; + if (!(prison_isalive(pr) || (flags & JAIL_DYING))) { + error = ENOENT; + vfs_opterror(opts, "jail %d is dying", + pr->pr_id); + goto done; + } + goto found_prison; + } + if (flags & JAIL_AT_DESC) { + /* Look up jails based on the descriptor's prison. */ + prison_free(mypr); + error = jaildesc_find(td, jfd_in, &desc_in, &mypr, + NULL); + if (error != 0) { + vfs_opterror(opts, error == ENOENT + ? "descriptor to dead jail" + : "not a jail descriptor"); + goto done; + } + error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid, + desc_in->jd_gid, VEXEC, td->td_ucred); + JAILDESC_UNLOCK(desc_in); + if (error != 0) + goto done; + } + if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) { + /* Allocate a jail descriptor to return later. */ + error = jaildesc_alloc(td, &jfp_out, &jfd_out, + flags & JAIL_OWN_DESC); + if (error) + goto done; + } + } else + goto done; + error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid)); if (error == 0) { TAILQ_FOREACH(pr, &allprison, pr_list) { @@ -2440,9 +2665,17 @@ found_prison: /* Get the parameters of the prison. */ - prison_hold(pr); - drflags |= PD_DEREF; + if (!(drflags & PD_DEREF)) { + prison_hold(pr); + drflags |= PD_DEREF; + } td->td_retval[0] = pr->pr_id; + if (jfd_out >= 0) { + error = vfs_setopt(opts, "desc", &jfd_out, sizeof(jfd_out)); + if (error != 0 && error != ENOENT) + goto done; + jaildesc_set_prison(jfp_out, pr); + } error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id)); if (error != 0 && error != ENOENT) goto done; @@ -2622,6 +2855,13 @@ prison_deref(pr, drflags); else if (drflags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); + else if (drflags & PD_LIST_XLOCKED) + sx_xunlock(&allprison_lock); + /* Clean up other resources. */ + if (jfp_out != NULL) + (void)fdrop(jfp_out, td); + if (error && jfd_out >= 0) + (void)kern_close(td, jfd_out); if (error && errmsg_pos >= 0) { /* Write the error message back to userspace. */ vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len); @@ -2638,6 +2878,7 @@ } } vfs_freeopts(opts); + prison_free(mypr); return (error); } @@ -2662,14 +2903,63 @@ sx_xunlock(&allprison_lock); return (EINVAL); } + prison_hold(pr); + prison_remove(pr); + return (0); +} + +/* + * struct jail_remove_jd_args { + * int fd; + * }; + */ +int +sys_jail_remove_jd(struct thread *td, struct jail_remove_jd_args *uap) +{ + struct jaildesc *jd; + struct prison *pr; + struct ucred *jdcred; + int error; + + error = jaildesc_find(td, uap->fd, &jd, &pr, &jdcred); + if (error) + return (error); + /* + * Check file permissions using the current credentials, and + * operation permissions using the descriptor's credentials. + */ + error = vaccess(VREG, jd->jd_mode, jd->jd_uid, jd->jd_gid, VWRITE, + td->td_ucred); + JAILDESC_UNLOCK(jd); + if (error == 0) + error = priv_check_cred(jdcred, PRIV_JAIL_REMOVE); + crfree(jdcred); + if (error) { + prison_free(pr); + return (error); + } + sx_xlock(&allprison_lock); + mtx_lock(&pr->pr_mtx); + prison_remove(pr); + return (0); +} + +/* + * Begin the removal process for a prison. The allprison lock should + * be held exclusively, and the prison should be both locked and held. + */ +void +prison_remove(struct prison *pr) +{ + sx_assert(&allprison_lock, SA_XLOCKED); + mtx_assert(&pr->pr_mtx, MA_OWNED); if (!prison_isalive(pr)) { /* Silently ignore already-dying prisons. */ mtx_unlock(&pr->pr_mtx); sx_xunlock(&allprison_lock); - return (0); + return; } - prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED); - return (0); + prison_deref(pr, PD_KILL | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); } /* @@ -2704,6 +2994,53 @@ return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED)); } +/* + * struct jail_attach_jd_args { + * int fd; + * }; + */ +int +sys_jail_attach_jd(struct thread *td, struct jail_attach_jd_args *uap) +{ + struct jaildesc *jd; + struct prison *pr; + struct ucred *jdcred; + int drflags, error; + + sx_slock(&allprison_lock); + drflags = PD_LIST_SLOCKED; + error = jaildesc_find(td, uap->fd, &jd, &pr, &jdcred); + if (error) + goto fail; + drflags |= PD_DEREF; + /* + * Check file permissions using the current credentials, and + * operation permissions using the descriptor's credentials. + */ + error = vaccess(VREG, jd->jd_mode, jd->jd_uid, jd->jd_gid, VEXEC, + td->td_ucred); + JAILDESC_UNLOCK(jd); + if (error == 0) + error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH); + crfree(jdcred); + if (error) + goto fail; + mtx_lock(&pr->pr_mtx); + drflags |= PD_LOCKED; + + /* Do not allow a process to attach to a prison that is not alive. */ + if (!prison_isalive(pr)) { + error = EINVAL; + goto fail; + } + + return (do_jail_attach(td, pr, drflags)); + + fail: + prison_deref(pr, drflags); + return (error); +} + static int do_jail_attach(struct thread *td, struct prison *pr, int drflags) { @@ -2722,9 +3059,12 @@ * a process root from one prison, but attached to the jail * of another. */ - prison_hold(pr); + if (!(drflags & PD_DEREF)) { + prison_hold(pr); + drflags |= PD_DEREF; + } refcount_acquire(&pr->pr_uref); - drflags |= PD_DEREF | PD_DEUREF; + drflags |= PD_DEUREF; mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; @@ -3444,6 +3784,7 @@ mtx_assert(&pr->pr_mtx, MA_OWNED); prison_knote(pr, NOTE_JAIL_REMOVE); knlist_detach(pr->pr_klist); + jaildesc_prison_cleanup(pr); pr->pr_klist = NULL; } @@ -4650,6 +4991,7 @@ * jail creation time but cannot be changed in an existing jail. */ SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID"); +SYSCTL_JAIL_PARAM(, desc, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail descriptor"); SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID"); SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name"); SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path"); diff --git a/sys/kern/kern_jaildesc.c b/sys/kern/kern_jaildesc.c new file mode 100644 --- /dev/null +++ b/sys/kern/kern_jaildesc.c @@ -0,0 +1,337 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 James Gritton. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MALLOC_DEFINE(M_JAILDESC, "jaildesc", "jail descriptors"); + +static fo_stat_t jaildesc_stat; +static fo_close_t jaildesc_close; +static fo_chmod_t jaildesc_chmod; +static fo_chown_t jaildesc_chown; +static fo_fill_kinfo_t jaildesc_fill_kinfo; +static fo_cmp_t jaildesc_cmp; + +static struct fileops jaildesc_ops = { + .fo_read = invfo_rdwr, + .fo_write = invfo_rdwr, + .fo_truncate = invfo_truncate, + .fo_ioctl = invfo_ioctl, + .fo_poll = invfo_poll, + .fo_kqfilter = invfo_kqfilter, + .fo_stat = jaildesc_stat, + .fo_close = jaildesc_close, + .fo_chmod = jaildesc_chmod, + .fo_chown = jaildesc_chown, + .fo_sendfile = invfo_sendfile, + .fo_fill_kinfo = jaildesc_fill_kinfo, + .fo_cmp = jaildesc_cmp, + .fo_flags = DFLAG_PASSABLE, +}; + +/* + * Given a jail descriptor number, return the jaildesc, its prison, + * and its credential. The jaildesc will be returned locked, and + * prison and the credential will be returned held. + */ +int +jaildesc_find(struct thread *td, int fd, struct jaildesc **jdp, + struct prison **prp, struct ucred **ucredp) +{ + struct file *fp; + struct jaildesc *jd; + struct prison *pr; + int error; + + error = fget(td, fd, &cap_no_rights, &fp); + if (error != 0) + return (error); + if (fp->f_type != DTYPE_JAILDESC) { + error = EBADF; + goto out; + } + jd = fp->f_data; + JAILDESC_LOCK(jd); + pr = jd->jd_prison; + if (pr == NULL || !prison_isvalid(pr)) { + error = ENOENT; + JAILDESC_UNLOCK(jd); + goto out; + } + prison_hold(pr); + *prp = pr; + if (jdp != NULL) + *jdp = jd; + else + JAILDESC_UNLOCK(jd); + if (ucredp != NULL) + *ucredp = crhold(fp->f_cred); + out: + fdrop(fp, td); + return (error); +} + +/* + * Allocate a new jail decriptor, not yet associated with a prison. + * Return the file pointer (with a reference held) and the descriptor + * number. + */ +int +jaildesc_alloc(struct thread *td, struct file **fpp, int *fdp, int owning) +{ + struct file *fp; + struct jaildesc *jd; + int error; + mode_t mode; + + if (owning) { + error = priv_check(td, PRIV_JAIL_REMOVE); + if (error != 0) + return (error); + mode = S_ISTXT; + } else + mode = 0; + jd = malloc(sizeof(*jd), M_JAILDESC, M_WAITOK | M_ZERO); + error = falloc_caps(td, &fp, fdp, 0, NULL); + finit(fp, priv_check_cred(fp->f_cred, PRIV_JAIL_SET) == 0 + ? FREAD | FWRITE : FREAD, DTYPE_JAILDESC, jd, &jaildesc_ops); + if (error != 0) { + free(jd, M_JAILDESC); + return (error); + } + JAILDESC_LOCK_INIT(jd); + jd->jd_uid = fp->f_cred->cr_uid; + jd->jd_gid = fp->f_cred->cr_gid; + jd->jd_mode = S_IFREG | S_IRUSR | S_IRGRP | S_IROTH | mode + | (priv_check(td, PRIV_JAIL_SET) == 0 ? S_IWUSR | S_IXUSR : 0) + | (priv_check(td, PRIV_JAIL_ATTACH) == 0 ? S_IXUSR : 0); + *fpp = fp; + return (0); +} + +/* + * Assocate a jail descriptor with its prison. + */ +void +jaildesc_set_prison(struct file *fp, struct prison *pr) +{ + struct jaildesc *jd; + + mtx_assert(&pr->pr_mtx, MA_OWNED); + jd = fp->f_data; + JAILDESC_LOCK(jd); + jd->jd_prison = pr; + LIST_INSERT_HEAD(&pr->pr_descs, jd, jd_list); + prison_hold(pr); + JAILDESC_UNLOCK(jd); +} + +/* + * Detach the all jail descriptors from a prison. + */ +void +jaildesc_prison_cleanup(struct prison *pr) +{ + struct jaildesc *jd; + + mtx_assert(&pr->pr_mtx, MA_OWNED); + while ((jd = LIST_FIRST(&pr->pr_descs))) { + JAILDESC_LOCK(jd); + LIST_REMOVE(jd, jd_list); + jd->jd_prison = NULL; + JAILDESC_UNLOCK(jd); + prison_free(pr); + } +} + +static int +jaildesc_close(struct file *fp, struct thread *td) +{ + struct jaildesc *jd; + struct prison *pr; + + jd = fp->f_data; + fp->f_data = NULL; + if (jd != NULL) { + JAILDESC_LOCK(jd); + pr = jd->jd_prison; + if (pr != NULL) { + /* + * Free or remove the associated prison. + * This requires a second check after re- + * ordering locks. This jaildesc can remain + * unlocked once we have a prison reference, + * because that prison is the only place that + * still points back to it. + */ + prison_hold(pr); + JAILDESC_UNLOCK(jd); + if (jd->jd_mode & S_ISTXT) { + sx_xlock(&allprison_lock); + prison_lock(pr); + if (jd->jd_prison != NULL) { + /* + * Unlink the prison, but don't free + * it; that will be done as part of + * of prison_remove. + */ + LIST_REMOVE(jd, jd_list); + prison_remove(pr); + } else { + prison_unlock(pr); + sx_xunlock(&allprison_lock); + } + } else { + prison_lock(pr); + if (jd->jd_prison != NULL) { + LIST_REMOVE(jd, jd_list); + prison_free(pr); + } + prison_unlock(pr); + } + prison_free(pr); + } + JAILDESC_LOCK_DESTROY(jd); + free(jd, M_JAILDESC); + } + finit(fp, 0, DTYPE_NONE, NULL, &badfileops); + return (0); +} + +static int +jaildesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) +{ + struct jaildesc *jd; + + bzero(sb, sizeof(struct stat)); + jd = fp->f_data; + JAILDESC_LOCK(jd); + if (jd->jd_prison != NULL) { + sb->st_ino = jd->jd_prison ? jd->jd_prison->pr_id : 0; + sb->st_uid = jd->jd_uid; + sb->st_gid = jd->jd_gid; + sb->st_mode = jd->jd_mode; + } else + sb->st_mode = S_IFREG; + JAILDESC_UNLOCK(jd); + return (0); +} + +static int +jaildesc_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, + struct thread *td) +{ + struct jaildesc *jd; + int error; + + /* Reject permissions that the creator doesn't have. */ + if (((mode & (S_IWUSR | S_IWGRP | S_IWOTH)) + && priv_check_cred(fp->f_cred, PRIV_JAIL_SET) != 0) + || ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) + && priv_check_cred(fp->f_cred, PRIV_JAIL_ATTACH) != 0 + && priv_check_cred(fp->f_cred, PRIV_JAIL_SET) != 0) + || ((mode & S_ISTXT) + && priv_check_cred(fp->f_cred, PRIV_JAIL_REMOVE) != 0)) + return (EPERM); + if (mode & (S_ISUID | S_ISGID)) + return (EINVAL); + jd = fp->f_data; + JAILDESC_LOCK(jd); + error = vaccess(VREG, jd->jd_mode, jd->jd_uid, jd->jd_gid, VADMIN, + active_cred); + if (error == 0) + jd->jd_mode = S_IFREG | (mode & ALLPERMS); + JAILDESC_UNLOCK(jd); + return (error); +} + +static int +jaildesc_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, + struct thread *td) +{ + struct jaildesc *jd; + int error; + + error = 0; + jd = fp->f_data; + JAILDESC_LOCK(jd); + if (uid == (uid_t)-1) + uid = jd->jd_uid; + if (gid == (gid_t)-1) + gid = jd->jd_gid; + if ((uid != jd->jd_uid && uid != active_cred->cr_uid) || + (gid != jd->jd_gid && !groupmember(gid, active_cred))) + error = priv_check_cred(active_cred, PRIV_VFS_CHOWN); + if (error == 0) { + jd->jd_uid = uid; + jd->jd_gid = gid; + } + JAILDESC_UNLOCK(jd); + return (error); +} + +static int +jaildesc_fill_kinfo(struct file *fp, struct kinfo_file *kif, + struct filedesc *fdp) +{ + return (EINVAL); +} + +static int +jaildesc_cmp(struct file *fp1, struct file *fp2, struct thread *td) +{ + struct jaildesc *jd1, *jd2; + int jid1, jid2; + + if (fp2->f_type != DTYPE_JAILDESC) + return (3); + jd1 = fp1->f_data; + JAILDESC_LOCK(jd1); + jid1 = jd1->jd_prison ? (uintptr_t)jd1->jd_prison->pr_id : 0; + JAILDESC_UNLOCK(jd1); + jd2 = fp2->f_data; + JAILDESC_LOCK(jd2); + jid2 = jd2->jd_prison ? (uintptr_t)jd2->jd_prison->pr_id : 0; + JAILDESC_UNLOCK(jd2); + return (kcmp_cmp(jid1, jid2)); +} diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c --- a/sys/kern/syscalls.c +++ b/sys/kern/syscalls.c @@ -602,4 +602,6 @@ "inotify_rm_watch", /* 594 = inotify_rm_watch */ "getgroups", /* 595 = getgroups */ "setgroups", /* 596 = setgroups */ + "jail_attach_jd", /* 597 = jail_attach_jd */ + "jail_remove_jd", /* 598 = jail_remove_jd */ }; diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -3383,5 +3383,15 @@ _In_reads_(gidsetsize) const gid_t *gidset ); } +597 AUE_JAIL_ATTACH STD { + int jail_attach_jd( + int fd + ); + } +598 AUE_JAIL_REMOVE STD { + int jail_remove_jd( + int fd + ); + } ; vim: syntax=off diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c --- a/sys/kern/systrace_args.c +++ b/sys/kern/systrace_args.c @@ -3500,6 +3500,20 @@ *n_args = 2; break; } + /* jail_attach_jd */ + case 597: { + struct jail_attach_jd_args *p = params; + iarg[a++] = p->fd; /* int */ + *n_args = 1; + break; + } + /* jail_remove_jd */ + case 598: { + struct jail_remove_jd_args *p = params; + iarg[a++] = p->fd; /* int */ + *n_args = 1; + break; + } default: *n_args = 0; break; @@ -9367,6 +9381,26 @@ break; }; break; + /* jail_attach_jd */ + case 597: + switch (ndx) { + case 0: + p = "int"; + break; + default: + break; + }; + break; + /* jail_remove_jd */ + case 598: + switch (ndx) { + case 0: + p = "int"; + break; + default: + break; + }; + break; default: break; }; @@ -11365,6 +11399,16 @@ if (ndx == 0 || ndx == 1) p = "int"; break; + /* jail_attach_jd */ + case 597: + if (ndx == 0 || ndx == 1) + p = "int"; + break; + /* jail_remove_jd */ + case 598: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; diff --git a/sys/sys/file.h b/sys/sys/file.h --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -72,6 +72,7 @@ #define DTYPE_EVENTFD 13 /* eventfd */ #define DTYPE_TIMERFD 14 /* timerfd */ #define DTYPE_INOTIFY 15 /* inotify descriptor */ +#define DTYPE_JAILDESC 16 /* jail descriptor */ #ifdef _KERNEL diff --git a/sys/sys/jail.h b/sys/sys/jail.h --- a/sys/sys/jail.h +++ b/sys/sys/jail.h @@ -99,8 +99,12 @@ #define JAIL_UPDATE 0x02 /* Update parameters of existing jail */ #define JAIL_ATTACH 0x04 /* Attach to jail upon creation */ #define JAIL_DYING 0x08 /* Allow getting a dying jail */ -#define JAIL_SET_MASK 0x0f /* JAIL_DYING is deprecated/ignored here */ -#define JAIL_GET_MASK 0x08 +#define JAIL_USE_DESC 0x10 /* Get/set jail in descriptor */ +#define JAIL_AT_DESC 0x20 /* Find/add jail under descriptor */ +#define JAIL_GET_DESC 0x40 /* Return a new jail descriptor */ +#define JAIL_OWN_DESC 0x80 /* Return a new owning jail descriptor */ +#define JAIL_SET_MASK 0xff /* JAIL_DYING is deprecated/ignored here */ +#define JAIL_GET_MASK 0xf8 #define JAIL_SYS_DISABLE 0 #define JAIL_SYS_NEW 1 @@ -115,7 +119,9 @@ int jail_set(struct iovec *, unsigned int, int); int jail_get(struct iovec *, unsigned int, int); int jail_attach(int); +int jail_attach_jd(int); int jail_remove(int); +int jail_remove_jd(int); __END_DECLS #else /* _KERNEL */ @@ -144,6 +150,7 @@ #define JAIL_META_PRIVATE "meta" #define JAIL_META_SHARED "env" +struct jaildesc; struct knlist; struct racct; struct prison_racct; @@ -191,7 +198,8 @@ struct prison_ip *pr_addrs[PR_FAMILY_MAX]; /* (p,n) IPs of jail */ struct prison_racct *pr_prison_racct; /* (c) racct jail proxy */ struct knlist *pr_klist; /* (m) attached knotes */ - void *pr_sparep[2]; + LIST_HEAD(, jaildesc) pr_descs; /* (a) attached descriptors */ + void *pr_sparep; int pr_childcount; /* (a) number of child jails */ int pr_childmax; /* (p) maximum child jails */ unsigned pr_allow; /* (p) PR_ALLOW_* flags */ @@ -466,6 +474,7 @@ void prison_proc_link(struct prison *, struct proc *); void prison_proc_unlink(struct prison *, struct proc *); void prison_proc_iterate(struct prison *, void (*)(struct proc *, void *), void *); +void prison_remove(struct prison *); void prison_set_allow(struct ucred *cred, unsigned flag, int enable); bool prison_ischild(struct prison *, struct prison *); bool prison_isalive(const struct prison *); diff --git a/sys/sys/jaildesc.h b/sys/sys/jaildesc.h new file mode 100644 --- /dev/null +++ b/sys/sys/jaildesc.h @@ -0,0 +1,85 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 James Gritton. + * All rights reserved. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYS_JAILDESC_H_ +#define _SYS_JAILDESC_H_ + +#ifdef _KERNEL + +#include +#include +#include +#include + +struct prison; + +/*- + * struct jaildesc describes a jail descriptor, which points to a struct + * prison. struct prison in turn has a linked list of struct jaildesc. + * + * Locking key: + * (c) set on creation, remains unchanged + * (d) jd_lock + * (p) jd_prison->pr_mtx + */ +struct jaildesc { + LIST_ENTRY(jaildesc) jd_list; /* (d,p) this prison's descs */ + struct prison *jd_prison; /* (d) the prison */ + struct mtx jd_lock; + uid_t jd_uid; /* (d) nominal file owner */ + gid_t jd_gid; /* (d) nominal file group */ + mode_t jd_mode; /* (d) descriptor permissions */ + unsigned jd_flags; /* (d) JDF_* flags */ +}; + +/* + * Locking macros for the jaildesc. + */ +#define JAILDESC_LOCK_DESTROY(jd) mtx_destroy(&(jd)->jd_lock) +#define JAILDESC_LOCK_INIT(jd) mtx_init(&(jd)->jd_lock, "jaildesc", \ + NULL, MTX_DEF) +#define JAILDESC_LOCK(jd) mtx_lock(&(jd)->jd_lock) +#define JAILDESC_UNLOCK(jd) mtx_unlock(&(jd)->jd_lock) + +/* + * Flags for the jd_flags field + */ +#define JDF_REMOVED 0x00000002 /* jail was removed */ + +int jaildesc_find(struct thread *td, int fd, struct jaildesc **jdp, + struct prison **prp, struct ucred **ucredp); +int jaildesc_alloc(struct thread *td, struct file **fpp, int *fdp, int owning); +void jaildesc_set_prison(struct file *jd, struct prison *pr); +void jaildesc_prison_cleanup(struct prison *pr); + +#endif /* _KERNEL */ + +#endif /* !_SYS_JAILDESC_H_ */ diff --git a/sys/sys/syscall.h b/sys/sys/syscall.h --- a/sys/sys/syscall.h +++ b/sys/sys/syscall.h @@ -535,4 +535,6 @@ #define SYS_inotify_rm_watch 594 #define SYS_getgroups 595 #define SYS_setgroups 596 -#define SYS_MAXSYSCALL 597 +#define SYS_jail_attach_jd 597 +#define SYS_jail_remove_jd 598 +#define SYS_MAXSYSCALL 599 diff --git a/sys/sys/syscall.mk b/sys/sys/syscall.mk --- a/sys/sys/syscall.mk +++ b/sys/sys/syscall.mk @@ -438,4 +438,6 @@ inotify_add_watch_at.o \ inotify_rm_watch.o \ getgroups.o \ - setgroups.o + setgroups.o \ + jail_attach_jd.o \ + jail_remove_jd.o diff --git a/sys/sys/sysproto.h b/sys/sys/sysproto.h --- a/sys/sys/sysproto.h +++ b/sys/sys/sysproto.h @@ -1901,6 +1901,12 @@ char gidsetsize_l_[PADL_(int)]; int gidsetsize; char gidsetsize_r_[PADR_(int)]; char gidset_l_[PADL_(const gid_t *)]; const gid_t * gidset; char gidset_r_[PADR_(const gid_t *)]; }; +struct jail_attach_jd_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; +}; +struct jail_remove_jd_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; +}; int sys__exit(struct thread *, struct _exit_args *); int sys_fork(struct thread *, struct fork_args *); int sys_read(struct thread *, struct read_args *); @@ -2305,6 +2311,8 @@ int sys_inotify_rm_watch(struct thread *, struct inotify_rm_watch_args *); int sys_getgroups(struct thread *, struct getgroups_args *); int sys_setgroups(struct thread *, struct setgroups_args *); +int sys_jail_attach_jd(struct thread *, struct jail_attach_jd_args *); +int sys_jail_remove_jd(struct thread *, struct jail_remove_jd_args *); #ifdef COMPAT_43 @@ -3301,6 +3309,8 @@ #define SYS_AUE_inotify_rm_watch AUE_INOTIFY #define SYS_AUE_getgroups AUE_GETGROUPS #define SYS_AUE_setgroups AUE_SETGROUPS +#define SYS_AUE_jail_attach_jd AUE_JAIL_ATTACH +#define SYS_AUE_jail_remove_jd AUE_JAIL_REMOVE #undef PAD_ #undef PADL_ diff --git a/sys/sys/user.h b/sys/sys/user.h --- a/sys/sys/user.h +++ b/sys/sys/user.h @@ -266,6 +266,7 @@ #define KF_TYPE_EVENTFD 13 #define KF_TYPE_TIMERFD 14 #define KF_TYPE_INOTIFY 15 +#define KF_TYPE_JAILDESC 16 #define KF_TYPE_UNKNOWN 255 #define KF_VTYPE_VNON 0 @@ -452,6 +453,9 @@ uint32_t kf_timerfd_flags; uint64_t kf_timerfd_addr; } kf_timerfd; + struct { + int32_t kf_jid; + } kf_jail; struct { uint64_t kf_kqueue_addr; int32_t kf_kqueue_count;