diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -368,6 +368,7 @@ struct filedesc_to_leader *fdtol; struct pwddesc *pd; struct sigacts *newsigacts; + enum prison_state pr_state; p1 = td->td_proc; @@ -710,6 +711,19 @@ */ knote_fork(p1->p_klist, p2->p_pid); + /* + * See if the containing prison died while the process was still new. + */ + prison_lock(p2->p_ucred->cr_prison); + pr_state = p2->p_ucred->cr_prison->pr_state; + prison_unlock(p2->p_ucred->cr_prison); + if (pr_state == PRISON_STATE_DYING) { + /* Folow the prison into death. */ + PROC_LOCK(p2); + kern_psignal(p2, SIGKILL); + PROC_UNLOCK(p2); + } + /* * Now can be swapped. */ diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -141,10 +141,10 @@ static int do_jail_attach(struct thread *td, struct prison *pr); static void prison_complete(void *context, int pending); static void prison_deref(struct prison *pr, int flags); +static int prison_deref_kill(struct prison *pr, struct prisonlist *freeprison); static void prison_set_allow_locked(struct prison *pr, unsigned flag, int enable); static char *prison_path(struct prison *pr1, struct prison *pr2); -static void prison_remove_one(struct prison *pr); #ifdef RACCT static void prison_racct_attach(struct prison *pr); static void prison_racct_modify(struct prison *pr); @@ -154,9 +154,10 @@ /* Flags for prison_deref */ #define PD_DEREF 0x01 /* Decrement pr_ref */ #define PD_DEUREF 0x02 /* Decrement pr_uref */ -#define PD_LOCKED 0x04 /* pr_mtx is held */ -#define PD_LIST_SLOCKED 0x08 /* allprison_lock is held shared */ -#define PD_LIST_XLOCKED 0x10 /* allprison_lock is held exclusive */ +#define PD_KILL 0x04 /* Remove jail, kill processes, etc */ +#define PD_LOCKED 0x08 /* pr_mtx is held */ +#define PD_LIST_SLOCKED 0x10 /* allprison_lock is held shared */ +#define PD_LIST_XLOCKED 0x20 /* allprison_lock is held exclusive */ /* * Parameter names corresponding to PR_* flag values. Size values are for kvm @@ -527,10 +528,10 @@ #endif unsigned long hid; size_t namelen, onamelen, pnamelen; - int born, created, cuflags, descend, drflags, enforce; + int created, cuflags, descend, drflags, enforce; int error, errmsg_len, errmsg_pos; int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; - int jid, jsys, len, level; + int jid, jsys, len, level, tjid; int childmax, osreldt, rsnum, slevel; #if defined(INET) || defined(INET6) int ii, ij; @@ -541,9 +542,8 @@ #ifdef INET6 int ip6s, redo_ip6; #endif - uint64_t pr_allow, ch_allow, pr_flags, ch_flags; + uint64_t pr_allow, ch_allow, pr_flags, ch_flags, tallow; uint64_t pr_allow_diff; - unsigned tallow; char numbuf[12]; error = priv_check(td, PRIV_JAIL_SET); @@ -551,9 +551,6 @@ error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); - mypr = td->td_ucred->cr_prison; - if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) - return (EPERM); if (flags & ~JAIL_SET_MASK) return (EINVAL); @@ -662,12 +659,6 @@ } ch_flags |= jsf->new | jsf->disable; } - if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE - && !(pr_flags & PR_PERSIST)) { - error = EINVAL; - vfs_opterror(opts, "new jail must persist or attach"); - goto done_errmsg; - } #ifdef VIMAGE if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) { error = EINVAL; @@ -985,9 +976,10 @@ * Find the specified jail, or at least its parent. * This abuses the file error codes ENOENT and EEXIST. */ + ppr = mypr = td->td_ucred->cr_prison; pr = NULL; - ppr = mypr; inspr = NULL; + deadpr = NULL; if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { namelc = strrchr(name, '.'); jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); @@ -1010,8 +1002,20 @@ if (inspr->pr_id == jid) { mtx_lock(&inspr->pr_mtx); if (inspr->pr_ref > 0) { - pr = inspr; - drflags |= PD_LOCKED; + if (inspr->pr_state != + PRISON_STATE_DYING) { + /* The jail exists. */ + pr = inspr; + ppr = pr->pr_parent; + drflags |= PD_LOCKED; + } else { + /* + * Note a dying jail to handle + * later. + */ + deadpr = inspr; + mtx_unlock(&deadpr->pr_mtx); + } inspr = NULL; } else mtx_unlock(&inspr->pr_mtx); @@ -1020,57 +1024,30 @@ if (inspr->pr_id > jid) break; } - if (pr != NULL) { - ppr = pr->pr_parent; - if (cuflags == JAIL_CREATE) { - /* - * Even creators that cannot see the jail will - * get EEXIST. - */ - error = EEXIST; - vfs_opterror(opts, "jail %d already exists", - jid); - goto done_deref; - } - if (pr->pr_state == PRISON_STATE_INVALID || - !prison_ischild(mypr, pr)) { - /* - * Updaters get ENOENT if they cannot see the - * jail, or if it is not yet fully created. - * This is true even for CREATE | UPDATE, - * which normally cannot give this error. - */ - error = ENOENT; - vfs_opterror(opts, "jail %d not found", jid); - goto done_deref; - } else if (pr->pr_state == PRISON_STATE_DYING) { - if (!(flags & JAIL_DYING)) { - error = ENOENT; - vfs_opterror(opts, "jail %d is dying", - jid); - goto done_deref; - } else if ((flags & JAIL_ATTACH) || - (pr_flags & PR_PERSIST)) { - /* - * A dying jail might be resurrected - * (via attach or persist), but first - * it must determine if another jail - * has claimed its name. Accomplish - * this by implicitly re-setting the - * name. - */ - if (name == NULL) - name = prison_name(mypr, pr); - } - } - } else { - /* Update: jid must exist. */ - if (cuflags == JAIL_UPDATE) { - error = ENOENT; - vfs_opterror(opts, "jail %d not found", jid); - goto done_deref; - } + + if (cuflags == JAIL_CREATE && pr != NULL) { + /* + * Creators get EEXIST if the jail already exists, + * even if they cannot see it. + */ + error = EEXIST; + vfs_opterror(opts, "jail %d already exists", jid); + goto done_deref; } + if ((pr == NULL) + ? cuflags == JAIL_UPDATE + : pr->pr_state == PRISON_STATE_INVALID || + !prison_ischild(mypr, pr)) { + /* + * Updaters get ENOENT for noexistent jails, + * or if the jail exists but they cannot see it. + * The latter case is true even for CREATE | UPDATE, + * which normally cannot give this error. + */ + error = ENOENT; + vfs_opterror(opts, "jail %d not found", jid); + goto done_deref; + } } /* * If the caller provided a name, look for a jail by that name. @@ -1101,7 +1078,10 @@ } else { *namelc = '\0'; ppr = prison_find_name(mypr, name); - if (ppr == NULL) { + if (ppr == NULL || + ppr->pr_state != PRISON_STATE_ALIVE) { + if (ppr != NULL) + mtx_unlock(&ppr->pr_mtx); error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); @@ -1115,67 +1095,40 @@ if (namelc[0] != '\0') { pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; - name_again: - deadpr = NULL; FOREACH_PRISON_CHILD(ppr, tpr) { - if (tpr != pr && tpr->pr_ref > 0 && - tpr->pr_state != PRISON_STATE_INVALID && - !strcmp(tpr->pr_name + pnamelen, namelc)) { - if (pr == NULL && - cuflags != JAIL_CREATE) { - mtx_lock(&tpr->pr_mtx); - if (tpr->pr_ref > 0) { - if (tpr->pr_state == - PRISON_STATE_ALIVE) - { - /* - * Use this jail - * for updates. - */ - pr = tpr; - drflags |= - PD_LOCKED; - break; - } - if (tpr->pr_state == - PRISON_STATE_DYING) - deadpr = tpr; - } - mtx_unlock(&tpr->pr_mtx); - } else if (tpr->pr_state == - PRISON_STATE_ALIVE) { - /* - * Create, or update(jid): - * name must not exist in an - * active sibling jail. - */ - error = EEXIST; - vfs_opterror(opts, - "jail \"%s\" already exists", - name); - goto done_deref; - } + if (tpr == pr || + strcmp(tpr->pr_name + pnamelen, namelc)) + continue; + mtx_lock(&tpr->pr_mtx); + if (tpr->pr_ref == 0 || + tpr->pr_state == PRISON_STATE_DYING) { + mtx_unlock(&tpr->pr_mtx); + continue; } - } - /* If no active jail is found, use a dying one. */ - if (deadpr != NULL && pr == NULL) { - if (flags & JAIL_DYING) { - mtx_lock(&deadpr->pr_mtx); - if (deadpr->pr_ref == 0) { - mtx_unlock(&deadpr->pr_mtx); - goto name_again; - } - pr = deadpr; - drflags |= PD_LOCKED; - } else if (cuflags == JAIL_UPDATE) { - error = ENOENT; + if (cuflags == JAIL_CREATE || pr != NULL) { + /* + * Create, or update(jid): name must + * not exist in an active sibling jail. + */ + error = EEXIST; + mtx_unlock(&tpr->pr_mtx); vfs_opterror(opts, - "jail \"%s\" is dying", name); + "jail \"%s\" already exists", name); goto done_deref; } + /* Use this jail for updates. */ + pr = tpr; + drflags |= PD_LOCKED; + break; } - /* Update: name must exist if no jid. */ - else if (cuflags == JAIL_UPDATE && pr == NULL) { + /* + * Update: name must exist if no jid. As with the jid + * case, the jail must be currently visible, or else + * even CREATE | UPDATE will get an error. + */ + if ((pr == NULL) + ? cuflags == JAIL_UPDATE + : pr->pr_state == PRISON_STATE_INVALID) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); @@ -1200,7 +1153,7 @@ goto done_deref; } mtx_lock(&ppr->pr_mtx); - if (ppr->pr_ref == 0 || ppr->pr_state == PRISON_STATE_INVALID) { + if (ppr->pr_ref == 0 || ppr->pr_state != PRISON_STATE_ALIVE) { mtx_unlock(&ppr->pr_mtx); error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", @@ -1209,15 +1162,23 @@ } ppr->pr_ref++; ppr->pr_uref++; - ppr->pr_state = PRISON_STATE_ALIVE; mtx_unlock(&ppr->pr_mtx); - if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) { - error = EAGAIN; - vfs_opterror(opts, "no available jail IDs"); - pr = ppr; - drflags |= PD_DEREF | PD_DEUREF; - goto done_deref; + /* + * If no jid was explicitly given, or if a dying jail is being + * replaced, find free ID. + */ + if (jid > 0 && deadpr == NULL) + tjid = jid; + else { + tjid = get_next_prid(&inspr); + if (tjid == 0) { + error = EAGAIN; + vfs_opterror(opts, "no available jail IDs"); + pr = ppr; + drflags |= PD_DEREF | PD_DEUREF; + goto done_deref; + } } /* @@ -1227,20 +1188,42 @@ pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO); pr->pr_state = PRISON_STATE_INVALID; pr->pr_ref = 1; - drflags |= PD_DEREF; + pr->pr_uref = 1; + drflags |= PD_DEREF | PD_DEUREF; LIST_INIT(&pr->pr_children); mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK); TASK_INIT(&pr->pr_task, 0, prison_complete, pr); + if (deadpr == NULL) { + pr->pr_id = jid = tjid; + tpr = pr; + } else { + pr->pr_id = jid; + /* + * The prison being created has the same ID as a dying + * one. Handle this by swapping the new prison with + * the dying one, and then re-adding the dying jail + * with the new ID. This may cause some confusion to + * user space, but only to those listing dying jails. + */ + TAILQ_INSERT_BEFORE(deadpr, pr, pr_list); + TAILQ_REMOVE(&allprison, deadpr, pr_list); + if (inspr == deadpr) + inspr = pr; + mtx_lock(&deadpr->pr_mtx); + deadpr->pr_id = tjid; + mtx_unlock(&deadpr->pr_mtx); + tpr = deadpr; + } + /* * Link the prison into the allprison list in ID order, * and into its parent's child list in no particular order. */ - pr->pr_id = jid; if (inspr != NULL) - TAILQ_INSERT_BEFORE(inspr, pr, pr_list); + TAILQ_INSERT_BEFORE(inspr, tpr, pr_list); else - TAILQ_INSERT_TAIL(&allprison, pr, pr_list); + TAILQ_INSERT_TAIL(&allprison, tpr, pr_list); pr->pr_parent = ppr; LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling); @@ -1331,7 +1314,8 @@ * continue to exist for the duration of the call. */ pr->pr_ref++; - drflags |= PD_DEREF; + pr->pr_uref++; + drflags |= PD_DEREF | PD_DEUREF; #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((pr->pr_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { @@ -1449,7 +1433,7 @@ #ifdef VIMAGE (tpr != tppr && (tpr->pr_flags & PR_VNET)) || #endif - tpr->pr_state == PRISON_STATE_DYIG) { + tpr->pr_state == PRISON_STATE_DYING) { descend = 0; continue; } @@ -1760,6 +1744,12 @@ pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags; mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; + /* + * Any errors past this point will need to de-persist newly created + * prisons, as well as call OSD remove. + */ + if (created) + drflags |= PD_KILL; #ifdef RACCT if (racct_enable && created) @@ -1819,18 +1809,14 @@ /* Let the modules do their work. */ sx_downgrade(&allprison_lock); drflags = (drflags & ~PD_LIST_XLOCKED) | PD_LIST_SLOCKED; - born = pr->pr_state != PRISON_STATE_ALIVE; - if (born) { + if (created) { error = osd_jail_call(pr, PR_METHOD_CREATE, opts); if (error) - goto done_remove; + goto done_deref; } error = osd_jail_call(pr, PR_METHOD_SET, opts); - if (error) { - if (born) - goto done_remove; + if (error) goto done_deref; - } /* Attach this process to the prison if requested. */ if (flags & JAIL_ATTACH) { @@ -1839,8 +1825,6 @@ drflags &= ~PD_LIST_SLOCKED; if (error) { vfs_opterror(opts, "attach failed"); - if (born) - goto done_remove; goto done_deref; } } @@ -1856,30 +1840,42 @@ #endif /* - * Now that everything is done, the prison is usually alive, though - * it might have been either new (invalid) or dying before. + * Now that everything is done, a newly created prison should be alive, + * either from persistence, attaching, or perhaps a module parameter. */ - mtx_lock(&pr->pr_mtx); - drflags |= PD_LOCKED; - if (pr->pr_uref > 0) + if (created) { + mtx_lock(&ppr->pr_mtx); + if (ppr->pr_state != PRISON_STATE_ALIVE) { + /* + * The parent prison died while this one was being + * created. + */ + error = ENOENT; + vfs_opterror(opts, "jail \"%s\" not found", + prison_name(mypr, ppr)); + mtx_unlock(&ppr->pr_mtx); + goto done_deref; + } + mtx_lock(&pr->pr_mtx); + mtx_unlock(&ppr->pr_mtx); + drflags |= PD_LOCKED; + /* + * We are holding one temporary reference, so there must be + * more than that for the prison to continue to exist. That + * usually comes from persistence or attaching, though + * modules may also add a reference. + */ + if (pr->pr_uref <= 1) { + error = EINVAL; + vfs_opterror(opts, "new jail must persist or attach"); + goto done_deref; + } pr->pr_state = PRISON_STATE_ALIVE; + drflags &= ~PD_KILL; + } td->td_retval[0] = pr->pr_id; - goto done_deref; - done_remove: - /* - * Remove the persist flag from new (or resurrected) prisons, - * and call OSD remove methods. - */ - (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); - mtx_lock(&pr->pr_mtx); - drflags |= PD_LOCKED; - if (pr->pr_flags & PR_PERSIST) { - pr->pr_flags &= ~PR_PERSIST; - pr->pr_ref--; - drflags |= PD_DEUREF; - } done_deref: /* Release any temporary prison holds and/or locks. */ if (pr != NULL) @@ -2115,7 +2111,8 @@ found_prison: /* Get the parameters of the prison. */ pr->pr_ref++; - drflags |= PD_DEREF; + pr->pr_uref++; + drflags |= PD_DEREF | PD_DEUREF; td->td_retval[0] = pr->pr_id; error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id)); if (error != 0 && error != ENOENT) @@ -2312,8 +2309,8 @@ int sys_jail_remove(struct thread *td, struct jail_remove_args *uap) { - struct prison *pr, *cpr, *lpr, *tpr; - int descend, error; + struct prison *pr; + int error; error = priv_check(td, PRIV_JAIL_REMOVE); if (error) @@ -2325,86 +2322,16 @@ sx_xunlock(&allprison_lock); return (EINVAL); } - - /* Remove all descendants of this prison, then remove this prison. */ - pr->pr_ref++; - if (!LIST_EMPTY(&pr->pr_children)) { + if (pr->pr_state == PRISON_STATE_DYING) { + /* Silently ignore already-dying prisons. */ mtx_unlock(&pr->pr_mtx); - lpr = NULL; - FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { - mtx_lock(&cpr->pr_mtx); - if (cpr->pr_ref > 0 && - cpr->pr_state != PRISON_STATE_INVALID) { - tpr = cpr; - cpr->pr_ref++; - } else { - /* Already removed - do not do it again. */ - tpr = NULL; - } - mtx_unlock(&cpr->pr_mtx); - if (lpr != NULL) { - mtx_lock(&lpr->pr_mtx); - prison_remove_one(lpr); - sx_xlock(&allprison_lock); - } - lpr = tpr; - } - if (lpr != NULL) { - mtx_lock(&lpr->pr_mtx); - prison_remove_one(lpr); - sx_xlock(&allprison_lock); - } - mtx_lock(&pr->pr_mtx); + sx_xunlock(&allprison_lock); + return (0); } - prison_remove_one(pr); + prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED); return (0); } -static void -prison_remove_one(struct prison *pr) -{ - struct proc *p; - int drflags; - - drflags = PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED; - - /* If the prison was persistent, it is not anymore. */ - if (pr->pr_flags & PR_PERSIST) { - pr->pr_ref--; - drflags |= PD_DEUREF; - pr->pr_flags &= ~PR_PERSIST; - } - - /* - * jail_remove added a reference. If that's the only one, remove - * the prison now. - */ - KASSERT(pr->pr_ref > 0, - ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id)); - if (pr->pr_ref == 1) { - prison_deref(pr, drflags); - return; - } - - mtx_unlock(&pr->pr_mtx); - sx_xunlock(&allprison_lock); - drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED); - /* - * Kill all processes unfortunate enough to be attached to this prison. - */ - sx_slock(&allproc_lock); - FOREACH_PROC_IN_SYSTEM(p) { - PROC_LOCK(p); - if (p->p_state != PRS_NEW && p->p_ucred && - p->p_ucred->cr_prison == pr) - kern_psignal(p, SIGKILL); - PROC_UNLOCK(p); - } - sx_sunlock(&allproc_lock); - /* Remove the temporary reference added by jail_remove. */ - prison_deref(pr, drflags); -} - /* * struct jail_attach_args { * int jid; @@ -2420,14 +2347,7 @@ if (error) return (error); - /* - * Start with exclusive hold on allprison_lock to ensure that a possible - * PR_METHOD_REMOVE call isn't concurrent with jail_set or jail_remove. - * But then immediately downgrade it since we don't need to stop - * readers. - */ - sx_xlock(&allprison_lock); - sx_downgrade(&allprison_lock); + sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_sunlock(&allprison_lock); @@ -2450,6 +2370,7 @@ struct proc *p; struct ucred *newcred, *oldcred; int error; + enum prison_state state; /* * XXX: Note that there is a slight race here if two threads @@ -2507,6 +2428,21 @@ #endif prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF); crfree(oldcred); + + /* + * See if the target prison died between unlocking the prison + * and changing the credentials. + */ + mtx_lock(&pr->pr_mtx); + state = pr->pr_state; + mtx_unlock(&pr->pr_mtx); + if (state == PRISON_STATE_DYING) { + /* Follow the prison into death. */ + PROC_LOCK(p); + kern_psignal(p, SIGKILL); + PROC_UNLOCK(p); + } + return (0); e_unlock: @@ -2637,6 +2573,8 @@ mtx_assert(&pr->pr_mtx, MA_OWNED); ref = --pr->pr_ref; + KASSERT(ref > 0 || pr->pr_uref == 0, + ("prison_free: last ref, but still has %d urefs", pr->pr_uref)); mtx_unlock(&pr->pr_mtx); if (ref == 0) taskqueue_enqueue(taskqueue_thread, &pr->pr_task); @@ -2666,88 +2604,188 @@ } /* - * Remove a prison reference (usually). This internal version assumes no - * mutexes are held, except perhaps the prison itself. If there are no more - * references, release and delist the prison. On completion, the prison lock - * and the allprison lock are both unlocked. + * Remove a prison reference (usually), which may or may not end up removing + * tne prison itself, or putting into a "dying" state while it cleans itself + * up. Optionally forcibly remove a prison and its descendents, including + * killing all associated processes. + * + * This can be called in a variety of situations, with the prison mutex or + * allprison_lock optionally held. It returns with all locks released. */ static void prison_deref(struct prison *pr, int flags) { - struct prison *ppr, *tpr; - int ref, lasturef; + struct prisonlist freeprison; + struct prison *killpr, *tpr; + struct proc *p; - if (!(flags & PD_LOCKED)) + /* + * Acquire necessary locks: at least the prison mutex, and perhaps an + * exclusive hold on allprison_lock. + */ + for (;;) { + if (!(flags & PD_LIST_XLOCKED) && + (pr->pr_ref == ((flags & PD_DEREF) ? 1 : 0) || + (pr->pr_uref == ((flags & PD_DEUREF) ? 1 : 0) && + (pr->pr_state != PRISON_STATE_DYING)) || + (flags & PD_KILL))) { + /* + * Grab allprison_lock. This may require unlocking + * the prison mutex and/or attempting an upgrade. + */ + if (flags & PD_LOCKED) + mtx_unlock(&pr->pr_mtx); + if (flags & PD_LIST_SLOCKED) { + if (!sx_try_upgrade(&allprison_lock)) { + sx_sunlock(&allprison_lock); + sx_xlock(&allprison_lock); + } + flags &= ~PD_LIST_SLOCKED; + } else + sx_xlock(&allprison_lock); + mtx_lock(&pr->pr_mtx); + flags |= PD_LOCKED | PD_LIST_XLOCKED; + break; + } + if (flags & PD_LOCKED) { + /* All the necessary locks are held. */ + break; + } + /* Lock the mutex, and the check again for allprison_lock. */ mtx_lock(&pr->pr_mtx); + flags |= PD_LOCKED; + } + + killpr = NULL; + TAILQ_INIT(&freeprison); + /* + * Release this prison as requested, which may cause its parent to be + * released, and then maybe its grandparent, etc. + */ for (;;) { + mtx_assert(&pr->pr_mtx, MA_OWNED); + if (flags & PD_DEREF) { + /* + * Drop a reference, which may make the prison go away. + */ + KASSERT(pr->pr_ref > 0, + ("prison_deref PD_DEREF on a dead prison (jid=%d)", + pr->pr_id)); + pr->pr_ref--; + KASSERT(prison0.pr_ref != 0, ("prison0 pr_ref=0")); + flags &= ~PD_DEREF; + } if (flags & PD_DEUREF) { + /* + * Drop a user reference, which may make the prison die. + */ KASSERT(pr->pr_uref > 0, ("prison_deref PD_DEUREF on a dead prison (jid=%d)", pr->pr_id)); pr->pr_uref--; KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0")); + flags &= ~PD_DEUREF; } - lasturef = pr->pr_uref == 0 && - pr->pr_state == PRISON_STATE_ALIVE; - if (lasturef) { - pr->pr_ref++; - pr->pr_state = PRISON_STATE_DYING; + if (flags & PD_KILL) { + /* + * Kill the prison and its descendents, regardless + * of current references. But don't kill a prison + * that is already dying. + */ + if (pr->pr_state != PRISON_STATE_DYING) { + /* + * Any user references remaining are probably + * attached processes, which are handled later. + */ + if (prison_deref_kill(pr, &freeprison) > 0) + killpr = pr; + flags |= PD_DEUREF; + } + flags &= ~PD_KILL; } - if (flags & PD_DEREF) { - KASSERT(pr->pr_ref > (lasturef ? 1 : 0), - ("prison_deref PD_DEREF on a dead prison (jid=%d)", - pr->pr_id)); + if (pr->pr_uref == 0 && pr->pr_state != PRISON_STATE_DYING) { + /* + * Note that the prison has died, at least from a user + * perspective. + */ + sx_assert(&allprison_lock, SA_XLOCKED); + pr->pr_state = PRISON_STATE_DYING; + pr->pr_ref++; + pr->pr_uref++; + mtx_unlock(&pr->pr_mtx); + (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); + mtx_lock(&pr->pr_mtx); pr->pr_ref--; + pr->pr_uref--; + for (tpr = pr->pr_parent; + tpr != NULL; + tpr = tpr->pr_parent) + tpr->pr_childcount--; + flags |= PD_DEUREF; } - ref = pr->pr_ref; - mtx_unlock(&pr->pr_mtx); + if (pr->pr_ref == 0) { + /* + * The prison is completely unreferenced, so prepare + * to remove it. + */ + KASSERT(pr->pr_uref == 0, + ("prison_deref: " + "last ref, but still has %d urefs (jid=%d)", + pr->pr_uref, pr->pr_id)); + TAILQ_REMOVE(&allprison, pr, pr_list); + TAILQ_INSERT_TAIL(&freeprison, pr, pr_list); + LIST_REMOVE(pr, pr_sibling); + flags |= PD_DEREF; + } + if (flags & (PD_DEUREF | PD_DEREF)) { + /* + * A prison was marked as dying or removed, which + * means its parent now drops a reference. + */ + mtx_unlock(&pr->pr_mtx); + pr = pr->pr_parent; + mtx_lock(&pr->pr_mtx); + continue; + } + break; + } + /* Release all the prison locks. */ + mtx_unlock(&pr->pr_mtx); + if (flags & PD_LIST_SLOCKED) + sx_sunlock(&allprison_lock); + else if (flags & PD_LIST_XLOCKED) + sx_xunlock(&allprison_lock); + + if (killpr != NULL) { /* - * Tell the modules if the last user reference was removed - * (even it sticks around in dying state). + * Kill all processes unfortunate enough to be attached + * to the killed prison. */ - if (lasturef) { - if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) { - if (ref > 1) { - sx_slock(&allprison_lock); - flags |= PD_LIST_SLOCKED; - } else { - sx_xlock(&allprison_lock); - flags |= PD_LIST_XLOCKED; - } + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + PROC_LOCK(p); + if (p->p_state != PRS_NEW && p->p_ucred != NULL) { + for (tpr = p->p_ucred->cr_prison; + tpr != &prison0; + tpr = tpr->pr_parent) + if (tpr == killpr) { + kern_psignal(p, SIGKILL); + break; + } } - (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); - mtx_lock(&pr->pr_mtx); - ref = --pr->pr_ref; - mtx_unlock(&pr->pr_mtx); + PROC_UNLOCK(p); } + sx_sunlock(&allproc_lock); + } - /* If the prison still has references, nothing else to do. */ - if (ref > 0) { - if (flags & PD_LIST_SLOCKED) - sx_sunlock(&allprison_lock); - else if (flags & PD_LIST_XLOCKED) - sx_xunlock(&allprison_lock); - return; - } - - if (flags & PD_LIST_SLOCKED) { - if (!sx_try_upgrade(&allprison_lock)) { - sx_sunlock(&allprison_lock); - sx_xlock(&allprison_lock); - } - } else if (!(flags & PD_LIST_XLOCKED)) - sx_xlock(&allprison_lock); - - TAILQ_REMOVE(&allprison, pr, pr_list); - LIST_REMOVE(pr, pr_sibling); - ppr = pr->pr_parent; - for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) - tpr->pr_childcount--; - sx_xunlock(&allprison_lock); - + TAILQ_FOREACH_SAFE(pr, &freeprison, pr_list, tpr) { + /* + * Finish removing unreferenced prisons, which couldn't happen + * while allprison_lock was held (to avoid a LOR on vrele). + */ #ifdef VIMAGE - if (pr->pr_vnet != ppr->pr_vnet) + if (pr->pr_vnet != pr->pr_parent->pr_vnet) vnet_destroy(pr->pr_vnet); #endif if (pr->pr_root != NULL) @@ -2766,13 +2804,109 @@ if (racct_enable) prison_racct_detach(pr); #endif + TAILQ_REMOVE(&freeprison, pr, pr_list); free(pr, M_PRISON); + } +} - /* Removing a prison frees a reference on its parent. */ - pr = ppr; - mtx_lock(&pr->pr_mtx); - flags = PD_DEREF | PD_DEUREF | PD_LOCKED; +/* + * Kill the prison and its descendants. Mark them as dying, clear any persist + * flag, and call any module remove methods. Return the total of all prisons' + * remaining user references. + */ +static int +prison_deref_kill(struct prison *pr, struct prisonlist *freeprison) +{ + struct prison *cpr, *tpr; + int descend, killed, urefs; + + /* + * The operation on the prison and each descendant is similar to what + * prison_deref() does when losing the last user or system reference, + * plus extra work to clear PR_PERSIST. + */ + sx_assert(&allprison_lock, SA_XLOCKED); + killed = 1; + urefs = 0; + pr->pr_state = PRISON_STATE_DYING; + pr->pr_flags |= PR_REMOVE; + pr->pr_ref++; + pr->pr_uref++; + mtx_unlock(&pr->pr_mtx); + (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); + + FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) { + if (descend) { + mtx_lock(&cpr->pr_mtx); + if (cpr->pr_state != PRISON_STATE_ALIVE) + descend = 0; + else { + killed++; + cpr->pr_state = PRISON_STATE_DYING; + cpr->pr_flags |= PR_REMOVE; + cpr->pr_ref++; + cpr->pr_uref++; + cpr->pr_parent->pr_uref--; + } + mtx_unlock(&cpr->pr_mtx); + (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); + } else { + mtx_lock(&cpr->pr_mtx); + if (cpr->pr_flags & PR_REMOVE) { + cpr->pr_flags &= ~PR_REMOVE; + if (cpr->pr_flags & PR_PERSIST) { + cpr->pr_flags &= ~PR_PERSIST; + cpr->pr_ref--; + cpr->pr_uref--; + } + cpr->pr_childcount = 0; + KASSERT(cpr->pr_ref > 0 && cpr->pr_uref > 0, + ("prison_deref_kill: " + "temporary refs gone (jid=%d)", + cpr->pr_id)); + cpr->pr_ref--; + cpr->pr_uref--; + urefs += cpr->pr_uref; + } + if (cpr->pr_ref == 0) { + KASSERT(cpr->pr_uref == 0, + ("prison_deref_kill: lasr ref, " + "but still has %d urefs (jid=%d)", + cpr->pr_uref, cpr->pr_id)); + TAILQ_REMOVE(&allprison, cpr, pr_list); + TAILQ_INSERT_TAIL(freeprison, cpr, pr_list); + cpr->pr_parent->pr_ref--; + } + mtx_unlock(&cpr->pr_mtx); + } } + + mtx_lock(&pr->pr_mtx); + pr->pr_flags &= ~PR_REMOVE; + if (pr->pr_flags & PR_PERSIST) { + pr->pr_flags &= ~PR_PERSIST; + pr->pr_ref--; + pr->pr_uref--; + } + pr->pr_childcount = 0; + KASSERT(pr->pr_ref > 0 && pr->pr_uref > 0, + ("prison_deref_kill: temporary refs gone (jid=%d)", + pr->pr_id)); + pr->pr_ref--; + pr->pr_uref--; + urefs += pr->pr_uref; + for (tpr = pr->pr_parent; + tpr != NULL; + tpr = tpr->pr_parent) + tpr->pr_childcount -= killed; + + /* + * Disconnect unreferenced descendant prisons from their parents, + * which couldn't easily be done mid-loop. + */ + TAILQ_FOREACH(cpr, freeprison, pr_list) + LIST_REMOVE(cpr, pr_sibling); + return urefs; } void @@ -3619,6 +3753,10 @@ again: #endif mtx_lock(&cpr->pr_mtx); + if (cpr->pr_ref == 0 || cpr->pr_state == PRISON_STATE_INVALID) { + mtx_unlock(&cpr->pr_mtx); + continue; + } #ifdef INET if (cpr->pr_ip4s > 0) { if (ip4s < cpr->pr_ip4s) { @@ -3645,10 +3783,6 @@ cpr->pr_ip6s * sizeof(struct in6_addr)); } #endif - if (cpr->pr_ref == 0 || cpr->pr_state == PRISON_STATE_INVALID) { - mtx_unlock(&cpr->pr_mtx); - continue; - } bzero(xp, sizeof(*xp)); xp->pr_version = XPRISON_VERSION; xp->pr_id = cpr->pr_id; diff --git a/sys/sys/jail.h b/sys/sys/jail.h --- a/sys/sys/jail.h +++ b/sys/sys/jail.h @@ -101,7 +101,7 @@ #define JAIL_UPDATE 0x02 /* Update parameters of existing jail */ #define JAIL_ATTACH 0x04 /* Attach to jail upon creation */ #define JAIL_DYING 0x08 /* Allow getting a dying jail */ -#define JAIL_SET_MASK 0x0f +#define JAIL_SET_MASK 0x0f /* JAIL_DYING is deprecated/ignored here */ #define JAIL_GET_MASK 0x08 #define JAIL_SYS_DISABLE 0 @@ -180,7 +180,7 @@ struct prison_racct *pr_prison_racct; /* (c) racct jail proxy */ void *pr_sparep[3]; int pr_childcount; /* (a) number of child jails */ - int pr_childmax; /* (p) maximum child jails */ + int pr_childmax; /* (a) maximum child jails */ unsigned pr_allow; /* (p) PR_ALLOW_* flags */ int pr_securelevel; /* (p) securelevel */ int pr_enforce_statfs; /* (p) statfs permission */ @@ -218,6 +218,7 @@ /* primary jail address. */ /* Internal flag bits */ +#define PR_REMOVE 0x01000000 /* In process of being removed */ #define PR_IP4 0x02000000 /* IPv4 restricted or disabled */ /* by this jail or an ancestor */ #define PR_IP6 0x04000000 /* IPv6 restricted or disabled */ @@ -334,6 +335,19 @@ if ((descend) ? (prison_lock(cpr), 0) : 1) \ ; \ else + +/* + * As FOREACH_PRISON_DESCENDANT, but visit both preorder and postorder. + */ +#define FOREACH_PRISON_DESCENDANT_PRE_POST(ppr, cpr, descend) \ + for ((cpr) = (ppr), (descend) = 1; \ + ((cpr) = (descend) \ + ? ((descend) = !LIST_EMPTY(&(cpr)->pr_children)) \ + ? LIST_FIRST(&(cpr)->pr_children) \ + : (cpr) \ + : ((descend) = LIST_NEXT(cpr, pr_sibling) != NULL) \ + ? LIST_NEXT(cpr, pr_sibling) \ + : cpr->pr_parent) != (ppr);) /* * Attributes of the physical system, and the root of the jail tree. diff --git a/usr.sbin/jail/jail.8 b/usr.sbin/jail/jail.8 --- a/usr.sbin/jail/jail.8 +++ b/usr.sbin/jail/jail.8 @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd November 18, 2020 +.Dd January 11, 2020 .Dt JAIL 8 .Os .Sh NAME @@ -136,10 +136,6 @@ .Pp Other available options are: .Bl -tag -width indent -.It Fl d -Allow making changes to a dying jail, equivalent to the -.Va allow.dying -parameter. .It Fl f Ar conf_file Use configuration file .Ar conf_file @@ -207,6 +203,10 @@ .It Fl v Print a message on every operation, such as running commands and mounting filesystems. +.It Fl d +This is deprecated, and equivalent to the (deprecated) +.Va allow.dying +parameter. .El .Pp If no arguments are given after the options, the operation (except @@ -903,9 +903,14 @@ .Pa /proc directory. .It Va allow.dying -Allow making changes to a +This deprecated and has no effect. +It used to allow making changes to a .Va dying jail. +Now such jails are always replaced when a new jail is created with the same +.Va jid +or +.Va name . .It Va depend Specify a jail (or jails) that this jail depends on. When this jail is to be created, any jail(s) it depends on must already exist. diff --git a/usr.sbin/jail/jail.c b/usr.sbin/jail/jail.c --- a/usr.sbin/jail/jail.c +++ b/usr.sbin/jail/jail.c @@ -65,7 +65,7 @@ static void clear_persist(struct cfjail *j); static int update_jail(struct cfjail *j); static int rdtun_params(struct cfjail *j, int dofail); -static void running_jid(struct cfjail *j, int dflag); +static void running_jid(struct cfjail *j); static void jail_quoted_warnx(const struct cfjail *j, const char *name_msg, const char *noname_msg); static int jailparam_set_note(const struct cfjail *j, struct jailparam *jp, @@ -140,7 +140,7 @@ char *JidFile; size_t sysvallen; unsigned op, pi; - int ch, docf, error, i, oldcl, sysval; + int ch, docf, error, i, oldcl, sysval, dying_warned; int dflag, Rflag; #if defined(INET) || defined(INET6) char *cs, *ncs; @@ -377,6 +377,7 @@ * operation on it. When that is done, the jail may be finished, * or it may go back for the next step. */ + dying_warned = 0; while ((j = next_jail())) { if (j->flags & JF_FAILED) { @@ -397,11 +398,13 @@ import_params(j) < 0) continue; } + if (j->intparams[IP_ALLOW_DYING] && !dying_warned) { + warnx("%s", "the 'allow.dying' parameter and '-d' flag" + "are deprecated and have no effect."); + dying_warned = 1; + } if (!j->jid) - running_jid(j, - (j->flags & (JF_SET | JF_DEPEND)) == JF_SET - ? dflag || bool_param(j->intparams[IP_ALLOW_DYING]) - : 0); + running_jid(j); if (finish_command(j)) continue; @@ -613,11 +616,10 @@ int create_jail(struct cfjail *j) { - struct iovec jiov[4]; struct stat st; - struct jailparam *jp, *setparams, *setparams2, *sjp; + struct jailparam *jp, *setparams, *sjp; const char *path; - int dopersist, ns, jid, dying, didfail; + int dopersist, ns; /* * Check the jail's path, with a better error message than jail_set @@ -657,57 +659,8 @@ *sjp++ = *jp; ns = sjp - setparams; - didfail = 0; j->jid = jailparam_set_note(j, setparams, ns, JAIL_CREATE); - if (j->jid < 0 && errno == EEXIST && - bool_param(j->intparams[IP_ALLOW_DYING]) && - int_param(j->intparams[KP_JID], &jid) && jid != 0) { - /* - * The jail already exists, but may be dying. - * Make sure it is, in which case an update is appropriate. - */ - jiov[0].iov_base = __DECONST(char *, "jid"); - jiov[0].iov_len = sizeof("jid"); - jiov[1].iov_base = &jid; - jiov[1].iov_len = sizeof(jid); - jiov[2].iov_base = __DECONST(char *, "dying"); - jiov[2].iov_len = sizeof("dying"); - jiov[3].iov_base = &dying; - jiov[3].iov_len = sizeof(dying); - if (jail_get(jiov, 4, JAIL_DYING) < 0) { - /* - * It could be that the jail just barely finished - * dying, or it could be that the jid never existed - * but the name does. In either case, another try - * at creating the jail should do the right thing. - */ - if (errno == ENOENT) - j->jid = jailparam_set_note(j, setparams, ns, - JAIL_CREATE); - } else if (dying) { - j->jid = jid; - if (rdtun_params(j, 1) < 0) { - j->jid = -1; - didfail = 1; - } else { - sjp = setparams2 = alloca((j->njp + dopersist) * - sizeof(struct jailparam)); - for (jp = setparams; jp < setparams + ns; jp++) - if (!JP_RDTUN(jp) || - !strcmp(jp->jp_name, "jid")) - *sjp++ = *jp; - j->jid = jailparam_set_note(j, setparams2, - sjp - setparams2, JAIL_UPDATE | JAIL_DYING); - /* - * Again, perhaps the jail just finished dying. - */ - if (j->jid < 0 && errno == ENOENT) - j->jid = jailparam_set_note(j, - setparams, ns, JAIL_CREATE); - } - } - } - if (j->jid < 0 && !didfail) { + if (j->jid < 0) { jail_warnx(j, "%s", jail_errmsg); failed(j); } @@ -772,9 +725,7 @@ if (!JP_RDTUN(jp)) *++sjp = *jp; - jid = jailparam_set_note(j, setparams, ns, - bool_param(j->intparams[IP_ALLOW_DYING]) - ? JAIL_UPDATE | JAIL_DYING : JAIL_UPDATE); + jid = jailparam_set_note(j, setparams, ns, JAIL_UPDATE); if (jid < 0) { jail_warnx(j, "%s", jail_errmsg); failed(j); @@ -813,8 +764,7 @@ rtjp->jp_value = NULL; } rval = 0; - if (jailparam_get(rtparams, nrt, - bool_param(j->intparams[IP_ALLOW_DYING]) ? JAIL_DYING : 0) > 0) { + if (jailparam_get(rtparams, nrt, 0) > 0) { rtjp = rtparams + 1; for (jp = j->jp; rtjp < rtparams + nrt; jp++) { if (JP_RDTUN(jp) && strcmp(jp->jp_name, "jid")) { @@ -851,7 +801,7 @@ * Get the jail's jid if it is running. */ static void -running_jid(struct cfjail *j, int dflag) +running_jid(struct cfjail *j) { struct iovec jiov[2]; const char *pval; @@ -877,7 +827,7 @@ j->jid = -1; return; } - j->jid = jail_get(jiov, 2, dflag ? JAIL_DYING : 0); + j->jid = jail_get(jiov, 2, 0); } static void @@ -906,10 +856,9 @@ jid = jailparam_set(jp, njp, flags); if (verbose > 0) { - jail_note(j, "jail_set(%s%s)", + jail_note(j, "jail_set(%s)", (flags & (JAIL_CREATE | JAIL_UPDATE)) == JAIL_CREATE - ? "JAIL_CREATE" : "JAIL_UPDATE", - (flags & JAIL_DYING) ? " | JAIL_DYING" : ""); + ? "JAIL_CREATE" : "JAIL_UPDATE"); for (i = 0; i < njp; i++) { printf(" %s", jp[i].jp_name); if (jp[i].jp_value == NULL)