Index: kern/kern_jail.c =================================================================== --- kern/kern_jail.c +++ kern/kern_jail.c @@ -138,7 +138,7 @@ int lastprid = 0; static int get_next_prid(struct prison **insprp); -static int do_jail_attach(struct thread *td, struct prison *pr); +static int do_jail_attach(struct thread *td, struct prison *pr, int drflags); static void prison_complete(void *context, int pending); static void prison_deref(struct prison *pr, int flags); static int prison_lock_xlock(struct prison *pr, int flags); @@ -1014,6 +1014,8 @@ if (inspr->pr_id > jid) break; pr = inspr; + KASSERT(prison_isvalid(pr), + ("Found invalid prison %p", pr)); mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; inspr = NULL; @@ -1032,7 +1034,7 @@ jid); goto done_deref; } - if (!prison_isvalid(pr) || !prison_ischild(mypr, pr)) { + if (!prison_ischild(mypr, pr)) { /* * Updaters get ENOENT if they cannot see the * jail. This is true even for CREATE | UPDATE, @@ -1123,7 +1125,6 @@ if (namelc[0] != '\0') { pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; - name_again: deadpr = NULL; FOREACH_PRISON_CHILD(ppr, tpr) { if (tpr != pr && @@ -1152,16 +1153,17 @@ goto done_deref; } if (pr == NULL && - cuflags != JAIL_CREATE && - prison_isvalid(tpr)) + cuflags != JAIL_CREATE) { deadpr = tpr; + KASSERT(prison_isvalid(deadpr), + ("Found invalid prison %p", + deadpr)); + } } } /* If no active jail is found, use a dying one. */ if (deadpr != NULL && pr == NULL) { if (flags & JAIL_DYING) { - if (!prison_isvalid(deadpr)) - goto name_again; pr = deadpr; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; @@ -1202,8 +1204,7 @@ /* This brings the parent back to life. */ mtx_lock(&ppr->pr_mtx); refcount_acquire(&ppr->pr_uref); - if (ppr->pr_state == PRISON_STATE_DYING) - ppr->pr_state = PRISON_STATE_ALIVE; + ppr->pr_state = PRISON_STATE_ALIVE; mtx_unlock(&ppr->pr_mtx); error = osd_jail_call(ppr, PR_METHOD_CREATE, opts); if (error) { @@ -1814,8 +1815,6 @@ #endif /* Let the modules do their work. */ - sx_downgrade(&allprison_lock); - drflags = (drflags & ~PD_LIST_XLOCKED) | PD_LIST_SLOCKED; if (born) { error = osd_jail_call(pr, PR_METHOD_CREATE, opts); if (error) @@ -1828,11 +1827,18 @@ goto done_deref; } + /* New prisons are now ready to be seen. */ + if (created) { + mtx_lock(&pr->pr_mtx); + drflags |= PD_LOCKED; + pr->pr_state = refcount_load(&pr->pr_uref) > 0 + ? PRISON_STATE_ALIVE : PRISON_STATE_DYING; + } + /* Attach this process to the prison if requested. */ if (flags & JAIL_ATTACH) { - mtx_lock(&pr->pr_mtx); - error = do_jail_attach(td, pr); - drflags &= ~PD_LIST_SLOCKED; + error = do_jail_attach(td, pr, prison_lock_xlock(pr, drflags)); + drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED); if (error) { vfs_opterror(opts, "attach failed"); if (born) @@ -1843,27 +1849,20 @@ #ifdef RACCT if (racct_enable && !created) { - if (drflags & PD_LIST_SLOCKED) { - sx_sunlock(&allprison_lock); - drflags &= ~PD_LIST_SLOCKED; + if (drflags & PD_LIST_XLOCKED) { + sx_xunlock(&allprison_lock); + drflags &= ~PD_LIST_XLOCKED; } prison_racct_modify(pr); } #endif - if (created) { - /* The new prison is now ready to be seen. */ - drflags = prison_lock_xlock(pr, drflags); - pr->pr_state = refcount_load(&pr->pr_uref) > 0 - ? PRISON_STATE_ALIVE : PRISON_STATE_DYING; - } - td->td_retval[0] = pr->pr_id; goto done_deref; done_remove: /* - * prison_deref will call the remove methods when alive prisons die; + * prison_deref will call the remove method when alive prisons die; * otherwise it needs to be called now. */ if (!(drflags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) { @@ -2038,9 +2037,11 @@ error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid)); if (error == 0) { TAILQ_FOREACH(pr, &allprison, pr_list) { - if (pr->pr_id > jid && ((flags & JAIL_DYING) - ? prison_isvalid(pr) : prison_isalive(pr)) && + if (pr->pr_id > jid && + ((flags & JAIL_DYING) || prison_isalive(pr)) && prison_ischild(mypr, pr)) { + KASSERT(prison_isvalid(pr), + ("Found invalid prison %p", pr)); mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; goto found_prison; @@ -2301,7 +2302,7 @@ int sys_jail_remove(struct thread *td, struct jail_remove_args *uap) { - struct prison *pr, *cpr, *lpr, *tpr; + struct prison *pr, *cpr, *lpr; int descend, error; error = priv_check(td, PRIV_JAIL_REMOVE); @@ -2321,17 +2322,15 @@ mtx_unlock(&pr->pr_mtx); lpr = NULL; FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { - if (prison_isvalid(cpr)) { - tpr = cpr; - prison_hold(tpr); - } else - tpr = NULL; + KASSERT(prison_isvalid(cpr), + ("Found invalid prison %p", cpr)); + prison_hold(cpr); if (lpr != NULL) { mtx_lock(&lpr->pr_mtx); prison_remove_one(lpr); sx_xlock(&allprison_lock); } - lpr = tpr; + lpr = cpr; } if (lpr != NULL) { mtx_lock(&lpr->pr_mtx); @@ -2419,16 +2418,19 @@ return (EINVAL); } - return (do_jail_attach(td, pr)); + return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED)); } static int -do_jail_attach(struct thread *td, struct prison *pr) +do_jail_attach(struct thread *td, struct prison *pr, int drflags) { struct proc *p; struct ucred *newcred, *oldcred; - int drflags, error; + int error; + mtx_assert(&pr->pr_mtx, MA_OWNED); + sx_assert(&allprison_lock, SX_LOCKED); + KASSERT(prison_isvalid(pr), ("Attaching to invalid prison %p", pr)); /* * XXX: Note that there is a slight race here if two threads * in the same privileged process attempt to attach to two @@ -2437,18 +2439,15 @@ * a process root from one prison, but attached to the jail * of another. */ - drflags = PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_SLOCKED; - if (!prison_isalive(pr)) { + prison_hold(pr); + if (refcount_acquire(&pr->pr_uref) == 0) { /* - * This will be the first reference, so allprison_lock - * needs an exclusive hold. + * A new prison is now alive (it was likely just set to dying). + * Don't call modules, because that has already been done. */ - drflags = prison_lock_xlock(pr, drflags); - } - prison_hold(pr); - if (refcount_acquire(&pr->pr_uref) == 0 && - pr->pr_state == PRISON_STATE_DYING) pr->pr_state = PRISON_STATE_ALIVE; + } + drflags |= PD_DEREF | PD_DEUREF; mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; @@ -2520,19 +2519,13 @@ sx_assert(&allprison_lock, SX_LOCKED); TAILQ_FOREACH(pr, &allprison, pr_list) { - if (pr->pr_id == prid) { - if (prison_isvalid(pr)) { - mtx_lock(&pr->pr_mtx); - return (pr); - } - /* - * Any active prison with the same ID would have - * been inserted before a dead one. - */ - break; - } + if (pr->pr_id < prid) + continue; if (pr->pr_id > prid) break; + KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr)); + mtx_lock(&pr->pr_mtx); + return (pr); } return (NULL); } @@ -2549,10 +2542,10 @@ sx_assert(&allprison_lock, SX_LOCKED); FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { if (pr->pr_id == prid) { - if (prison_isvalid(pr)) { - mtx_lock(&pr->pr_mtx); - return (pr); - } + KASSERT(prison_isvalid(pr), + ("Found invalid prison %p", pr)); + mtx_lock(&pr->pr_mtx); + return (pr); } } return (NULL); @@ -2572,7 +2565,9 @@ mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1; deadpr = NULL; FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { - if (prison_isvalid(pr) && !strcmp(pr->pr_name + mylen, name)) { + if (!strcmp(pr->pr_name + mylen, name)) { + KASSERT(prison_isvalid(pr), + ("Found invalid prison %p", pr)); if (prison_isalive(pr)) { mtx_lock(&pr->pr_mtx); return (pr); @@ -2774,8 +2769,14 @@ static void prison_deref(struct prison *pr, int flags) { - struct prison *ppr, *tpr; + struct prisonlist freeprison; + struct prison *ppr, *rpr, *tpr; + TAILQ_INIT(&freeprison); + /* + * Release this prison as requested, which may cause its parent to be + * released, and then maybe its grandparent, etc. + */ for (;;) { if (flags & PD_DEUREF) { /* Drop a user reference. */ @@ -2783,8 +2784,10 @@ ("prison_deref PD_DEUREF on a dead prison (jid=%d)", pr->pr_id)); if (!refcount_release_if_not_last(&pr->pr_uref)) { - /* This is the last user reference. */ - prison_hold(pr); + if (!(flags & PD_DEREF)) { + prison_hold(pr); + flags |= PD_DEREF; + } flags = prison_lock_xlock(pr, flags); if (refcount_release(&pr->pr_uref) && pr->pr_state == PRISON_STATE_ALIVE) { @@ -2793,18 +2796,15 @@ * this becomes a dying prison (unless * it was one already). */ + KASSERT( + refcount_load(&prison0.pr_uref) > 0, + ("prison0 pr_uref=0")); pr->pr_state = PRISON_STATE_DYING; mtx_unlock(&pr->pr_mtx); flags &= ~PD_LOCKED; (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); } - KASSERT(refcount_load(&prison0.pr_uref) > 0, - ("prison0 pr_uref=0")); - if (!(flags & PD_DEREF)) - flags |= PD_DEREF; - else - prison_free_not_last(pr); } } if (flags & PD_DEREF) { @@ -2812,58 +2812,81 @@ KASSERT(refcount_load(&pr->pr_ref) > 0, ("prison_deref PD_DEREF on a dead prison (jid=%d)", pr->pr_id)); - if (refcount_release_if_not_last(&pr->pr_ref)) - break; - /* This is the last reference. */ - flags = prison_lock_xlock(pr, flags); - if (!refcount_release(&pr->pr_ref)) - break; - } else - break; + if (!refcount_release_if_not_last(&pr->pr_ref)) { + flags = prison_lock_xlock(pr, flags); + if (refcount_release(&pr->pr_ref)) { + /* + * When the last reference goes, + * prepare to remove the prison. + */ + KASSERT( + refcount_load(&pr->pr_uref) == 0, + ("prison_deref: last ref, " + "but still has %d urefs (jid=%d)", + pr->pr_uref, pr->pr_id)); + KASSERT( + refcount_load(&prison0.pr_ref) != 0, + ("prison0 pr_ref=0")); + pr->pr_state = PRISON_STATE_INVALID; + TAILQ_REMOVE(&allprison, pr, pr_list); + LIST_REMOVE(pr, pr_sibling); + TAILQ_INSERT_TAIL(&freeprison, pr, + pr_list); + for (ppr = pr->pr_parent; + ppr != NULL; + ppr = ppr->pr_parent) + ppr->pr_childcount--; + /* + * Removing a prison frees references + * from its parent. + */ + mtx_unlock(&pr->pr_mtx); + flags &= ~PD_LOCKED; + pr = pr->pr_parent; + flags |= PD_DEREF | PD_DEUREF; + continue; + } + } + } + break; + } - /* Do the work of actually freeing the prison. */ - pr->pr_state = PRISON_STATE_INVALID; - TAILQ_REMOVE(&allprison, pr, pr_list); - LIST_REMOVE(pr, pr_sibling); - ppr = pr->pr_parent; - for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) - tpr->pr_childcount--; + /* Release all the prison locks. */ + if (flags & PD_LOCKED) + mtx_unlock(&pr->pr_mtx); + if (flags & PD_LIST_SLOCKED) + sx_sunlock(&allprison_lock); + else if (flags & PD_LIST_XLOCKED) sx_xunlock(&allprison_lock); + /* + * Finish removing any unreferenced prisons, which couldn't happen + * while allprison_lock was held (to avoid a LOR on vrele). + */ + TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) { #ifdef VIMAGE - if (pr->pr_vnet != ppr->pr_vnet) - vnet_destroy(pr->pr_vnet); + if (rpr->pr_vnet != rpr->pr_parent->pr_vnet) + vnet_destroy(rpr->pr_vnet); #endif - if (pr->pr_root != NULL) - vrele(pr->pr_root); - mtx_destroy(&pr->pr_mtx); + if (rpr->pr_root != NULL) + vrele(rpr->pr_root); + mtx_destroy(&rpr->pr_mtx); #ifdef INET - free(pr->pr_ip4, M_PRISON); + free(rpr->pr_ip4, M_PRISON); #endif #ifdef INET6 - free(pr->pr_ip6, M_PRISON); + free(rpr->pr_ip6, M_PRISON); #endif - if (pr->pr_cpuset != NULL) - cpuset_rel(pr->pr_cpuset); - osd_jail_exit(pr); + if (rpr->pr_cpuset != NULL) + cpuset_rel(rpr->pr_cpuset); + osd_jail_exit(rpr); #ifdef RACCT if (racct_enable) - prison_racct_detach(pr); + prison_racct_detach(rpr); #endif - free(pr, M_PRISON); - - /* Removing a prison frees a reference on its parent. */ - pr = ppr; - flags = PD_DEREF | PD_DEUREF; + TAILQ_REMOVE(&freeprison, rpr, pr_list); + free(rpr, M_PRISON); } - - /* Nothing more to do: release locks and return. */ - if (flags & PD_LOCKED) - mtx_unlock(&pr->pr_mtx); - if (flags & PD_LIST_SLOCKED) - sx_sunlock(&allprison_lock); - else if (flags & PD_LIST_XLOCKED) - sx_xunlock(&allprison_lock); } /* @@ -3078,7 +3101,10 @@ /* * Return true if the prison is currently valid, i.e. it has been fully - * created. Note that dying prisons are still considered valid. + * been fully created, and is not being destroyed. Note that dying prisons + * are still considered valid. Invalid prisons won't be found under normal + * circumstances, as they're only put in that state by functions that have + * an exclusive hold on allprison_locki. */ bool prison_isvalid(struct prison *pr) @@ -3718,8 +3744,7 @@ #if defined(INET) || defined(INET6) again: #endif - if (!prison_isvalid(cpr)) - continue; + KASSERT(prison_isvalid(cpr), ("Found invalid prison %p", cpr)); mtx_lock(&cpr->pr_mtx); #ifdef INET if (cpr->pr_ip4s > 0) {