diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -376,6 +376,7 @@ [~EVFILT_SENDFILE] = { &null_filtops }, [~EVFILT_EMPTY] = { &file_filtops, 1 }, [~EVFILT_JAIL] = { &jail_filtops, 1 }, + [~EVFILT_JAILDESC] = { &file_filtops, 1 }, }; /* diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -171,7 +171,7 @@ static void prison_racct_modify(struct prison *pr); static void prison_racct_detach(struct prison *pr); #endif -static void prison_knote(struct prison *pr, long hint); +static void prison_knote(struct prison *pr, long hint, int64_t data); /* Flags for prison_deref */ #define PD_DEREF 0x01 /* Decrement pr_ref */ @@ -2245,9 +2245,8 @@ */ if (created) { sx_assert(&allprison_lock, SX_XLOCKED); - mtx_lock(&ppr->pr_mtx); - knote_fork(ppr->pr_klist, pr->pr_id); - mtx_unlock(&ppr->pr_mtx); + jaildesc_add_child(pr); + prison_knote(ppr, NOTE_JAIL_CHILD, pr->pr_id); mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; pr->pr_state = PRISON_STATE_ALIVE; @@ -2313,7 +2312,7 @@ * the failure. */ if (maybe_changed && !created) - prison_knote(pr, NOTE_JAIL_SET); + prison_knote(pr, NOTE_JAIL_SET, 0); /* Release any temporary prison holds and/or locks. */ if (pr != NULL) prison_deref(pr, drflags); @@ -3114,7 +3113,7 @@ prison_proc_relink(oldcred->cr_prison, pr, p); prison_deref(oldcred->cr_prison, drflags); crfree(oldcred); - prison_knote(pr, NOTE_JAIL_ATTACH | td->td_proc->p_pid); + prison_knote(pr, NOTE_JAIL_ATTACH, td->td_proc->p_pid); /* * If the prison was killed while changing credentials, die along @@ -3782,7 +3781,7 @@ { sx_assert(&allprison_lock, SA_XLOCKED); mtx_assert(&pr->pr_mtx, MA_OWNED); - prison_knote(pr, NOTE_JAIL_REMOVE); + prison_knote(pr, NOTE_JAIL_REMOVE, 0); knlist_detach(pr->pr_klist); jaildesc_prison_cleanup(pr); pr->pr_klist = NULL; @@ -5419,14 +5418,18 @@ * Submit a knote for a prison, locking if necessary. */ static void -prison_knote(struct prison *pr, long hint) +prison_knote(struct prison *pr, long hint, int64_t data) { int locked; locked = mtx_owned(&pr->pr_mtx); if (!locked) mtx_lock(&pr->pr_mtx); - KNOTE_LOCKED(pr->pr_klist, hint); + if (hint == NOTE_JAIL_CHILD) + knote_fork(pr->pr_klist, data); + else + KNOTE_LOCKED(pr->pr_klist, hint | data); + jaildesc_knote(pr, hint, data); if (!locked) mtx_unlock(&pr->pr_mtx); } diff --git a/sys/kern/kern_jaildesc.c b/sys/kern/kern_jaildesc.c --- a/sys/kern/kern_jaildesc.c +++ b/sys/kern/kern_jaildesc.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,9 @@ MALLOC_DEFINE(M_JAILDESC, "jaildesc", "jail descriptors"); +static fo_ioctl_t jaildesc_ioctl; +static fo_poll_t jaildesc_poll; +static fo_kqfilter_t jaildesc_kqfilter; static fo_stat_t jaildesc_stat; static fo_close_t jaildesc_close; static fo_chmod_t jaildesc_chmod; @@ -56,9 +60,9 @@ .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, - .fo_ioctl = invfo_ioctl, - .fo_poll = invfo_poll, - .fo_kqfilter = invfo_kqfilter, + .fo_ioctl = jaildesc_ioctl, + .fo_poll = jaildesc_poll, + .fo_kqfilter = jaildesc_kqfilter, .fo_stat = jaildesc_stat, .fo_close = jaildesc_close, .fo_chmod = jaildesc_chmod, @@ -69,6 +73,8 @@ .fo_flags = DFLAG_PASSABLE, }; +static void jaildesc_clear_holding(struct jaildesc *jd); + /* * Given a jail descriptor number, return the jaildesc, its prison, * and its credential. The jaildesc will be returned locked, and @@ -140,6 +146,7 @@ finit(fp, priv_check_cred(fp->f_cred, PRIV_JAIL_SET) == 0 ? FREAD | FWRITE : FREAD, DTYPE_JAILDESC, jd, &jaildesc_ops); JAILDESC_LOCK_INIT(jd); + knlist_init_mtx(&jd->jd_selinfo.si_note, &jd->jd_lock); jd->jd_uid = fp->f_cred->cr_uid; jd->jd_gid = fp->f_cred->cr_gid; jd->jd_mode = S_IFREG | S_IRUSR | S_IRGRP | S_IROTH | mode @@ -157,6 +164,7 @@ { struct jaildesc *jd; + sx_assert(&allprison_lock, SA_LOCKED); mtx_assert(&pr->pr_mtx, MA_OWNED); jd = fp->f_data; JAILDESC_LOCK(jd); @@ -174,25 +182,177 @@ { struct jaildesc *jd; + sx_assert(&allprison_lock, SA_XLOCKED); mtx_assert(&pr->pr_mtx, MA_OWNED); while ((jd = LIST_FIRST(&pr->pr_descs))) { JAILDESC_LOCK(jd); LIST_REMOVE(jd, jd_list); jd->jd_prison = NULL; - JAILDESC_UNLOCK(jd); + if (jd->jd_held_by != NULL) { + /* + * There's no file descriptor to wait on, so + * remove the disassociated jaildesc immediately. + */ + LIST_REMOVE(jd, jd_nexthold); + knlist_destroy(&jd->jd_selinfo.si_note); + JAILDESC_LOCK_DESTROY(jd); + free(jd, M_JAILDESC); + } else + JAILDESC_UNLOCK(jd); prison_free(pr); } } +/* + * Pass a note to all listening kqueues. + */ +void +jaildesc_knote(struct prison *pr, long hint, int64_t data) +{ + struct jaildesc *jd; + int prison_locked; + + if (!LIST_EMPTY(&pr->pr_descs)) { + prison_locked = mtx_owned(&pr->pr_mtx); + if (!prison_locked) + prison_lock(pr); + LIST_FOREACH(jd, &pr->pr_descs, jd_list) { + JAILDESC_LOCK(jd); + if (hint == NOTE_JAIL_CHILD) + knote_fork(&jd->jd_selinfo.si_note, data); + else { + if (hint == NOTE_JAIL_REMOVE) { + jd->jd_flags |= JDF_REMOVED; + if (jd->jd_flags & JDF_SELECTED) { + jd->jd_flags &= ~JDF_SELECTED; + selwakeup(&jd->jd_selinfo); + } + } + KNOTE_LOCKED(&jd->jd_selinfo.si_note, + hint | data); + } + JAILDESC_UNLOCK(jd); + } + if (!prison_locked) + prison_unlock(pr); + } +} + +/* + * Create jaildescs for a new child, not yet tied to any file descriptor. + */ +void +jaildesc_add_child(struct prison *pr) +{ + struct jaildesc *jd, *hjd; + bool prison_locked; + + sx_assert(&allprison_lock, SX_XLOCKED); + mtx_assert(&pr->pr_mtx, MA_NOTOWNED); + prison_locked = false; + LIST_FOREACH(jd, &pr->pr_parent->pr_descs, jd_list) { + /* Check for recusrsion before locking, and then again after. */ + if (!(jd->jd_flags & JDF_RECURSE)) + continue; + if (prison_locked) { + hjd = malloc(sizeof(*jd), M_JAILDESC, + M_NOWAIT | M_ZERO); + if (hjd == NULL) { + prison_unlock(pr); + hjd = malloc(sizeof(*jd), M_JAILDESC, + M_WAITOK | M_ZERO); + prison_lock(pr); + } + } else { + hjd = malloc(sizeof(*jd), M_JAILDESC, + M_WAITOK | M_ZERO); + prison_lock(pr); + prison_locked = true; + } + JAILDESC_LOCK(jd); + if (!(jd->jd_flags & JDF_RECURSE)) { + free(hjd, M_JAILDESC); + JAILDESC_UNLOCK(jd); + continue; + } + /* Partially initiaize the holding jaildesc. */ + JAILDESC_LOCK_INIT(hjd); + knlist_init_mtx(&hjd->jd_selinfo.si_note, &hjd->jd_lock); + hjd->jd_prison = pr; + LIST_INSERT_HEAD(&pr->pr_descs, hjd, jd_list); + prison_hold(pr); + LIST_INSERT_HEAD(&jd->jd_holding, hjd, jd_nexthold); + hjd->jd_held_by = jd; + hjd->jd_flags |= JDF_RECURSE; + if (jd->jd_flags & JDF_SELECTED) { + jd->jd_flags &= ~JDF_SELECTED; + selwakeup(&jd->jd_selinfo); + } + JAILDESC_UNLOCK(jd); + } + if (prison_locked) + prison_unlock(pr); +} + +/* + * Recursively remove the contents of the jaildesc's holding list. + */ +static void +jaildesc_clear_holding(struct jaildesc *jd) +{ + struct jaildesc *hjd, *rjd; + struct prison *rpr; + bool descend; + + sx_assert(&allprison_lock, SA_XLOCKED); + rjd = NULL; + for (hjd = jd, descend = true; + (hjd = descend ? + (descend = !LIST_EMPTY(&hjd->jd_holding)) ? + LIST_FIRST(&hjd->jd_holding) : hjd : + (descend = LIST_NEXT(hjd, jd_nexthold) != NULL) ? + LIST_NEXT(hjd, jd_nexthold) : + hjd->jd_held_by) != jd;) + if (!descend) { + if (rjd != NULL) { + rpr = rjd->jd_prison; + prison_lock(rpr); + LIST_REMOVE(rjd, jd_list); + prison_unlock(rpr); + prison_free(rpr); + LIST_REMOVE(rjd, jd_nexthold); + knlist_destroy(&rjd->jd_selinfo.si_note); + JAILDESC_LOCK_DESTROY(rjd); + free(rjd, M_JAILDESC); + } + rjd = hjd; + } + if (rjd != NULL) { + rpr = rjd->jd_prison; + prison_lock(rpr); + LIST_REMOVE(rjd, jd_list); + prison_unlock(rpr); + prison_free(rpr); + LIST_REMOVE(rjd, jd_nexthold); + knlist_destroy(&rjd->jd_selinfo.si_note); + JAILDESC_LOCK_DESTROY(rjd); + free(rjd, M_JAILDESC); + } +} + static int jaildesc_close(struct file *fp, struct thread *td) { struct jaildesc *jd; struct prison *pr; + bool allprison_locked; jd = fp->f_data; fp->f_data = NULL; if (jd != NULL) { + sx_xlock(&allprison_lock); + allprison_locked = true; + jaildesc_clear_holding(jd); JAILDESC_LOCK(jd); pr = jd->jd_prison; if (pr != NULL) { @@ -207,7 +367,6 @@ prison_hold(pr); JAILDESC_UNLOCK(jd); if (jd->jd_mode & S_ISTXT) { - sx_xlock(&allprison_lock); prison_lock(pr); if (jd->jd_prison != NULL) { /* @@ -217,10 +376,9 @@ */ LIST_REMOVE(jd, jd_list); prison_remove(pr); - } else { + allprison_locked = false; + } else prison_unlock(pr); - sx_xunlock(&allprison_lock); - } } else { prison_lock(pr); if (jd->jd_prison != NULL) { @@ -231,12 +389,197 @@ } prison_free(pr); } + if (allprison_locked) + sx_xunlock(&allprison_lock); + knlist_destroy(&jd->jd_selinfo.si_note); JAILDESC_LOCK_DESTROY(jd); free(jd, M_JAILDESC); } return (0); } +static int +jaildesc_ioctl(struct file *fp, u_long com, void *data, + struct ucred *active_cred, struct thread *td) +{ + struct file *hfp; + struct jaildesc *jd, *hjd; + int error, hfd; + + jd = fp->f_data; + error = 0; + switch (com) { + case JIORECURSE: + /* + * Create jaildescs for child jails credted under this + * jail, and any under them, etc. + */ + JAILDESC_LOCK(jd); + jd->jd_flags |= JDF_RECURSE; + JAILDESC_UNLOCK(jd); + break; + + case JIONORECURSE: + /* + * Stop tracking child jail creation, and remove any + * jaildescs that have been created but not fully opened. + */ + if (!(jd->jd_flags & JDF_RECURSE)) + break; + sx_xlock(&allprison_lock); + JAILDESC_LOCK(jd); + jd->jd_flags &= ~JDF_RECURSE; + JAILDESC_UNLOCK(jd); + jaildesc_clear_holding(jd); + sx_xunlock(&allprison_lock); + break; + + case JIOGETCHILD: + /* + * Allocate and return a file descriptor for a child + * jaildesc, if there is one. + */ + if (LIST_EMPTY(&jd->jd_holding)) { + error = jd->jd_flags & JDF_RECURSE ? EAGAIN : EINVAL; + break; + } + error = falloc_caps(td, &hfp, &hfd, 0, NULL); + if (error != 0) + break; + sx_xlock(&allprison_lock); + hjd = LIST_FIRST(&jd->jd_holding); + if (hjd == NULL) { + /* The jaildesc disappeared during falloc. */ + error = EAGAIN; + sx_xunlock(&allprison_lock); + fdclose(td, hfp, hfd); + fdrop(hfp, td); + break; + } + LIST_REMOVE(hjd, jd_nexthold); + JAILDESC_LOCK(hjd); + hjd->jd_held_by = NULL; + /* Finish initializing the jaildesc, now that there's a file. */ + hjd->jd_uid = hfp->f_cred->cr_uid; + hjd->jd_gid = hfp->f_cred->cr_gid; + hjd->jd_mode = S_IFREG | S_IRUSR | S_IRGRP | S_IROTH | + (priv_check(td, PRIV_JAIL_SET) == 0 ? S_IWUSR | S_IXUSR : + 0) | (priv_check(td, PRIV_JAIL_ATTACH) == 0 ? S_IXUSR : 0); + finit(hfp, priv_check_cred(hfp->f_cred, PRIV_JAIL_SET) == 0 ? + FREAD | FWRITE : FREAD, DTYPE_JAILDESC, hjd, &jaildesc_ops); + if (!LIST_EMPTY(&hjd->jd_holding) && + (hjd->jd_flags & JDF_SELECTED)) { + hjd->jd_flags &= ~JDF_SELECTED; + selwakeup(&hjd->jd_selinfo); + } + JAILDESC_UNLOCK(hjd); + sx_xunlock(&allprison_lock); + fdrop(hfp, td); + *(int*)data = hfd; + break; + + default: + error = ENOTTY; + break; + } + return (error); +} + +static int +jaildesc_poll(struct file *fp, int events, struct ucred *active_cred, + struct thread *td) +{ + struct jaildesc *jd; + int revents; + + revents = 0; + jd = fp->f_data; + JAILDESC_LOCK(jd); + if (!LIST_EMPTY(&jd->jd_holding)) + revents |= events & (POLLIN | POLLRDNORM); + if (jd->jd_flags & JDF_REMOVED) + revents |= POLLHUP; + if (revents == 0) { + selrecord(td, &jd->jd_selinfo); + jd->jd_flags |= JDF_SELECTED; + } + JAILDESC_UNLOCK(jd); + return (revents); +} + +static void +jaildesc_kqops_detach(struct knote *kn) +{ + struct jaildesc *jd; + + jd = kn->kn_fp->f_data; + knlist_remove(&jd->jd_selinfo.si_note, kn, 0); +} + +static int +jaildesc_kqops_event(struct knote *kn, long hint) +{ + struct jaildesc *jd; + u_int event; + + jd = kn->kn_fp->f_data; + if (hint == 0) { + /* + * Initial test after registration. Generate a + * NOTE_JAIL_REMOVE in case the prison already died + * before registration. + */ + event = jd->jd_flags & JDF_REMOVED ? NOTE_JAIL_REMOVE : 0; + } else { + /* Mask off extra data. */ + event = (u_int)hint & NOTE_JAIL_CTRLMASK; + } + + /* If the user is interested in this event, record it. */ + if (kn->kn_sfflags & event) + kn->kn_fflags |= event; + + /* Report the attached process id. */ + if (event == NOTE_JAIL_ATTACH) { + if (kn->kn_data != 0) + kn->kn_fflags |= NOTE_JAIL_ATTACH_MULTI; + kn->kn_data = hint & NOTE_JAIL_DATAMASK; + } + + /* Prison is gone, so flag the event as finished. */ + if (event == NOTE_JAIL_REMOVE) { + kn->kn_flags |= EV_EOF | EV_ONESHOT; + if (kn->kn_fflags == 0) + kn->kn_flags |= EV_DROP; + return (1); + } + + return (kn->kn_fflags != 0); +} + +static const struct filterops jaildesc_kqops = { + .f_isfd = 1, + .f_detach = jaildesc_kqops_detach, + .f_event = jaildesc_kqops_event, +}; + +static int +jaildesc_kqfilter(struct file *fp, struct knote *kn) +{ + struct jaildesc *jd; + + jd = fp->f_data; + switch (kn->kn_filter) { + case EVFILT_JAILDESC: + kn->kn_fop = &jaildesc_kqops; + kn->kn_flags |= EV_CLEAR; + knlist_add(&jd->jd_selinfo.si_note, kn, 0); + return (0); + default: + return (EINVAL); + } +} + static int jaildesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { diff --git a/sys/sys/event.h b/sys/sys/event.h --- a/sys/sys/event.h +++ b/sys/sys/event.h @@ -46,7 +46,8 @@ #define EVFILT_SENDFILE (-12) /* attached to sendfile requests */ #define EVFILT_EMPTY (-13) /* empty send socket buf */ #define EVFILT_JAIL (-14) /* attached to struct prison */ -#define EVFILT_SYSCOUNT 14 +#define EVFILT_JAILDESC (-15) /* attached to jail descriptors */ +#define EVFILT_SYSCOUNT 15 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L #define EV_SET(kevp_, a, b, c, d, e, f) do { \ @@ -205,7 +206,7 @@ #define NOTE_PCTRLMASK 0xf0000000 /* mask for hint bits */ #define NOTE_PDATAMASK 0x000fffff /* mask for pid */ -/* data/hint flags for EVFILT_JAIL */ +/* data/hint flags for EVFILT_JAIL and EVFILT_JAILDESC */ #define NOTE_JAIL_SET 0x80000000 /* jail was modified */ #define NOTE_JAIL_CHILD 0x40000000 /* child jail was created */ #define NOTE_JAIL_ATTACH 0x20000000 /* jail was attached to */ diff --git a/sys/sys/jail.h b/sys/sys/jail.h --- a/sys/sys/jail.h +++ b/sys/sys/jail.h @@ -198,7 +198,7 @@ struct prison_ip *pr_addrs[PR_FAMILY_MAX]; /* (p,n) IPs of jail */ struct prison_racct *pr_prison_racct; /* (c) racct jail proxy */ struct knlist *pr_klist; /* (m) attached knotes */ - LIST_HEAD(, jaildesc) pr_descs; /* (a) attached descriptors */ + LIST_HEAD(, jaildesc) pr_descs; /* (q) attached descriptors */ void *pr_sparep; int pr_childcount; /* (a) number of child jails */ int pr_childmax; /* (p) maximum child jails */ diff --git a/sys/sys/jaildesc.h b/sys/sys/jaildesc.h --- a/sys/sys/jaildesc.h +++ b/sys/sys/jaildesc.h @@ -34,7 +34,9 @@ #ifdef _KERNEL +#include #include +#include #include #include #include @@ -46,14 +48,18 @@ * prison. struct prison in turn has a linked list of struct jaildesc. * * Locking key: - * (c) set on creation, remains unchanged + * (a) allprison_lock * (d) jd_lock * (p) jd_prison->pr_mtx */ struct jaildesc { - LIST_ENTRY(jaildesc) jd_list; /* (d,p) this prison's descs */ - struct prison *jd_prison; /* (d) the prison */ + LIST_ENTRY(jaildesc) jd_list; /* (p,a) this prison's descs */ + LIST_HEAD(, jaildesc) jd_holding; /* (a) child jaildescs w/o fd */ + LIST_ENTRY(jaildesc) jd_nexthold; /* (a) next in parent's list */ + struct jaildesc *jd_held_by; /* (a) jaildesc that is holding this */ + struct prison *jd_prison; /* (d,a) the prison */ struct mtx jd_lock; + struct selinfo jd_selinfo; /* (d) event notification */ uid_t jd_uid; /* (d) nominal file owner */ gid_t jd_gid; /* (d) nominal file group */ mode_t jd_mode; /* (d) descriptor permissions */ @@ -72,14 +78,23 @@ /* * Flags for the jd_flags field */ +#define JDF_SELECTED 0x00000001 /* issue selwakeup() */ #define JDF_REMOVED 0x00000002 /* jail was removed */ +#define JDF_RECURSE 0x00000004 /* add jaildesc for child jails */ int jaildesc_find(struct thread *td, int fd, struct jaildesc **jdp, struct prison **prp, struct ucred **ucredp); int jaildesc_alloc(struct thread *td, struct file **fpp, int *fdp, int owning); void jaildesc_set_prison(struct file *jd, struct prison *pr); void jaildesc_prison_cleanup(struct prison *pr); +void jaildesc_knote(struct prison *pr, long hint, int64_t data); +void jaildesc_add_child(struct prison *pr); #endif /* _KERNEL */ +/* Jail descriptor ioctl's. */ +#define JIORECURSE _IO('j', 1) /* create recursive desc's */ +#define JIONORECURSE _IO('j', 2) /* stop recursive desc's */ +#define JIOGETCHILD _IOR('j', 3, int) /* fetch queued child desc */ + #endif /* !_SYS_JAILDESC_H_ */