diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -376,6 +377,7 @@ [~EVFILT_SENDFILE] = { &null_filtops }, [~EVFILT_EMPTY] = { &file_filtops, 1 }, [~EVFILT_JAIL] = { &jail_filtops, 1 }, + [~EVFILT_JAILDESC] = { &file_filtops, 1 }, }; /* @@ -557,6 +559,8 @@ memset(&kev, 0, sizeof(kev)); SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { + int64_t parent_id; + kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { @@ -595,12 +599,24 @@ * notification since both NOTE_CHILD and NOTE_EXIT are defined * to use the data field (in conflicting ways). */ + if (kn->kn_filter == EVFILT_JAILDESC) { + /* + * The new note must be EVFILT_JAIL, because + * EVFILT_JAILDESC would require access to + * every process that gets this note. + */ + kev.filter = EVFILT_JAIL; + parent_id = ((struct jaildesc*)kn->kn_fp->f_data)-> + jd_prison->pr_id; /* parent's prison */ + } else { + kev.filter = kn->kn_filter; + parent_id = kn->kn_id; /* parent */ + } kev.ident = pid; - kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | EV_FLAG2; kev.fflags = kn->kn_sfflags; - kev.data = kn->kn_id; /* parent */ + kev.data = parent_id; kev.udata = kn->kn_kevent.udata;/* preserve udata */ error = kqueue_register(kq, &kev, NULL, M_NOWAIT); if (error) @@ -610,11 +626,12 @@ * Then register another knote to track other potential events * from the new process. */ + kev.filter = kn->kn_filter == EVFILT_JAILDESC ? EVFILT_JAIL : + kn->kn_filter; kev.ident = pid; - kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; kev.fflags = kn->kn_sfflags; - kev.data = kn->kn_id; /* parent */ + kev.data = parent_id; kev.udata = kn->kn_kevent.udata;/* preserve udata */ error = kqueue_register(kq, &kev, NULL, M_NOWAIT); if (error) diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -171,7 +171,7 @@ static void prison_racct_modify(struct prison *pr); static void prison_racct_detach(struct prison *pr); #endif -static void prison_knote(struct prison *pr, long hint); +static void prison_knote(struct prison *pr, long hint, int64_t data); /* Flags for prison_deref */ #define PD_DEREF 0x01 /* Decrement pr_ref */ @@ -2246,7 +2246,7 @@ if (created) { sx_assert(&allprison_lock, SX_XLOCKED); mtx_lock(&ppr->pr_mtx); - knote_fork(ppr->pr_klist, pr->pr_id); + prison_knote(ppr, NOTE_JAIL_CHILD, pr->pr_id); mtx_unlock(&ppr->pr_mtx); mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; @@ -2313,7 +2313,7 @@ * the failure. */ if (maybe_changed && !created) - prison_knote(pr, NOTE_JAIL_SET); + prison_knote(pr, NOTE_JAIL_SET, 0); /* Release any temporary prison holds and/or locks. */ if (pr != NULL) prison_deref(pr, drflags); @@ -3114,7 +3114,7 @@ prison_proc_relink(oldcred->cr_prison, pr, p); prison_deref(oldcred->cr_prison, drflags); crfree(oldcred); - prison_knote(pr, NOTE_JAIL_ATTACH | td->td_proc->p_pid); + prison_knote(pr, NOTE_JAIL_ATTACH, td->td_proc->p_pid); /* * If the prison was killed while changing credentials, die along @@ -3782,7 +3782,7 @@ { sx_assert(&allprison_lock, SA_XLOCKED); mtx_assert(&pr->pr_mtx, MA_OWNED); - prison_knote(pr, NOTE_JAIL_REMOVE); + prison_knote(pr, NOTE_JAIL_REMOVE, 0); knlist_detach(pr->pr_klist); jaildesc_prison_cleanup(pr); pr->pr_klist = NULL; @@ -5419,14 +5419,18 @@ * Submit a knote for a prison, locking if necessary. */ static void -prison_knote(struct prison *pr, long hint) +prison_knote(struct prison *pr, long hint, int64_t data) { int locked; locked = mtx_owned(&pr->pr_mtx); if (!locked) mtx_lock(&pr->pr_mtx); - KNOTE_LOCKED(pr->pr_klist, hint); + if (hint == NOTE_JAIL_CHILD) + knote_fork(pr->pr_klist, data); + else + KNOTE_LOCKED(pr->pr_klist, hint | data); + jaildesc_knote(pr, hint, data); if (!locked) mtx_unlock(&pr->pr_mtx); } diff --git a/sys/kern/kern_jaildesc.c b/sys/kern/kern_jaildesc.c --- a/sys/kern/kern_jaildesc.c +++ b/sys/kern/kern_jaildesc.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,8 @@ MALLOC_DEFINE(M_JAILDESC, "jaildesc", "jail descriptors"); +static fo_poll_t jaildesc_poll; +static fo_kqfilter_t jaildesc_kqfilter; static fo_stat_t jaildesc_stat; static fo_close_t jaildesc_close; static fo_chmod_t jaildesc_chmod; @@ -57,8 +60,8 @@ .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, - .fo_poll = invfo_poll, - .fo_kqfilter = invfo_kqfilter, + .fo_poll = jaildesc_poll, + .fo_kqfilter = jaildesc_kqfilter, .fo_stat = jaildesc_stat, .fo_close = jaildesc_close, .fo_chmod = jaildesc_chmod, @@ -140,6 +143,7 @@ finit(fp, priv_check_cred(fp->f_cred, PRIV_JAIL_SET) == 0 ? FREAD | FWRITE : FREAD, DTYPE_JAILDESC, jd, &jaildesc_ops); JAILDESC_LOCK_INIT(jd); + knlist_init_mtx(&jd->jd_selinfo.si_note, &jd->jd_lock); jd->jd_uid = fp->f_cred->cr_uid; jd->jd_gid = fp->f_cred->cr_gid; jd->jd_mode = S_IFREG | S_IRUSR | S_IRGRP | S_IROTH | mode @@ -184,6 +188,41 @@ } } +/* + * Pass a note to all listening kqueues. + */ +void +jaildesc_knote(struct prison *pr, long hint, int64_t data) +{ + struct jaildesc *jd; + int prison_locked; + + if (!LIST_EMPTY(&pr->pr_descs)) { + prison_locked = mtx_owned(&pr->pr_mtx); + if (!prison_locked) + prison_lock(pr); + LIST_FOREACH(jd, &pr->pr_descs, jd_list) { + JAILDESC_LOCK(jd); + if (hint == NOTE_JAIL_CHILD) + knote_fork(&jd->jd_selinfo.si_note, data); + else { + if (hint == NOTE_JAIL_REMOVE) { + jd->jd_flags |= JDF_REMOVED; + if (jd->jd_flags & JDF_SELECTED) { + jd->jd_flags &= ~JDF_SELECTED; + selwakeup(&jd->jd_selinfo); + } + } + KNOTE_LOCKED(&jd->jd_selinfo.si_note, + hint | data); + } + JAILDESC_UNLOCK(jd); + } + if (!prison_locked) + prison_unlock(pr); + } +} + static int jaildesc_close(struct file *fp, struct thread *td) { @@ -231,12 +270,106 @@ } prison_free(pr); } + knlist_destroy(&jd->jd_selinfo.si_note); JAILDESC_LOCK_DESTROY(jd); free(jd, M_JAILDESC); } return (0); } +static int +jaildesc_poll(struct file *fp, int events, struct ucred *active_cred, + struct thread *td) +{ + struct jaildesc *jd; + int revents; + + revents = 0; + jd = fp->f_data; + JAILDESC_LOCK(jd); + if (jd->jd_flags & JDF_REMOVED) + revents |= POLLHUP; + if (revents == 0) { + selrecord(td, &jd->jd_selinfo); + jd->jd_flags |= JDF_SELECTED; + } + JAILDESC_UNLOCK(jd); + return (revents); +} + +static void +jaildesc_kqops_detach(struct knote *kn) +{ + struct jaildesc *jd; + + jd = kn->kn_fp->f_data; + knlist_remove(&jd->jd_selinfo.si_note, kn, 0); +} + +static int +jaildesc_kqops_event(struct knote *kn, long hint) +{ + struct jaildesc *jd; + u_int event; + + jd = kn->kn_fp->f_data; + if (hint == 0) { + /* + * Initial test after registration. Generate a + * NOTE_JAIL_REMOVE in case the prison already died + * before registration. + */ + event = jd->jd_flags & JDF_REMOVED ? NOTE_JAIL_REMOVE : 0; + } else { + /* Mask off extra data. */ + event = (u_int)hint & NOTE_JAIL_CTRLMASK; + } + + /* If the user is interested in this event, record it. */ + if (kn->kn_sfflags & event) + kn->kn_fflags |= event; + + /* Report the attached process id. */ + if (event == NOTE_JAIL_ATTACH) { + if (kn->kn_data != 0) + kn->kn_fflags |= NOTE_JAIL_ATTACH_MULTI; + kn->kn_data = hint & NOTE_JAIL_DATAMASK; + } + + /* Prison is gone, so flag the event as finished. */ + if (event == NOTE_JAIL_REMOVE) { + kn->kn_flags |= EV_EOF | EV_ONESHOT; + if (kn->kn_fflags == 0) + kn->kn_flags |= EV_DROP; + return (1); + } + + return (kn->kn_fflags != 0); +} + +static const struct filterops jaildesc_kqops = { + .f_isfd = 1, + .f_detach = jaildesc_kqops_detach, + .f_event = jaildesc_kqops_event, +}; + +static int +jaildesc_kqfilter(struct file *fp, struct knote *kn) +{ + struct jaildesc *jd; + + jd = fp->f_data; + switch (kn->kn_filter) { + case EVFILT_JAILDESC: + kn->kn_fop = &jaildesc_kqops; + kn->kn_flags |= EV_CLEAR; + knlist_add(&jd->jd_selinfo.si_note, kn, 0); + return (0); + default: + return (EINVAL); + } +} + static int jaildesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { diff --git a/sys/sys/event.h b/sys/sys/event.h --- a/sys/sys/event.h +++ b/sys/sys/event.h @@ -46,7 +46,8 @@ #define EVFILT_SENDFILE (-12) /* attached to sendfile requests */ #define EVFILT_EMPTY (-13) /* empty send socket buf */ #define EVFILT_JAIL (-14) /* attached to struct prison */ -#define EVFILT_SYSCOUNT 14 +#define EVFILT_JAILDESC (-15) /* attached to jail descriptors */ +#define EVFILT_SYSCOUNT 15 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L #define EV_SET(kevp_, a, b, c, d, e, f) do { \ @@ -205,7 +206,7 @@ #define NOTE_PCTRLMASK 0xf0000000 /* mask for hint bits */ #define NOTE_PDATAMASK 0x000fffff /* mask for pid */ -/* data/hint flags for EVFILT_JAIL */ +/* data/hint flags for EVFILT_JAIL and EVFILT_JAILDESC */ #define NOTE_JAIL_SET 0x80000000 /* jail was modified */ #define NOTE_JAIL_CHILD 0x40000000 /* child jail was created */ #define NOTE_JAIL_ATTACH 0x20000000 /* jail was attached to */ @@ -214,7 +215,7 @@ #define NOTE_JAIL_CTRLMASK 0xf0000000 /* mask for hint bits */ #define NOTE_JAIL_DATAMASK 0x000fffff /* mask for pid */ -/* additional flags for EVFILT_PROC and EVFILT_JAIL */ +/* additional flags for EVFILT_PROC, EVFILT_JAIL, and EVFILT_JAILDESC */ #define NOTE_TRACK 0x00000001 /* follow across fork/create */ #define NOTE_TRACKERR 0x00000002 /* could not track child */ #define NOTE_CHILD 0x00000004 /* am a child process/jail */ diff --git a/sys/sys/jaildesc.h b/sys/sys/jaildesc.h --- a/sys/sys/jaildesc.h +++ b/sys/sys/jaildesc.h @@ -35,6 +35,7 @@ #ifdef _KERNEL #include +#include #include #include #include @@ -54,6 +55,7 @@ LIST_ENTRY(jaildesc) jd_list; /* (d,p) this prison's descs */ struct prison *jd_prison; /* (d) the prison */ struct mtx jd_lock; + struct selinfo jd_selinfo; /* (d) event notification */ uid_t jd_uid; /* (d) nominal file owner */ gid_t jd_gid; /* (d) nominal file group */ mode_t jd_mode; /* (d) descriptor permissions */ @@ -72,6 +74,7 @@ /* * Flags for the jd_flags field */ +#define JDF_SELECTED 0x00000001 /* issue selwakeup() */ #define JDF_REMOVED 0x00000002 /* jail was removed */ int jaildesc_find(struct thread *td, int fd, struct jaildesc **jdp, @@ -79,6 +82,7 @@ int jaildesc_alloc(struct thread *td, struct file **fpp, int *fdp, int owning); void jaildesc_set_prison(struct file *jd, struct prison *pr); void jaildesc_prison_cleanup(struct prison *pr); +void jaildesc_knote(struct prison *pr, long hint, int64_t data); #endif /* _KERNEL */