Index: head/sys/compat/svr4/svr4_stream.c =================================================================== --- head/sys/compat/svr4/svr4_stream.c (revision 174987) +++ head/sys/compat/svr4/svr4_stream.c (revision 174988) @@ -1,2033 +1,2029 @@ /*- * Copyright (c) 1998 Mark Newton. All rights reserved. * Copyright (c) 1994, 1996 Christos Zoulas. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Christos Zoulas. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Pretend that we have streams... * Yes, this is gross. * * ToDo: The state machine for getmsg needs re-thinking */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* Must come after sys/uio.h */ #include #include #include #include #include #include #include #include #include #include #include #include /* Utils */ static int clean_pipe(struct thread *, char *); static void getparm(struct file *, struct svr4_si_sockparms *); static int svr4_do_putmsg(struct thread *, struct svr4_sys_putmsg_args *, struct file *); static int svr4_do_getmsg(struct thread *, struct svr4_sys_getmsg_args *, struct file *); /* Address Conversions */ static void sockaddr_to_netaddr_in(struct svr4_strmcmd *, const struct sockaddr_in *); static void sockaddr_to_netaddr_un(struct svr4_strmcmd *, const struct sockaddr_un *); static void netaddr_to_sockaddr_in(struct sockaddr_in *, const struct svr4_strmcmd *); static void netaddr_to_sockaddr_un(struct sockaddr_un *, const struct svr4_strmcmd *); /* stream ioctls */ static int i_nread(struct file *, struct thread *, register_t *, int, u_long, caddr_t); static int i_fdinsert(struct file *, struct thread *, register_t *, int, u_long, caddr_t); static int i_str(struct file *, struct thread *, register_t *, int, u_long, caddr_t); static int i_setsig(struct file *, struct thread *, register_t *, int, u_long, caddr_t); static int i_getsig(struct file *, struct thread *, register_t *, int, u_long, caddr_t); static int _i_bind_rsvd(struct file *, struct thread *, register_t *, int, u_long, caddr_t); static int _i_rele_rsvd(struct file *, struct thread *, register_t *, int, u_long, caddr_t); /* i_str sockmod calls */ static int sockmod(struct file *, int, struct svr4_strioctl *, struct thread *); static int si_listen(struct file *, int, struct svr4_strioctl *, struct thread *); static int si_ogetudata(struct file *, int, struct svr4_strioctl *, struct thread *); static int si_sockparams(struct file *, int, struct svr4_strioctl *, struct thread *); static int si_shutdown (struct file *, int, struct svr4_strioctl *, struct thread *); static int si_getudata(struct file *, int, struct svr4_strioctl *, struct thread *); /* i_str timod calls */ static int timod(struct file *, int, struct svr4_strioctl *, struct thread *); static int ti_getinfo(struct file *, int, struct svr4_strioctl *, struct thread *); static int ti_bind(struct file *, int, struct svr4_strioctl *, struct thread *); #ifdef DEBUG_SVR4 static void bufprint(u_char *, size_t); static int show_ioc(const char *, struct svr4_strioctl *); static int show_strbuf(struct svr4_strbuf *); static void show_msg(const char *, int, struct svr4_strbuf *, struct svr4_strbuf *, int); static void bufprint(buf, len) u_char *buf; size_t len; { size_t i; uprintf("\n\t"); for (i = 0; i < len; i++) { uprintf("%x ", buf[i]); if (i && (i % 16) == 0) uprintf("\n\t"); } } static int show_ioc(str, ioc) const char *str; struct svr4_strioctl *ioc; { u_char *ptr = NULL; int len; int error; len = ioc->len; if (len > 1024) len = 1024; if (len > 0) { ptr = (u_char *) malloc(len, M_TEMP, M_WAITOK); if ((error = copyin(ioc->buf, ptr, len)) != 0) { free((char *) ptr, M_TEMP); return error; } } uprintf("%s cmd = %ld, timeout = %d, len = %d, buf = %p { ", str, ioc->cmd, ioc->timeout, ioc->len, ioc->buf); if (ptr != NULL) bufprint(ptr, len); uprintf("}\n"); if (ptr != NULL) free((char *) ptr, M_TEMP); return 0; } static int show_strbuf(str) struct svr4_strbuf *str; { int error; u_char *ptr = NULL; int maxlen = str->maxlen; int len = str->len; if (maxlen > 8192) maxlen = 8192; if (maxlen < 0) maxlen = 0; if (len >= maxlen) len = maxlen; if (len > 0) { ptr = (u_char *) malloc(len, M_TEMP, M_WAITOK); if ((error = copyin(str->buf, ptr, len)) != 0) { free((char *) ptr, M_TEMP); return error; } } uprintf(", { %d, %d, %p=[ ", str->maxlen, str->len, str->buf); if (ptr) bufprint(ptr, len); uprintf("]}"); if (ptr) free((char *) ptr, M_TEMP); return 0; } static void show_msg(str, fd, ctl, dat, flags) const char *str; int fd; struct svr4_strbuf *ctl; struct svr4_strbuf *dat; int flags; { struct svr4_strbuf buf; int error; uprintf("%s(%d", str, fd); if (ctl != NULL) { if ((error = copyin(ctl, &buf, sizeof(buf))) != 0) return; show_strbuf(&buf); } else uprintf(", NULL"); if (dat != NULL) { if ((error = copyin(dat, &buf, sizeof(buf))) != 0) return; show_strbuf(&buf); } else uprintf(", NULL"); uprintf(", %x);\n", flags); } #endif /* DEBUG_SVR4 */ /* * We are faced with an interesting situation. On svr4 unix sockets * are really pipes. But we really have sockets, and we might as * well use them. At the point where svr4 calls TI_BIND, it has * already created a named pipe for the socket using mknod(2). * We need to create a socket with the same name when we bind, * so we need to remove the pipe before, otherwise we'll get address * already in use. So we *carefully* remove the pipe, to avoid * using this as a random file removal tool. We use system calls * to avoid code duplication. */ static int clean_pipe(td, path) struct thread *td; char *path; { struct stat st; int error; error = kern_lstat(td, path, UIO_SYSSPACE, &st); /* * Make sure we are dealing with a mode 0 named pipe. */ if ((st.st_mode & S_IFMT) != S_IFIFO) return (0); if ((st.st_mode & ALLPERMS) != 0) return (0); error = kern_unlink(td, path, UIO_SYSSPACE); if (error) DPRINTF(("clean_pipe: unlink failed %d\n", error)); return (error); } static void sockaddr_to_netaddr_in(sc, sain) struct svr4_strmcmd *sc; const struct sockaddr_in *sain; { struct svr4_netaddr_in *na; na = SVR4_ADDROF(sc); na->family = sain->sin_family; na->port = sain->sin_port; na->addr = sain->sin_addr.s_addr; DPRINTF(("sockaddr_in -> netaddr %d %d %lx\n", na->family, na->port, na->addr)); } static void sockaddr_to_netaddr_un(sc, saun) struct svr4_strmcmd *sc; const struct sockaddr_un *saun; { struct svr4_netaddr_un *na; char *dst, *edst = ((char *) sc) + sc->offs + sizeof(na->family) + 1 - sizeof(*sc); const char *src; na = SVR4_ADDROF(sc); na->family = saun->sun_family; for (src = saun->sun_path, dst = na->path; (*dst++ = *src++) != '\0'; ) if (dst == edst) break; DPRINTF(("sockaddr_un -> netaddr %d %s\n", na->family, na->path)); } static void netaddr_to_sockaddr_in(sain, sc) struct sockaddr_in *sain; const struct svr4_strmcmd *sc; { const struct svr4_netaddr_in *na; na = SVR4_C_ADDROF(sc); memset(sain, 0, sizeof(*sain)); sain->sin_len = sizeof(*sain); sain->sin_family = na->family; sain->sin_port = na->port; sain->sin_addr.s_addr = na->addr; DPRINTF(("netaddr -> sockaddr_in %d %d %x\n", sain->sin_family, sain->sin_port, sain->sin_addr.s_addr)); } static void netaddr_to_sockaddr_un(saun, sc) struct sockaddr_un *saun; const struct svr4_strmcmd *sc; { const struct svr4_netaddr_un *na; char *dst, *edst = &saun->sun_path[sizeof(saun->sun_path) - 1]; const char *src; na = SVR4_C_ADDROF(sc); memset(saun, 0, sizeof(*saun)); saun->sun_family = na->family; for (src = na->path, dst = saun->sun_path; (*dst++ = *src++) != '\0'; ) if (dst == edst) break; saun->sun_len = dst - saun->sun_path; DPRINTF(("netaddr -> sockaddr_un %d %s\n", saun->sun_family, saun->sun_path)); } static void getparm(fp, pa) struct file *fp; struct svr4_si_sockparms *pa; { struct svr4_strm *st; struct socket *so; st = svr4_stream_get(fp); if (st == NULL) return; so = fp->f_data; pa->family = st->s_family; switch (so->so_type) { case SOCK_DGRAM: pa->type = SVR4_T_CLTS; pa->protocol = IPPROTO_UDP; DPRINTF(("getparm(dgram)\n")); return; case SOCK_STREAM: pa->type = SVR4_T_COTS; /* What about T_COTS_ORD? XXX */ pa->protocol = IPPROTO_IP; DPRINTF(("getparm(stream)\n")); return; case SOCK_RAW: pa->type = SVR4_T_CLTS; pa->protocol = IPPROTO_RAW; DPRINTF(("getparm(raw)\n")); return; default: pa->type = 0; pa->protocol = 0; DPRINTF(("getparm(type %d?)\n", so->so_type)); return; } } static int si_ogetudata(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { int error; struct svr4_si_oudata ud; struct svr4_si_sockparms pa; if (ioc->len != sizeof(ud) && ioc->len != sizeof(ud) - sizeof(int)) { DPRINTF(("SI_OGETUDATA: Wrong size %d != %d\n", sizeof(ud), ioc->len)); return EINVAL; } if ((error = copyin(ioc->buf, &ud, sizeof(ud))) != 0) return error; getparm(fp, &pa); switch (pa.family) { case AF_INET: ud.tidusize = 16384; ud.addrsize = sizeof(struct svr4_sockaddr_in); if (pa.type == SVR4_SOCK_STREAM) ud.etsdusize = 1; else ud.etsdusize = 0; break; case AF_LOCAL: ud.tidusize = 65536; ud.addrsize = 128; ud.etsdusize = 128; break; default: DPRINTF(("SI_OGETUDATA: Unsupported address family %d\n", pa.family)); return ENOSYS; } /* I have no idea what these should be! */ ud.optsize = 128; ud.tsdusize = 128; ud.servtype = pa.type; /* XXX: Fixme */ ud.so_state = 0; ud.so_options = 0; return copyout(&ud, ioc->buf, ioc->len); } static int si_sockparams(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { struct svr4_si_sockparms pa; getparm(fp, &pa); return copyout(&pa, ioc->buf, sizeof(pa)); } static int si_listen(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { int error; struct svr4_strm *st = svr4_stream_get(fp); struct svr4_strmcmd lst; struct listen_args la; if (st == NULL) return EINVAL; if (ioc->len < 0 || ioc->len > sizeof(lst)) return EINVAL; if ((error = copyin(ioc->buf, &lst, ioc->len)) != 0) return error; if (lst.cmd != SVR4_TI_OLD_BIND_REQUEST) { DPRINTF(("si_listen: bad request %ld\n", lst.cmd)); return EINVAL; } /* * We are making assumptions again... */ la.s = fd; DPRINTF(("SI_LISTEN: fileno %d backlog = %d\n", fd, 5)); la.backlog = 5; if ((error = listen(td, &la)) != 0) { DPRINTF(("SI_LISTEN: listen failed %d\n", error)); return error; } st->s_cmd = SVR4_TI__ACCEPT_WAIT; lst.cmd = SVR4_TI_BIND_REPLY; switch (st->s_family) { case AF_INET: /* XXX: Fill the length here */ break; case AF_LOCAL: lst.len = 140; lst.pad[28] = 0x00000000; /* magic again */ lst.pad[29] = 0x00000800; /* magic again */ lst.pad[30] = 0x80001400; /* magic again */ break; default: DPRINTF(("SI_LISTEN: Unsupported address family %d\n", st->s_family)); return ENOSYS; } if ((error = copyout(&lst, ioc->buf, ioc->len)) != 0) return error; return 0; } static int si_getudata(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { int error; struct svr4_si_udata ud; if (sizeof(ud) != ioc->len) { DPRINTF(("SI_GETUDATA: Wrong size %d != %d\n", sizeof(ud), ioc->len)); return EINVAL; } if ((error = copyin(ioc->buf, &ud, sizeof(ud))) != 0) return error; getparm(fp, &ud.sockparms); switch (ud.sockparms.family) { case AF_INET: DPRINTF(("getudata_inet\n")); ud.tidusize = 16384; ud.tsdusize = 16384; ud.addrsize = sizeof(struct svr4_sockaddr_in); if (ud.sockparms.type == SVR4_SOCK_STREAM) ud.etsdusize = 1; else ud.etsdusize = 0; ud.optsize = 0; break; case AF_LOCAL: DPRINTF(("getudata_local\n")); ud.tidusize = 65536; ud.tsdusize = 128; ud.addrsize = 128; ud.etsdusize = 128; ud.optsize = 128; break; default: DPRINTF(("SI_GETUDATA: Unsupported address family %d\n", ud.sockparms.family)); return ENOSYS; } ud.servtype = ud.sockparms.type; DPRINTF(("ud.servtype = %d\n", ud.servtype)); /* XXX: Fixme */ ud.so_state = 0; ud.so_options = 0; return copyout(&ud, ioc->buf, sizeof(ud)); } static int si_shutdown(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { int error; struct shutdown_args ap; if (ioc->len != sizeof(ap.how)) { DPRINTF(("SI_SHUTDOWN: Wrong size %d != %d\n", sizeof(ap.how), ioc->len)); return EINVAL; } if ((error = copyin(ioc->buf, &ap.how, ioc->len)) != 0) return error; ap.s = fd; return shutdown(td, &ap); } static int sockmod(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { switch (ioc->cmd) { case SVR4_SI_OGETUDATA: DPRINTF(("SI_OGETUDATA\n")); return si_ogetudata(fp, fd, ioc, td); case SVR4_SI_SHUTDOWN: DPRINTF(("SI_SHUTDOWN\n")); return si_shutdown(fp, fd, ioc, td); case SVR4_SI_LISTEN: DPRINTF(("SI_LISTEN\n")); return si_listen(fp, fd, ioc, td); case SVR4_SI_SETMYNAME: DPRINTF(("SI_SETMYNAME\n")); return 0; case SVR4_SI_SETPEERNAME: DPRINTF(("SI_SETPEERNAME\n")); return 0; case SVR4_SI_GETINTRANSIT: DPRINTF(("SI_GETINTRANSIT\n")); return 0; case SVR4_SI_TCL_LINK: DPRINTF(("SI_TCL_LINK\n")); return 0; case SVR4_SI_TCL_UNLINK: DPRINTF(("SI_TCL_UNLINK\n")); return 0; case SVR4_SI_SOCKPARAMS: DPRINTF(("SI_SOCKPARAMS\n")); return si_sockparams(fp, fd, ioc, td); case SVR4_SI_GETUDATA: DPRINTF(("SI_GETUDATA\n")); return si_getudata(fp, fd, ioc, td); default: DPRINTF(("Unknown sockmod ioctl %lx\n", ioc->cmd)); return 0; } } static int ti_getinfo(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { int error; struct svr4_infocmd info; memset(&info, 0, sizeof(info)); if (ioc->len < 0 || ioc->len > sizeof(info)) return EINVAL; if ((error = copyin(ioc->buf, &info, ioc->len)) != 0) return error; if (info.cmd != SVR4_TI_INFO_REQUEST) return EINVAL; info.cmd = SVR4_TI_INFO_REPLY; info.tsdu = 0; info.etsdu = 1; info.cdata = -2; info.ddata = -2; info.addr = 16; info.opt = -1; info.tidu = 16384; info.serv = 2; info.current = 0; info.provider = 2; ioc->len = sizeof(info); if ((error = copyout(&info, ioc->buf, ioc->len)) != 0) return error; return 0; } static int ti_bind(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { int error; struct svr4_strm *st = svr4_stream_get(fp); struct sockaddr_in sain; struct sockaddr_un saun; struct sockaddr *skp; int sasize; struct svr4_strmcmd bnd; if (st == NULL) { DPRINTF(("ti_bind: bad file descriptor\n")); return EINVAL; } if (ioc->len < 0 || ioc->len > sizeof(bnd)) return EINVAL; if ((error = copyin(ioc->buf, &bnd, ioc->len)) != 0) return error; if (bnd.cmd != SVR4_TI_OLD_BIND_REQUEST) { DPRINTF(("ti_bind: bad request %ld\n", bnd.cmd)); return EINVAL; } switch (st->s_family) { case AF_INET: skp = (struct sockaddr *)&sain; sasize = sizeof(sain); if (bnd.offs == 0) goto error; netaddr_to_sockaddr_in(&sain, &bnd); DPRINTF(("TI_BIND: fam %d, port %d, addr %x\n", sain.sin_family, sain.sin_port, sain.sin_addr.s_addr)); break; case AF_LOCAL: skp = (struct sockaddr *)&saun; sasize = sizeof(saun); if (bnd.offs == 0) goto error; netaddr_to_sockaddr_un(&saun, &bnd); if (saun.sun_path[0] == '\0') goto error; DPRINTF(("TI_BIND: fam %d, path %s\n", saun.sun_family, saun.sun_path)); if ((error = clean_pipe(td, saun.sun_path)) != 0) return error; bnd.pad[28] = 0x00001000; /* magic again */ break; default: DPRINTF(("TI_BIND: Unsupported address family %d\n", st->s_family)); return ENOSYS; } DPRINTF(("TI_BIND: fileno %d\n", fd)); if ((error = kern_bind(td, fd, skp)) != 0) { DPRINTF(("TI_BIND: bind failed %d\n", error)); return error; } goto reply; error: memset(&bnd, 0, sizeof(bnd)); bnd.len = sasize + 4; bnd.offs = 0x10; /* XXX */ reply: bnd.cmd = SVR4_TI_BIND_REPLY; if ((error = copyout(&bnd, ioc->buf, ioc->len)) != 0) return error; return 0; } static int timod(fp, fd, ioc, td) struct file *fp; int fd; struct svr4_strioctl *ioc; struct thread *td; { switch (ioc->cmd) { case SVR4_TI_GETINFO: DPRINTF(("TI_GETINFO\n")); return ti_getinfo(fp, fd, ioc, td); case SVR4_TI_OPTMGMT: DPRINTF(("TI_OPTMGMT\n")); return 0; case SVR4_TI_BIND: DPRINTF(("TI_BIND\n")); return ti_bind(fp, fd, ioc, td); case SVR4_TI_UNBIND: DPRINTF(("TI_UNBIND\n")); return 0; default: DPRINTF(("Unknown timod ioctl %lx\n", ioc->cmd)); return 0; } } int svr4_stream_ti_ioctl(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { struct svr4_strbuf skb, *sub = (struct svr4_strbuf *) dat; struct svr4_strm *st = svr4_stream_get(fp); int error; struct sockaddr *sa; socklen_t sasize, oldsasize; struct svr4_strmcmd sc; DPRINTF(("svr4_stream_ti_ioctl\n")); if (st == NULL) return EINVAL; sc.offs = 0x10; if ((error = copyin(sub, &skb, sizeof(skb))) != 0) { DPRINTF(("ti_ioctl: error copying in strbuf\n")); return error; } switch (st->s_family) { case AF_INET: sasize = sizeof(struct sockaddr_in); break; case AF_LOCAL: sasize = sizeof(struct sockaddr_un); break; default: DPRINTF(("ti_ioctl: Unsupported address family %d\n", st->s_family)); return ENOSYS; } oldsasize = sasize; switch (cmd) { case SVR4_TI_GETMYNAME: DPRINTF(("TI_GETMYNAME\n")); { error = kern_getsockname(td, fd, &sa, &sasize); if (error) { DPRINTF(("ti_ioctl: getsockname error\n")); return error; } } break; case SVR4_TI_GETPEERNAME: DPRINTF(("TI_GETPEERNAME\n")); { error = kern_getpeername(td, fd, &sa, &sasize); if (error) { DPRINTF(("ti_ioctl: getpeername error\n")); return error; } } break; case SVR4_TI_SETMYNAME: DPRINTF(("TI_SETMYNAME\n")); return 0; case SVR4_TI_SETPEERNAME: DPRINTF(("TI_SETPEERNAME\n")); return 0; default: DPRINTF(("ti_ioctl: Unknown ioctl %lx\n", cmd)); return ENOSYS; } if (sasize < 0 || sasize > oldsasize) { free(sa, M_SONAME); return EINVAL; } switch (st->s_family) { case AF_INET: sockaddr_to_netaddr_in(&sc, (struct sockaddr_in *)sa); skb.len = sasize; break; case AF_LOCAL: sockaddr_to_netaddr_un(&sc, (struct sockaddr_un *)sa); skb.len = sasize + 4; break; default: free(sa, M_SONAME); return ENOSYS; } free(sa, M_SONAME); if ((error = copyout(SVR4_ADDROF(&sc), skb.buf, sasize)) != 0) { DPRINTF(("ti_ioctl: error copying out socket data\n")); return error; } if ((error = copyout(&skb, sub, sizeof(skb))) != 0) { DPRINTF(("ti_ioctl: error copying out strbuf\n")); return error; } return error; } static int i_nread(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { int error; int nread = 0; /* * We are supposed to return the message length in nread, and the * number of messages in retval. We don't have the notion of number * of stream messages, so we just find out if we have any bytes waiting * for us, and if we do, then we assume that we have at least one * message waiting for us. */ if ((error = fo_ioctl(fp, FIONREAD, (caddr_t) &nread, td->td_ucred, td)) != 0) return error; if (nread != 0) *retval = 1; else *retval = 0; return copyout(&nread, dat, sizeof(nread)); } static int i_fdinsert(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { /* * Major hack again here. We assume that we are using this to * implement accept(2). If that is the case, we have already * called accept, and we have stored the file descriptor in * afd. We find the file descriptor that the code wants to use * in fd insert, and then we dup2() our accepted file descriptor * to it. */ int error; struct svr4_strm *st = svr4_stream_get(fp); struct svr4_strfdinsert fdi; struct dup2_args d2p; if (st == NULL) { DPRINTF(("fdinsert: bad file type\n")); return EINVAL; } mtx_lock(&Giant); if (st->s_afd == -1) { DPRINTF(("fdinsert: accept fd not found\n")); mtx_unlock(&Giant); return ENOENT; } if ((error = copyin(dat, &fdi, sizeof(fdi))) != 0) { DPRINTF(("fdinsert: copyin failed %d\n", error)); mtx_unlock(&Giant); return error; } d2p.from = st->s_afd; d2p.to = fdi.fd; if ((error = dup2(td, &d2p)) != 0) { DPRINTF(("fdinsert: dup2(%d, %d) failed %d\n", st->s_afd, fdi.fd, error)); mtx_unlock(&Giant); return error; } if ((error = kern_close(td, st->s_afd)) != 0) { DPRINTF(("fdinsert: close(%d) failed %d\n", st->s_afd, error)); mtx_unlock(&Giant); return error; } st->s_afd = -1; mtx_unlock(&Giant); *retval = 0; return 0; } static int _i_bind_rsvd(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { struct mkfifo_args ap; /* * This is a supposed to be a kernel and library only ioctl. * It gets called before ti_bind, when we have a unix * socket, to physically create the socket transport and * ``reserve'' it. I don't know how this get reserved inside * the kernel, but we are going to create it nevertheless. */ ap.path = dat; ap.mode = S_IFIFO; return mkfifo(td, &ap); } static int _i_rele_rsvd(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { struct unlink_args ap; /* * This is a supposed to be a kernel and library only ioctl. * I guess it is supposed to release the socket. */ ap.path = dat; return unlink(td, &ap); } static int i_str(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { int error; struct svr4_strioctl ioc; if ((error = copyin(dat, &ioc, sizeof(ioc))) != 0) return error; #ifdef DEBUG_SVR4 if ((error = show_ioc(">", &ioc)) != 0) return error; #endif /* DEBUG_SVR4 */ switch (ioc.cmd & 0xff00) { case SVR4_SIMOD: if ((error = sockmod(fp, fd, &ioc, td)) != 0) return error; break; case SVR4_TIMOD: if ((error = timod(fp, fd, &ioc, td)) != 0) return error; break; default: DPRINTF(("Unimplemented module %c %ld\n", (char) (cmd >> 8), cmd & 0xff)); return 0; } #ifdef DEBUG_SVR4 if ((error = show_ioc("<", &ioc)) != 0) return error; #endif /* DEBUG_SVR4 */ return copyout(&ioc, dat, sizeof(ioc)); } static int i_setsig(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { /* * This is the best we can do for now; we cannot generate * signals only for specific events so the signal mask gets * ignored; we save it just to pass it to a possible I_GETSIG... * * We alse have to fix the O_ASYNC fcntl bit, so the * process will get SIGPOLLs. */ int error; register_t oflags, flags; struct svr4_strm *st = svr4_stream_get(fp); if (st == NULL) { DPRINTF(("i_setsig: bad file descriptor\n")); return EINVAL; } /* get old status flags */ error = kern_fcntl(td, fd, F_GETFL, 0); if (error) return (error); oflags = td->td_retval[0]; /* update the flags */ mtx_lock(&Giant); if (dat != NULL) { int mask; flags = oflags | O_ASYNC; if ((error = copyin(dat, &mask, sizeof(mask))) != 0) { DPRINTF(("i_setsig: bad eventmask pointer\n")); return error; } if (mask & SVR4_S_ALLMASK) { DPRINTF(("i_setsig: bad eventmask data %x\n", mask)); return EINVAL; } st->s_eventmask = mask; } else { flags = oflags & ~O_ASYNC; st->s_eventmask = 0; } mtx_unlock(&Giant); /* set the new flags, if changed */ if (flags != oflags) { error = kern_fcntl(td, fd, F_SETFL, flags); if (error) return (error); flags = td->td_retval[0]; } /* set up SIGIO receiver if needed */ if (dat != NULL) return (kern_fcntl(td, fd, F_SETOWN, td->td_proc->p_pid)); return 0; } static int i_getsig(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { int error, eventmask; if (dat != NULL) { struct svr4_strm *st = svr4_stream_get(fp); if (st == NULL) { DPRINTF(("i_getsig: bad file descriptor\n")); return EINVAL; } mtx_lock(&Giant); eventmask = st->s_eventmask; mtx_unlock(&Giant); if ((error = copyout(&eventmask, dat, sizeof(eventmask))) != 0) { DPRINTF(("i_getsig: bad eventmask pointer\n")); return error; } } return 0; } int svr4_stream_ioctl(fp, td, retval, fd, cmd, dat) struct file *fp; struct thread *td; register_t *retval; int fd; u_long cmd; caddr_t dat; { *retval = 0; /* * All the following stuff assumes "sockmod" is pushed... */ switch (cmd) { case SVR4_I_NREAD: DPRINTF(("I_NREAD\n")); return i_nread(fp, td, retval, fd, cmd, dat); case SVR4_I_PUSH: DPRINTF(("I_PUSH %p\n", dat)); #if defined(DEBUG_SVR4) show_strbuf((struct svr4_strbuf *)dat); #endif return 0; case SVR4_I_POP: DPRINTF(("I_POP\n")); return 0; case SVR4_I_LOOK: DPRINTF(("I_LOOK\n")); return 0; case SVR4_I_FLUSH: DPRINTF(("I_FLUSH\n")); return 0; case SVR4_I_SRDOPT: DPRINTF(("I_SRDOPT\n")); return 0; case SVR4_I_GRDOPT: DPRINTF(("I_GRDOPT\n")); return 0; case SVR4_I_STR: DPRINTF(("I_STR\n")); return i_str(fp, td, retval, fd, cmd, dat); case SVR4_I_SETSIG: DPRINTF(("I_SETSIG\n")); return i_setsig(fp, td, retval, fd, cmd, dat); case SVR4_I_GETSIG: DPRINTF(("I_GETSIG\n")); return i_getsig(fp, td, retval, fd, cmd, dat); case SVR4_I_FIND: DPRINTF(("I_FIND\n")); /* * Here we are not pushing modules really, we just * pretend all are present */ *retval = 0; return 0; case SVR4_I_LINK: DPRINTF(("I_LINK\n")); return 0; case SVR4_I_UNLINK: DPRINTF(("I_UNLINK\n")); return 0; case SVR4_I_ERECVFD: DPRINTF(("I_ERECVFD\n")); return 0; case SVR4_I_PEEK: DPRINTF(("I_PEEK\n")); return 0; case SVR4_I_FDINSERT: DPRINTF(("I_FDINSERT\n")); return i_fdinsert(fp, td, retval, fd, cmd, dat); case SVR4_I_SENDFD: DPRINTF(("I_SENDFD\n")); return 0; case SVR4_I_RECVFD: DPRINTF(("I_RECVFD\n")); return 0; case SVR4_I_SWROPT: DPRINTF(("I_SWROPT\n")); return 0; case SVR4_I_GWROPT: DPRINTF(("I_GWROPT\n")); return 0; case SVR4_I_LIST: DPRINTF(("I_LIST\n")); return 0; case SVR4_I_PLINK: DPRINTF(("I_PLINK\n")); return 0; case SVR4_I_PUNLINK: DPRINTF(("I_PUNLINK\n")); return 0; case SVR4_I_SETEV: DPRINTF(("I_SETEV\n")); return 0; case SVR4_I_GETEV: DPRINTF(("I_GETEV\n")); return 0; case SVR4_I_STREV: DPRINTF(("I_STREV\n")); return 0; case SVR4_I_UNSTREV: DPRINTF(("I_UNSTREV\n")); return 0; case SVR4_I_FLUSHBAND: DPRINTF(("I_FLUSHBAND\n")); return 0; case SVR4_I_CKBAND: DPRINTF(("I_CKBAND\n")); return 0; case SVR4_I_GETBAND: DPRINTF(("I_GETBANK\n")); return 0; case SVR4_I_ATMARK: DPRINTF(("I_ATMARK\n")); return 0; case SVR4_I_SETCLTIME: DPRINTF(("I_SETCLTIME\n")); return 0; case SVR4_I_GETCLTIME: DPRINTF(("I_GETCLTIME\n")); return 0; case SVR4_I_CANPUT: DPRINTF(("I_CANPUT\n")); return 0; case SVR4__I_BIND_RSVD: DPRINTF(("_I_BIND_RSVD\n")); return _i_bind_rsvd(fp, td, retval, fd, cmd, dat); case SVR4__I_RELE_RSVD: DPRINTF(("_I_RELE_RSVD\n")); return _i_rele_rsvd(fp, td, retval, fd, cmd, dat); default: DPRINTF(("unimpl cmd = %lx\n", cmd)); break; } return 0; } int svr4_sys_putmsg(td, uap) register struct thread *td; struct svr4_sys_putmsg_args *uap; { struct file *fp; int error; if ((error = fget(td, uap->fd, &fp)) != 0) { #ifdef DEBUG_SVR4 uprintf("putmsg: bad fp\n"); #endif return EBADF; } error = svr4_do_putmsg(td, uap, fp); fdrop(fp, td); return (error); } static int svr4_do_putmsg(td, uap, fp) struct thread *td; struct svr4_sys_putmsg_args *uap; struct file *fp; { struct svr4_strbuf dat, ctl; struct svr4_strmcmd sc; struct sockaddr_in sain; struct sockaddr_un saun; struct sockaddr *sa; int sasize, *retval; struct svr4_strm *st; int error; retval = td->td_retval; #ifdef DEBUG_SVR4 show_msg(">putmsg", uap->fd, uap->ctl, uap->dat, uap->flags); #endif /* DEBUG_SVR4 */ - FILE_LOCK_ASSERT(fp, MA_NOTOWNED); - if (uap->ctl != NULL) { if ((error = copyin(uap->ctl, &ctl, sizeof(ctl))) != 0) { #ifdef DEBUG_SVR4 uprintf("putmsg: copyin(): %d\n", error); #endif return error; } } else ctl.len = -1; if (uap->dat != NULL) { if ((error = copyin(uap->dat, &dat, sizeof(dat))) != 0) { #ifdef DEBUG_SVR4 uprintf("putmsg: copyin(): %d (2)\n", error); #endif return error; } } else dat.len = -1; /* * Only for sockets for now. */ if ((st = svr4_stream_get(fp)) == NULL) { DPRINTF(("putmsg: bad file type\n")); return EINVAL; } if (ctl.len < 0 || ctl.len > sizeof(sc)) { DPRINTF(("putmsg: Bad control size %d != %d\n", ctl.len, sizeof(struct svr4_strmcmd))); return EINVAL; } if ((error = copyin(ctl.buf, &sc, ctl.len)) != 0) return error; switch (st->s_family) { case AF_INET: if (sc.len != sizeof(sain)) { if (sc.cmd == SVR4_TI_DATA_REQUEST) { struct write_args wa; /* Solaris seems to use sc.cmd = 3 to * send "expedited" data. telnet uses * this for options processing, sending EOF, * etc. I'm sure other things use it too. * I don't have any documentation * on it, so I'm making a guess that this * is how it works. newton@atdot.dotat.org XXX */ DPRINTF(("sending expedited data ??\n")); wa.fd = uap->fd; wa.buf = dat.buf; wa.nbyte = dat.len; return write(td, &wa); } DPRINTF(("putmsg: Invalid inet length %ld\n", sc.len)); return EINVAL; } netaddr_to_sockaddr_in(&sain, &sc); sa = (struct sockaddr *)&sain; sasize = sizeof(sain); if (sain.sin_family != st->s_family) error = EINVAL; break; case AF_LOCAL: if (ctl.len == 8) { /* We are doing an accept; succeed */ DPRINTF(("putmsg: Do nothing\n")); *retval = 0; return 0; } else { /* Maybe we've been given a device/inode pair */ dev_t *dev = SVR4_ADDROF(&sc); ino_t *ino = (ino_t *) &dev[1]; if (svr4_find_socket(td, fp, *dev, *ino, &saun) != 0) { /* I guess we have it by name */ netaddr_to_sockaddr_un(&saun, &sc); } sa = (struct sockaddr *)&saun; sasize = sizeof(saun); } break; default: DPRINTF(("putmsg: Unsupported address family %d\n", st->s_family)); return ENOSYS; } mtx_lock(&Giant); st->s_cmd = sc.cmd; mtx_unlock(&Giant); switch (sc.cmd) { case SVR4_TI_CONNECT_REQUEST: /* connect */ { return (kern_connect(td, uap->fd, sa)); } case SVR4_TI_SENDTO_REQUEST: /* sendto */ { struct msghdr msg; struct iovec aiov; msg.msg_name = sa; msg.msg_namelen = sasize; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; msg.msg_flags = 0; aiov.iov_base = dat.buf; aiov.iov_len = dat.len; error = kern_sendit(td, uap->fd, &msg, uap->flags, NULL, UIO_USERSPACE); DPRINTF(("sendto_request error: %d\n", error)); *retval = 0; return error; } default: DPRINTF(("putmsg: Unimplemented command %lx\n", sc.cmd)); return ENOSYS; } } int svr4_sys_getmsg(td, uap) struct thread *td; struct svr4_sys_getmsg_args *uap; { struct file *fp; int error; if ((error = fget(td, uap->fd, &fp)) != 0) { #ifdef DEBUG_SVR4 uprintf("getmsg: bad fp\n"); #endif return EBADF; } error = svr4_do_getmsg(td, uap, fp); fdrop(fp, td); return (error); } int svr4_do_getmsg(td, uap, fp) register struct thread *td; struct svr4_sys_getmsg_args *uap; struct file *fp; { struct svr4_strbuf dat, ctl; struct svr4_strmcmd sc; int error, *retval; struct msghdr msg; struct iovec aiov; struct sockaddr_in sain; struct sockaddr_un saun; struct sockaddr *sa; socklen_t sasize; struct svr4_strm *st; struct file *afp; int fl; retval = td->td_retval; error = 0; afp = NULL; - - FILE_LOCK_ASSERT(fp, MA_NOTOWNED); memset(&sc, 0, sizeof(sc)); #ifdef DEBUG_SVR4 show_msg(">getmsg", uap->fd, uap->ctl, uap->dat, 0); #endif /* DEBUG_SVR4 */ if (uap->ctl != NULL) { if ((error = copyin(uap->ctl, &ctl, sizeof(ctl))) != 0) return error; if (ctl.len < 0) return EINVAL; } else { ctl.len = -1; ctl.maxlen = 0; } if (uap->dat != NULL) { if ((error = copyin(uap->dat, &dat, sizeof(dat))) != 0) return error; } else { dat.len = -1; dat.maxlen = 0; } /* * Only for sockets for now. */ if ((st = svr4_stream_get(fp)) == NULL) { DPRINTF(("getmsg: bad file type\n")); return EINVAL; } if (ctl.maxlen == -1 || dat.maxlen == -1) { DPRINTF(("getmsg: Cannot handle -1 maxlen (yet)\n")); return ENOSYS; } switch (st->s_family) { case AF_INET: sasize = sizeof(sain); break; case AF_LOCAL: sasize = sizeof(saun); break; default: DPRINTF(("getmsg: Unsupported address family %d\n", st->s_family)); return ENOSYS; } mtx_lock(&Giant); switch (st->s_cmd) { case SVR4_TI_CONNECT_REQUEST: DPRINTF(("getmsg: TI_CONNECT_REQUEST\n")); /* * We do the connect in one step, so the putmsg should * have gotten the error. */ sc.cmd = SVR4_TI_OK_REPLY; sc.len = 0; ctl.len = 8; dat.len = -1; fl = 1; st->s_cmd = sc.cmd; break; case SVR4_TI_OK_REPLY: DPRINTF(("getmsg: TI_OK_REPLY\n")); /* * We are immediately after a connect reply, so we send * a connect verification. */ error = kern_getpeername(td, uap->fd, &sa, &sasize); if (error) { mtx_unlock(&Giant); DPRINTF(("getmsg: getpeername failed %d\n", error)); return error; } sc.cmd = SVR4_TI_CONNECT_REPLY; sc.pad[0] = 0x4; sc.offs = 0x18; sc.pad[1] = 0x14; sc.pad[2] = 0x04000402; switch (st->s_family) { case AF_INET: sc.len = sasize; sockaddr_to_netaddr_in(&sc, (struct sockaddr_in *)sa); break; case AF_LOCAL: sc.len = sasize + 4; sockaddr_to_netaddr_un(&sc, (struct sockaddr_un *)sa); break; default: mtx_unlock(&Giant); free(sa, M_SONAME); return ENOSYS; } free(sa, M_SONAME); ctl.len = 40; dat.len = -1; fl = 0; st->s_cmd = sc.cmd; break; case SVR4_TI__ACCEPT_OK: DPRINTF(("getmsg: TI__ACCEPT_OK\n")); /* * We do the connect in one step, so the putmsg should * have gotten the error. */ sc.cmd = SVR4_TI_OK_REPLY; sc.len = 1; ctl.len = 8; dat.len = -1; fl = 1; st->s_cmd = SVR4_TI__ACCEPT_WAIT; break; case SVR4_TI__ACCEPT_WAIT: DPRINTF(("getmsg: TI__ACCEPT_WAIT\n")); /* * We are after a listen, so we try to accept... */ error = kern_accept(td, uap->fd, &sa, &sasize, &afp); if (error) { mtx_unlock(&Giant); DPRINTF(("getmsg: accept failed %d\n", error)); return error; } st->s_afd = *retval; DPRINTF(("getmsg: Accept fd = %d\n", st->s_afd)); sc.cmd = SVR4_TI_ACCEPT_REPLY; sc.offs = 0x18; sc.pad[0] = 0x0; switch (st->s_family) { case AF_INET: sc.pad[1] = 0x28; sockaddr_to_netaddr_in(&sc, (struct sockaddr_in *)&sa); ctl.len = 40; sc.len = sasize; break; case AF_LOCAL: sc.pad[1] = 0x00010000; sc.pad[2] = 0xf6bcdaa0; /* I don't know what that is */ sc.pad[3] = 0x00010000; ctl.len = 134; sc.len = sasize + 4; break; default: fdclose(td->td_proc->p_fd, afp, st->s_afd, td); fdrop(afp, td); st->s_afd = -1; mtx_unlock(&Giant); free(sa, M_SONAME); return ENOSYS; } free(sa, M_SONAME); dat.len = -1; fl = 0; st->s_cmd = SVR4_TI__ACCEPT_OK; break; case SVR4_TI_SENDTO_REQUEST: DPRINTF(("getmsg: TI_SENDTO_REQUEST\n")); if (ctl.maxlen > 36 && ctl.len < 36) ctl.len = 36; if (ctl.len > sizeof(sc)) ctl.len = sizeof(sc); if ((error = copyin(ctl.buf, &sc, ctl.len)) != 0) { mtx_unlock(&Giant); return error; } switch (st->s_family) { case AF_INET: sa = (struct sockaddr *)&sain; sockaddr_to_netaddr_in(&sc, &sain); break; case AF_LOCAL: sa = (struct sockaddr *)&saun; sockaddr_to_netaddr_un(&sc, &saun); break; default: mtx_unlock(&Giant); return ENOSYS; } msg.msg_name = sa; msg.msg_namelen = sasize; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; aiov.iov_base = dat.buf; aiov.iov_len = dat.maxlen; msg.msg_flags = 0; error = kern_recvit(td, uap->fd, &msg, UIO_SYSSPACE, NULL); if (error) { mtx_unlock(&Giant); DPRINTF(("getmsg: recvit failed %d\n", error)); return error; } sc.cmd = SVR4_TI_RECVFROM_IND; switch (st->s_family) { case AF_INET: sc.len = sasize; sockaddr_to_netaddr_in(&sc, &sain); break; case AF_LOCAL: sc.len = sasize + 4; sockaddr_to_netaddr_un(&sc, &saun); break; default: mtx_unlock(&Giant); return ENOSYS; } dat.len = *retval; fl = 0; st->s_cmd = sc.cmd; break; default: st->s_cmd = sc.cmd; if (st->s_cmd == SVR4_TI_CONNECT_REQUEST) { struct read_args ra; /* More weirdness: Again, I can't find documentation * to back this up, but when a process does a generic * "getmsg()" call it seems that the command field is * zero and the length of the data area is zero. I * think processes expect getmsg() to fill in dat.len * after reading at most dat.maxlen octets from the * stream. Since we're using sockets I can let * read() look after it and frob return values * appropriately (or inappropriately :-) * -- newton@atdot.dotat.org XXX */ ra.fd = uap->fd; ra.buf = dat.buf; ra.nbyte = dat.maxlen; if ((error = read(td, &ra)) != 0) { mtx_unlock(&Giant); return error; } dat.len = *retval; *retval = 0; st->s_cmd = SVR4_TI_SENDTO_REQUEST; break; } mtx_unlock(&Giant); DPRINTF(("getmsg: Unknown state %x\n", st->s_cmd)); return EINVAL; } if (uap->ctl) { if (ctl.len > sizeof(sc)) ctl.len = sizeof(sc); if (ctl.len != -1) error = copyout(&sc, ctl.buf, ctl.len); if (error == 0) error = copyout(&ctl, uap->ctl, sizeof(ctl)); } if (uap->dat) { if (error == 0) error = copyout(&dat, uap->dat, sizeof(dat)); } if (uap->flags) { /* XXX: Need translation */ if (error == 0) error = copyout(&fl, uap->flags, sizeof(fl)); } if (error) { if (afp) { fdclose(td->td_proc->p_fd, afp, st->s_afd, td); fdrop(afp, td); st->s_afd = -1; } mtx_unlock(&Giant); return (error); } mtx_unlock(&Giant); if (afp) fdrop(afp, td); *retval = 0; #ifdef DEBUG_SVR4 show_msg("fd, uap->ctl, uap->dat, fl); #endif /* DEBUG_SVR4 */ return error; } int svr4_sys_send(td, uap) struct thread *td; struct svr4_sys_send_args *uap; { struct osend_args osa; osa.s = uap->s; osa.buf = uap->buf; osa.len = uap->len; osa.flags = uap->flags; return osend(td, &osa); } int svr4_sys_recv(td, uap) struct thread *td; struct svr4_sys_recv_args *uap; { struct orecv_args ora; ora.s = uap->s; ora.buf = uap->buf; ora.len = uap->len; ora.flags = uap->flags; return orecv(td, &ora); } /* * XXX This isn't necessary, but it's handy for inserting debug code into * sendto(). Let's leave it here for now... */ int svr4_sys_sendto(td, uap) struct thread *td; struct svr4_sys_sendto_args *uap; { struct sendto_args sa; sa.s = uap->s; sa.buf = uap->buf; sa.len = uap->len; sa.flags = uap->flags; sa.to = (caddr_t)uap->to; sa.tolen = uap->tolen; DPRINTF(("calling sendto()\n")); return sendto(td, &sa); } Index: head/sys/dev/streams/streams.c =================================================================== --- head/sys/dev/streams/streams.c (revision 174987) +++ head/sys/dev/streams/streams.c (revision 174988) @@ -1,354 +1,349 @@ /*- * Copyright (c) 1998 Mark Newton * Copyright (c) 1994 Christos Zoulas * Copyright (c) 1997 Todd Vierling * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Stolen from NetBSD /sys/compat/svr4/svr4_net.c. Pseudo-device driver * skeleton produced from /usr/share/examples/drivers/make_pseudo_driver.sh * in 3.0-980524-SNAP then hacked a bit (but probably not enough :-). * */ #include __FBSDID("$FreeBSD$"); #include #include #include /* SYSINIT stuff */ #include /* cdevsw stuff */ #include /* malloc region definitions */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int svr4_soo_close(struct file *, struct thread *); static int svr4_ptm_alloc(struct thread *); static d_open_t streamsopen; /* * Device minor numbers */ enum { dev_ptm = 10, dev_arp = 26, dev_icmp = 27, dev_ip = 28, dev_tcp = 35, dev_udp = 36, dev_rawip = 37, dev_unix_dgram = 38, dev_unix_stream = 39, dev_unix_ord_stream = 40 }; static struct cdev *dt_ptm, *dt_arp, *dt_icmp, *dt_ip, *dt_tcp, *dt_udp, *dt_rawip, *dt_unix_dgram, *dt_unix_stream, *dt_unix_ord_stream; static struct fileops svr4_netops = { .fo_read = soo_read, .fo_write = soo_write, .fo_ioctl = soo_ioctl, .fo_poll = soo_poll, .fo_kqfilter = soo_kqfilter, .fo_stat = soo_stat, .fo_close = svr4_soo_close }; static struct cdevsw streams_cdevsw = { .d_version = D_VERSION, .d_open = streamsopen, .d_name = "streams", }; struct streams_softc { struct isa_device *dev; } ; #define UNIT(dev) minor(dev) /* assume one minor number per unit */ typedef struct streams_softc *sc_p; static int streams_modevent(module_t mod, int type, void *unused) { switch (type) { case MOD_LOAD: dt_ptm = make_dev(&streams_cdevsw, dev_ptm, 0, 0, 0666, "ptm"); dt_arp = make_dev(&streams_cdevsw, dev_arp, 0, 0, 0666, "arp"); dt_icmp = make_dev(&streams_cdevsw, dev_icmp, 0, 0, 0666, "icmp"); dt_ip = make_dev(&streams_cdevsw, dev_ip, 0, 0, 0666, "ip"); dt_tcp = make_dev(&streams_cdevsw, dev_tcp, 0, 0, 0666, "tcp"); dt_udp = make_dev(&streams_cdevsw, dev_udp, 0, 0, 0666, "udp"); dt_rawip = make_dev(&streams_cdevsw, dev_rawip, 0, 0, 0666, "rawip"); dt_unix_dgram = make_dev(&streams_cdevsw, dev_unix_dgram, 0, 0, 0666, "ticlts"); dt_unix_stream = make_dev(&streams_cdevsw, dev_unix_stream, 0, 0, 0666, "ticots"); dt_unix_ord_stream = make_dev(&streams_cdevsw, dev_unix_ord_stream, 0, 0, 0666, "ticotsord"); if (! (dt_ptm && dt_arp && dt_icmp && dt_ip && dt_tcp && dt_udp && dt_rawip && dt_unix_dgram && dt_unix_stream && dt_unix_ord_stream)) { printf("WARNING: device config for STREAMS failed\n"); printf("Suggest unloading streams KLD\n"); } return 0; case MOD_UNLOAD: /* XXX should check to see if it's busy first */ destroy_dev(dt_ptm); destroy_dev(dt_arp); destroy_dev(dt_icmp); destroy_dev(dt_ip); destroy_dev(dt_tcp); destroy_dev(dt_udp); destroy_dev(dt_rawip); destroy_dev(dt_unix_dgram); destroy_dev(dt_unix_stream); destroy_dev(dt_unix_ord_stream); return 0; default: return EOPNOTSUPP; break; } return 0; } static moduledata_t streams_mod = { "streams", streams_modevent, 0 }; DECLARE_MODULE(streams, streams_mod, SI_SUB_DRIVERS, SI_ORDER_ANY); MODULE_VERSION(streams, 1); /* * We only need open() and close() routines. open() calls socreate() * to allocate a "real" object behind the stream and mallocs some state * info for use by the svr4 emulator; close() deallocates the state * information and passes the underlying object to the normal socket close * routine. */ static int streamsopen(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct filedesc *fdp; struct svr4_strm *st; struct socket *so; struct file *fp; int family, type, protocol; int error, fd; if (td->td_dupfd >= 0) return ENODEV; switch (minor(dev)) { case dev_udp: family = AF_INET; type = SOCK_DGRAM; protocol = IPPROTO_UDP; break; case dev_tcp: family = AF_INET; type = SOCK_STREAM; protocol = IPPROTO_TCP; break; case dev_ip: case dev_rawip: family = AF_INET; type = SOCK_RAW; protocol = IPPROTO_IP; break; case dev_icmp: family = AF_INET; type = SOCK_RAW; protocol = IPPROTO_ICMP; break; case dev_unix_dgram: family = AF_LOCAL; type = SOCK_DGRAM; protocol = 0; break; case dev_unix_stream: case dev_unix_ord_stream: family = AF_LOCAL; type = SOCK_STREAM; protocol = 0; break; case dev_ptm: return svr4_ptm_alloc(td); default: return EOPNOTSUPP; } fdp = td->td_proc->p_fd; if ((error = falloc(td, &fp, &fd)) != 0) return error; /* An extra reference on `fp' has been held for us by falloc(). */ error = socreate(family, &so, type, protocol, td->td_ucred, td); if (error) { fdclose(fdp, fp, fd, td); fdrop(fp, td); return error; } - FILE_LOCK(fp); - fp->f_data = so; - fp->f_flag = FREAD|FWRITE; - fp->f_ops = &svr4_netops; - fp->f_type = DTYPE_SOCKET; - FILE_UNLOCK(fp); + finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &svr4_netops); /* * Allocate a stream structure and attach it to this socket. * We don't bother locking so_emuldata for SVR4 stream sockets as * its value is constant for the lifetime of the stream once it * is initialized here. */ st = malloc(sizeof(struct svr4_strm), M_TEMP, M_WAITOK); st->s_family = so->so_proto->pr_domain->dom_family; st->s_cmd = ~0; st->s_afd = -1; st->s_eventmask = 0; so->so_emuldata = st; fdrop(fp, td); td->td_dupfd = fd; return ENXIO; } static int svr4_ptm_alloc(td) struct thread *td; { /* * XXX this is very, very ugly. But I can't find a better * way that won't duplicate a big amount of code from * sys_open(). Ho hum... * * Fortunately for us, Solaris (at least 2.5.1) makes the * /dev/ptmx open automatically just open a pty, that (after * STREAMS I_PUSHes), is just a plain pty. fstat() is used * to get the minor device number to map to a tty. * * Cycle through the names. If sys_open() returns ENOENT (or * ENXIO), short circuit the cycle and exit. */ static char ptyname[] = "/dev/ptyXX"; static char ttyletters[] = "pqrstuwxyzPQRST"; static char ttynumbers[] = "0123456789abcdef"; struct proc *p; register_t fd; int error, l, n; fd = -1; n = 0; l = 0; p = td->td_proc; while (fd == -1) { ptyname[8] = ttyletters[l]; ptyname[9] = ttynumbers[n]; error = kern_open(td, ptyname, UIO_SYSSPACE, O_RDWR, 0); switch (error) { case ENOENT: case ENXIO: return error; case 0: td->td_dupfd = td->td_retval[0]; return ENXIO; default: if (ttynumbers[++n] == '\0') { if (ttyletters[++l] == '\0') break; n = 0; } } } return ENOENT; } struct svr4_strm * svr4_stream_get(fp) struct file *fp; { struct socket *so; if (fp == NULL || fp->f_type != DTYPE_SOCKET) return NULL; so = fp->f_data; return so->so_emuldata; } static int svr4_soo_close(struct file *fp, struct thread *td) { struct socket *so = fp->f_data; /* CHECKUNIT_DIAG(ENXIO);*/ svr4_delete_socket(td->td_proc, fp); free(so->so_emuldata, M_TEMP); return soo_close(fp, td); } Index: head/sys/fs/devfs/devfs_vnops.c =================================================================== --- head/sys/fs/devfs/devfs_vnops.c (revision 174987) +++ head/sys/fs/devfs/devfs_vnops.c (revision 174988) @@ -1,1404 +1,1401 @@ /*- * Copyright (c) 2000-2004 * Poul-Henning Kamp. All rights reserved. * Copyright (c) 1989, 1992-1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kernfs_vnops.c 8.15 (Berkeley) 5/21/95 * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43 * * $FreeBSD$ */ /* * TODO: * remove empty directories * mkdir: want it ? */ #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct vop_vector devfs_vnodeops; static struct vop_vector devfs_specops; static struct fileops devfs_ops_f; #include #include #include struct mtx devfs_de_interlock; MTX_SYSINIT(devfs_de_interlock, &devfs_de_interlock, "devfs interlock", MTX_DEF); struct sx clone_drain_lock; SX_SYSINIT(clone_drain_lock, &clone_drain_lock, "clone events drain lock"); static int devfs_fp_check(struct file *fp, struct cdev **devp, struct cdevsw **dswp) { *dswp = devvn_refthread(fp->f_vnode, devp); if (*devp != fp->f_data) { if (*dswp != NULL) dev_relthread(*devp); return (ENXIO); } KASSERT((*devp)->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(*devp))); if (*dswp == NULL) return (ENXIO); return (0); } /* * Construct the fully qualified path name relative to the mountpoint */ static char * devfs_fqpn(char *buf, struct vnode *dvp, struct componentname *cnp) { int i; struct devfs_dirent *de, *dd; struct devfs_mount *dmp; dmp = VFSTODEVFS(dvp->v_mount); dd = dvp->v_data; i = SPECNAMELEN; buf[i] = '\0'; i -= cnp->cn_namelen; if (i < 0) return (NULL); bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen); de = dd; while (de != dmp->dm_rootdir) { i--; if (i < 0) return (NULL); buf[i] = '/'; i -= de->de_dirent->d_namlen; if (i < 0) return (NULL); bcopy(de->de_dirent->d_name, buf + i, de->de_dirent->d_namlen); de = TAILQ_FIRST(&de->de_dlist); /* "." */ de = TAILQ_NEXT(de, de_list); /* ".." */ de = de->de_dir; } return (buf + i); } static int devfs_allocv_drop_refs(int drop_dm_lock, struct devfs_mount *dmp, struct devfs_dirent *de) { int not_found; not_found = 0; if (de->de_flags & DE_DOOMED) not_found = 1; if (DEVFS_DE_DROP(de)) { KASSERT(not_found == 1, ("DEVFS de dropped but not doomed")); devfs_dirent_free(de); } if (DEVFS_DMP_DROP(dmp)) { KASSERT(not_found == 1, ("DEVFS mount struct freed before dirent")); not_found = 2; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } if (not_found == 1 || (drop_dm_lock && not_found != 2)) sx_unlock(&dmp->dm_lock); return (not_found); } static void devfs_insmntque_dtr(struct vnode *vp, void *arg) { struct devfs_dirent *de; de = (struct devfs_dirent *)arg; mtx_lock(&devfs_de_interlock); vp->v_data = NULL; de->de_vnode = NULL; mtx_unlock(&devfs_de_interlock); vgone(vp); vput(vp); } /* * devfs_allocv shall be entered with dmp->dm_lock held, and it drops * it on return. */ int devfs_allocv(struct devfs_dirent *de, struct mount *mp, struct vnode **vpp, struct thread *td) { int error; struct vnode *vp; struct cdev *dev; struct devfs_mount *dmp; KASSERT(td == curthread, ("devfs_allocv: td != curthread")); dmp = VFSTODEVFS(mp); if (de->de_flags & DE_DOOMED) { sx_xunlock(&dmp->dm_lock); return (ENOENT); } loop: DEVFS_DE_HOLD(de); DEVFS_DMP_HOLD(dmp); mtx_lock(&devfs_de_interlock); vp = de->de_vnode; if (vp != NULL) { VI_LOCK(vp); mtx_unlock(&devfs_de_interlock); sx_xunlock(&dmp->dm_lock); error = vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td); sx_xlock(&dmp->dm_lock); if (devfs_allocv_drop_refs(0, dmp, de)) { if (error == 0) vput(vp); return (ENOENT); } else if (error) goto loop; sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } mtx_unlock(&devfs_de_interlock); if (de->de_dirent->d_type == DT_CHR) { if (!(de->de_cdp->cdp_flags & CDP_ACTIVE)) { devfs_allocv_drop_refs(1, dmp, de); return (ENOENT); } dev = &de->de_cdp->cdp_c; } else { dev = NULL; } error = getnewvnode("devfs", mp, &devfs_vnodeops, &vp); if (error != 0) { devfs_allocv_drop_refs(1, dmp, de); printf("devfs_allocv: failed to allocate new vnode\n"); return (error); } if (de->de_dirent->d_type == DT_CHR) { vp->v_type = VCHR; VI_LOCK(vp); dev_lock(); dev_refl(dev); /* XXX: v_rdev should be protect by vnode lock */ vp->v_rdev = dev; KASSERT(vp->v_usecount == 1, ("%s %d (%d)\n", __func__, __LINE__, vp->v_usecount)); dev->si_usecount += vp->v_usecount; dev_unlock(); VI_UNLOCK(vp); vp->v_op = &devfs_specops; } else if (de->de_dirent->d_type == DT_DIR) { vp->v_type = VDIR; } else if (de->de_dirent->d_type == DT_LNK) { vp->v_type = VLNK; } else { vp->v_type = VBAD; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); mtx_lock(&devfs_de_interlock); vp->v_data = de; de->de_vnode = vp; mtx_unlock(&devfs_de_interlock); error = insmntque1(vp, mp, devfs_insmntque_dtr, de); if (error != 0) { (void) devfs_allocv_drop_refs(1, dmp, de); return (error); } if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); } #ifdef MAC mac_devfs_vnode_associate(mp, de, vp); #endif sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } static int devfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct devfs_dirent *de; int error; de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid, ap->a_mode, ap->a_cred, NULL); if (!error) return (error); if (error != EACCES) return (error); /* We do, however, allow access to the controlling terminal */ if (!(ap->a_td->td_proc->p_flag & P_CONTROLT)) return (error); if (ap->a_td->td_proc->p_session->s_ttyvp == de->de_vnode) return (0); return (error); } /* ARGSUSED */ static int devfs_advlock(struct vop_advlock_args *ap) { return (ap->a_flags & F_FLOCK ? EOPNOTSUPP : EINVAL); } /* ARGSUSED */ static int devfs_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp, *oldvp; struct thread *td = ap->a_td; struct cdev *dev = vp->v_rdev; struct cdevsw *dsw; int vp_locked, error; /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. * We cannot easily tell that a character device is * a controlling terminal, unless it is the closing * process' controlling terminal. In that case, * if the reference count is 2 (this last descriptor * plus the session), release the reference from the session. */ oldvp = NULL; sx_xlock(&proctree_lock); if (td && vp == td->td_proc->p_session->s_ttyvp) { SESS_LOCK(td->td_proc->p_session); VI_LOCK(vp); if (count_dev(dev) == 2 && (vp->v_iflag & VI_DOOMED) == 0) { td->td_proc->p_session->s_ttyvp = NULL; oldvp = vp; } VI_UNLOCK(vp); SESS_UNLOCK(td->td_proc->p_session); } sx_xunlock(&proctree_lock); if (oldvp != NULL) vrele(oldvp); /* * We do not want to really close the device if it * is still in use unless we are trying to close it * forcibly. Since every use (buffer, vnode, swap, cmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ dsw = dev_refthread(dev); if (dsw == NULL) return (ENXIO); VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { /* Forced close. */ } else if (dsw->d_flags & D_TRACKCLOSE) { /* Keep device updated on status. */ } else if (count_dev(dev) > 1) { VI_UNLOCK(vp); dev_relthread(dev); return (0); } vholdl(vp); VI_UNLOCK(vp); vp_locked = VOP_ISLOCKED(vp, td); VOP_UNLOCK(vp, 0, td); KASSERT(dev->si_refcount > 0, ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev))); if (!(dsw->d_flags & D_NEEDGIANT)) { DROP_GIANT(); error = dsw->d_close(dev, ap->a_fflag, S_IFCHR, td); PICKUP_GIANT(); } else { error = dsw->d_close(dev, ap->a_fflag, S_IFCHR, td); } dev_relthread(dev); vn_lock(vp, vp_locked | LK_RETRY, td); vdrop(vp); return (error); } static int devfs_close_f(struct file *fp, struct thread *td) { return (vnops.fo_close(fp, td)); } /* ARGSUSED */ static int devfs_fsync(struct vop_fsync_args *ap) { if (!vn_isdisk(ap->a_vp, NULL)) return (0); return (vop_stdfsync(ap)); } static int devfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; int error = 0; struct devfs_dirent *de; struct cdev *dev; de = vp->v_data; KASSERT(de != NULL, ("Null dirent in devfs_getattr vp=%p", vp)); if (vp->v_type == VDIR) { de = de->de_dir; KASSERT(de != NULL, ("Null dir dirent in devfs_getattr vp=%p", vp)); } bzero((caddr_t) vap, sizeof(*vap)); vattr_null(vap); vap->va_uid = de->de_uid; vap->va_gid = de->de_gid; vap->va_mode = de->de_mode; if (vp->v_type == VLNK) vap->va_size = strlen(de->de_symlink); else if (vp->v_type == VDIR) vap->va_size = vap->va_bytes = DEV_BSIZE; else vap->va_size = 0; if (vp->v_type != VDIR) vap->va_bytes = 0; vap->va_blocksize = DEV_BSIZE; vap->va_type = vp->v_type; #define fix(aa) \ do { \ if ((aa).tv_sec <= 3600) { \ (aa).tv_sec = boottime.tv_sec; \ (aa).tv_nsec = boottime.tv_usec * 1000; \ } \ } while (0) if (vp->v_type != VCHR) { fix(de->de_atime); vap->va_atime = de->de_atime; fix(de->de_mtime); vap->va_mtime = de->de_mtime; fix(de->de_ctime); vap->va_ctime = de->de_ctime; } else { dev = vp->v_rdev; fix(dev->si_atime); vap->va_atime = dev->si_atime; fix(dev->si_mtime); vap->va_mtime = dev->si_mtime; fix(dev->si_ctime); vap->va_ctime = dev->si_ctime; vap->va_rdev = dev->si_priv->cdp_inode; } vap->va_gen = 0; vap->va_flags = 0; vap->va_nlink = de->de_links; vap->va_fileid = de->de_inode; return (error); } /* ARGSUSED */ static int devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; struct vnode *vp; struct vnode *vpold; int error, i; const char *p; struct fiodgname_arg *fgn; error = devfs_fp_check(fp, &dev, &dsw); if (error) return (error); if (com == FIODTYPE) { *(int *)data = dsw->d_flags & D_TYPEMASK; dev_relthread(dev); return (0); } else if (com == FIODGNAME) { fgn = data; p = devtoname(dev); i = strlen(p) + 1; if (i > fgn->len) error = EINVAL; else error = copyout(p, fgn->buf, i); dev_relthread(dev); return (error); } error = dsw->d_ioctl(dev, com, data, fp->f_flag, td); dev_relthread(dev); if (error == ENOIOCTL) error = ENOTTY; if (error == 0 && com == TIOCSCTTY) { vp = fp->f_vnode; /* Do nothing if reassigning same control tty */ sx_slock(&proctree_lock); if (td->td_proc->p_session->s_ttyvp == vp) { sx_sunlock(&proctree_lock); return (0); } mtx_lock(&Giant); /* XXX TTY */ vpold = td->td_proc->p_session->s_ttyvp; VREF(vp); SESS_LOCK(td->td_proc->p_session); td->td_proc->p_session->s_ttyvp = vp; SESS_UNLOCK(td->td_proc->p_session); sx_sunlock(&proctree_lock); /* Get rid of reference to old control tty */ if (vpold) vrele(vpold); mtx_unlock(&Giant); /* XXX TTY */ } return (error); } /* ARGSUSED */ static int devfs_kqfilter_f(struct file *fp, struct knote *kn) { struct cdev *dev; struct cdevsw *dsw; int error; error = devfs_fp_check(fp, &dev, &dsw); if (error) return (error); error = dsw->d_kqfilter(dev, kn); dev_relthread(dev); return (error); } static int devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock) { struct componentname *cnp; struct vnode *dvp, **vpp; struct thread *td; struct devfs_dirent *de, *dd; struct devfs_dirent **dde; struct devfs_mount *dmp; struct cdev *cdev; int error, flags, nameiop; char specname[SPECNAMELEN + 1], *pname; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; td = cnp->cn_thread; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; dmp = VFSTODEVFS(dvp->v_mount); dd = dvp->v_data; *vpp = NULLVP; if ((flags & ISLASTCN) && nameiop == RENAME) return (EOPNOTSUPP); if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) return (EIO); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td); if (error) return (error); if (cnp->cn_namelen == 1 && *pname == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); *vpp = dvp; VREF(dvp); return (0); } if (flags & ISDOTDOT) { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); VOP_UNLOCK(dvp, 0, td); de = TAILQ_FIRST(&dd->de_dlist); /* "." */ de = TAILQ_NEXT(de, de_list); /* ".." */ de = de->de_dir; error = devfs_allocv(de, dvp->v_mount, vpp, td); *dm_unlock = 0; vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); return (error); } DEVFS_DMP_HOLD(dmp); devfs_populate(dmp); if (DEVFS_DMP_DROP(dmp)) { *dm_unlock = 0; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ENOENT); } dd = dvp->v_data; de = devfs_find(dd, cnp->cn_nameptr, cnp->cn_namelen); while (de == NULL) { /* While(...) so we can use break */ if (nameiop == DELETE) return (ENOENT); /* * OK, we didn't have an entry for the name we were asked for * so we try to see if anybody can create it on demand. */ pname = devfs_fqpn(specname, dvp, cnp); if (pname == NULL) break; cdev = NULL; DEVFS_DMP_HOLD(dmp); sx_xunlock(&dmp->dm_lock); sx_slock(&clone_drain_lock); EVENTHANDLER_INVOKE(dev_clone, td->td_ucred, pname, strlen(pname), &cdev); sx_sunlock(&clone_drain_lock); sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { *dm_unlock = 0; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ENOENT); } if (cdev == NULL) break; DEVFS_DMP_HOLD(dmp); devfs_populate(dmp); if (DEVFS_DMP_DROP(dmp)) { *dm_unlock = 0; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ENOENT); } dev_lock(); dde = &cdev->si_priv->cdp_dirents[dmp->dm_idx]; if (dde != NULL && *dde != NULL) de = *dde; dev_unlock(); dev_rel(cdev); break; } if (de == NULL || de->de_flags & DE_WHITEOUT) { if ((nameiop == CREATE || nameiop == RENAME) && (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) { cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); if (*vpp == dvp) { VREF(dvp); *vpp = dvp; return (0); } } error = devfs_allocv(de, dvp->v_mount, vpp, td); *dm_unlock = 0; return (error); } static int devfs_lookup(struct vop_lookup_args *ap) { int j; struct devfs_mount *dmp; int dm_unlock; dmp = VFSTODEVFS(ap->a_dvp->v_mount); dm_unlock = 1; sx_xlock(&dmp->dm_lock); j = devfs_lookupx(ap, &dm_unlock); if (dm_unlock == 1) sx_xunlock(&dmp->dm_lock); return (j); } static int devfs_mknod(struct vop_mknod_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct thread *td; struct devfs_dirent *dd, *de; struct devfs_mount *dmp; int error; /* * The only type of node we should be creating here is a * character device, for anything else return EOPNOTSUPP. */ if (ap->a_vap->va_type != VCHR) return (EOPNOTSUPP); dvp = ap->a_dvp; dmp = VFSTODEVFS(dvp->v_mount); cnp = ap->a_cnp; vpp = ap->a_vpp; td = cnp->cn_thread; dd = dvp->v_data; error = ENOENT; sx_xlock(&dmp->dm_lock); TAILQ_FOREACH(de, &dd->de_dlist, de_list) { if (cnp->cn_namelen != de->de_dirent->d_namlen) continue; if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name, de->de_dirent->d_namlen) != 0) continue; if (de->de_flags & DE_WHITEOUT) break; goto notfound; } if (de == NULL) goto notfound; de->de_flags &= ~DE_WHITEOUT; error = devfs_allocv(de, dvp->v_mount, vpp, td); return (error); notfound: sx_xunlock(&dmp->dm_lock); return (error); } /* ARGSUSED */ static int devfs_open(struct vop_open_args *ap) { struct thread *td = ap->a_td; struct vnode *vp = ap->a_vp; struct cdev *dev = vp->v_rdev; struct file *fp = ap->a_fp; int error; struct cdevsw *dsw; if (vp->v_type == VBLK) return (ENXIO); if (dev == NULL) return (ENXIO); /* Make this field valid before any I/O in d_open. */ if (dev->si_iosize_max == 0) dev->si_iosize_max = DFLTPHYS; dsw = dev_refthread(dev); if (dsw == NULL) return (ENXIO); /* XXX: Special casing of ttys for deadfs. Probably redundant. */ if (dsw->d_flags & D_TTY) vp->v_vflag |= VV_ISTTY; VOP_UNLOCK(vp, 0, td); if(!(dsw->d_flags & D_NEEDGIANT)) { DROP_GIANT(); if (dsw->d_fdopen != NULL) error = dsw->d_fdopen(dev, ap->a_mode, td, fp); else error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td); PICKUP_GIANT(); } else { if (dsw->d_fdopen != NULL) error = dsw->d_fdopen(dev, ap->a_mode, td, fp); else error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); dev_relthread(dev); if (error) return (error); #if 0 /* /dev/console */ KASSERT(fp != NULL, ("Could not vnode bypass device on NULL fp")); #else if(fp == NULL) return (error); #endif - FILE_LOCK(fp); KASSERT(fp->f_ops == &badfileops, ("Could not vnode bypass device on fdops %p", fp->f_ops)); - fp->f_data = dev; - fp->f_ops = &devfs_ops_f; - FILE_UNLOCK(fp); + finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f); return (error); } static int devfs_pathconf(struct vop_pathconf_args *ap) { switch (ap->a_name) { case _PC_MAC_PRESENT: #ifdef MAC /* * If MAC is enabled, devfs automatically supports * trivial non-persistant label storage. */ *ap->a_retval = 1; #else *ap->a_retval = 0; #endif return (0); default: return (vop_stdpathconf(ap)); } /* NOTREACHED */ } /* ARGSUSED */ static int devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; int error; error = devfs_fp_check(fp, &dev, &dsw); if (error) return (error); error = dsw->d_poll(dev, events, td); dev_relthread(dev); return(error); } /* * Print out the contents of a special device vnode. */ static int devfs_print(struct vop_print_args *ap) { printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev)); return (0); } /* ARGSUSED */ static int devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int ioflag, error, resid; struct cdevsw *dsw; error = devfs_fp_check(fp, &dev, &dsw); if (error) return (error); resid = uio->uio_resid; ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; error = dsw->d_read(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) vfs_timestamp(&dev->si_atime); dev_relthread(dev); if ((flags & FOF_OFFSET) == 0) fp->f_offset = uio->uio_offset; fp->f_nextoff = uio->uio_offset; return (error); } static int devfs_readdir(struct vop_readdir_args *ap) { int error; struct uio *uio; struct dirent *dp; struct devfs_dirent *dd; struct devfs_dirent *de; struct devfs_mount *dmp; off_t off, oldoff; int *tmp_ncookies = NULL; if (ap->a_vp->v_type != VDIR) return (ENOTDIR); uio = ap->a_uio; if (uio->uio_offset < 0) return (EINVAL); /* * XXX: This is a temporary hack to get around this filesystem not * supporting cookies. We store the location of the ncookies pointer * in a temporary variable before calling vfs_subr.c:vfs_read_dirent() * and set the number of cookies to 0. We then set the pointer to * NULL so that vfs_read_dirent doesn't try to call realloc() on * ap->a_cookies. Later in this function, we restore the ap->a_ncookies * pointer to its original location before returning to the caller. */ if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } dmp = VFSTODEVFS(ap->a_vp->v_mount); sx_xlock(&dmp->dm_lock); DEVFS_DMP_HOLD(dmp); devfs_populate(dmp); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (EIO); } error = 0; de = ap->a_vp->v_data; off = 0; oldoff = uio->uio_offset; TAILQ_FOREACH(dd, &de->de_dlist, de_list) { KASSERT(dd->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__)); if (dd->de_flags & DE_WHITEOUT) continue; if (dd->de_dirent->d_type == DT_DIR) de = dd->de_dir; else de = dd; dp = dd->de_dirent; if (dp->d_reclen > uio->uio_resid) break; dp->d_fileno = de->de_inode; if (off >= uio->uio_offset) { error = vfs_read_dirent(ap, dp, off); if (error) break; } off += dp->d_reclen; } sx_xunlock(&dmp->dm_lock); uio->uio_offset = off; /* * Restore ap->a_ncookies if it wasn't originally NULL in the first * place. */ if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } static int devfs_readlink(struct vop_readlink_args *ap) { struct devfs_dirent *de; de = ap->a_vp->v_data; return (uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio)); } static int devfs_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct devfs_dirent *de; struct cdev *dev; mtx_lock(&devfs_de_interlock); de = vp->v_data; if (de != NULL) { de->de_vnode = NULL; vp->v_data = NULL; } mtx_unlock(&devfs_de_interlock); vnode_destroy_vobject(vp); dev_lock(); dev = vp->v_rdev; vp->v_rdev = NULL; if (dev == NULL) { dev_unlock(); return (0); } dev->si_usecount -= vp->v_usecount; dev_unlock(); dev_rel(dev); return (0); } static int devfs_remove(struct vop_remove_args *ap) { struct vnode *vp = ap->a_vp; struct devfs_dirent *dd; struct devfs_dirent *de; struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount); sx_xlock(&dmp->dm_lock); dd = ap->a_dvp->v_data; de = vp->v_data; if (de->de_cdp == NULL) { TAILQ_REMOVE(&dd->de_dlist, de, de_list); devfs_delete(dmp, de, 1); } else { de->de_flags |= DE_WHITEOUT; } sx_xunlock(&dmp->dm_lock); return (0); } /* * Revoke is called on a tty when a terminal session ends. The vnode * is orphaned by setting v_op to deadfs so we need to let go of it * as well so that we create a new one next time around. * */ static int devfs_revoke(struct vop_revoke_args *ap) { struct vnode *vp = ap->a_vp, *vp2; struct cdev *dev; struct cdev_priv *cdp; struct devfs_dirent *de; int i; KASSERT((ap->a_flags & REVOKEALL) != 0, ("devfs_revoke !REVOKEALL")); dev = vp->v_rdev; cdp = dev->si_priv; dev_lock(); cdp->cdp_inuse++; dev_unlock(); vhold(vp); vgone(vp); vdrop(vp); VOP_UNLOCK(vp,0,curthread); loop: for (;;) { mtx_lock(&devfs_de_interlock); dev_lock(); vp2 = NULL; for (i = 0; i <= cdp->cdp_maxdirent; i++) { de = cdp->cdp_dirents[i]; if (de == NULL) continue; vp2 = de->de_vnode; if (vp2 != NULL) { dev_unlock(); VI_LOCK(vp2); mtx_unlock(&devfs_de_interlock); if (vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK, curthread)) goto loop; vhold(vp2); vgone(vp2); vdrop(vp2); vput(vp2); break; } } if (vp2 != NULL) { continue; } dev_unlock(); mtx_unlock(&devfs_de_interlock); break; } dev_lock(); cdp->cdp_inuse--; if (!(cdp->cdp_flags & CDP_ACTIVE) && cdp->cdp_inuse == 0) { TAILQ_REMOVE(&cdevp_list, cdp, cdp_list); dev_unlock(); dev_rel(&cdp->cdp_c); } else dev_unlock(); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); return (0); } static int devfs_rioctl(struct vop_ioctl_args *ap) { int error; struct devfs_mount *dmp; dmp = VFSTODEVFS(ap->a_vp->v_mount); sx_xlock(&dmp->dm_lock); DEVFS_DMP_HOLD(dmp); devfs_populate(dmp); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ENOENT); } error = devfs_rules_ioctl(dmp, ap->a_command, ap->a_data, ap->a_td); sx_xunlock(&dmp->dm_lock); return (error); } static int devfs_rread(struct vop_read_args *ap) { if (ap->a_vp->v_type != VDIR) return (EINVAL); return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL)); } static int devfs_setattr(struct vop_setattr_args *ap) { struct devfs_dirent *de; struct vattr *vap; struct vnode *vp; int c, error; uid_t uid; gid_t gid; vap = ap->a_vap; vp = ap->a_vp; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; error = c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = de->de_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = de->de_gid; else gid = vap->va_gid; if (uid != de->de_uid || gid != de->de_gid) { if ((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid || (gid != de->de_gid && !groupmember(gid, ap->a_cred))) { error = priv_check(ap->a_td, PRIV_VFS_CHOWN); if (error) return (error); } de->de_uid = uid; de->de_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if (ap->a_cred->cr_uid != de->de_uid) { error = priv_check(ap->a_td, PRIV_VFS_ADMIN); if (error) return (error); } de->de_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { /* See the comment in ufs_vnops::ufs_setattr(). */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, ap->a_td)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_atime = vap->va_atime; else de->de_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_mtime = vap->va_mtime; else de->de_mtime = vap->va_mtime; } c = 1; } if (c) { if (vp->v_type == VCHR) vfs_timestamp(&vp->v_rdev->si_ctime); else vfs_timestamp(&de->de_mtime); } return (0); } #ifdef MAC static int devfs_setlabel(struct vop_setlabel_args *ap) { struct vnode *vp; struct devfs_dirent *de; vp = ap->a_vp; de = vp->v_data; mac_vnode_relabel(ap->a_cred, vp, ap->a_label); mac_devfs_update(vp->v_mount, de, vp); return (0); } #endif static int devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td) { return (vnops.fo_stat(fp, sb, cred, td)); } static int devfs_symlink(struct vop_symlink_args *ap) { int i, error; struct devfs_dirent *dd; struct devfs_dirent *de; struct devfs_mount *dmp; struct thread *td; td = ap->a_cnp->cn_thread; KASSERT(td == curthread, ("devfs_symlink: td != curthread")); error = priv_check(td, PRIV_DEVFS_SYMLINK); if (error) return(error); dmp = VFSTODEVFS(ap->a_dvp->v_mount); dd = ap->a_dvp->v_data; de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen); de->de_uid = 0; de->de_gid = 0; de->de_mode = 0755; de->de_inode = alloc_unr(devfs_inos); de->de_dirent->d_type = DT_LNK; i = strlen(ap->a_target) + 1; de->de_symlink = malloc(i, M_DEVFS, M_WAITOK); bcopy(ap->a_target, de->de_symlink, i); sx_xlock(&dmp->dm_lock); #ifdef MAC mac_devfs_create_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de); #endif TAILQ_INSERT_TAIL(&dd->de_dlist, de, de_list); return (devfs_allocv(de, ap->a_dvp->v_mount, ap->a_vpp, td)); } /* ARGSUSED */ static int devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int error, ioflag, resid; struct cdevsw *dsw; error = devfs_fp_check(fp, &dev, &dsw); if (error) return (error); KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; resid = uio->uio_resid; error = dsw->d_write(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) { vfs_timestamp(&dev->si_ctime); dev->si_mtime = dev->si_ctime; } dev_relthread(dev); if ((flags & FOF_OFFSET) == 0) fp->f_offset = uio->uio_offset; fp->f_nextoff = uio->uio_offset; return (error); } dev_t dev2udev(struct cdev *x) { if (x == NULL) return (NODEV); return (x->si_priv->cdp_inode); } static struct fileops devfs_ops_f = { .fo_read = devfs_read_f, .fo_write = devfs_write_f, .fo_ioctl = devfs_ioctl_f, .fo_poll = devfs_poll_f, .fo_kqfilter = devfs_kqfilter_f, .fo_stat = devfs_stat_f, .fo_close = devfs_close_f, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; static struct vop_vector devfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_getattr = devfs_getattr, .vop_ioctl = devfs_rioctl, .vop_lookup = devfs_lookup, .vop_mknod = devfs_mknod, .vop_pathconf = devfs_pathconf, .vop_read = devfs_rread, .vop_readdir = devfs_readdir, .vop_readlink = devfs_readlink, .vop_reclaim = devfs_reclaim, .vop_remove = devfs_remove, .vop_revoke = devfs_revoke, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_symlink = devfs_symlink, }; static struct vop_vector devfs_specops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_advlock = devfs_advlock, .vop_bmap = VOP_PANIC, .vop_close = devfs_close, .vop_create = VOP_PANIC, .vop_fsync = devfs_fsync, .vop_getattr = devfs_getattr, .vop_lease = VOP_NULL, .vop_link = VOP_PANIC, .vop_mkdir = VOP_PANIC, .vop_mknod = VOP_PANIC, .vop_open = devfs_open, .vop_pathconf = devfs_pathconf, .vop_print = devfs_print, .vop_read = VOP_PANIC, .vop_readdir = VOP_PANIC, .vop_readlink = VOP_PANIC, .vop_reallocblks = VOP_PANIC, .vop_reclaim = devfs_reclaim, .vop_remove = devfs_remove, .vop_rename = VOP_PANIC, .vop_revoke = devfs_revoke, .vop_rmdir = VOP_PANIC, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_strategy = VOP_PANIC, .vop_symlink = VOP_PANIC, .vop_write = VOP_PANIC, }; /* * Our calling convention to the device drivers used to be that we passed * vnode.h IO_* flags to read()/write(), but we're moving to fcntl.h O_ * flags instead since that's what open(), close() and ioctl() takes and * we don't really want vnode.h in device drivers. * We solved the source compatibility by redefining some vnode flags to * be the same as the fcntl ones and by sending down the bitwise OR of * the respective fcntl/vnode flags. These CTASSERTS make sure nobody * pulls the rug out under this. */ CTASSERT(O_NONBLOCK == IO_NDELAY); CTASSERT(O_FSYNC == IO_SYNC); Index: head/sys/fs/fifofs/fifo_vnops.c =================================================================== --- head/sys/fs/fifofs/fifo_vnops.c (revision 174987) +++ head/sys/fs/fifofs/fifo_vnops.c (revision 174988) @@ -1,742 +1,739 @@ /*- * Copyright (c) 1990, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2005 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fifo_vnops.c 8.10 (Berkeley) 5/27/95 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include /* XXXKSE */ #include #include #include #include #include #include #include #include #include static fo_rdwr_t fifo_read_f; static fo_rdwr_t fifo_write_f; static fo_ioctl_t fifo_ioctl_f; static fo_poll_t fifo_poll_f; static fo_kqfilter_t fifo_kqfilter_f; static fo_stat_t fifo_stat_f; static fo_close_t fifo_close_f; struct fileops fifo_ops_f = { .fo_read = fifo_read_f, .fo_write = fifo_write_f, .fo_ioctl = fifo_ioctl_f, .fo_poll = fifo_poll_f, .fo_kqfilter = fifo_kqfilter_f, .fo_stat = fifo_stat_f, .fo_close = fifo_close_f, .fo_flags = DFLAG_PASSABLE }; /* * This structure is associated with the FIFO vnode and stores * the state associated with the FIFO. */ struct fifoinfo { struct socket *fi_readsock; struct socket *fi_writesock; long fi_readers; long fi_writers; }; static vop_print_t fifo_print; static vop_open_t fifo_open; static vop_close_t fifo_close; static vop_ioctl_t fifo_ioctl; static vop_kqfilter_t fifo_kqfilter; static vop_pathconf_t fifo_pathconf; static vop_advlock_t fifo_advlock; static void filt_fifordetach(struct knote *kn); static int filt_fiforead(struct knote *kn, long hint); static void filt_fifowdetach(struct knote *kn); static int filt_fifowrite(struct knote *kn, long hint); static void filt_fifodetach_notsup(struct knote *kn); static int filt_fifo_notsup(struct knote *kn, long hint); static struct filterops fiforead_filtops = { 1, NULL, filt_fifordetach, filt_fiforead }; static struct filterops fifowrite_filtops = { 1, NULL, filt_fifowdetach, filt_fifowrite }; static struct filterops fifo_notsup_filtops = { 1, NULL, filt_fifodetach_notsup, filt_fifo_notsup }; struct vop_vector fifo_specops = { .vop_default = &default_vnodeops, .vop_access = VOP_EBADF, .vop_advlock = fifo_advlock, .vop_close = fifo_close, .vop_create = VOP_PANIC, .vop_getattr = VOP_EBADF, .vop_ioctl = fifo_ioctl, .vop_kqfilter = fifo_kqfilter, .vop_lease = VOP_NULL, .vop_link = VOP_PANIC, .vop_mkdir = VOP_PANIC, .vop_mknod = VOP_PANIC, .vop_open = fifo_open, .vop_pathconf = fifo_pathconf, .vop_print = fifo_print, .vop_read = VOP_PANIC, .vop_readdir = VOP_PANIC, .vop_readlink = VOP_PANIC, .vop_reallocblks = VOP_PANIC, .vop_reclaim = VOP_NULL, .vop_remove = VOP_PANIC, .vop_rename = VOP_PANIC, .vop_rmdir = VOP_PANIC, .vop_setattr = VOP_EBADF, .vop_symlink = VOP_PANIC, .vop_write = VOP_PANIC, }; struct mtx fifo_mtx; MTX_SYSINIT(fifo, &fifo_mtx, "fifo mutex", MTX_DEF); /* * Dispose of fifo resources. */ static void fifo_cleanup(struct vnode *vp) { struct fifoinfo *fip = vp->v_fifoinfo; ASSERT_VOP_LOCKED(vp, "fifo_cleanup"); if (fip->fi_readers == 0 && fip->fi_writers == 0) { vp->v_fifoinfo = NULL; (void)soclose(fip->fi_readsock); (void)soclose(fip->fi_writesock); FREE(fip, M_VNODE); } } /* * Open called to set up a new instance of a fifo or * to find an active instance of a fifo. */ /* ARGSUSED */ static int fifo_open(ap) struct vop_open_args /* { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; int a_fdidx; } */ *ap; { struct vnode *vp = ap->a_vp; struct fifoinfo *fip; struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; struct file *fp = ap->a_fp; struct socket *rso, *wso; int error; ASSERT_VOP_ELOCKED(vp, "fifo_open"); if (fp == NULL) return (EINVAL); if ((fip = vp->v_fifoinfo) == NULL) { MALLOC(fip, struct fifoinfo *, sizeof(*fip), M_VNODE, M_WAITOK); error = socreate(AF_LOCAL, &rso, SOCK_STREAM, 0, cred, td); if (error) goto fail1; fip->fi_readsock = rso; error = socreate(AF_LOCAL, &wso, SOCK_STREAM, 0, cred, td); if (error) goto fail2; fip->fi_writesock = wso; error = soconnect2(wso, rso); if (error) { (void)soclose(wso); fail2: (void)soclose(rso); fail1: free(fip, M_VNODE); return (error); } fip->fi_readers = fip->fi_writers = 0; wso->so_snd.sb_lowat = PIPE_BUF; SOCKBUF_LOCK(&rso->so_rcv); rso->so_rcv.sb_state |= SBS_CANTRCVMORE; SOCKBUF_UNLOCK(&rso->so_rcv); KASSERT(vp->v_fifoinfo == NULL, ("fifo_open: v_fifoinfo race")); vp->v_fifoinfo = fip; } /* * General access to fi_readers and fi_writers is protected using * the vnode lock. * * Protect the increment of fi_readers and fi_writers and the * associated calls to wakeup() with the fifo mutex in addition * to the vnode lock. This allows the vnode lock to be dropped * for the msleep() calls below, and using the fifo mutex with * msleep() prevents the wakeup from being missed. */ mtx_lock(&fifo_mtx); if (ap->a_mode & FREAD) { fip->fi_readers++; if (fip->fi_readers == 1) { SOCKBUF_LOCK(&fip->fi_writesock->so_snd); fip->fi_writesock->so_snd.sb_state &= ~SBS_CANTSENDMORE; SOCKBUF_UNLOCK(&fip->fi_writesock->so_snd); if (fip->fi_writers > 0) { wakeup(&fip->fi_writers); sowwakeup(fip->fi_writesock); } } } if (ap->a_mode & FWRITE) { if ((ap->a_mode & O_NONBLOCK) && fip->fi_readers == 0) { mtx_unlock(&fifo_mtx); return (ENXIO); } fip->fi_writers++; if (fip->fi_writers == 1) { SOCKBUF_LOCK(&fip->fi_readsock->so_rcv); fip->fi_readsock->so_rcv.sb_state &= ~SBS_CANTRCVMORE; SOCKBUF_UNLOCK(&fip->fi_readsock->so_rcv); if (fip->fi_readers > 0) { wakeup(&fip->fi_readers); sorwakeup(fip->fi_readsock); } } } if ((ap->a_mode & O_NONBLOCK) == 0) { if ((ap->a_mode & FREAD) && fip->fi_writers == 0) { VOP_UNLOCK(vp, 0, td); error = msleep(&fip->fi_readers, &fifo_mtx, PDROP | PCATCH | PSOCK, "fifoor", 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (error) { fip->fi_readers--; if (fip->fi_readers == 0) { socantsendmore(fip->fi_writesock); fifo_cleanup(vp); } return (error); } mtx_lock(&fifo_mtx); /* * We must have got woken up because we had a writer. * That (and not still having one) is the condition * that we must wait for. */ } if ((ap->a_mode & FWRITE) && fip->fi_readers == 0) { VOP_UNLOCK(vp, 0, td); error = msleep(&fip->fi_writers, &fifo_mtx, PDROP | PCATCH | PSOCK, "fifoow", 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (error) { fip->fi_writers--; if (fip->fi_writers == 0) { socantrcvmore(fip->fi_readsock); fifo_cleanup(vp); } return (error); } /* * We must have got woken up because we had * a reader. That (and not still having one) * is the condition that we must wait for. */ mtx_lock(&fifo_mtx); } } mtx_unlock(&fifo_mtx); KASSERT(fp != NULL, ("can't fifo/vnode bypass")); - FILE_LOCK(fp); KASSERT(fp->f_ops == &badfileops, ("not badfileops in fifo_open")); - fp->f_data = fip; - fp->f_ops = &fifo_ops_f; - FILE_UNLOCK(fp); + finit(fp, fp->f_flag, DTYPE_FIFO, fip, &fifo_ops_f); return (0); } /* * Now unused vnode ioctl routine. */ /* ARGSUSED */ static int fifo_ioctl(ap) struct vop_ioctl_args /* { struct vnode *a_vp; u_long a_command; caddr_t a_data; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { printf("WARNING: fifo_ioctl called unexpectedly\n"); return (ENOTTY); } /* * Now unused vnode kqfilter routine. */ /* ARGSUSED */ static int fifo_kqfilter(ap) struct vop_kqfilter_args /* { struct vnode *a_vp; struct knote *a_kn; } */ *ap; { printf("WARNING: fifo_kqfilter called unexpectedly\n"); return (EINVAL); } static void filt_fifordetach(struct knote *kn) { struct socket *so = (struct socket *)kn->kn_hook; SOCKBUF_LOCK(&so->so_rcv); knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); if (knlist_empty(&so->so_rcv.sb_sel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; SOCKBUF_UNLOCK(&so->so_rcv); } static int filt_fiforead(struct knote *kn, long hint) { struct socket *so = (struct socket *)kn->kn_hook; SOCKBUF_LOCK_ASSERT(&so->so_rcv); kn->kn_data = so->so_rcv.sb_cc; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_flags &= ~EV_EOF; return (kn->kn_data > 0); } } static void filt_fifowdetach(struct knote *kn) { struct socket *so = (struct socket *)kn->kn_hook; SOCKBUF_LOCK(&so->so_snd); knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); if (knlist_empty(&so->so_snd.sb_sel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; SOCKBUF_UNLOCK(&so->so_snd); } static int filt_fifowrite(struct knote *kn, long hint) { struct socket *so = (struct socket *)kn->kn_hook; SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_flags &= ~EV_EOF; return (kn->kn_data >= so->so_snd.sb_lowat); } } static void filt_fifodetach_notsup(struct knote *kn) { } static int filt_fifo_notsup(struct knote *kn, long hint) { return (0); } /* * Device close routine */ /* ARGSUSED */ static int fifo_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct fifoinfo *fip = vp->v_fifoinfo; ASSERT_VOP_LOCKED(vp, "fifo_close"); KASSERT(fip != NULL, ("fifo_close: no v_fifoinfo")); if (ap->a_fflag & FREAD) { fip->fi_readers--; if (fip->fi_readers == 0) socantsendmore(fip->fi_writesock); } if (ap->a_fflag & FWRITE) { fip->fi_writers--; if (fip->fi_writers == 0) socantrcvmore(fip->fi_readsock); } fifo_cleanup(vp); return (0); } /* * Print out internal contents of a fifo vnode. */ int fifo_printinfo(vp) struct vnode *vp; { register struct fifoinfo *fip = vp->v_fifoinfo; if (fip == NULL){ printf(", NULL v_fifoinfo"); return (0); } printf(", fifo with %ld readers and %ld writers", fip->fi_readers, fip->fi_writers); return (0); } /* * Print out the contents of a fifo vnode. */ static int fifo_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { fifo_printinfo(ap->a_vp); printf("\n"); return (0); } /* * Return POSIX pathconf information applicable to fifo's. */ static int fifo_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = LINK_MAX; return (0); case _PC_PIPE_BUF: *ap->a_retval = PIPE_BUF; return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Fifo advisory byte-level locks. */ /* ARGSUSED */ static int fifo_advlock(ap) struct vop_advlock_args /* { struct vnode *a_vp; caddr_t a_id; int a_op; struct flock *a_fl; int a_flags; } */ *ap; { return (ap->a_flags & F_FLOCK ? EOPNOTSUPP : EINVAL); } static int fifo_close_f(struct file *fp, struct thread *td) { return (vnops.fo_close(fp, td)); } /* * The implementation of ioctl() for named fifos is complicated by the fact * that we permit O_RDWR fifo file descriptors, meaning that the actions of * ioctls may have to be applied to both the underlying sockets rather than * just one. The original implementation simply forward the ioctl to one * or both sockets based on fp->f_flag. We now consider each ioctl * separately, as the composition effect requires careful ordering. * * We do not blindly pass all ioctls through to the socket in order to avoid * providing unnecessary ioctls that might be improperly depended on by * applications (such as socket-specific, routing, and interface ioctls). * * Unlike sys_pipe.c, fifos do not implement the deprecated TIOCSPGRP and * TIOCGPGRP ioctls. Earlier implementations of fifos did forward SIOCSPGRP * and SIOCGPGRP ioctls, so we might need to re-add those here. */ static int fifo_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td) { struct fifoinfo *fi; struct file filetmp; /* Local, so need not be locked. */ int error; error = ENOTTY; fi = fp->f_data; switch (com) { case FIONBIO: /* * Non-blocking I/O is implemented at the fifo layer using * MSG_NBIO, so does not need to be forwarded down the stack. */ return (0); case FIOASYNC: case FIOSETOWN: case FIOGETOWN: /* * These socket ioctls don't have any ordering requirements, * so are called in an arbitrary order, and only on the * sockets indicated by the file descriptor rights. * * XXXRW: If O_RDWR and the read socket accepts an ioctl but * the write socket doesn't, the socketpair is left in an * inconsistent state. */ if (fp->f_flag & FREAD) { filetmp.f_data = fi->fi_readsock; filetmp.f_cred = cred; error = soo_ioctl(&filetmp, com, data, cred, td); if (error) return (error); } if (fp->f_flag & FWRITE) { filetmp.f_data = fi->fi_writesock; filetmp.f_cred = cred; error = soo_ioctl(&filetmp, com, data, cred, td); } return (error); case FIONREAD: /* * FIONREAD will return 0 for non-readable descriptors, and * the results of FIONREAD on the read socket for readable * descriptors. */ if (!(fp->f_flag & FREAD)) { *(int *)data = 0; return (0); } filetmp.f_data = fi->fi_readsock; filetmp.f_cred = cred; return (soo_ioctl(&filetmp, com, data, cred, td)); default: return (ENOTTY); } } /* * Because fifos are now a file descriptor layer object, EVFILT_VNODE is not * implemented. Likely, fifo_kqfilter() should be removed, and * fifo_kqfilter_f() should know how to forward the request to the underling * vnode using f_vnode in the file descriptor here. */ static int fifo_kqfilter_f(struct file *fp, struct knote *kn) { struct fifoinfo *fi; struct socket *so; struct sockbuf *sb; fi = fp->f_data; /* * If a filter is requested that is not supported by this file * descriptor, don't return an error, but also don't ever generate an * event. */ if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) { kn->kn_fop = &fifo_notsup_filtops; return (0); } if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) { kn->kn_fop = &fifo_notsup_filtops; return (0); } switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &fiforead_filtops; so = fi->fi_readsock; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &fifowrite_filtops; so = fi->fi_writesock; sb = &so->so_snd; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)so; SOCKBUF_LOCK(sb); knlist_add(&sb->sb_sel.si_note, kn, 1); sb->sb_flags |= SB_KNOTE; SOCKBUF_UNLOCK(sb); return (0); } static int fifo_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td) { struct fifoinfo *fip; struct file filetmp; int levents, revents = 0; fip = fp->f_data; levents = events & (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND); if ((fp->f_flag & FREAD) && levents) { /* * If POLLIN or POLLRDNORM is requested and POLLINIGNEOF is * not, then convert the first two to the last one. This * tells the socket poll function to ignore EOF so that we * block if there is no writer (and no data). Callers can * set POLLINIGNEOF to get non-blocking behavior. */ if (levents & (POLLIN | POLLRDNORM) && !(levents & POLLINIGNEOF)) { levents &= ~(POLLIN | POLLRDNORM); levents |= POLLINIGNEOF; } filetmp.f_data = fip->fi_readsock; filetmp.f_cred = cred; revents |= soo_poll(&filetmp, levents, cred, td); /* Reverse the above conversion. */ if ((revents & POLLINIGNEOF) && !(events & POLLINIGNEOF)) { revents |= (events & (POLLIN | POLLRDNORM)); revents &= ~POLLINIGNEOF; } } levents = events & (POLLOUT | POLLWRNORM | POLLWRBAND); if ((fp->f_flag & FWRITE) && levents) { filetmp.f_data = fip->fi_writesock; filetmp.f_cred = cred; revents |= soo_poll(&filetmp, levents, cred, td); } return (revents); } static int fifo_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct fifoinfo *fip; int error, sflags; fip = fp->f_data; KASSERT(uio->uio_rw == UIO_READ,("fifo_read mode")); if (uio->uio_resid == 0) return (0); sflags = (fp->f_flag & FNONBLOCK) ? MSG_NBIO : 0; mtx_lock(&Giant); error = soreceive(fip->fi_readsock, NULL, uio, NULL, NULL, &sflags); mtx_unlock(&Giant); return (error); } static int fifo_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td) { return (vnops.fo_stat(fp, sb, cred, td)); } static int fifo_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct fifoinfo *fip; int error, sflags; fip = fp->f_data; KASSERT(uio->uio_rw == UIO_WRITE,("fifo_write mode")); sflags = (fp->f_flag & FNONBLOCK) ? MSG_NBIO : 0; mtx_lock(&Giant); error = sosend(fip->fi_writesock, NULL, uio, 0, NULL, sflags, td); mtx_unlock(&Giant); return (error); } Index: head/sys/kern/kern_descrip.c =================================================================== --- head/sys/kern/kern_descrip.c (revision 174987) +++ head/sys/kern/kern_descrip.c (revision 174988) @@ -1,2895 +1,2857 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", "file desc to leader structures"); static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); static uma_zone_t file_zone; /* How to treat 'new' parameter when allocating a fd for do_dup(). */ enum dup_type { DUP_VARIABLE, DUP_FIXED }; static int do_dup(struct thread *td, enum dup_type type, int old, int new, register_t *retval); static int fd_first_free(struct filedesc *, int, int); static int fd_last_used(struct filedesc *, int, int); static void fdgrowtable(struct filedesc *, int); -static int fdrop_locked(struct file *fp, struct thread *td); static void fdunused(struct filedesc *fdp, int fd); static void fdused(struct filedesc *fdp, int fd); /* * A process is initially started out with NDFILE descriptors stored within * this structure, selected to be enough for typical applications based on * the historical limit of 20 open files (and the usage of descriptors by * shells). If these descriptors are exhausted, a larger descriptor table * may be allocated, up to a process' resource limit; the internal arrays * are then unused. */ #define NDFILE 20 #define NDSLOTSIZE sizeof(NDSLOTTYPE) #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) #define NDSLOT(x) ((x) / NDENTRIES) #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) /* * Storage required per open file descriptor. */ #define OFILESIZE (sizeof(struct file *) + sizeof(char)) /* * Basic allocation of descriptors: * one of the above, plus arrays for NDFILE descriptors. */ struct filedesc0 { struct filedesc fd_fd; /* * These arrays are used when the number of open files is * <= NDFILE, and are then pointed to by the pointers above. */ struct file *fd_dfiles[NDFILE]; char fd_dfileflags[NDFILE]; NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; }; /* * Descriptor management. */ -struct filelist filehead; /* head of list of open files */ -int openfiles; /* actual number of open files */ -struct sx filelist_lock; /* sx to protect filelist */ +volatile int openfiles; /* actual number of open files */ struct mtx sigio_lock; /* mtx to protect pointers to sigio */ void (*mq_fdclose)(struct thread *td, int fd, struct file *fp); /* A mutex to protect the association between a proc and filedesc. */ static struct mtx fdesc_mtx; /* * Find the first zero bit in the given bitmap, starting at low and not * exceeding size - 1. */ static int fd_first_free(struct filedesc *fdp, int low, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, maxoff; if (low >= size) return (low); off = NDSLOT(low); if (low % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); if ((mask &= ~map[off]) != 0UL) return (off * NDENTRIES + ffsl(mask) - 1); ++off; } for (maxoff = NDSLOTS(size); off < maxoff; ++off) if (map[off] != ~0UL) return (off * NDENTRIES + ffsl(~map[off]) - 1); return (size); } /* * Find the highest non-zero bit in the given bitmap, starting at low and * not exceeding size - 1. */ static int fd_last_used(struct filedesc *fdp, int low, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, minoff; if (low >= size) return (-1); off = NDSLOT(size); if (size % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES)); if ((mask &= map[off]) != 0) return (off * NDENTRIES + flsl(mask) - 1); --off; } for (minoff = NDSLOT(low); off >= minoff; --off) if (map[off] != 0) return (off * NDENTRIES + flsl(map[off]) - 1); return (low - 1); } static int fdisused(struct filedesc *fdp, int fd) { KASSERT(fd >= 0 && fd < fdp->fd_nfiles, ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); } /* * Mark a file descriptor as used. */ static void fdused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); KASSERT(!fdisused(fdp, fd), ("fd already used")); fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); if (fd > fdp->fd_lastfile) fdp->fd_lastfile = fd; if (fd == fdp->fd_freefile) fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles); } /* * Mark a file descriptor as unused. */ static void fdunused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdisused(fdp, fd), ("fd is already unused")); KASSERT(fdp->fd_ofiles[fd] == NULL, ("fd is still in use")); fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); if (fd < fdp->fd_freefile) fdp->fd_freefile = fd; if (fd == fdp->fd_lastfile) fdp->fd_lastfile = fd_last_used(fdp, 0, fd); } /* * System calls on descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct getdtablesize_args { int dummy; }; #endif /* ARGSUSED */ int getdtablesize(struct thread *td, struct getdtablesize_args *uap) { struct proc *p = td->td_proc; PROC_LOCK(p); td->td_retval[0] = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); PROC_UNLOCK(p); return (0); } /* * Duplicate a file descriptor to a particular value. * * Note: keep in mind that a potential race condition exists when closing * descriptors from a shared descriptor table (via rfork). */ #ifndef _SYS_SYSPROTO_H_ struct dup2_args { u_int from; u_int to; }; #endif /* ARGSUSED */ int dup2(struct thread *td, struct dup2_args *uap) { return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to, td->td_retval)); } /* * Duplicate a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct dup_args { u_int fd; }; #endif /* ARGSUSED */ int dup(struct thread *td, struct dup_args *uap) { return (do_dup(td, DUP_VARIABLE, (int)uap->fd, 0, td->td_retval)); } /* * The file control system call. */ #ifndef _SYS_SYSPROTO_H_ struct fcntl_args { int fd; int cmd; long arg; }; #endif /* ARGSUSED */ int fcntl(struct thread *td, struct fcntl_args *uap) { struct flock fl; intptr_t arg; int error; error = 0; switch (uap->cmd) { case F_GETLK: case F_SETLK: case F_SETLKW: error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl)); arg = (intptr_t)&fl; break; default: arg = uap->arg; break; } if (error) return (error); error = kern_fcntl(td, uap->fd, uap->cmd, arg); if (error) return (error); if (uap->cmd == F_GETLK) error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl)); return (error); } static inline struct file * fdtofp(int fd, struct filedesc *fdp) { struct file *fp; FILEDESC_LOCK_ASSERT(fdp); if ((unsigned)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) return (NULL); return (fp); } int kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) { struct filedesc *fdp; struct flock *flp; struct file *fp; struct proc *p; char *pop; struct vnode *vp; u_int newmin; int error, flg, tmp; int vfslocked; vfslocked = 0; error = 0; flg = F_POSIX; p = td->td_proc; fdp = p->p_fd; switch (cmd) { case F_DUPFD: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } FILEDESC_SUNLOCK(fdp); newmin = arg; PROC_LOCK(p); if (newmin >= lim_cur(p, RLIMIT_NOFILE) || newmin >= maxfilesperproc) { PROC_UNLOCK(p); error = EINVAL; break; } PROC_UNLOCK(p); error = do_dup(td, DUP_VARIABLE, fd, newmin, td->td_retval); break; case F_GETFD: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } pop = &fdp->fd_ofileflags[fd]; td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0; FILEDESC_SUNLOCK(fdp); break; case F_SETFD: FILEDESC_XLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_XUNLOCK(fdp); error = EBADF; break; } pop = &fdp->fd_ofileflags[fd]; *pop = (*pop &~ UF_EXCLOSE) | (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); FILEDESC_XUNLOCK(fdp); break; case F_GETFL: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } - FILE_LOCK(fp); td->td_retval[0] = OFLAGS(fp->f_flag); - FILE_UNLOCK(fp); FILEDESC_SUNLOCK(fdp); break; case F_SETFL: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } - FILE_LOCK(fp); - fhold_locked(fp); - fp->f_flag &= ~FCNTLFLAGS; - fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; - FILE_UNLOCK(fp); + fhold(fp); FILEDESC_SUNLOCK(fdp); + do { + tmp = flg = fp->f_flag; + tmp &= ~FCNTLFLAGS; + tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; + } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); tmp = fp->f_flag & FNONBLOCK; error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); if (error) { fdrop(fp, td); break; } tmp = fp->f_flag & FASYNC; error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); if (error == 0) { fdrop(fp, td); break; } - FILE_LOCK(fp); - fp->f_flag &= ~FNONBLOCK; - FILE_UNLOCK(fp); + atomic_clear_int(&fp->f_flag, FNONBLOCK); tmp = 0; (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_GETOWN: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } fhold(fp); FILEDESC_SUNLOCK(fdp); error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); if (error == 0) td->td_retval[0] = tmp; fdrop(fp, td); break; case F_SETOWN: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } fhold(fp); FILEDESC_SUNLOCK(fdp); tmp = arg; error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_SETLKW: flg |= F_WAIT; /* FALLTHROUGH F_SETLK */ case F_SETLK: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } if (fp->f_type != DTYPE_VNODE) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } flp = (struct flock *)arg; if (flp->l_whence == SEEK_CUR) { if (fp->f_offset < 0 || (flp->l_start > 0 && fp->f_offset > OFF_MAX - flp->l_start)) { FILEDESC_SUNLOCK(fdp); error = EOVERFLOW; break; } flp->l_start += fp->f_offset; } /* * VOP_ADVLOCK() may block. */ fhold(fp); FILEDESC_SUNLOCK(fdp); vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); switch (flp->l_type) { case F_RDLCK: if ((fp->f_flag & FREAD) == 0) { error = EBADF; break; } PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_WRLCK: if ((fp->f_flag & FWRITE) == 0) { error = EBADF; break; } PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_UNLCK: error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, F_POSIX); break; default: error = EINVAL; break; } VFS_UNLOCK_GIANT(vfslocked); vfslocked = 0; /* Check for race with close */ FILEDESC_SLOCK(fdp); if ((unsigned) fd >= fdp->fd_nfiles || fp != fdp->fd_ofiles[fd]) { FILEDESC_SUNLOCK(fdp); flp->l_whence = SEEK_SET; flp->l_start = 0; flp->l_len = 0; flp->l_type = F_UNLCK; vfslocked = VFS_LOCK_GIANT(vp->v_mount); (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, F_POSIX); VFS_UNLOCK_GIANT(vfslocked); vfslocked = 0; } else FILEDESC_SUNLOCK(fdp); fdrop(fp, td); break; case F_GETLK: FILEDESC_SLOCK(fdp); if ((fp = fdtofp(fd, fdp)) == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } if (fp->f_type != DTYPE_VNODE) { FILEDESC_SUNLOCK(fdp); error = EBADF; break; } flp = (struct flock *)arg; if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && flp->l_type != F_UNLCK) { FILEDESC_SUNLOCK(fdp); error = EINVAL; break; } if (flp->l_whence == SEEK_CUR) { if ((flp->l_start > 0 && fp->f_offset > OFF_MAX - flp->l_start) || (flp->l_start < 0 && fp->f_offset < OFF_MIN - flp->l_start)) { FILEDESC_SUNLOCK(fdp); error = EOVERFLOW; break; } flp->l_start += fp->f_offset; } /* * VOP_ADVLOCK() may block. */ fhold(fp); FILEDESC_SUNLOCK(fdp); vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, F_POSIX); VFS_UNLOCK_GIANT(vfslocked); vfslocked = 0; fdrop(fp, td); break; default: error = EINVAL; break; } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Common code for dup, dup2, and fcntl(F_DUPFD). */ static int do_dup(struct thread *td, enum dup_type type, int old, int new, register_t *retval) { struct filedesc *fdp; struct proc *p; struct file *fp; struct file *delfp; int error, holdleaders, maxfd; KASSERT((type == DUP_VARIABLE || type == DUP_FIXED), ("invalid dup type %d", type)); p = td->td_proc; fdp = p->p_fd; /* * Verify we have a valid descriptor to dup from and possibly to * dup to. */ if (old < 0 || new < 0) return (EBADF); PROC_LOCK(p); maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); PROC_UNLOCK(p); if (new >= maxfd) return (EMFILE); FILEDESC_XLOCK(fdp); if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } if (type == DUP_FIXED && old == new) { *retval = new; FILEDESC_XUNLOCK(fdp); return (0); } fp = fdp->fd_ofiles[old]; fhold(fp); /* * If the caller specified a file descriptor, make sure the file * table is large enough to hold it, and grab it. Otherwise, just * allocate a new descriptor the usual way. Since the filedesc * lock may be temporarily dropped in the process, we have to look * out for a race. */ if (type == DUP_FIXED) { if (new >= fdp->fd_nfiles) fdgrowtable(fdp, new + 1); if (fdp->fd_ofiles[new] == NULL) fdused(fdp, new); } else { if ((error = fdalloc(td, new, &new)) != 0) { FILEDESC_XUNLOCK(fdp); fdrop(fp, td); return (error); } } /* * If the old file changed out from under us then treat it as a * bad file descriptor. Userland should do its own locking to * avoid this case. */ if (fdp->fd_ofiles[old] != fp) { /* we've allocated a descriptor which we won't use */ if (fdp->fd_ofiles[new] == NULL) fdunused(fdp, new); FILEDESC_XUNLOCK(fdp); fdrop(fp, td); return (EBADF); } KASSERT(old != new, ("new fd is same as old")); /* * Save info on the descriptor being overwritten. We cannot close * it without introducing an ownership race for the slot, since we * need to drop the filedesc lock to call closef(). * * XXX this duplicates parts of close(). */ delfp = fdp->fd_ofiles[new]; holdleaders = 0; if (delfp != NULL) { if (td->td_proc->p_fdtol != NULL) { /* * Ask fdfree() to sleep to ensure that all relevant * process leaders can be traversed in closef(). */ fdp->fd_holdleaderscount++; holdleaders = 1; } } /* * Duplicate the source descriptor */ fdp->fd_ofiles[new] = fp; fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE; if (new > fdp->fd_lastfile) fdp->fd_lastfile = new; *retval = new; /* * If we dup'd over a valid file, we now own the reference to it * and must dispose of it using closef() semantics (as if a * close() were performed on it). * * XXX this duplicates parts of close(). */ if (delfp != NULL) { knote_fdclose(td, new); if (delfp->f_type == DTYPE_MQUEUE) mq_fdclose(td, new, delfp); FILEDESC_XUNLOCK(fdp); (void) closef(delfp, td); if (holdleaders) { FILEDESC_XLOCK(fdp); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; wakeup(&fdp->fd_holdleaderscount); } FILEDESC_XUNLOCK(fdp); } } else { FILEDESC_XUNLOCK(fdp); } return (0); } /* * If sigio is on the list associated with a process or process group, * disable signalling from the device, remove sigio from the list and * free sigio. */ void funsetown(struct sigio **sigiop) { struct sigio *sigio; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } *(sigio->sio_myref) = NULL; if ((sigio)->sio_pgid < 0) { struct pgrp *pg = (sigio)->sio_pgrp; PGRP_LOCK(pg); SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else { struct proc *p = (sigio)->sio_proc; PROC_LOCK(p); SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } SIGIO_UNLOCK(); crfree(sigio->sio_ucred); FREE(sigio, M_SIGIO); } /* * Free a list of sigio structures. * We only need to lock the SIGIO_LOCK because we have made ourselves * inaccessible to callers of fsetown and therefore do not need to lock * the proc or pgrp struct for the list manipulation. */ void funsetownlst(struct sigiolst *sigiolst) { struct proc *p; struct pgrp *pg; struct sigio *sigio; sigio = SLIST_FIRST(sigiolst); if (sigio == NULL) return; p = NULL; pg = NULL; /* * Every entry of the list should belong * to a single proc or pgrp. */ if (sigio->sio_pgid < 0) { pg = sigio->sio_pgrp; PGRP_LOCK_ASSERT(pg, MA_NOTOWNED); } else /* if (sigio->sio_pgid > 0) */ { p = sigio->sio_proc; PROC_LOCK_ASSERT(p, MA_NOTOWNED); } SIGIO_LOCK(); while ((sigio = SLIST_FIRST(sigiolst)) != NULL) { *(sigio->sio_myref) = NULL; if (pg != NULL) { KASSERT(sigio->sio_pgid < 0, ("Proc sigio in pgrp sigio list")); KASSERT(sigio->sio_pgrp == pg, ("Bogus pgrp in sigio list")); PGRP_LOCK(pg); SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else /* if (p != NULL) */ { KASSERT(sigio->sio_pgid > 0, ("Pgrp sigio in proc sigio list")); KASSERT(sigio->sio_proc == p, ("Bogus proc in sigio list")); PROC_LOCK(p); SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } SIGIO_UNLOCK(); crfree(sigio->sio_ucred); FREE(sigio, M_SIGIO); SIGIO_LOCK(); } SIGIO_UNLOCK(); } /* * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). * * After permission checking, add a sigio structure to the sigio list for * the process or process group. */ int fsetown(pid_t pgid, struct sigio **sigiop) { struct proc *proc; struct pgrp *pgrp; struct sigio *sigio; int ret; if (pgid == 0) { funsetown(sigiop); return (0); } ret = 0; /* Allocate and fill in the new sigio out of locks. */ MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK); sigio->sio_pgid = pgid; sigio->sio_ucred = crhold(curthread->td_ucred); sigio->sio_myref = sigiop; sx_slock(&proctree_lock); if (pgid > 0) { proc = pfind(pgid); if (proc == NULL) { ret = ESRCH; goto fail; } /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ PROC_UNLOCK(proc); if (proc->p_session != curthread->td_proc->p_session) { ret = EPERM; goto fail; } pgrp = NULL; } else /* if (pgid < 0) */ { pgrp = pgfind(-pgid); if (pgrp == NULL) { ret = ESRCH; goto fail; } PGRP_UNLOCK(pgrp); /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ if (pgrp->pg_session != curthread->td_proc->p_session) { ret = EPERM; goto fail; } proc = NULL; } funsetown(sigiop); if (pgid > 0) { PROC_LOCK(proc); /* * Since funsetownlst() is called without the proctree * locked, we need to check for P_WEXIT. * XXX: is ESRCH correct? */ if ((proc->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(proc); ret = ESRCH; goto fail; } SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); sigio->sio_proc = proc; PROC_UNLOCK(proc); } else { PGRP_LOCK(pgrp); SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); sigio->sio_pgrp = pgrp; PGRP_UNLOCK(pgrp); } sx_sunlock(&proctree_lock); SIGIO_LOCK(); *sigiop = sigio; SIGIO_UNLOCK(); return (0); fail: sx_sunlock(&proctree_lock); crfree(sigio->sio_ucred); FREE(sigio, M_SIGIO); return (ret); } /* * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). */ pid_t fgetown(sigiop) struct sigio **sigiop; { pid_t pgid; SIGIO_LOCK(); pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; SIGIO_UNLOCK(); return (pgid); } /* * Close a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct close_args { int fd; }; #endif /* ARGSUSED */ int close(td, uap) struct thread *td; struct close_args *uap; { return (kern_close(td, uap->fd)); } int kern_close(td, fd) struct thread *td; int fd; { struct filedesc *fdp; struct file *fp; int error; int holdleaders; error = 0; holdleaders = 0; fdp = td->td_proc->p_fd; AUDIT_SYSCLOSE(td, fd); FILEDESC_XLOCK(fdp); if ((unsigned)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } fdp->fd_ofiles[fd] = NULL; fdp->fd_ofileflags[fd] = 0; fdunused(fdp, fd); if (td->td_proc->p_fdtol != NULL) { /* * Ask fdfree() to sleep to ensure that all relevant * process leaders can be traversed in closef(). */ fdp->fd_holdleaderscount++; holdleaders = 1; } /* * We now hold the fp reference that used to be owned by the * descriptor array. We have to unlock the FILEDESC *AFTER* * knote_fdclose to prevent a race of the fd getting opened, a knote * added, and deleteing a knote for the new fd. */ knote_fdclose(td, fd); if (fp->f_type == DTYPE_MQUEUE) mq_fdclose(td, fd, fp); FILEDESC_XUNLOCK(fdp); error = closef(fp, td); if (holdleaders) { FILEDESC_XLOCK(fdp); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; wakeup(&fdp->fd_holdleaderscount); } FILEDESC_XUNLOCK(fdp); } return (error); } #if defined(COMPAT_43) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ofstat_args { int fd; struct ostat *sb; }; #endif /* ARGSUSED */ int ofstat(struct thread *td, struct ofstat_args *uap) { struct ostat oub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtstat(&ub, &oub); error = copyout(&oub, uap->sb, sizeof(oub)); } return (error); } #endif /* COMPAT_43 */ /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fstat_args { int fd; struct stat *sb; }; #endif /* ARGSUSED */ int fstat(struct thread *td, struct fstat_args *uap) { struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) error = copyout(&ub, uap->sb, sizeof(ub)); return (error); } int kern_fstat(struct thread *td, int fd, struct stat *sbp) { struct file *fp; int error; AUDIT_ARG(fd, fd); if ((error = fget(td, fd, &fp)) != 0) return (error); AUDIT_ARG(file, td->td_proc, fp); error = fo_stat(fp, sbp, td->td_ucred, td); fdrop(fp, td); return (error); } /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct nfstat_args { int fd; struct nstat *sb; }; #endif /* ARGSUSED */ int nfstat(struct thread *td, struct nfstat_args *uap) { struct nstat nub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtnstat(&ub, &nub); error = copyout(&nub, uap->sb, sizeof(nub)); } return (error); } /* * Return pathconf information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fpathconf_args { int fd; int name; }; #endif /* ARGSUSED */ int fpathconf(struct thread *td, struct fpathconf_args *uap) { struct file *fp; struct vnode *vp; int error; if ((error = fget(td, uap->fd, &fp)) != 0) return (error); /* If asynchronous I/O is available, it works for all descriptors. */ if (uap->name == _PC_ASYNC_IO) { td->td_retval[0] = async_io_version; goto out; } vp = fp->f_vnode; if (vp != NULL) { int vfslocked; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_PATHCONF(vp, uap->name, td->td_retval); VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { if (uap->name != _PC_PIPE_BUF) { error = EINVAL; } else { td->td_retval[0] = PIPE_BUF; error = 0; } } else { error = EOPNOTSUPP; } out: fdrop(fp, td); return (error); } /* * Grow the file table to accomodate (at least) nfd descriptors. This may * block and drop the filedesc lock, but it will reacquire it before * returning. */ static void fdgrowtable(struct filedesc *fdp, int nfd) { struct file **ntable; char *nfileflags; int nnfiles, onfiles; NDSLOTTYPE *nmap; FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); /* compute the size of the new table */ onfiles = fdp->fd_nfiles; nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ if (nnfiles <= onfiles) /* the table is already large enough */ return; /* allocate a new table and (if required) new bitmaps */ FILEDESC_XUNLOCK(fdp); MALLOC(ntable, struct file **, nnfiles * OFILESIZE, M_FILEDESC, M_ZERO | M_WAITOK); nfileflags = (char *)&ntable[nnfiles]; if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) MALLOC(nmap, NDSLOTTYPE *, NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, M_ZERO | M_WAITOK); else nmap = NULL; FILEDESC_XLOCK(fdp); /* * We now have new tables ready to go. Since we dropped the * filedesc lock to call malloc(), watch out for a race. */ onfiles = fdp->fd_nfiles; if (onfiles >= nnfiles) { /* we lost the race, but that's OK */ free(ntable, M_FILEDESC); if (nmap != NULL) free(nmap, M_FILEDESC); return; } bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable)); bcopy(fdp->fd_ofileflags, nfileflags, onfiles); if (onfiles > NDFILE) free(fdp->fd_ofiles, M_FILEDESC); fdp->fd_ofiles = ntable; fdp->fd_ofileflags = nfileflags; if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap)); if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) free(fdp->fd_map, M_FILEDESC); fdp->fd_map = nmap; } fdp->fd_nfiles = nnfiles; } /* * Allocate a file descriptor for the process. */ int fdalloc(struct thread *td, int minfd, int *result) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int fd = -1, maxfd; FILEDESC_XLOCK_ASSERT(fdp); if (fdp->fd_freefile > minfd) minfd = fdp->fd_freefile; PROC_LOCK(p); maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); PROC_UNLOCK(p); /* * Search the bitmap for a free descriptor. If none is found, try * to grow the file table. Keep at it until we either get a file * descriptor or run into process or system limits; fdgrowtable() * may drop the filedesc lock, so we're in a race. */ for (;;) { fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); if (fd >= maxfd) return (EMFILE); if (fd < fdp->fd_nfiles) break; fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd)); } /* * Perform some sanity checks, then mark the file descriptor as * used and return it to the caller. */ KASSERT(!fdisused(fdp, fd), ("fd_first_free() returned non-free descriptor")); KASSERT(fdp->fd_ofiles[fd] == NULL, ("free descriptor isn't")); fdp->fd_ofileflags[fd] = 0; /* XXX needed? */ fdused(fdp, fd); *result = fd; return (0); } /* * Check to see whether n user file descriptors are available to the process * p. */ int fdavail(struct thread *td, int n) { struct proc *p = td->td_proc; struct filedesc *fdp = td->td_proc->p_fd; struct file **fpp; int i, lim, last; FILEDESC_LOCK_ASSERT(fdp); PROC_LOCK(p); lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); PROC_UNLOCK(p); if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) return (1); last = min(fdp->fd_nfiles, lim); fpp = &fdp->fd_ofiles[fdp->fd_freefile]; for (i = last - fdp->fd_freefile; --i >= 0; fpp++) { if (*fpp == NULL && --n <= 0) return (1); } return (0); } /* * Create a new open file structure and allocate a file decriptor for the * process that refers to it. We add one reference to the file for the * descriptor table and one reference for resultfp. This is to prevent us * being preempted and the entry in the descriptor table closed after we * release the FILEDESC lock. */ int falloc(struct thread *td, struct file **resultfp, int *resultfd) { struct proc *p = td->td_proc; - struct file *fp, *fq; + struct file *fp; int error, i; int maxuserfiles = maxfiles - (maxfiles / 20); static struct timeval lastfail; static int curfail; fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); - sx_xlock(&filelist_lock); - if ((openfiles >= maxuserfiles && priv_check(td, PRIV_MAXFILES) != 0) || openfiles >= maxfiles) { if (ppsratecheck(&lastfail, &curfail, 1)) { printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n", td->td_ucred->cr_ruid); } - sx_xunlock(&filelist_lock); uma_zfree(file_zone, fp); return (ENFILE); } - openfiles++; + atomic_add_int(&openfiles, 1); /* * If the process has file descriptor zero open, add the new file * descriptor to the list of open files at that point, otherwise * put it at the front of the list of open files. */ - fp->f_mtxp = mtx_pool_alloc(mtxpool_sleep); fp->f_count = 1; if (resultfp) fp->f_count++; fp->f_cred = crhold(td->td_ucred); fp->f_ops = &badfileops; fp->f_data = NULL; fp->f_vnode = NULL; FILEDESC_XLOCK(p->p_fd); - if ((fq = p->p_fd->fd_ofiles[0])) { - LIST_INSERT_AFTER(fq, fp, f_list); - } else { - LIST_INSERT_HEAD(&filehead, fp, f_list); - } - sx_xunlock(&filelist_lock); if ((error = fdalloc(td, 0, &i))) { FILEDESC_XUNLOCK(p->p_fd); fdrop(fp, td); if (resultfp) fdrop(fp, td); return (error); } p->p_fd->fd_ofiles[i] = fp; FILEDESC_XUNLOCK(p->p_fd); if (resultfp) *resultfp = fp; if (resultfd) *resultfd = i; return (0); } /* * Build a new filedesc structure from another. * Copy the current, root, and jail root vnode references. */ struct filedesc * fdinit(struct filedesc *fdp) { struct filedesc0 *newfdp; newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO); FILEDESC_LOCK_INIT(&newfdp->fd_fd); if (fdp != NULL) { FILEDESC_XLOCK(fdp); newfdp->fd_fd.fd_cdir = fdp->fd_cdir; if (newfdp->fd_fd.fd_cdir) VREF(newfdp->fd_fd.fd_cdir); newfdp->fd_fd.fd_rdir = fdp->fd_rdir; if (newfdp->fd_fd.fd_rdir) VREF(newfdp->fd_fd.fd_rdir); newfdp->fd_fd.fd_jdir = fdp->fd_jdir; if (newfdp->fd_fd.fd_jdir) VREF(newfdp->fd_fd.fd_jdir); FILEDESC_XUNLOCK(fdp); } /* Create the file descriptor table. */ newfdp->fd_fd.fd_refcnt = 1; newfdp->fd_fd.fd_holdcnt = 1; newfdp->fd_fd.fd_cmask = CMASK; newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles; newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags; newfdp->fd_fd.fd_nfiles = NDFILE; newfdp->fd_fd.fd_map = newfdp->fd_dmap; newfdp->fd_fd.fd_lastfile = -1; return (&newfdp->fd_fd); } static struct filedesc * fdhold(struct proc *p) { struct filedesc *fdp; mtx_lock(&fdesc_mtx); fdp = p->p_fd; if (fdp != NULL) fdp->fd_holdcnt++; mtx_unlock(&fdesc_mtx); return (fdp); } static void fddrop(struct filedesc *fdp) { int i; mtx_lock(&fdesc_mtx); i = --fdp->fd_holdcnt; mtx_unlock(&fdesc_mtx); if (i > 0) return; FILEDESC_LOCK_DESTROY(fdp); FREE(fdp, M_FILEDESC); } /* * Share a filedesc structure. */ struct filedesc * fdshare(struct filedesc *fdp) { FILEDESC_XLOCK(fdp); fdp->fd_refcnt++; FILEDESC_XUNLOCK(fdp); return (fdp); } /* * Unshare a filedesc structure, if necessary by making a copy */ void fdunshare(struct proc *p, struct thread *td) { FILEDESC_XLOCK(p->p_fd); if (p->p_fd->fd_refcnt > 1) { struct filedesc *tmp; FILEDESC_XUNLOCK(p->p_fd); tmp = fdcopy(p->p_fd); fdfree(td); p->p_fd = tmp; } else FILEDESC_XUNLOCK(p->p_fd); } /* * Copy a filedesc structure. A NULL pointer in returns a NULL reference, * this is to ease callers, not catch errors. */ struct filedesc * fdcopy(struct filedesc *fdp) { struct filedesc *newfdp; int i; /* Certain daemons might not have file descriptors. */ if (fdp == NULL) return (NULL); newfdp = fdinit(fdp); FILEDESC_SLOCK(fdp); while (fdp->fd_lastfile >= newfdp->fd_nfiles) { FILEDESC_SUNLOCK(fdp); FILEDESC_XLOCK(newfdp); fdgrowtable(newfdp, fdp->fd_lastfile + 1); FILEDESC_XUNLOCK(newfdp); FILEDESC_SLOCK(fdp); } /* copy everything except kqueue descriptors */ newfdp->fd_freefile = -1; for (i = 0; i <= fdp->fd_lastfile; ++i) { if (fdisused(fdp, i) && fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE) { newfdp->fd_ofiles[i] = fdp->fd_ofiles[i]; newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i]; fhold(newfdp->fd_ofiles[i]); newfdp->fd_lastfile = i; } else { if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; } } FILEDESC_SUNLOCK(fdp); FILEDESC_XLOCK(newfdp); for (i = 0; i <= newfdp->fd_lastfile; ++i) if (newfdp->fd_ofiles[i] != NULL) fdused(newfdp, i); FILEDESC_XUNLOCK(newfdp); FILEDESC_SLOCK(fdp); if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; newfdp->fd_cmask = fdp->fd_cmask; FILEDESC_SUNLOCK(fdp); return (newfdp); } /* * Release a filedesc structure. */ void fdfree(struct thread *td) { struct filedesc *fdp; struct file **fpp; int i, locked; struct filedesc_to_leader *fdtol; struct file *fp; struct vnode *cdir, *jdir, *rdir, *vp; struct flock lf; /* Certain daemons might not have file descriptors. */ fdp = td->td_proc->p_fd; if (fdp == NULL) return; /* Check for special need to clear POSIX style locks */ fdtol = td->td_proc->p_fdtol; if (fdtol != NULL) { FILEDESC_XLOCK(fdp); KASSERT(fdtol->fdl_refcount > 0, ("filedesc_to_refcount botch: fdl_refcount=%d", fdtol->fdl_refcount)); if (fdtol->fdl_refcount == 1 && (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { for (i = 0, fpp = fdp->fd_ofiles; i <= fdp->fd_lastfile; i++, fpp++) { if (*fpp == NULL || (*fpp)->f_type != DTYPE_VNODE) continue; fp = *fpp; fhold(fp); FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; locked = VFS_LOCK_GIANT(vp->v_mount); (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc-> p_leader, F_UNLCK, &lf, F_POSIX); VFS_UNLOCK_GIANT(locked); FILEDESC_XLOCK(fdp); fdrop(fp, td); fpp = fdp->fd_ofiles + i; } } retry: if (fdtol->fdl_refcount == 1) { if (fdp->fd_holdleaderscount > 0 && (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { /* * close() or do_dup() has cleared a reference * in a shared file descriptor table. */ fdp->fd_holdleaderswakeup = 1; sx_sleep(&fdp->fd_holdleaderscount, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } if (fdtol->fdl_holdcount > 0) { /* * Ensure that fdtol->fdl_leader remains * valid in closef(). */ fdtol->fdl_wakeup = 1; sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } } fdtol->fdl_refcount--; if (fdtol->fdl_refcount == 0 && fdtol->fdl_holdcount == 0) { fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; fdtol->fdl_prev->fdl_next = fdtol->fdl_next; } else fdtol = NULL; td->td_proc->p_fdtol = NULL; FILEDESC_XUNLOCK(fdp); if (fdtol != NULL) FREE(fdtol, M_FILEDESC_TO_LEADER); } FILEDESC_XLOCK(fdp); i = --fdp->fd_refcnt; FILEDESC_XUNLOCK(fdp); if (i > 0) return; /* * We are the last reference to the structure, so we can * safely assume it will not change out from under us. */ fpp = fdp->fd_ofiles; for (i = fdp->fd_lastfile; i-- >= 0; fpp++) { if (*fpp) (void) closef(*fpp, td); } FILEDESC_XLOCK(fdp); /* XXX This should happen earlier. */ mtx_lock(&fdesc_mtx); td->td_proc->p_fd = NULL; mtx_unlock(&fdesc_mtx); if (fdp->fd_nfiles > NDFILE) FREE(fdp->fd_ofiles, M_FILEDESC); if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) FREE(fdp->fd_map, M_FILEDESC); fdp->fd_nfiles = 0; cdir = fdp->fd_cdir; fdp->fd_cdir = NULL; rdir = fdp->fd_rdir; fdp->fd_rdir = NULL; jdir = fdp->fd_jdir; fdp->fd_jdir = NULL; FILEDESC_XUNLOCK(fdp); if (cdir) { locked = VFS_LOCK_GIANT(cdir->v_mount); vrele(cdir); VFS_UNLOCK_GIANT(locked); } if (rdir) { locked = VFS_LOCK_GIANT(rdir->v_mount); vrele(rdir); VFS_UNLOCK_GIANT(locked); } if (jdir) { locked = VFS_LOCK_GIANT(jdir->v_mount); vrele(jdir); VFS_UNLOCK_GIANT(locked); } fddrop(fdp); } /* * For setugid programs, we don't want to people to use that setugidness * to generate error messages which write to a file which otherwise would * otherwise be off-limits to the process. We check for filesystems where * the vnode can change out from under us after execve (like [lin]procfs). * * Since setugidsafety calls this only for fd 0, 1 and 2, this check is * sufficient. We also don't check for setugidness since we know we are. */ static int is_unsafe(struct file *fp) { if (fp->f_type == DTYPE_VNODE) { struct vnode *vp = fp->f_vnode; if ((vp->v_vflag & VV_PROCDEP) != 0) return (1); } return (0); } /* * Make this setguid thing safe, if at all possible. */ void setugidsafety(struct thread *td) { struct filedesc *fdp; int i; /* Certain daemons might not have file descriptors. */ fdp = td->td_proc->p_fd; if (fdp == NULL) return; /* * Note: fdp->fd_ofiles may be reallocated out from under us while * we are blocked in a close. Be careful! */ FILEDESC_XLOCK(fdp); for (i = 0; i <= fdp->fd_lastfile; i++) { if (i > 2) break; if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) { struct file *fp; knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fp = fdp->fd_ofiles[i]; fdp->fd_ofiles[i] = NULL; fdp->fd_ofileflags[i] = 0; fdunused(fdp, i); FILEDESC_XUNLOCK(fdp); (void) closef(fp, td); FILEDESC_XLOCK(fdp); } } FILEDESC_XUNLOCK(fdp); } /* * If a specific file object occupies a specific file descriptor, close the * file descriptor entry and drop a reference on the file object. This is a * convenience function to handle a subsequent error in a function that calls * falloc() that handles the race that another thread might have closed the * file descriptor out from under the thread creating the file object. */ void fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td) { FILEDESC_XLOCK(fdp); if (fdp->fd_ofiles[idx] == fp) { fdp->fd_ofiles[idx] = NULL; fdunused(fdp, idx); FILEDESC_XUNLOCK(fdp); fdrop(fp, td); } else FILEDESC_XUNLOCK(fdp); } /* * Close any files on exec? */ void fdcloseexec(struct thread *td) { struct filedesc *fdp; int i; /* Certain daemons might not have file descriptors. */ fdp = td->td_proc->p_fd; if (fdp == NULL) return; FILEDESC_XLOCK(fdp); /* * We cannot cache fd_ofiles or fd_ofileflags since operations * may block and rip them out from under us. */ for (i = 0; i <= fdp->fd_lastfile; i++) { if (fdp->fd_ofiles[i] != NULL && (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE || (fdp->fd_ofileflags[i] & UF_EXCLOSE))) { struct file *fp; knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fp = fdp->fd_ofiles[i]; fdp->fd_ofiles[i] = NULL; fdp->fd_ofileflags[i] = 0; fdunused(fdp, i); if (fp->f_type == DTYPE_MQUEUE) mq_fdclose(td, i, fp); FILEDESC_XUNLOCK(fdp); (void) closef(fp, td); FILEDESC_XLOCK(fdp); } } FILEDESC_XUNLOCK(fdp); } /* * It is unsafe for set[ug]id processes to be started with file * descriptors 0..2 closed, as these descriptors are given implicit * significance in the Standard C library. fdcheckstd() will create a * descriptor referencing /dev/null for each of stdin, stdout, and * stderr that is not already open. */ int fdcheckstd(struct thread *td) { struct filedesc *fdp; register_t retval, save; int i, error, devnull; fdp = td->td_proc->p_fd; if (fdp == NULL) return (0); KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); devnull = -1; error = 0; for (i = 0; i < 3; i++) { if (fdp->fd_ofiles[i] != NULL) continue; if (devnull < 0) { save = td->td_retval[0]; error = kern_open(td, "/dev/null", UIO_SYSSPACE, O_RDWR, 0); devnull = td->td_retval[0]; KASSERT(devnull == i, ("oof, we didn't get our fd")); td->td_retval[0] = save; if (error) break; } else { error = do_dup(td, DUP_FIXED, devnull, i, &retval); if (error != 0) break; } } return (error); } /* * Internal form of close. Decrement reference count on file structure. * Note: td may be NULL when closing a file that was being passed in a * message. * * XXXRW: Giant is not required for the caller, but often will be held; this * makes it moderately likely the Giant will be recursed in the VFS case. */ int closef(struct file *fp, struct thread *td) { struct vnode *vp; struct flock lf; struct filedesc_to_leader *fdtol; struct filedesc *fdp; /* * POSIX record locking dictates that any close releases ALL * locks owned by this process. This is handled by setting * a flag in the unlock to free ONLY locks obeying POSIX * semantics, and not to free BSD-style file locks. * If the descriptor was in a message, POSIX-style locks * aren't passed with the descriptor, and the thread pointer * will be NULL. Callers should be careful only to pass a * NULL thread pointer when there really is no owning * context that might have locks, or the locks will be * leaked. */ if (fp->f_type == DTYPE_VNODE && td != NULL) { int vfslocked; vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, F_UNLCK, &lf, F_POSIX); } fdtol = td->td_proc->p_fdtol; if (fdtol != NULL) { /* * Handle special case where file descriptor table is * shared between multiple process leaders. */ fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); for (fdtol = fdtol->fdl_next; fdtol != td->td_proc->p_fdtol; fdtol = fdtol->fdl_next) { if ((fdtol->fdl_leader->p_flag & P_ADVLOCK) == 0) continue; fdtol->fdl_holdcount++; FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdtol->fdl_holdcount--; if (fdtol->fdl_holdcount == 0 && fdtol->fdl_wakeup != 0) { fdtol->fdl_wakeup = 0; wakeup(fdtol); } } FILEDESC_XUNLOCK(fdp); } VFS_UNLOCK_GIANT(vfslocked); } return (fdrop(fp, td)); } /* + * Initialize the file pointer with the specified properties. + * + * The ops are set with release semantics to be certain that the flags, type, + * and data are visible when ops is. This is to prevent ops methods from being + * called with bad data. + */ +void +finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) +{ + fp->f_data = data; + fp->f_flag = flag; + fp->f_type = type; + atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); +} + + +/* * Extract the file pointer associated with the specified descriptor for the * current user process. * * If the descriptor doesn't exist, EBADF is returned. * * If the descriptor exists but doesn't match 'flags' then return EBADF for * read attempts and EINVAL for write attempts. * * If 'hold' is set (non-zero) the file's refcount will be bumped on return. * It should be dropped with fdrop(). If it is not set, then the refcount * will not be bumped however the thread's filedesc struct will be returned * locked (for fgetsock). * * If an error occured the non-zero error is returned and *fpp is set to * NULL. Otherwise *fpp is set and zero is returned. */ static __inline int _fget(struct thread *td, int fd, struct file **fpp, int flags, int hold) { struct filedesc *fdp; struct file *fp; *fpp = NULL; if (td == NULL || (fdp = td->td_proc->p_fd) == NULL) return (EBADF); FILEDESC_SLOCK(fdp); if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) { FILEDESC_SUNLOCK(fdp); return (EBADF); } /* * FREAD and FWRITE failure return EBADF as per POSIX. * * Only one flag, or 0, may be specified. */ if (flags == FREAD && (fp->f_flag & FREAD) == 0) { FILEDESC_SUNLOCK(fdp); return (EBADF); } if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) { FILEDESC_SUNLOCK(fdp); return (EBADF); } if (hold) { fhold(fp); FILEDESC_SUNLOCK(fdp); } *fpp = fp; return (0); } int fget(struct thread *td, int fd, struct file **fpp) { return(_fget(td, fd, fpp, 0, 1)); } int fget_read(struct thread *td, int fd, struct file **fpp) { return(_fget(td, fd, fpp, FREAD, 1)); } int fget_write(struct thread *td, int fd, struct file **fpp) { return(_fget(td, fd, fpp, FWRITE, 1)); } /* * Like fget() but loads the underlying vnode, or returns an error if the * descriptor does not represent a vnode. Note that pipes use vnodes but * never have VM objects. The returned vnode will be vref()'d. * * XXX: what about the unused flags ? */ static __inline int _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags) { struct file *fp; int error; *vpp = NULL; if ((error = _fget(td, fd, &fp, 0, 0)) != 0) return (error); if (fp->f_vnode == NULL) { error = EINVAL; } else { *vpp = fp->f_vnode; vref(*vpp); } FILEDESC_SUNLOCK(td->td_proc->p_fd); return (error); } int fgetvp(struct thread *td, int fd, struct vnode **vpp) { return (_fgetvp(td, fd, vpp, 0)); } int fgetvp_read(struct thread *td, int fd, struct vnode **vpp) { return (_fgetvp(td, fd, vpp, FREAD)); } #ifdef notyet int fgetvp_write(struct thread *td, int fd, struct vnode **vpp) { return (_fgetvp(td, fd, vpp, FWRITE)); } #endif /* * Like fget() but loads the underlying socket, or returns an error if the * descriptor does not represent a socket. * * We bump the ref count on the returned socket. XXX Also obtain the SX lock * in the future. * * XXXRW: fgetsock() and fputsock() are deprecated, as consumers should rely * on their file descriptor reference to prevent the socket from being free'd * during use. */ int fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp) { struct file *fp; int error; *spp = NULL; if (fflagp != NULL) *fflagp = 0; if ((error = _fget(td, fd, &fp, 0, 0)) != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { error = ENOTSOCK; } else { *spp = fp->f_data; if (fflagp) *fflagp = fp->f_flag; SOCK_LOCK(*spp); soref(*spp); SOCK_UNLOCK(*spp); } FILEDESC_SUNLOCK(td->td_proc->p_fd); return (error); } /* * Drop the reference count on the socket and XXX release the SX lock in the * future. The last reference closes the socket. * * XXXRW: fputsock() is deprecated, see comment for fgetsock(). */ void fputsock(struct socket *so) { ACCEPT_LOCK(); SOCK_LOCK(so); sorele(so); } -int -fdrop(struct file *fp, struct thread *td) -{ - - FILE_LOCK(fp); - return (fdrop_locked(fp, td)); -} - /* - * Drop reference on struct file passed in, may call closef if the - * reference hits zero. - * Expects struct file locked, and will unlock it. + * Handle the last reference to a file being closed. */ -static int -fdrop_locked(struct file *fp, struct thread *td) +int +_fdrop(struct file *fp, struct thread *td) { int error; - FILE_LOCK_ASSERT(fp, MA_OWNED); - - if (--fp->f_count > 0) { - FILE_UNLOCK(fp); - return (0); - } - - /* - * We might have just dropped the last reference to a file - * object that is for a UNIX domain socket whose message - * buffers are being examined in unp_gc(). If that is the - * case, FWAIT will be set in f_gcflag and we need to wait for - * unp_gc() to finish its scan. - */ - while (fp->f_gcflag & FWAIT) - msleep(&fp->f_gcflag, fp->f_mtxp, 0, "fpdrop", 0); - - /* We have the last ref so we can proceed without the file lock. */ - FILE_UNLOCK(fp); - if (fp->f_count < 0) - panic("fdrop: count < 0"); + error = 0; + if (fp->f_count != 0) + panic("fdrop: count %d", fp->f_count); if (fp->f_ops != &badfileops) error = fo_close(fp, td); - else - error = 0; - - sx_xlock(&filelist_lock); - LIST_REMOVE(fp, f_list); - openfiles--; - sx_xunlock(&filelist_lock); + atomic_subtract_int(&openfiles, 1); crfree(fp->f_cred); uma_zfree(file_zone, fp); return (error); } /* * Apply an advisory lock on a file descriptor. * * Just attempt to get a record lock of the requested type on the entire file * (l_whence = SEEK_SET, l_start = 0, l_len = 0). */ #ifndef _SYS_SYSPROTO_H_ struct flock_args { int fd; int how; }; #endif /* ARGSUSED */ int flock(struct thread *td, struct flock_args *uap) { struct file *fp; struct vnode *vp; struct flock lf; int vfslocked; int error; if ((error = fget(td, uap->fd, &fp)) != 0) return (error); if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); return (EOPNOTSUPP); } vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (uap->how & LOCK_UN) { lf.l_type = F_UNLCK; - FILE_LOCK(fp); - fp->f_flag &= ~FHASLOCK; - FILE_UNLOCK(fp); + atomic_clear_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); goto done2; } if (uap->how & LOCK_EX) lf.l_type = F_WRLCK; else if (uap->how & LOCK_SH) lf.l_type = F_RDLCK; else { error = EBADF; goto done2; } - FILE_LOCK(fp); - fp->f_flag |= FHASLOCK; - FILE_UNLOCK(fp); + atomic_set_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); done2: fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Duplicate the specified descriptor to a free descriptor. */ int dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error) { struct file *wfp; struct file *fp; /* * If the to-be-dup'd fd number is greater than the allowed number * of file descriptors, or the fd to be dup'd has already been * closed, then reject. */ FILEDESC_XLOCK(fdp); if (dfd < 0 || dfd >= fdp->fd_nfiles || (wfp = fdp->fd_ofiles[dfd]) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } /* * There are two cases of interest here. * * For ENODEV simply dup (dfd) to file descriptor (indx) and return. * * For ENXIO steal away the file structure from (dfd) and store it in * (indx). (dfd) is effectively closed by this operation. * * Any other error code is just returned. */ switch (error) { case ENODEV: /* * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ - FILE_LOCK(wfp); if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) { - FILE_UNLOCK(wfp); FILEDESC_XUNLOCK(fdp); return (EACCES); } fp = fdp->fd_ofiles[indx]; fdp->fd_ofiles[indx] = wfp; fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; if (fp == NULL) fdused(fdp, indx); - fhold_locked(wfp); - FILE_UNLOCK(wfp); + fhold(wfp); FILEDESC_XUNLOCK(fdp); if (fp != NULL) /* * We now own the reference to fp that the ofiles[] * array used to own. Release it. */ fdrop(fp, td); return (0); case ENXIO: /* * Steal away the file pointer from dfd and stuff it into indx. */ fp = fdp->fd_ofiles[indx]; fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; fdp->fd_ofiles[dfd] = NULL; fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; fdp->fd_ofileflags[dfd] = 0; fdunused(fdp, dfd); if (fp == NULL) fdused(fdp, indx); FILEDESC_XUNLOCK(fdp); /* * We now own the reference to fp that the ofiles[] array * used to own. Release it. */ if (fp != NULL) fdrop(fp, td); return (0); default: FILEDESC_XUNLOCK(fdp); return (error); } /* NOTREACHED */ } /* * Scan all active processes to see if any of them have a current or root * directory of `olddp'. If so, replace them with the new mount point. */ void mountcheckdirs(struct vnode *olddp, struct vnode *newdp) { struct filedesc *fdp; struct proc *p; int nrele; if (vrefcnt(olddp) == 1) return; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { fdp = fdhold(p); if (fdp == NULL) continue; nrele = 0; FILEDESC_XLOCK(fdp); if (fdp->fd_cdir == olddp) { vref(newdp); fdp->fd_cdir = newdp; nrele++; } if (fdp->fd_rdir == olddp) { vref(newdp); fdp->fd_rdir = newdp; nrele++; } FILEDESC_XUNLOCK(fdp); fddrop(fdp); while (nrele--) vrele(olddp); } sx_sunlock(&allproc_lock); if (rootvnode == olddp) { vrele(rootvnode); vref(newdp); rootvnode = newdp; } } struct filedesc_to_leader * filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) { struct filedesc_to_leader *fdtol; MALLOC(fdtol, struct filedesc_to_leader *, sizeof(struct filedesc_to_leader), M_FILEDESC_TO_LEADER, M_WAITOK); fdtol->fdl_refcount = 1; fdtol->fdl_holdcount = 0; fdtol->fdl_wakeup = 0; fdtol->fdl_leader = leader; if (old != NULL) { FILEDESC_XLOCK(fdp); fdtol->fdl_next = old->fdl_next; fdtol->fdl_prev = old; old->fdl_next = fdtol; fdtol->fdl_next->fdl_prev = fdtol; FILEDESC_XUNLOCK(fdp); } else { fdtol->fdl_next = fdtol; fdtol->fdl_prev = fdtol; } return (fdtol); } /* * Get file structures globally. */ static int sysctl_kern_file(SYSCTL_HANDLER_ARGS) { struct xfile xf; struct filedesc *fdp; struct file *fp; struct proc *p; int error, n; - /* - * Note: because the number of file descriptors is calculated - * in different ways for sizing vs returning the data, - * there is information leakage from the first loop. However, - * it is of a similar order of magnitude to the leakage from - * global system statistics such as kern.openfiles. - */ error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); if (req->oldptr == NULL) { - n = 16; /* A slight overestimate. */ - sx_slock(&filelist_lock); - LIST_FOREACH(fp, &filehead, f_list) { - /* - * We should grab the lock, but this is an - * estimate, so does it really matter? - */ - /* mtx_lock(fp->f_mtxp); */ - n += fp->f_count; - /* mtx_unlock(f->f_mtxp); */ + n = 0; + sx_slock(&allproc_lock); + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_state == PRS_NEW) + continue; + fdp = fdhold(p); + if (fdp == NULL) + continue; + /* overestimates sparse tables. */ + n += fdp->fd_lastfile; + fddrop(fdp); } - sx_sunlock(&filelist_lock); + sx_sunlock(&allproc_lock); return (SYSCTL_OUT(req, 0, n * sizeof(xf))); } error = 0; bzero(&xf, sizeof(xf)); xf.xf_size = sizeof(xf); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; PROC_LOCK(p); if (p_cansee(req->td, p) != 0) { PROC_UNLOCK(p); continue; } xf.xf_pid = p->p_pid; xf.xf_uid = p->p_ucred->cr_uid; PROC_UNLOCK(p); fdp = fdhold(p); if (fdp == NULL) continue; FILEDESC_SLOCK(fdp); for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) { if ((fp = fdp->fd_ofiles[n]) == NULL) continue; xf.xf_fd = n; xf.xf_file = fp; xf.xf_data = fp->f_data; xf.xf_vnode = fp->f_vnode; xf.xf_type = fp->f_type; xf.xf_count = fp->f_count; - xf.xf_msgcount = fp->f_msgcount; + xf.xf_msgcount = 0; xf.xf_offset = fp->f_offset; xf.xf_flag = fp->f_flag; error = SYSCTL_OUT(req, &xf, sizeof(xf)); if (error) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); if (error) break; } sx_sunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) { char *fullpath, *freepath; struct kinfo_file *kif; struct filedesc *fdp; int error, i, *name; struct socket *so; struct vnode *vp; struct file *fp; struct proc *p; int vfslocked; name = (int *)arg1; if ((p = pfind((pid_t)name[0])) == NULL) return (ESRCH); if ((error = p_candebug(curthread, p))) { PROC_UNLOCK(p); return (error); } fdp = fdhold(p); PROC_UNLOCK(p); kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); FILEDESC_SLOCK(fdp); for (i = 0; i < fdp->fd_nfiles; i++) { if ((fp = fdp->fd_ofiles[i]) == NULL) continue; bzero(kif, sizeof(*kif)); kif->kf_structsize = sizeof(*kif); - FILE_LOCK(fp); vp = NULL; so = NULL; kif->kf_fd = i; switch (fp->f_type) { case DTYPE_VNODE: kif->kf_type = KF_TYPE_VNODE; vp = fp->f_vnode; - vref(vp); break; case DTYPE_SOCKET: kif->kf_type = KF_TYPE_SOCKET; so = fp->f_data; break; case DTYPE_PIPE: kif->kf_type = KF_TYPE_PIPE; break; case DTYPE_FIFO: kif->kf_type = KF_TYPE_FIFO; vp = fp->f_vnode; vref(vp); break; case DTYPE_KQUEUE: kif->kf_type = KF_TYPE_KQUEUE; break; case DTYPE_CRYPTO: kif->kf_type = KF_TYPE_CRYPTO; break; case DTYPE_MQUEUE: kif->kf_type = KF_TYPE_MQUEUE; break; default: kif->kf_type = KF_TYPE_UNKNOWN; break; } kif->kf_ref_count = fp->f_count; if (fp->f_flag & FREAD) kif->kf_flags |= KF_FLAG_READ; if (fp->f_flag & FWRITE) kif->kf_flags |= KF_FLAG_WRITE; if (fp->f_flag & FAPPEND) kif->kf_flags |= KF_FLAG_APPEND; if (fp->f_flag & FASYNC) kif->kf_flags |= KF_FLAG_ASYNC; if (fp->f_flag & FFSYNC) kif->kf_flags |= KF_FLAG_FSYNC; if (fp->f_flag & FNONBLOCK) kif->kf_flags |= KF_FLAG_NONBLOCK; if (fp->f_flag & O_DIRECT) kif->kf_flags |= KF_FLAG_DIRECT; if (fp->f_flag & FHASLOCK) kif->kf_flags |= KF_FLAG_HASLOCK; kif->kf_offset = fp->f_offset; - FILE_UNLOCK(fp); if (vp != NULL) { + vref(vp); switch (vp->v_type) { case VNON: kif->kf_vnode_type = KF_VTYPE_VNON; break; case VREG: kif->kf_vnode_type = KF_VTYPE_VREG; break; case VDIR: kif->kf_vnode_type = KF_VTYPE_VDIR; break; case VBLK: kif->kf_vnode_type = KF_VTYPE_VBLK; break; case VCHR: kif->kf_vnode_type = KF_VTYPE_VCHR; break; case VLNK: kif->kf_vnode_type = KF_VTYPE_VLNK; break; case VSOCK: kif->kf_vnode_type = KF_VTYPE_VSOCK; break; case VFIFO: kif->kf_vnode_type = KF_VTYPE_VFIFO; break; case VBAD: kif->kf_vnode_type = KF_VTYPE_VBAD; break; default: kif->kf_vnode_type = KF_VTYPE_UNKNOWN; break; } /* * It is OK to drop the filedesc lock here as we will * re-validate and re-evaluate its properties when * the loop continues. */ freepath = NULL; fullpath = "-"; FILEDESC_SUNLOCK(fdp); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); vn_fullpath(curthread, vp, &fullpath, &freepath); vput(vp); VFS_UNLOCK_GIANT(vfslocked); strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); if (freepath != NULL) free(freepath, M_TEMP); FILEDESC_SLOCK(fdp); } if (so != NULL) { struct sockaddr *sa; if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa) == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) { bcopy(sa, &kif->kf_sa_local, sa->sa_len); free(sa, M_SONAME); } if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa) == 00 && sa->sa_len <= sizeof(kif->kf_sa_peer)) { bcopy(sa, &kif->kf_sa_peer, sa->sa_len); free(sa, M_SONAME); } kif->kf_sock_domain = so->so_proto->pr_domain->dom_family; kif->kf_sock_type = so->so_type; kif->kf_sock_protocol = so->so_proto->pr_protocol; } error = SYSCTL_OUT(req, kif, sizeof(*kif)); if (error) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); free(kif, M_TEMP); return (0); } static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD, sysctl_kern_proc_filedesc, "Process filedesc entries"); #ifdef DDB /* * For the purposes of debugging, generate a human-readable string for the * file type. */ static const char * file_type_to_name(short type) { switch (type) { case 0: return ("zero"); case DTYPE_VNODE: return ("vnod"); case DTYPE_SOCKET: return ("sock"); case DTYPE_PIPE: return ("pipe"); case DTYPE_FIFO: return ("fifo"); case DTYPE_KQUEUE: return ("kque"); case DTYPE_CRYPTO: return ("crpt"); case DTYPE_MQUEUE: return ("mque"); default: return ("unkn"); } } /* * For the purposes of debugging, identify a process (if any, perhaps one of * many) that references the passed file in its file descriptor array. Return * NULL if none. */ static struct proc * file_to_first_proc(struct file *fp) { struct filedesc *fdp; struct proc *p; int n; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; fdp = p->p_fd; if (fdp == NULL) continue; for (n = 0; n < fdp->fd_nfiles; n++) { if (fp == fdp->fd_ofiles[n]) return (p); } } return (NULL); } static void db_print_file(struct file *fp, int header) { struct proc *p; if (header) db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n", "File", "Type", "Data", "Flag", "GCFl", "Count", "MCount", "Vnode", "FPID", "FCmd"); p = file_to_first_proc(fp); db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp, file_type_to_name(fp->f_type), fp->f_data, fp->f_flag, - fp->f_gcflag, fp->f_count, fp->f_msgcount, fp->f_vnode, + 0, fp->f_count, 0, fp->f_vnode, p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); } DB_SHOW_COMMAND(file, db_show_file) { struct file *fp; if (!have_addr) { db_printf("usage: show file \n"); return; } fp = (struct file *)addr; db_print_file(fp, 1); } DB_SHOW_COMMAND(files, db_show_files) { + struct filedesc *fdp; struct file *fp; + struct proc *p; int header; + int n; header = 1; - LIST_FOREACH(fp, &filehead, f_list) { - db_print_file(fp, header); - header = 0; + FOREACH_PROC_IN_SYSTEM(p) { + if (p->p_state == PRS_NEW) + continue; + if ((fdp = p->p_fd) == NULL) + continue; + for (n = 0; n < fdp->fd_nfiles; ++n) { + if ((fp = fdp->fd_ofiles[n]) == NULL) + continue; + db_print_file(fp, header); + header = 0; + } } } #endif SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, &maxfilesperproc, 0, "Maximum files allowed open per process"); SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, "Maximum number of files"); SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, - &openfiles, 0, "System-wide number of open files"); + __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files"); /* ARGSUSED*/ static void filelistinit(void *dummy) { file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - sx_init(&filelist_lock, "filelist lock"); mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF); } SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL) /*-------------------------------------------------------------------*/ static int badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EBADF); } static int badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (0); } static int badfo_kqfilter(struct file *fp, struct knote *kn) { return (EBADF); } static int badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_close(struct file *fp, struct thread *td) { return (EBADF); } struct fileops badfileops = { .fo_read = badfo_readwrite, .fo_write = badfo_readwrite, .fo_ioctl = badfo_ioctl, .fo_poll = badfo_poll, .fo_kqfilter = badfo_kqfilter, .fo_stat = badfo_stat, .fo_close = badfo_close, }; /*-------------------------------------------------------------------*/ /* * File Descriptor pseudo-device driver (/dev/fd/). * * Opening minor device N dup()s the file (if any) connected to file * descriptor N belonging to the calling process. Note that this driver * consists of only the ``open()'' routine, because all subsequent * references to this file will be direct to the other driver. * * XXX: we could give this one a cloning event handler if necessary. */ /* ARGSUSED */ static int fdopen(struct cdev *dev, int mode, int type, struct thread *td) { /* * XXX Kludge: set curthread->td_dupfd to contain the value of the * the file descriptor being sought for duplication. The error * return ensures that the vnode for this device will be released * by vn_open. Open will detect this special error and take the * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN * will simply report the error. */ td->td_dupfd = dev2unit(dev); return (ENODEV); } static struct cdevsw fildesc_cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDGIANT, .d_open = fdopen, .d_name = "FD", }; static void fildesc_drvinit(void *unused) { struct cdev *dev; dev = make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "fd/0"); make_dev_alias(dev, "stdin"); dev = make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "fd/1"); make_dev_alias(dev, "stdout"); dev = make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "fd/2"); make_dev_alias(dev, "stderr"); } SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL) Index: head/sys/kern/kern_event.c =================================================================== --- head/sys/kern/kern_event.c (revision 174987) +++ head/sys/kern/kern_event.c (revision 174988) @@ -1,1965 +1,1953 @@ /*- * Copyright (c) 1999,2000,2001 Jonathan Lemon * Copyright 2004 John-Mark Gurney * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); /* * This lock is used if multiple kq locks are required. This possibly * should be made into a per proc lock. */ static struct mtx kq_global; MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF); #define KQ_GLOBAL_LOCK(lck, haslck) do { \ if (!haslck) \ mtx_lock(lck); \ haslck = 1; \ } while (0) #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \ if (haslck) \ mtx_unlock(lck); \ haslck = 0; \ } while (0) TASKQUEUE_DEFINE_THREAD(kqueue); static int kevent_copyout(void *arg, struct kevent *kevp, int count); static int kevent_copyin(void *arg, struct kevent *kevp, int count); static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok); static int kqueue_acquire(struct file *fp, struct kqueue **kqp); static void kqueue_release(struct kqueue *kq, int locked); static int kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, int waitok); static void kqueue_task(void *arg, int pending); static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *timeout, struct kevent *keva, struct thread *td); static void kqueue_wakeup(struct kqueue *kq); static struct filterops *kqueue_fo_find(int filt); static void kqueue_fo_release(int filt); static fo_rdwr_t kqueue_read; static fo_rdwr_t kqueue_write; static fo_ioctl_t kqueue_ioctl; static fo_poll_t kqueue_poll; static fo_kqfilter_t kqueue_kqfilter; static fo_stat_t kqueue_stat; static fo_close_t kqueue_close; static struct fileops kqueueops = { .fo_read = kqueue_read, .fo_write = kqueue_write, .fo_ioctl = kqueue_ioctl, .fo_poll = kqueue_poll, .fo_kqfilter = kqueue_kqfilter, .fo_stat = kqueue_stat, .fo_close = kqueue_close, }; static int knote_attach(struct knote *kn, struct kqueue *kq); static void knote_drop(struct knote *kn, struct thread *td); static void knote_enqueue(struct knote *kn); static void knote_dequeue(struct knote *kn); static void knote_init(void); static struct knote *knote_alloc(int waitok); static void knote_free(struct knote *kn); static void filt_kqdetach(struct knote *kn); static int filt_kqueue(struct knote *kn, long hint); static int filt_procattach(struct knote *kn); static void filt_procdetach(struct knote *kn); static int filt_proc(struct knote *kn, long hint); static int filt_fileattach(struct knote *kn); static void filt_timerexpire(void *knx); static int filt_timerattach(struct knote *kn); static void filt_timerdetach(struct knote *kn); static int filt_timer(struct knote *kn, long hint); static struct filterops file_filtops = { 1, filt_fileattach, NULL, NULL }; static struct filterops kqread_filtops = { 1, NULL, filt_kqdetach, filt_kqueue }; /* XXX - move to kern_proc.c? */ static struct filterops proc_filtops = { 0, filt_procattach, filt_procdetach, filt_proc }; static struct filterops timer_filtops = { 0, filt_timerattach, filt_timerdetach, filt_timer }; static uma_zone_t knote_zone; static int kq_ncallouts = 0; static int kq_calloutmax = (4 * 1024); SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); /* XXX - ensure not KN_INFLUX?? */ #define KNOTE_ACTIVATE(kn, islock) do { \ if ((islock)) \ mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \ else \ KQ_LOCK((kn)->kn_kq); \ (kn)->kn_status |= KN_ACTIVE; \ if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ knote_enqueue((kn)); \ if (!(islock)) \ KQ_UNLOCK((kn)->kn_kq); \ } while(0) #define KQ_LOCK(kq) do { \ mtx_lock(&(kq)->kq_lock); \ } while (0) #define KQ_FLUX_WAKEUP(kq) do { \ if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \ (kq)->kq_state &= ~KQ_FLUXWAIT; \ wakeup((kq)); \ } \ } while (0) #define KQ_UNLOCK_FLUX(kq) do { \ KQ_FLUX_WAKEUP(kq); \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_UNLOCK(kq) do { \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_OWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_OWNED); \ } while (0) #define KQ_NOTOWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \ } while (0) #define KN_LIST_LOCK(kn) do { \ if (kn->kn_knlist != NULL) \ kn->kn_knlist->kl_lock(kn->kn_knlist->kl_lockarg); \ } while (0) #define KN_LIST_UNLOCK(kn) do { \ if (kn->kn_knlist != NULL) \ kn->kn_knlist->kl_unlock(kn->kn_knlist->kl_lockarg); \ } while (0) #define KNL_ASSERT_LOCK(knl, islocked) do { \ if (islocked) \ KNL_ASSERT_LOCKED(knl); \ else \ KNL_ASSERT_UNLOCKED(knl); \ } while (0) #ifdef INVARIANTS #define KNL_ASSERT_LOCKED(knl) do { \ if (!knl->kl_locked((knl)->kl_lockarg)) \ panic("knlist not locked, but should be"); \ } while (0) #define KNL_ASSERT_UNLOCKED(knl) do { \ if (knl->kl_locked((knl)->kl_lockarg)) \ panic("knlist locked, but should not be"); \ } while (0) #else /* !INVARIANTS */ #define KNL_ASSERT_LOCKED(knl) do {} while(0) #define KNL_ASSERT_UNLOCKED(knl) do {} while (0) #endif /* INVARIANTS */ #define KN_HASHSIZE 64 /* XXX should be tunable */ #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) static int filt_nullattach(struct knote *kn) { return (ENXIO); }; struct filterops null_filtops = { 0, filt_nullattach, NULL, NULL }; /* XXX - make SYSINIT to add these, and move into respective modules. */ extern struct filterops sig_filtops; extern struct filterops fs_filtops; /* * Table for for all system-defined filters. */ static struct mtx filterops_lock; MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", MTX_DEF); static struct { struct filterops *for_fop; int for_refcnt; } sysfilt_ops[EVFILT_SYSCOUNT] = { { &file_filtops }, /* EVFILT_READ */ { &file_filtops }, /* EVFILT_WRITE */ { &null_filtops }, /* EVFILT_AIO */ { &file_filtops }, /* EVFILT_VNODE */ { &proc_filtops }, /* EVFILT_PROC */ { &sig_filtops }, /* EVFILT_SIGNAL */ { &timer_filtops }, /* EVFILT_TIMER */ { &file_filtops }, /* EVFILT_NETDEV */ { &fs_filtops }, /* EVFILT_FS */ { &null_filtops }, /* EVFILT_LIO */ }; /* * Simple redirection for all cdevsw style objects to call their fo_kqfilter * method. */ static int filt_fileattach(struct knote *kn) { return (fo_kqfilter(kn->kn_fp, kn)); } /*ARGSUSED*/ static int kqueue_kqfilter(struct file *fp, struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; if (kn->kn_filter != EVFILT_READ) return (EINVAL); kn->kn_status |= KN_KQUEUE; kn->kn_fop = &kqread_filtops; knlist_add(&kq->kq_sel.si_note, kn, 0); return (0); } static void filt_kqdetach(struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; knlist_remove(&kq->kq_sel.si_note, kn, 0); } /*ARGSUSED*/ static int filt_kqueue(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_fp->f_data; kn->kn_data = kq->kq_count; return (kn->kn_data > 0); } /* XXX - move to kern_proc.c? */ static int filt_procattach(struct knote *kn) { struct proc *p; int immediate; int error; immediate = 0; p = pfind(kn->kn_id); if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) { p = zpfind(kn->kn_id); immediate = 1; } else if (p != NULL && (p->p_flag & P_WEXIT)) { immediate = 1; } if (p == NULL) return (ESRCH); if ((error = p_cansee(curthread, p))) return (error); kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ /* * internal flag indicating registration done by kernel */ if (kn->kn_flags & EV_FLAG1) { kn->kn_data = kn->kn_sdata; /* ppid */ kn->kn_fflags = NOTE_CHILD; kn->kn_flags &= ~EV_FLAG1; } if (immediate == 0) knlist_add(&p->p_klist, kn, 1); /* * Immediately activate any exit notes if the target process is a * zombie. This is necessary to handle the case where the target * process, e.g. a child, dies before the kevent is registered. */ if (immediate && filt_proc(kn, NOTE_EXIT)) KNOTE_ACTIVATE(kn, 0); PROC_UNLOCK(p); return (0); } /* * The knote may be attached to a different process, which may exit, * leaving nothing for the knote to be attached to. So when the process * exits, the knote is marked as DETACHED and also flagged as ONESHOT so * it will be deleted when read out. However, as part of the knote deletion, * this routine is called, so a check is needed to avoid actually performing * a detach, because the original process does not exist any more. */ /* XXX - move to kern_proc.c? */ static void filt_procdetach(struct knote *kn) { struct proc *p; p = kn->kn_ptr.p_proc; knlist_remove(&p->p_klist, kn, 0); kn->kn_ptr.p_proc = NULL; } /* XXX - move to kern_proc.c? */ static int filt_proc(struct knote *kn, long hint) { struct proc *p = kn->kn_ptr.p_proc; u_int event; /* * mask off extra data */ event = (u_int)hint & NOTE_PCTRLMASK; /* * if the user is interested in this event, record it. */ if (kn->kn_sfflags & event) kn->kn_fflags |= event; /* * process is gone, so flag the event as finished. */ if (event == NOTE_EXIT) { if (!(kn->kn_status & KN_DETACHED)) knlist_remove_inevent(&p->p_klist, kn); kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = p->p_xstat; kn->kn_ptr.p_proc = NULL; return (1); } /* * process forked, and user wants to track the new process, * so attach a new knote to it, and immediately report an * event with the parent's pid. */ if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) { struct kevent kev; int error; /* * register knote with new process. */ kev.ident = hint & NOTE_PDATAMASK; /* pid */ kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata; /* preserve udata */ error = kqueue_register(kn->kn_kq, &kev, NULL, 0); if (error) kn->kn_fflags |= NOTE_TRACKERR; } return (kn->kn_fflags != 0); } static int timertoticks(intptr_t data) { struct timeval tv; int tticks; tv.tv_sec = data / 1000; tv.tv_usec = (data % 1000) * 1000; tticks = tvtohz(&tv); return tticks; } /* XXX - move to kern_timeout.c? */ static void filt_timerexpire(void *knx) { struct knote *kn = knx; struct callout *calloutp; kn->kn_data++; KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */ if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) { calloutp = (struct callout *)kn->kn_hook; callout_reset(calloutp, timertoticks(kn->kn_sdata), filt_timerexpire, kn); } } /* * data contains amount of time to sleep, in milliseconds */ /* XXX - move to kern_timeout.c? */ static int filt_timerattach(struct knote *kn) { struct callout *calloutp; atomic_add_int(&kq_ncallouts, 1); if (kq_ncallouts >= kq_calloutmax) { atomic_add_int(&kq_ncallouts, -1); return (ENOMEM); } kn->kn_flags |= EV_CLEAR; /* automatically set */ kn->kn_status &= ~KN_DETACHED; /* knlist_add usually sets it */ MALLOC(calloutp, struct callout *, sizeof(*calloutp), M_KQUEUE, M_WAITOK); callout_init(calloutp, CALLOUT_MPSAFE); kn->kn_hook = calloutp; callout_reset(calloutp, timertoticks(kn->kn_sdata), filt_timerexpire, kn); return (0); } /* XXX - move to kern_timeout.c? */ static void filt_timerdetach(struct knote *kn) { struct callout *calloutp; calloutp = (struct callout *)kn->kn_hook; callout_drain(calloutp); FREE(calloutp, M_KQUEUE); atomic_add_int(&kq_ncallouts, -1); kn->kn_status |= KN_DETACHED; /* knlist_remove usually clears it */ } /* XXX - move to kern_timeout.c? */ static int filt_timer(struct knote *kn, long hint) { return (kn->kn_data != 0); } int kqueue(struct thread *td, struct kqueue_args *uap) { struct filedesc *fdp; struct kqueue *kq; struct file *fp; int fd, error; fdp = td->td_proc->p_fd; error = falloc(td, &fp, &fd); if (error) goto done2; /* An extra reference on `nfp' has been held for us by falloc(). */ kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK); TAILQ_INIT(&kq->kq_head); kq->kq_fdp = fdp; knlist_init(&kq->kq_sel.si_note, &kq->kq_lock, NULL, NULL, NULL); TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); FILEDESC_XLOCK(fdp); SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); FILEDESC_XUNLOCK(fdp); - FILE_LOCK(fp); - fp->f_flag = FREAD | FWRITE; - fp->f_type = DTYPE_KQUEUE; - fp->f_data = kq; - fp->f_ops = &kqueueops; - FILE_UNLOCK(fp); + finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); fdrop(fp, td); td->td_retval[0] = fd; done2: return (error); } #ifndef _SYS_SYSPROTO_H_ struct kevent_args { int fd; const struct kevent *changelist; int nchanges; struct kevent *eventlist; int nevents; const struct timespec *timeout; }; #endif int kevent(struct thread *td, struct kevent_args *uap) { struct timespec ts, *tsp; struct kevent_copyops k_ops = { uap, kevent_copyout, kevent_copyin}; int error; #ifdef KTRACE struct uio ktruio; struct iovec ktriov; struct uio *ktruioin = NULL; struct uio *ktruioout = NULL; #endif if (uap->timeout != NULL) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) { ktriov.iov_base = uap->changelist; ktriov.iov_len = uap->nchanges * sizeof(struct kevent); ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1, .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ, .uio_td = td }; ktruioin = cloneuio(&ktruio); ktriov.iov_base = uap->eventlist; ktriov.iov_len = uap->nevents * sizeof(struct kevent); ktruioout = cloneuio(&ktruio); } #endif error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents, &k_ops, tsp); #ifdef KTRACE if (ktruioin != NULL) { ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent); ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0); ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent); ktrgenio(uap->fd, UIO_READ, ktruioout, error); } #endif return (error); } /* * Copy 'count' items into the destination list pointed to by uap->eventlist. */ static int kevent_copyout(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyout(kevp, uap->eventlist, count * sizeof *kevp); if (error == 0) uap->eventlist += count; return (error); } /* * Copy 'count' items from the list pointed to by uap->changelist. */ static int kevent_copyin(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyin(uap->changelist, kevp, count * sizeof *kevp); if (error == 0) uap->changelist += count; return (error); } int kern_kevent(struct thread *td, int fd, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { struct kevent keva[KQ_NEVENTS]; struct kevent *kevp, *changes; struct kqueue *kq; struct file *fp; int i, n, nerrors, error; if ((error = fget(td, fd, &fp)) != 0) return (error); if ((error = kqueue_acquire(fp, &kq)) != 0) goto done_norel; nerrors = 0; while (nchanges > 0) { n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges; error = k_ops->k_copyin(k_ops->arg, keva, n); if (error) goto done; changes = keva; for (i = 0; i < n; i++) { kevp = &changes[i]; if (!kevp->filter) continue; kevp->flags &= ~EV_SYSFLAGS; error = kqueue_register(kq, kevp, td, 1); if (error) { if (nevents != 0) { kevp->flags = EV_ERROR; kevp->data = error; (void) k_ops->k_copyout(k_ops->arg, kevp, 1); nevents--; nerrors++; } else { goto done; } } } nchanges -= n; } if (nerrors) { td->td_retval[0] = nerrors; error = 0; goto done; } error = kqueue_scan(kq, nevents, k_ops, timeout, keva, td); done: kqueue_release(kq, 0); done_norel: fdrop(fp, td); return (error); } int kqueue_add_filteropts(int filt, struct filterops *filtops) { int error; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) { printf( "trying to add a filterop that is out of range: %d is beyond %d\n", ~filt, EVFILT_SYSCOUNT); return EINVAL; } mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop != &null_filtops && sysfilt_ops[~filt].for_fop != NULL) error = EEXIST; else { sysfilt_ops[~filt].for_fop = filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return (0); } int kqueue_del_filteropts(int filt) { int error; error = 0; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return EINVAL; mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop == &null_filtops || sysfilt_ops[~filt].for_fop == NULL) error = EINVAL; else if (sysfilt_ops[~filt].for_refcnt != 0) error = EBUSY; else { sysfilt_ops[~filt].for_fop = &null_filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return error; } static struct filterops * kqueue_fo_find(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return NULL; mtx_lock(&filterops_lock); sysfilt_ops[~filt].for_refcnt++; if (sysfilt_ops[~filt].for_fop == NULL) sysfilt_ops[~filt].for_fop = &null_filtops; mtx_unlock(&filterops_lock); return sysfilt_ops[~filt].for_fop; } static void kqueue_fo_release(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return; mtx_lock(&filterops_lock); KASSERT(sysfilt_ops[~filt].for_refcnt > 0, ("filter object refcount not valid on release")); sysfilt_ops[~filt].for_refcnt--; mtx_unlock(&filterops_lock); } /* * A ref to kq (obtained via kqueue_acquire) must be held. waitok will * influence if memory allocation should wait. Make sure it is 0 if you * hold any mutexes. */ static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok) { struct filterops *fops; struct file *fp; struct knote *kn, *tkn; int error, filt, event; int haskqglobal; fp = NULL; kn = NULL; error = 0; haskqglobal = 0; filt = kev->filter; fops = kqueue_fo_find(filt); if (fops == NULL) return EINVAL; tkn = knote_alloc(waitok); /* prevent waiting with locks */ findkn: if (fops->f_isfd) { KASSERT(td != NULL, ("td is NULL")); error = fget(td, kev->ident, &fp); if (error) goto done; if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops, kev->ident, 0) != 0) { /* try again */ fdrop(fp, td); fp = NULL; error = kqueue_expand(kq, fops, kev->ident, waitok); if (error) goto done; goto findkn; } if (fp->f_type == DTYPE_KQUEUE) { /* * if we add some inteligence about what we are doing, * we should be able to support events on ourselves. * We need to know when we are doing this to prevent * getting both the knlist lock and the kq lock since * they are the same thing. */ if (fp->f_data == kq) { error = EINVAL; goto done; } KQ_GLOBAL_LOCK(&kq_global, haskqglobal); } KQ_LOCK(kq); if (kev->ident < kq->kq_knlistsize) { SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link) if (kev->filter == kn->kn_filter) break; } } else { if ((kev->flags & EV_ADD) == EV_ADD) kqueue_expand(kq, fops, kev->ident, waitok); KQ_LOCK(kq); if (kq->kq_knhashmask != 0) { struct klist *list; list = &kq->kq_knhash[ KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; SLIST_FOREACH(kn, list, kn_link) if (kev->ident == kn->kn_id && kev->filter == kn->kn_filter) break; } } /* knote is in the process of changing, wait for it to stablize. */ if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) { if (fp != NULL) { fdrop(fp, td); fp = NULL; } KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0); goto findkn; } if (kn == NULL && ((kev->flags & EV_ADD) == 0)) { KQ_UNLOCK(kq); error = ENOENT; goto done; } /* * kn now contains the matching knote, or NULL if no match */ if (kev->flags & EV_ADD) { if (kn == NULL) { kn = tkn; tkn = NULL; if (kn == NULL) { KQ_UNLOCK(kq); error = ENOMEM; goto done; } kn->kn_fp = fp; kn->kn_kq = kq; kn->kn_fop = fops; /* * apply reference counts to knote structure, and * do not release it at the end of this routine. */ fops = NULL; fp = NULL; kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kev->fflags = 0; kev->data = 0; kn->kn_kevent = *kev; kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE | EV_ENABLE | EV_DISABLE); kn->kn_status = KN_INFLUX|KN_DETACHED; error = knote_attach(kn, kq); KQ_UNLOCK(kq); if (error != 0) { tkn = kn; goto done; } if ((error = kn->kn_fop->f_attach(kn)) != 0) { knote_drop(kn, td); goto done; } KN_LIST_LOCK(kn); } else { /* * The user may change some filter values after the * initial EV_ADD, but doing so will not reset any * filter which has already been triggered. */ kn->kn_status |= KN_INFLUX; KQ_UNLOCK(kq); KN_LIST_LOCK(kn); kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kn->kn_kevent.udata = kev->udata; } /* * We can get here with kn->kn_knlist == NULL. * This can happen when the initial attach event decides that * the event is "completed" already. i.e. filt_procattach * is called on a zombie process. It will call filt_proc * which will remove it from the list, and NULL kn_knlist. */ event = kn->kn_fop->f_event(kn, 0); KQ_LOCK(kq); if (event) KNOTE_ACTIVATE(kn, 1); kn->kn_status &= ~KN_INFLUX; KN_LIST_UNLOCK(kn); } else if (kev->flags & EV_DELETE) { kn->kn_status |= KN_INFLUX; KQ_UNLOCK(kq); if (!(kn->kn_status & KN_DETACHED)) kn->kn_fop->f_detach(kn); knote_drop(kn, td); goto done; } if ((kev->flags & EV_DISABLE) && ((kn->kn_status & KN_DISABLED) == 0)) { kn->kn_status |= KN_DISABLED; } if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) { kn->kn_status &= ~KN_DISABLED; if ((kn->kn_status & KN_ACTIVE) && ((kn->kn_status & KN_QUEUED) == 0)) knote_enqueue(kn); } KQ_UNLOCK_FLUX(kq); done: KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (fp != NULL) fdrop(fp, td); if (tkn != NULL) knote_free(tkn); if (fops != NULL) kqueue_fo_release(filt); return (error); } static int kqueue_acquire(struct file *fp, struct kqueue **kqp) { int error; struct kqueue *kq; error = 0; - FILE_LOCK(fp); - do { - kq = fp->f_data; - if (fp->f_type != DTYPE_KQUEUE || kq == NULL) { - error = EBADF; - break; - } - *kqp = kq; - KQ_LOCK(kq); - if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { - KQ_UNLOCK(kq); - error = EBADF; - break; - } - kq->kq_refcnt++; + kq = fp->f_data; + if (fp->f_type != DTYPE_KQUEUE || kq == NULL) + return (EBADF); + *kqp = kq; + KQ_LOCK(kq); + if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { KQ_UNLOCK(kq); - } while (0); - FILE_UNLOCK(fp); + return (EBADF); + } + kq->kq_refcnt++; + KQ_UNLOCK(kq); return error; } static void kqueue_release(struct kqueue *kq, int locked) { if (locked) KQ_OWNED(kq); else KQ_LOCK(kq); kq->kq_refcnt--; if (kq->kq_refcnt == 1) wakeup(&kq->kq_refcnt); if (!locked) KQ_UNLOCK(kq); } static void kqueue_schedtask(struct kqueue *kq) { KQ_OWNED(kq); KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN), ("scheduling kqueue task while draining")); if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) { taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task); kq->kq_state |= KQ_TASKSCHED; } } /* * Expand the kq to make sure we have storage for fops/ident pair. * * Return 0 on success (or no work necessary), return errno on failure. * * Not calling hashinit w/ waitok (proper malloc flag) should be safe. * If kqueue_register is called from a non-fd context, there usually/should * be no locks held. */ static int kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, int waitok) { struct klist *list, *tmp_knhash; u_long tmp_knhashmask; int size; int fd; int mflag = waitok ? M_WAITOK : M_NOWAIT; KQ_NOTOWNED(kq); if (fops->f_isfd) { fd = ident; if (kq->kq_knlistsize <= fd) { size = kq->kq_knlistsize; while (size <= fd) size += KQEXTENT; MALLOC(list, struct klist *, size * sizeof list, M_KQUEUE, mflag); if (list == NULL) return ENOMEM; KQ_LOCK(kq); if (kq->kq_knlistsize > fd) { FREE(list, M_KQUEUE); list = NULL; } else { if (kq->kq_knlist != NULL) { bcopy(kq->kq_knlist, list, kq->kq_knlistsize * sizeof list); FREE(kq->kq_knlist, M_KQUEUE); kq->kq_knlist = NULL; } bzero((caddr_t)list + kq->kq_knlistsize * sizeof list, (size - kq->kq_knlistsize) * sizeof list); kq->kq_knlistsize = size; kq->kq_knlist = list; } KQ_UNLOCK(kq); } } else { if (kq->kq_knhashmask == 0) { tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, &tmp_knhashmask); if (tmp_knhash == NULL) return ENOMEM; KQ_LOCK(kq); if (kq->kq_knhashmask == 0) { kq->kq_knhash = tmp_knhash; kq->kq_knhashmask = tmp_knhashmask; } else { free(tmp_knhash, M_KQUEUE); } KQ_UNLOCK(kq); } } KQ_NOTOWNED(kq); return 0; } static void kqueue_task(void *arg, int pending) { struct kqueue *kq; int haskqglobal; haskqglobal = 0; kq = arg; KQ_GLOBAL_LOCK(&kq_global, haskqglobal); KQ_LOCK(kq); KNOTE_LOCKED(&kq->kq_sel.si_note, 0); kq->kq_state &= ~KQ_TASKSCHED; if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) { wakeup(&kq->kq_state); } KQ_UNLOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); } /* * Scan, update kn_data (if not ONESHOT), and copyout triggered events. * We treat KN_MARKER knotes as if they are INFLUX. */ static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *tsp, struct kevent *keva, struct thread *td) { struct kevent *kevp; struct timeval atv, rtv, ttv; struct knote *kn, *marker; int count, timeout, nkev, error; int haskqglobal; count = maxevents; nkev = 0; error = 0; haskqglobal = 0; if (maxevents == 0) goto done_nl; if (tsp != NULL) { TIMESPEC_TO_TIMEVAL(&atv, tsp); if (itimerfix(&atv)) { error = EINVAL; goto done_nl; } if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) timeout = -1; else timeout = atv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&atv); getmicrouptime(&rtv); timevaladd(&atv, &rtv); } else { atv.tv_sec = 0; atv.tv_usec = 0; timeout = 0; } marker = knote_alloc(1); if (marker == NULL) { error = ENOMEM; goto done_nl; } marker->kn_status = KN_MARKER; KQ_LOCK(kq); goto start; retry: if (atv.tv_sec || atv.tv_usec) { getmicrouptime(&rtv); if (timevalcmp(&rtv, &atv, >=)) goto done; ttv = atv; timevalsub(&ttv, &rtv); timeout = ttv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&ttv); } start: kevp = keva; if (kq->kq_count == 0) { if (timeout < 0) { error = EWOULDBLOCK; } else { kq->kq_state |= KQ_SLEEP; error = msleep(kq, &kq->kq_lock, PSOCK | PCATCH, "kqread", timeout); } if (error == 0) goto retry; /* don't restart after signals... */ if (error == ERESTART) error = EINTR; else if (error == EWOULDBLOCK) error = 0; goto done; } TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); while (count) { KQ_OWNED(kq); kn = TAILQ_FIRST(&kq->kq_head); if ((kn->kn_status == KN_MARKER && kn != marker) || (kn->kn_status & KN_INFLUX) == KN_INFLUX) { kq->kq_state |= KQ_FLUXWAIT; error = msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); continue; } TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) { kn->kn_status &= ~KN_QUEUED; kq->kq_count--; continue; } if (kn == marker) { KQ_FLUX_WAKEUP(kq); if (count == maxevents) goto retry; goto done; } KASSERT((kn->kn_status & KN_INFLUX) == 0, ("KN_INFLUX set when not suppose to be")); if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { kn->kn_status &= ~KN_QUEUED; kn->kn_status |= KN_INFLUX; kq->kq_count--; KQ_UNLOCK(kq); /* * We don't need to lock the list since we've marked * it _INFLUX. */ *kevp = kn->kn_kevent; if (!(kn->kn_status & KN_DETACHED)) kn->kn_fop->f_detach(kn); knote_drop(kn, td); KQ_LOCK(kq); kn = NULL; } else { kn->kn_status |= KN_INFLUX; KQ_UNLOCK(kq); if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE) KQ_GLOBAL_LOCK(&kq_global, haskqglobal); KN_LIST_LOCK(kn); if (kn->kn_fop->f_event(kn, 0) == 0) { KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX); kq->kq_count--; KN_LIST_UNLOCK(kn); continue; } *kevp = kn->kn_kevent; KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); kq->kq_count--; } else TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~(KN_INFLUX); KN_LIST_UNLOCK(kn); } /* we are returning a copy to the user */ kevp++; nkev++; count--; if (nkev == KQ_NEVENTS) { KQ_UNLOCK_FLUX(kq); error = k_ops->k_copyout(k_ops->arg, keva, nkev); nkev = 0; kevp = keva; KQ_LOCK(kq); if (error) break; } } TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); done: KQ_OWNED(kq); KQ_UNLOCK_FLUX(kq); knote_free(marker); done_nl: KQ_NOTOWNED(kq); if (nkev != 0) error = k_ops->k_copyout(k_ops->arg, keva, nkev); td->td_retval[0] = maxevents - count; return (error); } /* * XXX * This could be expanded to call kqueue_scan, if desired. */ /*ARGSUSED*/ static int kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (ENXIO); } /*ARGSUSED*/ static int kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (ENXIO); } /*ARGSUSED*/ static int kqueue_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { /* * Enabling sigio causes two major problems: * 1) infinite recursion: * Synopsys: kevent is being used to track signals and have FIOASYNC * set. On receipt of a signal this will cause a kqueue to recurse * into itself over and over. Sending the sigio causes the kqueue * to become ready, which in turn posts sigio again, forever. * Solution: this can be solved by setting a flag in the kqueue that * we have a SIGIO in progress. * 2) locking problems: * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts * us above the proc and pgrp locks. * Solution: Post a signal using an async mechanism, being sure to * record a generation count in the delivery so that we do not deliver * a signal to the wrong process. * * Note, these two mechanisms are somewhat mutually exclusive! */ #if 0 struct kqueue *kq; kq = fp->f_data; switch (cmd) { case FIOASYNC: if (*(int *)data) { kq->kq_state |= KQ_ASYNC; } else { kq->kq_state &= ~KQ_ASYNC; } return (0); case FIOSETOWN: return (fsetown(*(int *)data, &kq->kq_sigio)); case FIOGETOWN: *(int *)data = fgetown(&kq->kq_sigio); return (0); } #endif return (ENOTTY); } /*ARGSUSED*/ static int kqueue_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct kqueue *kq; int revents = 0; int error; if ((error = kqueue_acquire(fp, &kq))) return POLLERR; KQ_LOCK(kq); if (events & (POLLIN | POLLRDNORM)) { if (kq->kq_count) { revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(td, &kq->kq_sel); if (SEL_WAITING(&kq->kq_sel)) kq->kq_state |= KQ_SEL; } } kqueue_release(kq, 1); KQ_UNLOCK(kq); return (revents); } /*ARGSUSED*/ static int kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { bzero((void *)st, sizeof *st); /* * We no longer return kq_count because the unlocked value is useless. * If you spent all this time getting the count, why not spend your * syscall better by calling kevent? * * XXX - This is needed for libc_r. */ st->st_mode = S_IFIFO; return (0); } /*ARGSUSED*/ static int kqueue_close(struct file *fp, struct thread *td) { struct kqueue *kq = fp->f_data; struct filedesc *fdp; struct knote *kn; int i; int error; if ((error = kqueue_acquire(fp, &kq))) return error; KQ_LOCK(kq); KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING, ("kqueue already closing")); kq->kq_state |= KQ_CLOSING; if (kq->kq_refcnt > 1) msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0); KASSERT(kq->kq_refcnt == 1, ("other refs are out there!")); fdp = kq->kq_fdp; KASSERT(knlist_empty(&kq->kq_sel.si_note), ("kqueue's knlist not empty")); for (i = 0; i < kq->kq_knlistsize; i++) { while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) { KASSERT((kn->kn_status & KN_INFLUX) == 0, ("KN_INFLUX set when not suppose to be")); kn->kn_status |= KN_INFLUX; KQ_UNLOCK(kq); if (!(kn->kn_status & KN_DETACHED)) kn->kn_fop->f_detach(kn); knote_drop(kn, td); KQ_LOCK(kq); } } if (kq->kq_knhashmask != 0) { for (i = 0; i <= kq->kq_knhashmask; i++) { while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) { KASSERT((kn->kn_status & KN_INFLUX) == 0, ("KN_INFLUX set when not suppose to be")); kn->kn_status |= KN_INFLUX; KQ_UNLOCK(kq); if (!(kn->kn_status & KN_DETACHED)) kn->kn_fop->f_detach(kn); knote_drop(kn, td); KQ_LOCK(kq); } } } if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) { kq->kq_state |= KQ_TASKDRAIN; msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } KQ_UNLOCK(kq); FILEDESC_XLOCK(fdp); SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list); FILEDESC_XUNLOCK(fdp); knlist_destroy(&kq->kq_sel.si_note); mtx_destroy(&kq->kq_lock); kq->kq_fdp = NULL; if (kq->kq_knhash != NULL) free(kq->kq_knhash, M_KQUEUE); if (kq->kq_knlist != NULL) free(kq->kq_knlist, M_KQUEUE); funsetown(&kq->kq_sigio); free(kq, M_KQUEUE); fp->f_data = NULL; return (0); } static void kqueue_wakeup(struct kqueue *kq) { KQ_OWNED(kq); if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) { kq->kq_state &= ~KQ_SLEEP; wakeup(kq); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } if (!knlist_empty(&kq->kq_sel.si_note)) kqueue_schedtask(kq); if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) { pgsigio(&kq->kq_sigio, SIGIO, 0); } } /* * Walk down a list of knotes, activating them if their event has triggered. * * There is a possibility to optimize in the case of one kq watching another. * Instead of scheduling a task to wake it up, you could pass enough state * down the chain to make up the parent kqueue. Make this code functional * first. */ void knote(struct knlist *list, long hint, int islocked) { struct kqueue *kq; struct knote *kn; if (list == NULL) return; KNL_ASSERT_LOCK(list, islocked); if (!islocked) list->kl_lock(list->kl_lockarg); /* * If we unlock the list lock (and set KN_INFLUX), we can eliminate * the kqueue scheduling, but this will introduce four * lock/unlock's for each knote to test. If we do, continue to use * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is * only safe if you want to remove the current item, which we are * not doing. */ SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { kq = kn->kn_kq; if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) { KQ_LOCK(kq); if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) { kn->kn_status |= KN_HASKQLOCK; if (kn->kn_fop->f_event(kn, hint)) KNOTE_ACTIVATE(kn, 1); kn->kn_status &= ~KN_HASKQLOCK; } KQ_UNLOCK(kq); } kq = NULL; } if (!islocked) list->kl_unlock(list->kl_lockarg); } /* * add a knote to a knlist */ void knlist_add(struct knlist *knl, struct knote *kn, int islocked) { KNL_ASSERT_LOCK(knl, islocked); KQ_NOTOWNED(kn->kn_kq); KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED")); if (!islocked) knl->kl_lock(knl->kl_lockarg); SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext); if (!islocked) knl->kl_unlock(knl->kl_lockarg); KQ_LOCK(kn->kn_kq); kn->kn_knlist = knl; kn->kn_status &= ~KN_DETACHED; KQ_UNLOCK(kn->kn_kq); } static void knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked) { KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked")); KNL_ASSERT_LOCK(knl, knlislocked); mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED); if (!kqislocked) KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX, ("knlist_remove called w/o knote being KN_INFLUX or already removed")); if (!knlislocked) knl->kl_lock(knl->kl_lockarg); SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext); kn->kn_knlist = NULL; if (!knlislocked) knl->kl_unlock(knl->kl_lockarg); if (!kqislocked) KQ_LOCK(kn->kn_kq); kn->kn_status |= KN_DETACHED; if (!kqislocked) KQ_UNLOCK(kn->kn_kq); } /* * remove all knotes from a specified klist */ void knlist_remove(struct knlist *knl, struct knote *kn, int islocked) { knlist_remove_kq(knl, kn, islocked, 0); } /* * remove knote from a specified klist while in f_event handler. */ void knlist_remove_inevent(struct knlist *knl, struct knote *kn) { knlist_remove_kq(knl, kn, 1, (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK); } int knlist_empty(struct knlist *knl) { KNL_ASSERT_LOCKED(knl); return SLIST_EMPTY(&knl->kl_list); } static struct mtx knlist_lock; MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects", MTX_DEF); static void knlist_mtx_lock(void *arg); static void knlist_mtx_unlock(void *arg); static int knlist_mtx_locked(void *arg); static void knlist_mtx_lock(void *arg) { mtx_lock((struct mtx *)arg); } static void knlist_mtx_unlock(void *arg) { mtx_unlock((struct mtx *)arg); } static int knlist_mtx_locked(void *arg) { return (mtx_owned((struct mtx *)arg)); } void knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), void (*kl_unlock)(void *), int (*kl_locked)(void *)) { if (lock == NULL) knl->kl_lockarg = &knlist_lock; else knl->kl_lockarg = lock; if (kl_lock == NULL) knl->kl_lock = knlist_mtx_lock; else knl->kl_lock = kl_lock; if (kl_unlock == NULL) knl->kl_unlock = knlist_mtx_unlock; else knl->kl_unlock = kl_unlock; if (kl_locked == NULL) knl->kl_locked = knlist_mtx_locked; else knl->kl_locked = kl_locked; SLIST_INIT(&knl->kl_list); } void knlist_destroy(struct knlist *knl) { #ifdef INVARIANTS /* * if we run across this error, we need to find the offending * driver and have it call knlist_clear. */ if (!SLIST_EMPTY(&knl->kl_list)) printf("WARNING: destroying knlist w/ knotes on it!\n"); #endif knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL; SLIST_INIT(&knl->kl_list); } /* * Even if we are locked, we may need to drop the lock to allow any influx * knotes time to "settle". */ void knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn) { struct knote *kn, *kn2; struct kqueue *kq; if (islocked) KNL_ASSERT_LOCKED(knl); else { KNL_ASSERT_UNLOCKED(knl); again: /* need to reacquire lock since we have dropped it */ knl->kl_lock(knl->kl_lockarg); } SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) { kq = kn->kn_kq; KQ_LOCK(kq); if ((kn->kn_status & KN_INFLUX)) { KQ_UNLOCK(kq); continue; } knlist_remove_kq(knl, kn, 1, 1); if (killkn) { kn->kn_status |= KN_INFLUX | KN_DETACHED; KQ_UNLOCK(kq); knote_drop(kn, td); } else { /* Make sure cleared knotes disappear soon */ kn->kn_flags |= (EV_EOF | EV_ONESHOT); KQ_UNLOCK(kq); } kq = NULL; } if (!SLIST_EMPTY(&knl->kl_list)) { /* there are still KN_INFLUX remaining */ kn = SLIST_FIRST(&knl->kl_list); kq = kn->kn_kq; KQ_LOCK(kq); KASSERT(kn->kn_status & KN_INFLUX, ("knote removed w/o list lock")); knl->kl_unlock(knl->kl_lockarg); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0); kq = NULL; goto again; } if (islocked) KNL_ASSERT_LOCKED(knl); else { knl->kl_unlock(knl->kl_lockarg); KNL_ASSERT_UNLOCKED(knl); } } /* * Remove all knotes referencing a specified fd must be called with FILEDESC * lock. This prevents a race where a new fd comes along and occupies the * entry and we attach a knote to the fd. */ void knote_fdclose(struct thread *td, int fd) { struct filedesc *fdp = td->td_proc->p_fd; struct kqueue *kq; struct knote *kn; int influx; FILEDESC_XLOCK_ASSERT(fdp); /* * We shouldn't have to worry about new kevents appearing on fd * since filedesc is locked. */ SLIST_FOREACH(kq, &fdp->fd_kqlist, kq_list) { KQ_LOCK(kq); again: influx = 0; while (kq->kq_knlistsize > fd && (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) { if (kn->kn_status & KN_INFLUX) { /* someone else might be waiting on our knote */ if (influx) wakeup(kq); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); goto again; } kn->kn_status |= KN_INFLUX; KQ_UNLOCK(kq); if (!(kn->kn_status & KN_DETACHED)) kn->kn_fop->f_detach(kn); knote_drop(kn, td); influx = 1; KQ_LOCK(kq); } KQ_UNLOCK_FLUX(kq); } } static int knote_attach(struct knote *kn, struct kqueue *kq) { struct klist *list; KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX")); KQ_OWNED(kq); if (kn->kn_fop->f_isfd) { if (kn->kn_id >= kq->kq_knlistsize) return ENOMEM; list = &kq->kq_knlist[kn->kn_id]; } else { if (kq->kq_knhash == NULL) return ENOMEM; list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; } SLIST_INSERT_HEAD(list, kn, kn_link); return 0; } /* * knote must already have been detached using the f_detach method. * no lock need to be held, it is assumed that the KN_INFLUX flag is set * to prevent other removal. */ static void knote_drop(struct knote *kn, struct thread *td) { struct kqueue *kq; struct klist *list; kq = kn->kn_kq; KQ_NOTOWNED(kq); KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX, ("knote_drop called without KN_INFLUX set in kn_status")); KQ_LOCK(kq); if (kn->kn_fop->f_isfd) list = &kq->kq_knlist[kn->kn_id]; else list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; if (!SLIST_EMPTY(list)) SLIST_REMOVE(list, kn, knote, kn_link); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); KQ_UNLOCK_FLUX(kq); if (kn->kn_fop->f_isfd) { fdrop(kn->kn_fp, td); kn->kn_fp = NULL; } kqueue_fo_release(kn->kn_kevent.filter); kn->kn_fop = NULL; knote_free(kn); } static void knote_enqueue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status |= KN_QUEUED; kq->kq_count++; kqueue_wakeup(kq); } static void knote_dequeue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; } static void knote_init(void) { knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL) static struct knote * knote_alloc(int waitok) { return ((struct knote *)uma_zalloc(knote_zone, (waitok ? M_WAITOK : M_NOWAIT)|M_ZERO)); } static void knote_free(struct knote *kn) { if (kn != NULL) uma_zfree(knote_zone, kn); } /* * Register the kev w/ the kq specified by fd. */ int kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok) { struct kqueue *kq; struct file *fp; int error; if ((error = fget(td, fd, &fp)) != 0) return (error); if ((error = kqueue_acquire(fp, &kq)) != 0) goto noacquire; error = kqueue_register(kq, kev, td, waitok); kqueue_release(kq, 0); noacquire: fdrop(fp, td); return error; } Index: head/sys/kern/sys_generic.c =================================================================== --- head/sys/kern/sys_generic.c (revision 174987) +++ head/sys/kern/sys_generic.c (revision 174988) @@ -1,1403 +1,1399 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); MALLOC_DEFINE(M_IOV, "iov", "large iov's"); static int pollscan(struct thread *, struct pollfd *, u_int); static int pollrescan(struct thread *); static int selscan(struct thread *, fd_mask **, fd_mask **, int); static int selrescan(struct thread *, fd_mask **, fd_mask **); static void selfdalloc(struct thread *, void *); static void selfdfree(struct seltd *, struct selfd *); static int dofileread(struct thread *, int, struct file *, struct uio *, off_t, int); static int dofilewrite(struct thread *, int, struct file *, struct uio *, off_t, int); static void doselwakeup(struct selinfo *, int); static void seltdinit(struct thread *); static int seltdwait(struct thread *, int); static void seltdclear(struct thread *); /* * One seltd per-thread allocated on demand as needed. * * t - protected by st_mtx * k - Only accessed by curthread or read-only */ struct seltd { STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ struct selfd *st_free1; /* (k) free fd for read set. */ struct selfd *st_free2; /* (k) free fd for write set. */ struct mtx st_mtx; /* Protects struct seltd */ struct cv st_wait; /* (t) Wait channel. */ int st_flags; /* (t) SELTD_ flags. */ }; #define SELTD_PENDING 0x0001 /* We have pending events. */ #define SELTD_RESCAN 0x0002 /* Doing a rescan. */ /* * One selfd allocated per-thread per-file-descriptor. * f - protected by sf_mtx */ struct selfd { STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ struct selinfo *sf_si; /* (f) selinfo when linked. */ struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ struct seltd *sf_td; /* (k) owning seltd. */ void *sf_cookie; /* (k) fd or pollfd. */ }; static uma_zone_t selfd_zone; #ifndef _SYS_SYSPROTO_H_ struct read_args { int fd; void *buf; size_t nbyte; }; #endif int read(td, uap) struct thread *td; struct read_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > INT_MAX) return (EINVAL); aiov.iov_base = uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_readv(td, uap->fd, &auio); return(error); } /* * Positioned read system call */ #ifndef _SYS_SYSPROTO_H_ struct pread_args { int fd; void *buf; size_t nbyte; int pad; off_t offset; }; #endif int pread(td, uap) struct thread *td; struct pread_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > INT_MAX) return (EINVAL); aiov.iov_base = uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_preadv(td, uap->fd, &auio, uap->offset); return(error); } int freebsd6_pread(td, uap) struct thread *td; struct freebsd6_pread_args *uap; { struct pread_args oargs; oargs.fd = uap->fd; oargs.buf = uap->buf; oargs.nbyte = uap->nbyte; oargs.offset = uap->offset; return (pread(td, &oargs)); } /* * Scatter read system call. */ #ifndef _SYS_SYSPROTO_H_ struct readv_args { int fd; struct iovec *iovp; u_int iovcnt; }; #endif int readv(struct thread *td, struct readv_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_readv(td, uap->fd, auio); free(auio, M_IOV); return (error); } int kern_readv(struct thread *td, int fd, struct uio *auio) { struct file *fp; int error; error = fget_read(td, fd, &fp); if (error) return (error); error = dofileread(td, fd, fp, auio, (off_t)-1, 0); fdrop(fp, td); return (error); } /* * Scatter positioned read system call. */ #ifndef _SYS_SYSPROTO_H_ struct preadv_args { int fd; struct iovec *iovp; u_int iovcnt; off_t offset; }; #endif int preadv(struct thread *td, struct preadv_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_preadv(td, uap->fd, auio, uap->offset); free(auio, M_IOV); return (error); } int kern_preadv(td, fd, auio, offset) struct thread *td; int fd; struct uio *auio; off_t offset; { struct file *fp; int error; error = fget_read(td, fd, &fp); if (error) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) error = ESPIPE; else if (offset < 0 && fp->f_vnode->v_type != VCHR) error = EINVAL; else error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); fdrop(fp, td); return (error); } /* * Common code for readv and preadv that reads data in * from a file using the passed in uio, offset, and flags. */ static int dofileread(td, fd, fp, auio, offset, flags) struct thread *td; int fd; struct file *fp; struct uio *auio; off_t offset; int flags; { ssize_t cnt; int error; #ifdef KTRACE struct uio *ktruio = NULL; #endif /* Finish zero length reads right here */ if (auio->uio_resid == 0) { td->td_retval[0] = 0; return(0); } auio->uio_rw = UIO_READ; auio->uio_offset = offset; auio->uio_td = td; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(auio); #endif cnt = auio->uio_resid; if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { if (auio->uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } cnt -= auio->uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = cnt; ktrgenio(fd, UIO_READ, ktruio, error); } #endif td->td_retval[0] = cnt; return (error); } #ifndef _SYS_SYSPROTO_H_ struct write_args { int fd; const void *buf; size_t nbyte; }; #endif int write(td, uap) struct thread *td; struct write_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > INT_MAX) return (EINVAL); aiov.iov_base = (void *)(uintptr_t)uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_writev(td, uap->fd, &auio); return(error); } /* * Positioned write system call. */ #ifndef _SYS_SYSPROTO_H_ struct pwrite_args { int fd; const void *buf; size_t nbyte; int pad; off_t offset; }; #endif int pwrite(td, uap) struct thread *td; struct pwrite_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > INT_MAX) return (EINVAL); aiov.iov_base = (void *)(uintptr_t)uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_pwritev(td, uap->fd, &auio, uap->offset); return(error); } int freebsd6_pwrite(td, uap) struct thread *td; struct freebsd6_pwrite_args *uap; { struct pwrite_args oargs; oargs.fd = uap->fd; oargs.buf = uap->buf; oargs.nbyte = uap->nbyte; oargs.offset = uap->offset; return (pwrite(td, &oargs)); } /* * Gather write system call. */ #ifndef _SYS_SYSPROTO_H_ struct writev_args { int fd; struct iovec *iovp; u_int iovcnt; }; #endif int writev(struct thread *td, struct writev_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_writev(td, uap->fd, auio); free(auio, M_IOV); return (error); } int kern_writev(struct thread *td, int fd, struct uio *auio) { struct file *fp; int error; error = fget_write(td, fd, &fp); if (error) return (error); error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); fdrop(fp, td); return (error); } /* * Gather positioned write system call. */ #ifndef _SYS_SYSPROTO_H_ struct pwritev_args { int fd; struct iovec *iovp; u_int iovcnt; off_t offset; }; #endif int pwritev(struct thread *td, struct pwritev_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_pwritev(td, uap->fd, auio, uap->offset); free(auio, M_IOV); return (error); } int kern_pwritev(td, fd, auio, offset) struct thread *td; struct uio *auio; int fd; off_t offset; { struct file *fp; int error; error = fget_write(td, fd, &fp); if (error) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) error = ESPIPE; else if (offset < 0 && fp->f_vnode->v_type != VCHR) error = EINVAL; else error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); fdrop(fp, td); return (error); } /* * Common code for writev and pwritev that writes data to * a file using the passed in uio, offset, and flags. */ static int dofilewrite(td, fd, fp, auio, offset, flags) struct thread *td; int fd; struct file *fp; struct uio *auio; off_t offset; int flags; { ssize_t cnt; int error; #ifdef KTRACE struct uio *ktruio = NULL; #endif auio->uio_rw = UIO_WRITE; auio->uio_td = td; auio->uio_offset = offset; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(auio); #endif cnt = auio->uio_resid; if (fp->f_type == DTYPE_VNODE) bwillwrite(); if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { if (auio->uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Socket layer is responsible for issuing SIGPIPE. */ if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { PROC_LOCK(td->td_proc); psignal(td->td_proc, SIGPIPE); PROC_UNLOCK(td->td_proc); } } cnt -= auio->uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = cnt; ktrgenio(fd, UIO_WRITE, ktruio, error); } #endif td->td_retval[0] = cnt; return (error); } #ifndef _SYS_SYSPROTO_H_ struct ioctl_args { int fd; u_long com; caddr_t data; }; #endif /* ARGSUSED */ int ioctl(struct thread *td, struct ioctl_args *uap) { u_long com; int arg, error; u_int size; caddr_t data; if (uap->com > 0xffffffff) { printf( "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", td->td_proc->p_pid, td->td_name, uap->com); uap->com &= 0xffffffff; } com = uap->com; /* * Interpret high order word to find amount of data to be * copied to/from the user's address space. */ size = IOCPARM_LEN(com); if ((size > IOCPARM_MAX) || ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) ((com & IOC_OUT) && size == 0) || #else ((com & (IOC_IN | IOC_OUT)) && size == 0) || #endif ((com & IOC_VOID) && size > 0 && size != sizeof(int))) return (ENOTTY); if (size > 0) { if (!(com & IOC_VOID)) data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); else { /* Integer argument. */ arg = (intptr_t)uap->data; data = (void *)&arg; size = 0; } } else data = (void *)&uap->data; if (com & IOC_IN) { error = copyin(uap->data, data, (u_int)size); if (error) { if (size > 0) free(data, M_IOCTLOPS); return (error); } } else if (com & IOC_OUT) { /* * Zero the buffer so the user always * gets back something deterministic. */ bzero(data, size); } error = kern_ioctl(td, uap->fd, com, data); if (error == 0 && (com & IOC_OUT)) error = copyout(data, uap->data, (u_int)size); if (size > 0) free(data, M_IOCTLOPS); return (error); } int kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) { struct file *fp; struct filedesc *fdp; int error; int tmp; if ((error = fget(td, fd, &fp)) != 0) return (error); if ((fp->f_flag & (FREAD | FWRITE)) == 0) { fdrop(fp, td); return (EBADF); } fdp = td->td_proc->p_fd; switch (com) { case FIONCLEX: FILEDESC_XLOCK(fdp); fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE; FILEDESC_XUNLOCK(fdp); goto out; case FIOCLEX: FILEDESC_XLOCK(fdp); fdp->fd_ofileflags[fd] |= UF_EXCLOSE; FILEDESC_XUNLOCK(fdp); goto out; case FIONBIO: - FILE_LOCK(fp); if ((tmp = *(int *)data)) - fp->f_flag |= FNONBLOCK; + atomic_set_int(&fp->f_flag, FNONBLOCK); else - fp->f_flag &= ~FNONBLOCK; - FILE_UNLOCK(fp); + atomic_clear_int(&fp->f_flag, FNONBLOCK); data = (void *)&tmp; break; case FIOASYNC: - FILE_LOCK(fp); if ((tmp = *(int *)data)) - fp->f_flag |= FASYNC; + atomic_set_int(&fp->f_flag, FASYNC); else - fp->f_flag &= ~FASYNC; - FILE_UNLOCK(fp); + atomic_clear_int(&fp->f_flag, FASYNC); data = (void *)&tmp; break; } error = fo_ioctl(fp, com, data, td->td_ucred, td); out: fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct select_args { int nd; fd_set *in, *ou, *ex; struct timeval *tv; }; #endif int select(td, uap) register struct thread *td; register struct select_args *uap; { struct timeval tv, *tvp; int error; if (uap->tv != NULL) { error = copyin(uap->tv, &tv, sizeof(tv)); if (error) return (error); tvp = &tv; } else tvp = NULL; return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp)); } int kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, fd_set *fd_ex, struct timeval *tvp) { struct filedesc *fdp; /* * The magic 2048 here is chosen to be just enough for FD_SETSIZE * infds with the new FD_SETSIZE of 1024, and more than enough for * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE * of 256. */ fd_mask s_selbits[howmany(2048, NFDBITS)]; fd_mask *ibits[3], *obits[3], *selbits, *sbp; struct timeval atv, rtv, ttv; int error, timo; u_int nbufbytes, ncpbytes, nfdbits; if (nd < 0) return (EINVAL); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); if (nd > td->td_proc->p_fd->fd_nfiles) nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */ FILEDESC_SUNLOCK(fdp); /* * Allocate just enough bits for the non-null fd_sets. Use the * preallocated auto buffer if possible. */ nfdbits = roundup(nd, NFDBITS); ncpbytes = nfdbits / NBBY; nbufbytes = 0; if (fd_in != NULL) nbufbytes += 2 * ncpbytes; if (fd_ou != NULL) nbufbytes += 2 * ncpbytes; if (fd_ex != NULL) nbufbytes += 2 * ncpbytes; if (nbufbytes <= sizeof s_selbits) selbits = &s_selbits[0]; else selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); /* * Assign pointers into the bit buffers and fetch the input bits. * Put the output buffers together so that they can be bzeroed * together. */ sbp = selbits; #define getbits(name, x) \ do { \ if (name == NULL) \ ibits[x] = NULL; \ else { \ ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ obits[x] = sbp; \ sbp += ncpbytes / sizeof *sbp; \ error = copyin(name, ibits[x], ncpbytes); \ if (error != 0) \ goto done; \ } \ } while (0) getbits(fd_in, 0); getbits(fd_ou, 1); getbits(fd_ex, 2); #undef getbits if (nbufbytes != 0) bzero(selbits, nbufbytes / 2); if (tvp != NULL) { atv = *tvp; if (itimerfix(&atv)) { error = EINVAL; goto done; } getmicrouptime(&rtv); timevaladd(&atv, &rtv); } else { atv.tv_sec = 0; atv.tv_usec = 0; } timo = 0; seltdinit(td); /* Iterate until the timeout expires or descriptors become ready. */ for (;;) { error = selscan(td, ibits, obits, nd); if (error || td->td_retval[0] != 0) break; if (atv.tv_sec || atv.tv_usec) { getmicrouptime(&rtv); if (timevalcmp(&rtv, &atv, >=)) break; ttv = atv; timevalsub(&ttv, &rtv); timo = ttv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&ttv); } error = seltdwait(td, timo); if (error) break; error = selrescan(td, ibits, obits); if (error || td->td_retval[0] != 0) break; } seltdclear(td); done: /* select is not restarted after signals... */ if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; #define putbits(name, x) \ if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ error = error2; if (error == 0) { int error2; putbits(fd_in, 0); putbits(fd_ou, 1); putbits(fd_ex, 2); #undef putbits } if (selbits != &s_selbits[0]) free(selbits, M_SELECT); return (error); } /* * Traverse the list of fds attached to this thread's seltd and check for * completion. */ static int selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) { struct seltd *stp; struct selfd *sfp; struct selfd *sfn; struct selinfo *si; struct file *fp; int msk, fd; int n = 0; /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; struct filedesc *fdp = td->td_proc->p_fd; stp = td->td_sel; FILEDESC_SLOCK(fdp); STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { fd = (int)(uintptr_t)sfp->sf_cookie; si = sfp->sf_si; selfdfree(stp, sfp); /* If the selinfo wasn't cleared the event didn't fire. */ if (si != NULL) continue; if ((fp = fget_locked(fdp, fd)) == NULL) { FILEDESC_SUNLOCK(fdp); return (EBADF); } for (msk = 0; msk < 3; msk++) { if (ibits[msk] == NULL) continue; if ((ibits[msk][fd/NFDBITS] & ((fd_mask) 1 << (fd % NFDBITS))) == 0) continue; if (fo_poll(fp, flag[msk], td->td_ucred, td)) { obits[msk][(fd)/NFDBITS] |= ((fd_mask)1 << ((fd) % NFDBITS)); n++; } } } FILEDESC_SUNLOCK(fdp); stp->st_flags = 0; td->td_retval[0] = n; return (0); } /* * Perform the initial filedescriptor scan and register ourselves with * each selinfo. */ static int selscan(td, ibits, obits, nfd) struct thread *td; fd_mask **ibits, **obits; int nfd; { int msk, i, fd; fd_mask bits; struct file *fp; int n = 0; /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; struct filedesc *fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); for (msk = 0; msk < 3; msk++) { if (ibits[msk] == NULL) continue; for (i = 0; i < nfd; i += NFDBITS) { bits = ibits[msk][i/NFDBITS]; /* ffs(int mask) not portable, fd_mask is long */ for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { if (!(bits & 1)) continue; if ((fp = fget_locked(fdp, fd)) == NULL) { FILEDESC_SUNLOCK(fdp); return (EBADF); } selfdalloc(td, (void *)(uintptr_t)fd); if (fo_poll(fp, flag[msk], td->td_ucred, td)) { obits[msk][(fd)/NFDBITS] |= ((fd_mask)1 << ((fd) % NFDBITS)); n++; } } } } FILEDESC_SUNLOCK(fdp); td->td_retval[0] = n; return (0); } #ifndef _SYS_SYSPROTO_H_ struct poll_args { struct pollfd *fds; u_int nfds; int timeout; }; #endif int poll(td, uap) struct thread *td; struct poll_args *uap; { struct pollfd *bits; struct pollfd smallbits[32]; struct timeval atv, rtv, ttv; int error = 0, timo; u_int nfds; size_t ni; nfds = uap->nfds; /* * This is kinda bogus. We have fd limits, but that is not * really related to the size of the pollfd array. Make sure * we let the process use at least FD_SETSIZE entries and at * least enough for the current limits. We want to be reasonably * safe, but not overly restrictive. */ PROC_LOCK(td->td_proc); if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) && (nfds > FD_SETSIZE)) { PROC_UNLOCK(td->td_proc); return (EINVAL); } PROC_UNLOCK(td->td_proc); ni = nfds * sizeof(struct pollfd); if (ni > sizeof(smallbits)) bits = malloc(ni, M_TEMP, M_WAITOK); else bits = smallbits; error = copyin(uap->fds, bits, ni); if (error) goto done; if (uap->timeout != INFTIM) { atv.tv_sec = uap->timeout / 1000; atv.tv_usec = (uap->timeout % 1000) * 1000; if (itimerfix(&atv)) { error = EINVAL; goto done; } getmicrouptime(&rtv); timevaladd(&atv, &rtv); } else { atv.tv_sec = 0; atv.tv_usec = 0; } timo = 0; seltdinit(td); /* Iterate until the timeout expires or descriptors become ready. */ for (;;) { error = pollscan(td, bits, nfds); if (error || td->td_retval[0] != 0) break; if (atv.tv_sec || atv.tv_usec) { getmicrouptime(&rtv); if (timevalcmp(&rtv, &atv, >=)) break; ttv = atv; timevalsub(&ttv, &rtv); timo = ttv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&ttv); } error = seltdwait(td, timo); if (error) break; error = pollrescan(td); if (error || td->td_retval[0] != 0) break; } seltdclear(td); done: /* poll is not restarted after signals... */ if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; if (error == 0) { error = copyout(bits, uap->fds, ni); if (error) goto out; } out: if (ni > sizeof(smallbits)) free(bits, M_TEMP); return (error); } static int pollrescan(struct thread *td) { struct seltd *stp; struct selfd *sfp; struct selfd *sfn; struct selinfo *si; struct filedesc *fdp; struct file *fp; struct pollfd *fd; int n; n = 0; fdp = td->td_proc->p_fd; stp = td->td_sel; FILEDESC_SLOCK(fdp); STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { fd = (struct pollfd *)sfp->sf_cookie; si = sfp->sf_si; selfdfree(stp, sfp); /* If the selinfo wasn't cleared the event didn't fire. */ if (si != NULL) continue; fp = fdp->fd_ofiles[fd->fd]; if (fp == NULL) { fd->revents = POLLNVAL; n++; continue; } /* * Note: backend also returns POLLHUP and * POLLERR if appropriate. */ fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); if (fd->revents != 0) n++; } FILEDESC_SUNLOCK(fdp); stp->st_flags = 0; td->td_retval[0] = n; return (0); } static int pollscan(td, fds, nfd) struct thread *td; struct pollfd *fds; u_int nfd; { struct filedesc *fdp = td->td_proc->p_fd; int i; struct file *fp; int n = 0; FILEDESC_SLOCK(fdp); for (i = 0; i < nfd; i++, fds++) { if (fds->fd >= fdp->fd_nfiles) { fds->revents = POLLNVAL; n++; } else if (fds->fd < 0) { fds->revents = 0; } else { fp = fdp->fd_ofiles[fds->fd]; if (fp == NULL) { fds->revents = POLLNVAL; n++; } else { /* * Note: backend also returns POLLHUP and * POLLERR if appropriate. */ selfdalloc(td, fds); fds->revents = fo_poll(fp, fds->events, td->td_ucred, td); if (fds->revents != 0) n++; } } } FILEDESC_SUNLOCK(fdp); td->td_retval[0] = n; return (0); } /* * OpenBSD poll system call. * * XXX this isn't quite a true representation.. OpenBSD uses select ops. */ #ifndef _SYS_SYSPROTO_H_ struct openbsd_poll_args { struct pollfd *fds; u_int nfds; int timeout; }; #endif int openbsd_poll(td, uap) register struct thread *td; register struct openbsd_poll_args *uap; { return (poll(td, (struct poll_args *)uap)); } /* * XXX This was created specifically to support netncp and netsmb. This * allows the caller to specify a socket to wait for events on. It returns * 0 if any events matched and an error otherwise. There is no way to * determine which events fired. */ int selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) { struct timeval atv, rtv, ttv; int error, timo; if (tvp != NULL) { atv = *tvp; if (itimerfix(&atv)) return (EINVAL); getmicrouptime(&rtv); timevaladd(&atv, &rtv); } else { atv.tv_sec = 0; atv.tv_usec = 0; } timo = 0; seltdinit(td); /* * Iterate until the timeout expires or the socket becomes ready. */ for (;;) { selfdalloc(td, NULL); error = sopoll(so, events, NULL, td); /* error here is actually the ready events. */ if (error) return (0); if (atv.tv_sec || atv.tv_usec) { getmicrouptime(&rtv); if (timevalcmp(&rtv, &atv, >=)) { seltdclear(td); return (EWOULDBLOCK); } ttv = atv; timevalsub(&ttv, &rtv); timo = ttv.tv_sec > 24 * 60 * 60 ? 24 * 60 * 60 * hz : tvtohz(&ttv); } error = seltdwait(td, timo); seltdclear(td); if (error) break; } /* XXX Duplicates ncp/smb behavior. */ if (error == ERESTART) error = 0; return (error); } /* * Preallocate two selfds associated with 'cookie'. Some fo_poll routines * have two select sets, one for read and another for write. */ static void selfdalloc(struct thread *td, void *cookie) { struct seltd *stp; stp = td->td_sel; if (stp->st_free1 == NULL) stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); stp->st_free1->sf_td = stp; stp->st_free1->sf_cookie = cookie; if (stp->st_free2 == NULL) stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); stp->st_free2->sf_td = stp; stp->st_free2->sf_cookie = cookie; } static void selfdfree(struct seltd *stp, struct selfd *sfp) { STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); mtx_lock(sfp->sf_mtx); if (sfp->sf_si) TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); mtx_unlock(sfp->sf_mtx); uma_zfree(selfd_zone, sfp); } /* * Record a select request. */ void selrecord(selector, sip) struct thread *selector; struct selinfo *sip; { struct selfd *sfp; struct seltd *stp; struct mtx *mtxp; stp = selector->td_sel; /* * Don't record when doing a rescan. */ if (stp->st_flags & SELTD_RESCAN) return; /* * Grab one of the preallocated descriptors. */ sfp = NULL; if ((sfp = stp->st_free1) != NULL) stp->st_free1 = NULL; else if ((sfp = stp->st_free2) != NULL) stp->st_free2 = NULL; else panic("selrecord: No free selfd on selq"); mtxp = mtx_pool_find(mtxpool_sleep, sip); /* * Initialize the sfp and queue it in the thread. */ sfp->sf_si = sip; sfp->sf_mtx = mtxp; STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); /* * Now that we've locked the sip, check for initialization. */ mtx_lock(mtxp); if (sip->si_mtx == NULL) { sip->si_mtx = mtxp; TAILQ_INIT(&sip->si_tdlist); } /* * Add this thread to the list of selfds listening on this selinfo. */ TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); mtx_unlock(sip->si_mtx); } /* Wake up a selecting thread. */ void selwakeup(sip) struct selinfo *sip; { doselwakeup(sip, -1); } /* Wake up a selecting thread, and set its priority. */ void selwakeuppri(sip, pri) struct selinfo *sip; int pri; { doselwakeup(sip, pri); } /* * Do a wakeup when a selectable event occurs. */ static void doselwakeup(sip, pri) struct selinfo *sip; int pri; { struct selfd *sfp; struct selfd *sfn; struct seltd *stp; /* If it's not initialized there can't be any waiters. */ if (sip->si_mtx == NULL) return; /* * Locking the selinfo locks all selfds associated with it. */ mtx_lock(sip->si_mtx); TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { /* * Once we remove this sfp from the list and clear the * sf_si seltdclear will know to ignore this si. */ TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); sfp->sf_si = NULL; stp = sfp->sf_td; mtx_lock(&stp->st_mtx); stp->st_flags |= SELTD_PENDING; cv_broadcastpri(&stp->st_wait, pri); mtx_unlock(&stp->st_mtx); } mtx_unlock(sip->si_mtx); } static void seltdinit(struct thread *td) { struct seltd *stp; if ((stp = td->td_sel) != NULL) goto out; td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); cv_init(&stp->st_wait, "select"); out: stp->st_flags = 0; STAILQ_INIT(&stp->st_selq); } static int seltdwait(struct thread *td, int timo) { struct seltd *stp; int error; stp = td->td_sel; /* * An event of interest may occur while we do not hold the seltd * locked so check the pending flag before we sleep. */ mtx_lock(&stp->st_mtx); /* * Any further calls to selrecord will be a rescan. */ stp->st_flags |= SELTD_RESCAN; if (stp->st_flags & SELTD_PENDING) { mtx_unlock(&stp->st_mtx); return (0); } if (timo > 0) error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo); else error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); mtx_unlock(&stp->st_mtx); return (error); } void seltdfini(struct thread *td) { struct seltd *stp; stp = td->td_sel; if (stp == NULL) return; if (stp->st_free1) uma_zfree(selfd_zone, stp->st_free1); if (stp->st_free2) uma_zfree(selfd_zone, stp->st_free2); td->td_sel = NULL; free(stp, M_SELECT); } /* * Remove the references to the thread from all of the objects we were * polling. */ static void seltdclear(struct thread *td) { struct seltd *stp; struct selfd *sfp; struct selfd *sfn; stp = td->td_sel; STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) selfdfree(stp, sfp); stp->st_flags = 0; } static void selectinit(void *); SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); static void selectinit(void *dummy __unused) { selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } Index: head/sys/kern/sys_pipe.c =================================================================== --- head/sys/kern/sys_pipe.c (revision 174987) +++ head/sys/kern/sys_pipe.c (revision 174988) @@ -1,1620 +1,1610 @@ /*- * Copyright (c) 1996 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. */ /* * This file contains a high-performance replacement for the socket-based * pipes scheme originally used in FreeBSD/4.4Lite. It does not support * all features of sockets, but does do everything that pipes normally * do. */ /* * This code has two modes of operation, a small write mode and a large * write mode. The small write mode acts like conventional pipes with * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and * the receiving process can copy it directly from the pages in the sending * process. * * If the sending process receives a signal, it is possible that it will * go away, and certainly its address space can change, because control * is returned back to the user-mode side. In that case, the pipe code * arranges to copy the buffer supplied by the user process, to a pageable * kernel buffer, and the receiving process will grab the data from the * pageable kernel buffer. Since signals don't happen all that often, * the copy operation is normally eliminated. * * The constant PIPE_MINDIRECT is chosen to make sure that buffering will * happen for small transfers so that the system will not spend all of * its time context switching. * * In order to limit the resource use of pipes, two sysctls exist: * * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable * address space available to us in pipe_map. This value is normally * autotuned, but may also be loader tuned. * * kern.ipc.pipekva - This read-only sysctl tracks the current amount of * memory in use by pipes. * * Based on how large pipekva is relative to maxpipekva, the following * will happen: * * 0% - 50%: * New pipes are given 16K of memory backing, pipes may dynamically * grow to as large as 64K where needed. * 50% - 75%: * New pipes are given 4K (or PAGE_SIZE) of memory backing, * existing pipes may NOT grow. * 75% - 100%: * New pipes are given 4K (or PAGE_SIZE) of memory backing, * existing pipes will be shrunk down to 4K whenever possible. * * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE * resize which MUST occur for reverse-direction pipes when they are * first used. * * Additional information about the current state of pipes may be obtained * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail, * and kern.ipc.piperesizefail. * * Locking rules: There are two locks present here: A mutex, used via * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via * the flag, as mutexes can not persist over uiomove. The mutex * exists only to guard access to the flag, and is not in itself a * locking mechanism. Also note that there is only a single mutex for * both directions of a pipe. * * As pipelock() may have to sleep before it can acquire the flag, it * is important to reread all data after a call to pipelock(); everything * in the structure may have changed. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Use this define if you want to disable *fancy* VM things. Expect an * approx 30% decrease in transfer rate. This could be useful for * NetBSD or OpenBSD. */ /* #define PIPE_NODIRECT */ /* * interfaces to the outside world */ static fo_rdwr_t pipe_read; static fo_rdwr_t pipe_write; static fo_ioctl_t pipe_ioctl; static fo_poll_t pipe_poll; static fo_kqfilter_t pipe_kqfilter; static fo_stat_t pipe_stat; static fo_close_t pipe_close; static struct fileops pipeops = { .fo_read = pipe_read, .fo_write = pipe_write, .fo_ioctl = pipe_ioctl, .fo_poll = pipe_poll, .fo_kqfilter = pipe_kqfilter, .fo_stat = pipe_stat, .fo_close = pipe_close, .fo_flags = DFLAG_PASSABLE }; static void filt_pipedetach(struct knote *kn); static int filt_piperead(struct knote *kn, long hint); static int filt_pipewrite(struct knote *kn, long hint); static struct filterops pipe_rfiltops = { 1, NULL, filt_pipedetach, filt_piperead }; static struct filterops pipe_wfiltops = { 1, NULL, filt_pipedetach, filt_pipewrite }; /* * Default pipe buffer size(s), this can be kind-of large now because pipe * space is pageable. The pipe code will try to maintain locality of * reference for performance reasons, so small amounts of outstanding I/O * will not wipe the cache. */ #define MINPIPESIZE (PIPE_SIZE/3) #define MAXPIPESIZE (2*PIPE_SIZE/3) static int amountpipekva; static int pipefragretry; static int pipeallocfail; static int piperesizefail; static int piperesizeallowed = 1; SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN, &maxpipekva, 0, "Pipe KVA limit"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, &amountpipekva, 0, "Pipe KVA usage"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD, &pipefragretry, 0, "Pipe allocation retries due to fragmentation"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD, &pipeallocfail, 0, "Pipe allocation failures"); SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD, &piperesizefail, 0, "Pipe resize failures"); SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW, &piperesizeallowed, 0, "Pipe resizing allowed"); static void pipeinit(void *dummy __unused); static void pipeclose(struct pipe *cpipe); static void pipe_free_kmem(struct pipe *cpipe); static int pipe_create(struct pipe *pipe, int backing); static __inline int pipelock(struct pipe *cpipe, int catch); static __inline void pipeunlock(struct pipe *cpipe); static __inline void pipeselwakeup(struct pipe *cpipe); #ifndef PIPE_NODIRECT static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); static void pipe_destroy_write_buffer(struct pipe *wpipe); static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); static void pipe_clone_write_buffer(struct pipe *wpipe); #endif static int pipespace(struct pipe *cpipe, int size); static int pipespace_new(struct pipe *cpipe, int size); static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); static int pipe_zone_init(void *mem, int size, int flags); static void pipe_zone_fini(void *mem, int size); static uma_zone_t pipe_zone; SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); static void pipeinit(void *dummy __unused) { pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair), pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini, UMA_ALIGN_PTR, 0); KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); } static int pipe_zone_ctor(void *mem, int size, void *arg, int flags) { struct pipepair *pp; struct pipe *rpipe, *wpipe; KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); pp = (struct pipepair *)mem; /* * We zero both pipe endpoints to make sure all the kmem pointers * are NULL, flag fields are zero'd, etc. We timestamp both * endpoints with the same time. */ rpipe = &pp->pp_rpipe; bzero(rpipe, sizeof(*rpipe)); vfs_timestamp(&rpipe->pipe_ctime); rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; wpipe = &pp->pp_wpipe; bzero(wpipe, sizeof(*wpipe)); wpipe->pipe_ctime = rpipe->pipe_ctime; wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; rpipe->pipe_peer = wpipe; rpipe->pipe_pair = pp; wpipe->pipe_peer = rpipe; wpipe->pipe_pair = pp; /* * Mark both endpoints as present; they will later get free'd * one at a time. When both are free'd, then the whole pair * is released. */ rpipe->pipe_present = 1; wpipe->pipe_present = 1; /* * Eventually, the MAC Framework may initialize the label * in ctor or init, but for now we do it elswhere to avoid * blocking in ctor or init. */ pp->pp_label = NULL; return (0); } static int pipe_zone_init(void *mem, int size, int flags) { struct pipepair *pp; KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); pp = (struct pipepair *)mem; mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE); return (0); } static void pipe_zone_fini(void *mem, int size) { struct pipepair *pp; KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); pp = (struct pipepair *)mem; mtx_destroy(&pp->pp_mtx); } /* * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let * the zone pick up the pieces via pipeclose(). */ /* ARGSUSED */ int pipe(td, uap) struct thread *td; struct pipe_args /* { int dummy; } */ *uap; { struct filedesc *fdp = td->td_proc->p_fd; struct file *rf, *wf; struct pipepair *pp; struct pipe *rpipe, *wpipe; int fd, error; pp = uma_zalloc(pipe_zone, M_WAITOK); #ifdef MAC /* * The MAC label is shared between the connected endpoints. As a * result mac_pipe_init() and mac_pipe_create() are called once * for the pair, and not on the endpoints. */ mac_pipe_init(pp); mac_pipe_create(td->td_ucred, pp); #endif rpipe = &pp->pp_rpipe; wpipe = &pp->pp_wpipe; knlist_init(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe), NULL, NULL, NULL); knlist_init(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe), NULL, NULL, NULL); /* Only the forward direction pipe is backed by default */ if ((error = pipe_create(rpipe, 1)) != 0 || (error = pipe_create(wpipe, 0)) != 0) { pipeclose(rpipe); pipeclose(wpipe); return (error); } rpipe->pipe_state |= PIPE_DIRECTOK; wpipe->pipe_state |= PIPE_DIRECTOK; error = falloc(td, &rf, &fd); if (error) { pipeclose(rpipe); pipeclose(wpipe); return (error); } /* An extra reference on `rf' has been held for us by falloc(). */ td->td_retval[0] = fd; /* * Warning: once we've gotten past allocation of the fd for the * read-side, we can only drop the read side via fdrop() in order * to avoid races against processes which manage to dup() the read * side while we are blocked trying to allocate the write side. */ - FILE_LOCK(rf); - rf->f_flag = FREAD | FWRITE; - rf->f_type = DTYPE_PIPE; - rf->f_data = rpipe; - rf->f_ops = &pipeops; - FILE_UNLOCK(rf); + finit(rf, FREAD | FWRITE, DTYPE_PIPE, rpipe, &pipeops); error = falloc(td, &wf, &fd); if (error) { fdclose(fdp, rf, td->td_retval[0], td); fdrop(rf, td); /* rpipe has been closed by fdrop(). */ pipeclose(wpipe); return (error); } /* An extra reference on `wf' has been held for us by falloc(). */ - FILE_LOCK(wf); - wf->f_flag = FREAD | FWRITE; - wf->f_type = DTYPE_PIPE; - wf->f_data = wpipe; - wf->f_ops = &pipeops; - FILE_UNLOCK(wf); + finit(wf, FREAD | FWRITE, DTYPE_PIPE, wpipe, &pipeops); fdrop(wf, td); td->td_retval[1] = fd; fdrop(rf, td); return (0); } /* * Allocate kva for pipe circular buffer, the space is pageable * This routine will 'realloc' the size of a pipe safely, if it fails * it will retain the old buffer. * If it fails it will return ENOMEM. */ static int pipespace_new(cpipe, size) struct pipe *cpipe; int size; { caddr_t buffer; int error, cnt, firstseg; static int curfail = 0; static struct timeval lastfail; KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW), ("pipespace: resize of direct writes not allowed")); retry: cnt = cpipe->pipe_buffer.cnt; if (cnt > size) size = cnt; size = round_page(size); buffer = (caddr_t) vm_map_min(pipe_map); error = vm_map_find(pipe_map, NULL, 0, (vm_offset_t *) &buffer, size, 1, VM_PROT_ALL, VM_PROT_ALL, 0); if (error != KERN_SUCCESS) { if ((cpipe->pipe_buffer.buffer == NULL) && (size > SMALL_PIPE_SIZE)) { size = SMALL_PIPE_SIZE; pipefragretry++; goto retry; } if (cpipe->pipe_buffer.buffer == NULL) { pipeallocfail++; if (ppsratecheck(&lastfail, &curfail, 1)) printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); } else { piperesizefail++; } return (ENOMEM); } /* copy data, then free old resources if we're resizing */ if (cnt > 0) { if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) { firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out; bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], buffer, firstseg); if ((cnt - firstseg) > 0) bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg], cpipe->pipe_buffer.in); } else { bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], buffer, cnt); } } pipe_free_kmem(cpipe); cpipe->pipe_buffer.buffer = buffer; cpipe->pipe_buffer.size = size; cpipe->pipe_buffer.in = cnt; cpipe->pipe_buffer.out = 0; cpipe->pipe_buffer.cnt = cnt; atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size); return (0); } /* * Wrapper for pipespace_new() that performs locking assertions. */ static int pipespace(cpipe, size) struct pipe *cpipe; int size; { KASSERT(cpipe->pipe_state & PIPE_LOCKFL, ("Unlocked pipe passed to pipespace")); return (pipespace_new(cpipe, size)); } /* * lock a pipe for I/O, blocking other access */ static __inline int pipelock(cpipe, catch) struct pipe *cpipe; int catch; { int error; PIPE_LOCK_ASSERT(cpipe, MA_OWNED); while (cpipe->pipe_state & PIPE_LOCKFL) { cpipe->pipe_state |= PIPE_LWANT; error = msleep(cpipe, PIPE_MTX(cpipe), catch ? (PRIBIO | PCATCH) : PRIBIO, "pipelk", 0); if (error != 0) return (error); } cpipe->pipe_state |= PIPE_LOCKFL; return (0); } /* * unlock a pipe I/O lock */ static __inline void pipeunlock(cpipe) struct pipe *cpipe; { PIPE_LOCK_ASSERT(cpipe, MA_OWNED); KASSERT(cpipe->pipe_state & PIPE_LOCKFL, ("Unlocked pipe passed to pipeunlock")); cpipe->pipe_state &= ~PIPE_LOCKFL; if (cpipe->pipe_state & PIPE_LWANT) { cpipe->pipe_state &= ~PIPE_LWANT; wakeup(cpipe); } } static __inline void pipeselwakeup(cpipe) struct pipe *cpipe; { PIPE_LOCK_ASSERT(cpipe, MA_OWNED); if (cpipe->pipe_state & PIPE_SEL) { selwakeuppri(&cpipe->pipe_sel, PSOCK); if (!SEL_WAITING(&cpipe->pipe_sel)) cpipe->pipe_state &= ~PIPE_SEL; } if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) pgsigio(&cpipe->pipe_sigio, SIGIO, 0); KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0); } /* * Initialize and allocate VM and memory for pipe. The structure * will start out zero'd from the ctor, so we just manage the kmem. */ static int pipe_create(pipe, backing) struct pipe *pipe; int backing; { int error; if (backing) { if (amountpipekva > maxpipekva / 2) error = pipespace_new(pipe, SMALL_PIPE_SIZE); else error = pipespace_new(pipe, PIPE_SIZE); } else { /* If we're not backing this pipe, no need to do anything. */ error = 0; } return (error); } /* ARGSUSED */ static int pipe_read(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; struct thread *td; int flags; { struct pipe *rpipe = fp->f_data; int error; int nread = 0; u_int size; PIPE_LOCK(rpipe); ++rpipe->pipe_busy; error = pipelock(rpipe, 1); if (error) goto unlocked_error; #ifdef MAC error = mac_pipe_check_read(active_cred, rpipe->pipe_pair); if (error) goto locked_error; #endif if (amountpipekva > (3 * maxpipekva) / 4) { if (!(rpipe->pipe_state & PIPE_DIRECTW) && (rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && (rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && (piperesizeallowed == 1)) { PIPE_UNLOCK(rpipe); pipespace(rpipe, SMALL_PIPE_SIZE); PIPE_LOCK(rpipe); } } while (uio->uio_resid) { /* * normal pipe buffer receive */ if (rpipe->pipe_buffer.cnt > 0) { size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; if (size > rpipe->pipe_buffer.cnt) size = rpipe->pipe_buffer.cnt; if (size > (u_int) uio->uio_resid) size = (u_int) uio->uio_resid; PIPE_UNLOCK(rpipe); error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], size, uio); PIPE_LOCK(rpipe); if (error) break; rpipe->pipe_buffer.out += size; if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) rpipe->pipe_buffer.out = 0; rpipe->pipe_buffer.cnt -= size; /* * If there is no more to read in the pipe, reset * its pointers to the beginning. This improves * cache hit stats. */ if (rpipe->pipe_buffer.cnt == 0) { rpipe->pipe_buffer.in = 0; rpipe->pipe_buffer.out = 0; } nread += size; #ifndef PIPE_NODIRECT /* * Direct copy, bypassing a kernel buffer. */ } else if ((size = rpipe->pipe_map.cnt) && (rpipe->pipe_state & PIPE_DIRECTW)) { if (size > (u_int) uio->uio_resid) size = (u_int) uio->uio_resid; PIPE_UNLOCK(rpipe); error = uiomove_fromphys(rpipe->pipe_map.ms, rpipe->pipe_map.pos, size, uio); PIPE_LOCK(rpipe); if (error) break; nread += size; rpipe->pipe_map.pos += size; rpipe->pipe_map.cnt -= size; if (rpipe->pipe_map.cnt == 0) { rpipe->pipe_state &= ~PIPE_DIRECTW; wakeup(rpipe); } #endif } else { /* * detect EOF condition * read returns 0 on EOF, no need to set error */ if (rpipe->pipe_state & PIPE_EOF) break; /* * If the "write-side" has been blocked, wake it up now. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } /* * Break if some data was read. */ if (nread > 0) break; /* * Unlock the pipe buffer for our remaining processing. * We will either break out with an error or we will * sleep and relock to loop. */ pipeunlock(rpipe); /* * Handle non-blocking mode operation or * wait for more data. */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; } else { rpipe->pipe_state |= PIPE_WANTR; if ((error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "piperd", 0)) == 0) error = pipelock(rpipe, 1); } if (error) goto unlocked_error; } } #ifdef MAC locked_error: #endif pipeunlock(rpipe); /* XXX: should probably do this before getting any locks. */ if (error == 0) vfs_timestamp(&rpipe->pipe_atime); unlocked_error: --rpipe->pipe_busy; /* * PIPE_WANT processing only makes sense if pipe_busy is 0. */ if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); wakeup(rpipe); } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { /* * Handle write blocking hysteresis. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } } if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) pipeselwakeup(rpipe); PIPE_UNLOCK(rpipe); return (error); } #ifndef PIPE_NODIRECT /* * Map the sending processes' buffer into kernel space and wire it. * This is similar to a physical write operation. */ static int pipe_build_write_buffer(wpipe, uio) struct pipe *wpipe; struct uio *uio; { pmap_t pmap; u_int size; int i, j; vm_offset_t addr, endaddr; PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); KASSERT(wpipe->pipe_state & PIPE_DIRECTW, ("Clone attempt on non-direct write pipe!")); size = (u_int) uio->uio_iov->iov_len; if (size > wpipe->pipe_buffer.size) size = wpipe->pipe_buffer.size; pmap = vmspace_pmap(curproc->p_vmspace); endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) { /* * vm_fault_quick() can sleep. Consequently, * vm_page_lock_queue() and vm_page_unlock_queue() * should not be performed outside of this loop. */ race: if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) { vm_page_lock_queues(); for (j = 0; j < i; j++) vm_page_unhold(wpipe->pipe_map.ms[j]); vm_page_unlock_queues(); return (EFAULT); } wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr, VM_PROT_READ); if (wpipe->pipe_map.ms[i] == NULL) goto race; } /* * set up the control block */ wpipe->pipe_map.npages = i; wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; wpipe->pipe_map.cnt = size; /* * and update the uio data */ uio->uio_iov->iov_len -= size; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; if (uio->uio_iov->iov_len == 0) uio->uio_iov++; uio->uio_resid -= size; uio->uio_offset += size; return (0); } /* * unmap and unwire the process buffer */ static void pipe_destroy_write_buffer(wpipe) struct pipe *wpipe; { int i; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); vm_page_lock_queues(); for (i = 0; i < wpipe->pipe_map.npages; i++) { vm_page_unhold(wpipe->pipe_map.ms[i]); } vm_page_unlock_queues(); wpipe->pipe_map.npages = 0; } /* * In the case of a signal, the writing process might go away. This * code copies the data into the circular buffer so that the source * pages can be freed without loss of data. */ static void pipe_clone_write_buffer(wpipe) struct pipe *wpipe; { struct uio uio; struct iovec iov; int size; int pos; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); size = wpipe->pipe_map.cnt; pos = wpipe->pipe_map.pos; wpipe->pipe_buffer.in = size; wpipe->pipe_buffer.out = 0; wpipe->pipe_buffer.cnt = size; wpipe->pipe_state &= ~PIPE_DIRECTW; PIPE_UNLOCK(wpipe); iov.iov_base = wpipe->pipe_buffer.buffer; iov.iov_len = size; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = size; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio); PIPE_LOCK(wpipe); pipe_destroy_write_buffer(wpipe); } /* * This implements the pipe buffer write mechanism. Note that only * a direct write OR a normal pipe write can be pending at any given time. * If there are any characters in the pipe buffer, the direct write will * be deferred until the receiving process grabs all of the bytes from * the pipe buffer. Then the direct mapping write is set-up. */ static int pipe_direct_write(wpipe, uio) struct pipe *wpipe; struct uio *uio; { int error; retry: PIPE_LOCK_ASSERT(wpipe, MA_OWNED); error = pipelock(wpipe, 1); if (wpipe->pipe_state & PIPE_EOF) error = EPIPE; if (error) { pipeunlock(wpipe); goto error1; } while (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdww", 0); if (error) goto error1; else goto retry; } wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ if (wpipe->pipe_buffer.cnt > 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwc", 0); if (error) goto error1; else goto retry; } wpipe->pipe_state |= PIPE_DIRECTW; PIPE_UNLOCK(wpipe); error = pipe_build_write_buffer(wpipe, uio); PIPE_LOCK(wpipe); if (error) { wpipe->pipe_state &= ~PIPE_DIRECTW; pipeunlock(wpipe); goto error1; } error = 0; while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { if (wpipe->pipe_state & PIPE_EOF) { pipe_destroy_write_buffer(wpipe); pipeselwakeup(wpipe); pipeunlock(wpipe); error = EPIPE; goto error1; } if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwt", 0); pipelock(wpipe, 0); } if (wpipe->pipe_state & PIPE_EOF) error = EPIPE; if (wpipe->pipe_state & PIPE_DIRECTW) { /* * this bit of trickery substitutes a kernel buffer for * the process that might be going away. */ pipe_clone_write_buffer(wpipe); } else { pipe_destroy_write_buffer(wpipe); } pipeunlock(wpipe); return (error); error1: wakeup(wpipe); return (error); } #endif static int pipe_write(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; struct thread *td; int flags; { int error = 0; int desiredsize, orig_resid; struct pipe *wpipe, *rpipe; rpipe = fp->f_data; wpipe = rpipe->pipe_peer; PIPE_LOCK(rpipe); error = pipelock(wpipe, 1); if (error) { PIPE_UNLOCK(rpipe); return (error); } /* * detect loss of pipe read side, issue SIGPIPE if lost. */ if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (EPIPE); } #ifdef MAC error = mac_pipe_check_write(active_cred, wpipe->pipe_pair); if (error) { pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (error); } #endif ++wpipe->pipe_busy; /* Choose a larger size if it's advantageous */ desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size); while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) { if (piperesizeallowed != 1) break; if (amountpipekva > maxpipekva / 2) break; if (desiredsize == BIG_PIPE_SIZE) break; desiredsize = desiredsize * 2; } /* Choose a smaller size if we're in a OOM situation */ if ((amountpipekva > (3 * maxpipekva) / 4) && (wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && (wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && (piperesizeallowed == 1)) desiredsize = SMALL_PIPE_SIZE; /* Resize if the above determined that a new size was necessary */ if ((desiredsize != wpipe->pipe_buffer.size) && ((wpipe->pipe_state & PIPE_DIRECTW) == 0)) { PIPE_UNLOCK(wpipe); pipespace(wpipe, desiredsize); PIPE_LOCK(wpipe); } if (wpipe->pipe_buffer.size == 0) { /* * This can only happen for reverse direction use of pipes * in a complete OOM situation. */ error = ENOMEM; --wpipe->pipe_busy; pipeunlock(wpipe); PIPE_UNLOCK(wpipe); return (error); } pipeunlock(wpipe); orig_resid = uio->uio_resid; while (uio->uio_resid) { int space; pipelock(wpipe, 0); if (wpipe->pipe_state & PIPE_EOF) { pipeunlock(wpipe); error = EPIPE; break; } #ifndef PIPE_NODIRECT /* * If the transfer is large, we can gain performance if * we do process-to-process copies directly. * If the write is non-blocking, we don't use the * direct write mechanism. * * The direct write mechanism will detect the reader going * away on us. */ if (uio->uio_segflg == UIO_USERSPACE && uio->uio_iov->iov_len >= PIPE_MINDIRECT && wpipe->pipe_buffer.size >= PIPE_MINDIRECT && (fp->f_flag & FNONBLOCK) == 0) { pipeunlock(wpipe); error = pipe_direct_write(wpipe, uio); if (error) break; continue; } #endif /* * Pipe buffered writes cannot be coincidental with * direct writes. We wait until the currently executing * direct write is completed before we start filling the * pipe buffer. We break out if a signal occurs or the * reader goes away. */ if (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipbww", 0); if (error) break; else continue; } space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; /* Writes of size <= PIPE_BUF must be atomic. */ if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) space = 0; if (space > 0) { int size; /* Transfer size */ int segsize; /* first segment to transfer */ /* * Transfer size is minimum of uio transfer * and free space in pipe buffer. */ if (space > uio->uio_resid) size = uio->uio_resid; else size = space; /* * First segment to transfer is minimum of * transfer size and contiguous space in * pipe buffer. If first segment to transfer * is less than the transfer size, we've got * a wraparound in the buffer. */ segsize = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in; if (segsize > size) segsize = size; /* Transfer first segment */ PIPE_UNLOCK(rpipe); error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], segsize, uio); PIPE_LOCK(rpipe); if (error == 0 && segsize < size) { KASSERT(wpipe->pipe_buffer.in + segsize == wpipe->pipe_buffer.size, ("Pipe buffer wraparound disappeared")); /* * Transfer remaining part now, to * support atomic writes. Wraparound * happened. */ PIPE_UNLOCK(rpipe); error = uiomove( &wpipe->pipe_buffer.buffer[0], size - segsize, uio); PIPE_LOCK(rpipe); } if (error == 0) { wpipe->pipe_buffer.in += size; if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size) { KASSERT(wpipe->pipe_buffer.in == size - segsize + wpipe->pipe_buffer.size, ("Expected wraparound bad")); wpipe->pipe_buffer.in = size - segsize; } wpipe->pipe_buffer.cnt += size; KASSERT(wpipe->pipe_buffer.cnt <= wpipe->pipe_buffer.size, ("Pipe buffer overflow")); } pipeunlock(wpipe); if (error != 0) break; } else { /* * If the "read-side" has been blocked, wake it up now. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } /* * don't block on non-blocking I/O */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; pipeunlock(wpipe); break; } /* * We have no more space and have something to offer, * wake up select/poll. */ pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipewr", 0); if (error != 0) break; } } pipelock(wpipe, 0); --wpipe->pipe_busy; if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); wakeup(wpipe); } else if (wpipe->pipe_buffer.cnt > 0) { /* * If we have put any characters in the buffer, we wake up * the reader. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } } /* * Don't return EPIPE if I/O was successful */ if ((wpipe->pipe_buffer.cnt == 0) && (uio->uio_resid == 0) && (error == EPIPE)) { error = 0; } if (error == 0) vfs_timestamp(&wpipe->pipe_mtime); /* * We have something to offer, * wake up select/poll. */ if (wpipe->pipe_buffer.cnt) pipeselwakeup(wpipe); pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (error); } /* * we implement a very minimal set of ioctls for compatibility with sockets. */ static int pipe_ioctl(fp, cmd, data, active_cred, td) struct file *fp; u_long cmd; void *data; struct ucred *active_cred; struct thread *td; { struct pipe *mpipe = fp->f_data; int error; PIPE_LOCK(mpipe); #ifdef MAC error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data); if (error) { PIPE_UNLOCK(mpipe); return (error); } #endif error = 0; switch (cmd) { case FIONBIO: break; case FIOASYNC: if (*(int *)data) { mpipe->pipe_state |= PIPE_ASYNC; } else { mpipe->pipe_state &= ~PIPE_ASYNC; } break; case FIONREAD: if (mpipe->pipe_state & PIPE_DIRECTW) *(int *)data = mpipe->pipe_map.cnt; else *(int *)data = mpipe->pipe_buffer.cnt; break; case FIOSETOWN: PIPE_UNLOCK(mpipe); error = fsetown(*(int *)data, &mpipe->pipe_sigio); goto out_unlocked; case FIOGETOWN: *(int *)data = fgetown(&mpipe->pipe_sigio); break; /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: PIPE_UNLOCK(mpipe); error = fsetown(-(*(int *)data), &mpipe->pipe_sigio); goto out_unlocked; /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(&mpipe->pipe_sigio); break; default: error = ENOTTY; break; } PIPE_UNLOCK(mpipe); out_unlocked: return (error); } static int pipe_poll(fp, events, active_cred, td) struct file *fp; int events; struct ucred *active_cred; struct thread *td; { struct pipe *rpipe = fp->f_data; struct pipe *wpipe; int revents = 0; #ifdef MAC int error; #endif wpipe = rpipe->pipe_peer; PIPE_LOCK(rpipe); #ifdef MAC error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair); if (error) goto locked_error; #endif if (events & (POLLIN | POLLRDNORM)) if ((rpipe->pipe_state & PIPE_DIRECTW) || (rpipe->pipe_buffer.cnt > 0) || (rpipe->pipe_state & PIPE_EOF)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (!wpipe->pipe_present || (wpipe->pipe_state & PIPE_EOF) || (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) revents |= events & (POLLOUT | POLLWRNORM); if ((rpipe->pipe_state & PIPE_EOF) || (!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) revents |= POLLHUP; if (revents == 0) { if (events & (POLLIN | POLLRDNORM)) { selrecord(td, &rpipe->pipe_sel); if (SEL_WAITING(&rpipe->pipe_sel)) rpipe->pipe_state |= PIPE_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &wpipe->pipe_sel); if (SEL_WAITING(&wpipe->pipe_sel)) wpipe->pipe_state |= PIPE_SEL; } } #ifdef MAC locked_error: #endif PIPE_UNLOCK(rpipe); return (revents); } /* * We shouldn't need locks here as we're doing a read and this should * be a natural race. */ static int pipe_stat(fp, ub, active_cred, td) struct file *fp; struct stat *ub; struct ucred *active_cred; struct thread *td; { struct pipe *pipe = fp->f_data; #ifdef MAC int error; PIPE_LOCK(pipe); error = mac_pipe_check_stat(active_cred, pipe->pipe_pair); PIPE_UNLOCK(pipe); if (error) return (error); #endif bzero(ub, sizeof(*ub)); ub->st_mode = S_IFIFO; ub->st_blksize = PAGE_SIZE; if (pipe->pipe_state & PIPE_DIRECTW) ub->st_size = pipe->pipe_map.cnt; else ub->st_size = pipe->pipe_buffer.cnt; ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; ub->st_atimespec = pipe->pipe_atime; ub->st_mtimespec = pipe->pipe_mtime; ub->st_ctimespec = pipe->pipe_ctime; ub->st_uid = fp->f_cred->cr_uid; ub->st_gid = fp->f_cred->cr_gid; /* * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. * XXX (st_dev, st_ino) should be unique. */ return (0); } /* ARGSUSED */ static int pipe_close(fp, td) struct file *fp; struct thread *td; { struct pipe *cpipe = fp->f_data; fp->f_ops = &badfileops; fp->f_data = NULL; funsetown(&cpipe->pipe_sigio); pipeclose(cpipe); return (0); } static void pipe_free_kmem(cpipe) struct pipe *cpipe; { KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipe_free_kmem: pipe mutex locked")); if (cpipe->pipe_buffer.buffer != NULL) { atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size); vm_map_remove(pipe_map, (vm_offset_t)cpipe->pipe_buffer.buffer, (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); cpipe->pipe_buffer.buffer = NULL; } #ifndef PIPE_NODIRECT { cpipe->pipe_map.cnt = 0; cpipe->pipe_map.pos = 0; cpipe->pipe_map.npages = 0; } #endif } /* * shutdown the pipe */ static void pipeclose(cpipe) struct pipe *cpipe; { struct pipepair *pp; struct pipe *ppipe; KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); PIPE_LOCK(cpipe); pipelock(cpipe, 0); pp = cpipe->pipe_pair; pipeselwakeup(cpipe); /* * If the other side is blocked, wake it up saying that * we want to close it down. */ cpipe->pipe_state |= PIPE_EOF; while (cpipe->pipe_busy) { wakeup(cpipe); cpipe->pipe_state |= PIPE_WANT; pipeunlock(cpipe); msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); pipelock(cpipe, 0); } /* * Disconnect from peer, if any. */ ppipe = cpipe->pipe_peer; if (ppipe->pipe_present != 0) { pipeselwakeup(ppipe); ppipe->pipe_state |= PIPE_EOF; wakeup(ppipe); KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0); } /* * Mark this endpoint as free. Release kmem resources. We * don't mark this endpoint as unused until we've finished * doing that, or the pipe might disappear out from under * us. */ PIPE_UNLOCK(cpipe); pipe_free_kmem(cpipe); PIPE_LOCK(cpipe); cpipe->pipe_present = 0; pipeunlock(cpipe); knlist_clear(&cpipe->pipe_sel.si_note, 1); knlist_destroy(&cpipe->pipe_sel.si_note); /* * If both endpoints are now closed, release the memory for the * pipe pair. If not, unlock. */ if (ppipe->pipe_present == 0) { PIPE_UNLOCK(cpipe); #ifdef MAC mac_pipe_destroy(pp); #endif uma_zfree(pipe_zone, cpipe->pipe_pair); } else PIPE_UNLOCK(cpipe); } /*ARGSUSED*/ static int pipe_kqfilter(struct file *fp, struct knote *kn) { struct pipe *cpipe; cpipe = kn->kn_fp->f_data; PIPE_LOCK(cpipe); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &pipe_rfiltops; break; case EVFILT_WRITE: kn->kn_fop = &pipe_wfiltops; if (!cpipe->pipe_peer->pipe_present) { /* other end of pipe has been closed */ PIPE_UNLOCK(cpipe); return (EPIPE); } cpipe = cpipe->pipe_peer; break; default: PIPE_UNLOCK(cpipe); return (EINVAL); } knlist_add(&cpipe->pipe_sel.si_note, kn, 1); PIPE_UNLOCK(cpipe); return (0); } static void filt_pipedetach(struct knote *kn) { struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; PIPE_LOCK(cpipe); if (kn->kn_filter == EVFILT_WRITE) { if (!cpipe->pipe_peer->pipe_present) { PIPE_UNLOCK(cpipe); return; } cpipe = cpipe->pipe_peer; } knlist_remove(&cpipe->pipe_sel.si_note, kn, 1); PIPE_UNLOCK(cpipe); } /*ARGSUSED*/ static int filt_piperead(struct knote *kn, long hint) { struct pipe *rpipe = kn->kn_fp->f_data; struct pipe *wpipe = rpipe->pipe_peer; int ret; PIPE_LOCK(rpipe); kn->kn_data = rpipe->pipe_buffer.cnt; if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) kn->kn_data = rpipe->pipe_map.cnt; if ((rpipe->pipe_state & PIPE_EOF) || (!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_flags |= EV_EOF; PIPE_UNLOCK(rpipe); return (1); } ret = kn->kn_data > 0; PIPE_UNLOCK(rpipe); return ret; } /*ARGSUSED*/ static int filt_pipewrite(struct knote *kn, long hint) { struct pipe *rpipe = kn->kn_fp->f_data; struct pipe *wpipe = rpipe->pipe_peer; PIPE_LOCK(rpipe); if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_data = 0; kn->kn_flags |= EV_EOF; PIPE_UNLOCK(rpipe); return (1); } kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; if (wpipe->pipe_state & PIPE_DIRECTW) kn->kn_data = 0; PIPE_UNLOCK(rpipe); return (kn->kn_data >= PIPE_BUF); } Index: head/sys/kern/uipc_mqueue.c =================================================================== --- head/sys/kern/uipc_mqueue.c (revision 174987) +++ head/sys/kern/uipc_mqueue.c (revision 174988) @@ -1,2481 +1,2480 @@ /*- * Copyright (c) 2005 David Xu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * POSIX message queue implementation. * * 1) A mqueue filesystem can be mounted, each message queue appears * in mounted directory, user can change queue's permission and * ownership, or remove a queue. Manually creating a file in the * directory causes a message queue to be created in the kernel with * default message queue attributes applied and same name used, this * method is not advocated since mq_open syscall allows user to specify * different attributes. Also the file system can be mounted multiple * times at different mount points but shows same contents. * * 2) Standard POSIX message queue API. The syscalls do not use vfs layer, * but directly operate on internal data structure, this allows user to * use the IPC facility without having to mount mqueue file system. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Limits and constants */ #define MQFS_NAMELEN NAME_MAX #define MQFS_DELEN (8 + MQFS_NAMELEN) /* node types */ typedef enum { mqfstype_none = 0, mqfstype_root, mqfstype_dir, mqfstype_this, mqfstype_parent, mqfstype_file, mqfstype_symlink, } mqfs_type_t; struct mqfs_node; /* * mqfs_info: describes a mqfs instance */ struct mqfs_info { struct sx mi_lock; struct mqfs_node *mi_root; struct unrhdr *mi_unrhdr; }; struct mqfs_vdata { LIST_ENTRY(mqfs_vdata) mv_link; struct mqfs_node *mv_node; struct vnode *mv_vnode; struct task mv_task; }; /* * mqfs_node: describes a node (file or directory) within a mqfs */ struct mqfs_node { char mn_name[MQFS_NAMELEN+1]; struct mqfs_info *mn_info; struct mqfs_node *mn_parent; LIST_HEAD(,mqfs_node) mn_children; LIST_ENTRY(mqfs_node) mn_sibling; LIST_HEAD(,mqfs_vdata) mn_vnodes; int mn_refcount; mqfs_type_t mn_type; int mn_deleted; u_int32_t mn_fileno; void *mn_data; struct timespec mn_birth; struct timespec mn_ctime; struct timespec mn_atime; struct timespec mn_mtime; uid_t mn_uid; gid_t mn_gid; int mn_mode; }; #define VTON(vp) (((struct mqfs_vdata *)((vp)->v_data))->mv_node) #define VTOMQ(vp) ((struct mqueue *)(VTON(vp)->mn_data)) #define VFSTOMQFS(m) ((struct mqfs_info *)((m)->mnt_data)) #define FPTOMQ(fp) ((struct mqueue *)(((struct mqfs_node *) \ (fp)->f_data)->mn_data)) TAILQ_HEAD(msgq, mqueue_msg); struct mqueue; struct mqueue_notifier { LIST_ENTRY(mqueue_notifier) nt_link; struct sigevent nt_sigev; ksiginfo_t nt_ksi; struct proc *nt_proc; }; struct mqueue { struct mtx mq_mutex; int mq_flags; long mq_maxmsg; long mq_msgsize; long mq_curmsgs; long mq_totalbytes; struct msgq mq_msgq; int mq_receivers; int mq_senders; struct selinfo mq_rsel; struct selinfo mq_wsel; struct mqueue_notifier *mq_notifier; }; #define MQ_RSEL 0x01 #define MQ_WSEL 0x02 struct mqueue_msg { TAILQ_ENTRY(mqueue_msg) msg_link; unsigned int msg_prio; unsigned int msg_size; /* following real data... */ }; SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0, "POSIX real time message queue"); static int default_maxmsg = 10; static int default_msgsize = 1024; static int maxmsg = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW, &maxmsg, 0, "Default maximum messages in queue"); static int maxmsgsize = 16384; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW, &maxmsgsize, 0, "Default maximum message size"); static int maxmq = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW, &maxmq, 0, "maximum message queues"); static int curmq = 0; SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW, &curmq, 0, "current message queue number"); static int unloadable = 0; static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data"); static eventhandler_tag exit_tag; /* Only one instance per-system */ static struct mqfs_info mqfs_data; static uma_zone_t mqnode_zone; static uma_zone_t mqueue_zone; static uma_zone_t mvdata_zone; static uma_zone_t mqnoti_zone; static struct vop_vector mqfs_vnodeops; static struct fileops mqueueops; /* * Directory structure construction and manipulation */ #ifdef notyet static struct mqfs_node *mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static struct mqfs_node *mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); #endif static struct mqfs_node *mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static int mqfs_destroy(struct mqfs_node *mn); static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn); static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn); static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn); /* * Message queue construction and maniplation */ static struct mqueue *mqueue_alloc(const struct mq_attr *attr); static void mqueue_free(struct mqueue *mq); static int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout); static int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout); static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo); static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo); static void mqueue_send_notification(struct mqueue *mq); static void mqueue_fdclose(struct thread *td, int fd, struct file *fp); static void mq_proc_exit(void *arg, struct proc *p); /* * kqueue filters */ static void filt_mqdetach(struct knote *kn); static int filt_mqread(struct knote *kn, long hint); static int filt_mqwrite(struct knote *kn, long hint); struct filterops mq_rfiltops = { 1, NULL, filt_mqdetach, filt_mqread }; struct filterops mq_wfiltops = { 1, NULL, filt_mqdetach, filt_mqwrite }; /* * Initialize fileno bitmap */ static void mqfs_fileno_init(struct mqfs_info *mi) { struct unrhdr *up; up = new_unrhdr(1, INT_MAX, NULL); mi->mi_unrhdr = up; } /* * Tear down fileno bitmap */ static void mqfs_fileno_uninit(struct mqfs_info *mi) { struct unrhdr *up; up = mi->mi_unrhdr; mi->mi_unrhdr = NULL; delete_unrhdr(up); } /* * Allocate a file number */ static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn) { /* make sure our parent has a file number */ if (mn->mn_parent && !mn->mn_parent->mn_fileno) mqfs_fileno_alloc(mi, mn->mn_parent); switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: mn->mn_fileno = alloc_unr(mi->mi_unrhdr); break; case mqfstype_this: KASSERT(mn->mn_parent != NULL, ("mqfstype_this node has no parent")); mn->mn_fileno = mn->mn_parent->mn_fileno; break; case mqfstype_parent: KASSERT(mn->mn_parent != NULL, ("mqfstype_parent node has no parent")); if (mn->mn_parent == mi->mi_root) { mn->mn_fileno = mn->mn_parent->mn_fileno; break; } KASSERT(mn->mn_parent->mn_parent != NULL, ("mqfstype_parent node has no grandparent")); mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno; break; default: KASSERT(0, ("mqfs_fileno_alloc() called for unknown type node: %d", mn->mn_type)); break; } } /* * Release a file number */ static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn) { switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: free_unr(mi->mi_unrhdr, mn->mn_fileno); break; case mqfstype_this: case mqfstype_parent: /* ignore these, as they don't "own" their file number */ break; default: KASSERT(0, ("mqfs_fileno_free() called for unknown type node: %d", mn->mn_type)); break; } } static __inline struct mqfs_node * mqnode_alloc(void) { return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO); } static __inline void mqnode_free(struct mqfs_node *node) { uma_zfree(mqnode_zone, node); } static __inline void mqnode_addref(struct mqfs_node *node) { atomic_fetchadd_int(&node->mn_refcount, 1); } static __inline void mqnode_release(struct mqfs_node *node) { int old, exp; old = atomic_fetchadd_int(&node->mn_refcount, -1); if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) exp = 3; /* include . and .. */ else exp = 1; if (old == exp) mqfs_destroy(node); } /* * Add a node to a directory */ static int mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node) { KASSERT(parent != NULL, ("%s(): parent is NULL", __func__)); KASSERT(parent->mn_info != NULL, ("%s(): parent has no mn_info", __func__)); KASSERT(parent->mn_type == mqfstype_dir || parent->mn_type == mqfstype_root, ("%s(): parent is not a directory", __func__)); node->mn_info = parent->mn_info; node->mn_parent = parent; LIST_INIT(&node->mn_children); LIST_INIT(&node->mn_vnodes); LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling); mqnode_addref(parent); return (0); } static struct mqfs_node * mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode, int nodetype) { struct mqfs_node *node; node = mqnode_alloc(); strncpy(node->mn_name, name, namelen); node->mn_type = nodetype; node->mn_refcount = 1; getnanotime(&node->mn_birth); node->mn_ctime = node->mn_atime = node->mn_mtime = node->mn_birth; node->mn_uid = cred->cr_uid; node->mn_gid = cred->cr_gid; node->mn_mode = mode; return (node); } /* * Create a file */ static struct mqfs_node * mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } /* * Add . and .. to a directory */ static int mqfs_fixup_dir(struct mqfs_node *parent) { struct mqfs_node *dir; dir = mqnode_alloc(); dir->mn_name[0] = '.'; dir->mn_type = mqfstype_this; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } dir = mqnode_alloc(); dir->mn_name[0] = dir->mn_name[1] = '.'; dir->mn_type = mqfstype_parent; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } return (0); } #ifdef notyet /* * Create a directory */ static struct mqfs_node * mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } if (mqfs_fixup_dir(node) != 0) { mqfs_destroy(node); return (NULL); } return (node); } /* * Create a symlink */ static struct mqfs_node * mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } #endif /* * Destroy a node or a tree of nodes */ static int mqfs_destroy(struct mqfs_node *node) { struct mqfs_node *parent; KASSERT(node != NULL, ("%s(): node is NULL", __func__)); KASSERT(node->mn_info != NULL, ("%s(): node has no mn_info", __func__)); /* destroy children */ if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) while (! LIST_EMPTY(&node->mn_children)) mqfs_destroy(LIST_FIRST(&node->mn_children)); /* unlink from parent */ if ((parent = node->mn_parent) != NULL) { KASSERT(parent->mn_info == node->mn_info, ("%s(): parent has different mn_info", __func__)); LIST_REMOVE(node, mn_sibling); } if (node->mn_fileno != 0) mqfs_fileno_free(node->mn_info, node); if (node->mn_data != NULL) mqueue_free(node->mn_data); mqnode_free(node); return (0); } /* * Mount a mqfs instance */ static int mqfs_mount(struct mount *mp, struct thread *td) { struct statfs *sbp; if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); mp->mnt_data = &mqfs_data; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; mp->mnt_kern_flag |= MNTK_MPSAFE; MNT_IUNLOCK(mp); vfs_getnewfsid(mp); sbp = &mp->mnt_stat; vfs_mountedfrom(mp, "mqueue"); sbp->f_bsize = PAGE_SIZE; sbp->f_iosize = PAGE_SIZE; sbp->f_blocks = 1; sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 1; sbp->f_ffree = 0; return (0); } /* * Unmount a mqfs instance */ static int mqfs_unmount(struct mount *mp, int mntflags, struct thread *td) { int error; error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0, td); return (error); } /* * Return a root vnode */ static int mqfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td) { struct mqfs_info *mqfs; int ret; mqfs = VFSTOMQFS(mp); sx_xlock(&mqfs->mi_lock); ret = mqfs_allocv(mp, vpp, mqfs->mi_root); sx_xunlock(&mqfs->mi_lock); return (ret); } /* * Return filesystem stats */ static int mqfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td) { /* XXX update statistics */ return (0); } /* * Initialize a mqfs instance */ static int mqfs_init(struct vfsconf *vfc) { struct mqfs_node *root; struct mqfs_info *mi; mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mvdata_zone = uma_zcreate("mvdata", sizeof(struct mqfs_vdata), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mi = &mqfs_data; sx_init(&mi->mi_lock, "mqfs lock"); /* set up the root diretory */ root = mqfs_create_node("/", 1, curthread->td_ucred, 01777, mqfstype_root); root->mn_info = mi; LIST_INIT(&root->mn_children); LIST_INIT(&root->mn_vnodes); mi->mi_root = root; mqfs_fileno_init(mi); mqfs_fileno_alloc(mi, root); mqfs_fixup_dir(root); exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL, EVENTHANDLER_PRI_ANY); mq_fdclose = mqueue_fdclose; p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING); return (0); } /* * Destroy a mqfs instance */ static int mqfs_uninit(struct vfsconf *vfc) { struct mqfs_info *mi; if (!unloadable) return (EOPNOTSUPP); EVENTHANDLER_DEREGISTER(process_exit, exit_tag); mi = &mqfs_data; mqfs_destroy(mi->mi_root); mi->mi_root = NULL; mqfs_fileno_uninit(mi); sx_destroy(&mi->mi_lock); uma_zdestroy(mqnode_zone); uma_zdestroy(mqueue_zone); uma_zdestroy(mvdata_zone); uma_zdestroy(mqnoti_zone); return (0); } /* * task routine */ static void do_recycle(void *context, int pending __unused) { struct vnode *vp = (struct vnode *)context; vrecycle(vp, curthread); vdrop(vp); } /* * Allocate a vnode */ static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn) { struct mqfs_vdata *vd; int error; LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) break; } if (vd != NULL) { if (vget(vd->mv_vnode, 0, curthread) == 0) { *vpp = vd->mv_vnode; vn_lock(*vpp, LK_RETRY | LK_EXCLUSIVE, curthread); return (0); } /* XXX if this can happen, we're in trouble */ } error = getnewvnode("mqueue", mp, &mqfs_vnodeops, vpp); if (error) return (error); vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread); error = insmntque(*vpp, mp); if (error != 0) { *vpp = NULLVP; return (error); } vd = uma_zalloc(mvdata_zone, M_WAITOK); (*vpp)->v_data = vd; vd->mv_vnode = *vpp; vd->mv_node = pn; TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp); LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link); mqnode_addref(pn); switch (pn->mn_type) { case mqfstype_root: (*vpp)->v_vflag = VV_ROOT; /* fall through */ case mqfstype_dir: case mqfstype_this: case mqfstype_parent: (*vpp)->v_type = VDIR; break; case mqfstype_file: (*vpp)->v_type = VREG; break; case mqfstype_symlink: (*vpp)->v_type = VLNK; break; case mqfstype_none: KASSERT(0, ("mqfs_allocf called for null node\n")); default: panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type); } return (0); } /* * Search a directory entry */ static struct mqfs_node * mqfs_search(struct mqfs_node *pd, const char *name, int len) { struct mqfs_node *pn; LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { if (strncmp(pn->mn_name, name, len) == 0) return (pn); } return (NULL); } /* * Look up a file or directory. */ static int mqfs_lookupx(struct vop_cachedlookup_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct mqfs_node *pd; struct mqfs_node *pn; int nameiop, flags, error, namelen; char *pname; struct thread *td; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; namelen = cnp->cn_namelen; td = cnp->cn_thread; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; pd = VTON(dvp); pn = NULL; *vpp = NULLVP; if (dvp->v_type != VDIR) return (ENOTDIR); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread); if (error) return (error); /* shortcut: check if the name is too long */ if (cnp->cn_namelen >= MQFS_NAMELEN) return (ENOENT); /* self */ if (namelen == 1 && pname[0] == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); pn = pd; *vpp = dvp; VREF(dvp); return (0); } /* parent */ if (cnp->cn_flags & ISDOTDOT) { if (dvp->v_vflag & VV_ROOT) return (EIO); if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); VOP_UNLOCK(dvp, 0, cnp->cn_thread); KASSERT(pd->mn_parent, ("non-root directory has no parent")); pn = pd->mn_parent; error = mqfs_allocv(dvp->v_mount, vpp, pn); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td); return (error); } /* named node */ pn = mqfs_search(pd, pname, namelen); /* found */ if (pn != NULL) { /* DELETE */ if (nameiop == DELETE && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); if (*vpp == dvp) { VREF(dvp); *vpp = dvp; return (0); } } /* allocate vnode */ error = mqfs_allocv(dvp->v_mount, vpp, pn); if (error == 0 && cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *vpp, cnp); return (error); } /* not found */ /* will create a new entry in the directory ? */ if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } #if 0 struct vop_lookup_args { struct vop_generic_args a_gen; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif /* * vnode lookup operation */ static int mqfs_lookup(struct vop_cachedlookup_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); int rc; sx_xlock(&mqfs->mi_lock); rc = mqfs_lookupx(ap); sx_xunlock(&mqfs->mi_lock); return (rc); } #if 0 struct vop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif /* * vnode creation operation */ static int mqfs_create(struct vop_create_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqueue *mq; int error; pd = VTON(ap->a_dvp); if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); mq = mqueue_alloc(NULL); if (mq == NULL) return (EAGAIN); sx_xlock(&mqfs->mi_lock); #if 0 /* named node */ pn = mqfs_search(pd, cnp->cn_nameptr, cnp->cn_namelen); if (pn != NULL) { mqueue_free(mq); sx_xunlock(&mqfs->mi_lock); return (EEXIST); } #else if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); #endif pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, ap->a_vap->va_mode); if (pn == NULL) error = ENOSPC; else { error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); if (error) mqfs_destroy(pn); else pn->mn_data = mq; } sx_xunlock(&mqfs->mi_lock); if (error) mqueue_free(mq); return (error); } /* * Remove an entry */ static int do_unlink(struct mqfs_node *pn, struct ucred *ucred) { struct mqfs_node *parent; struct mqfs_vdata *vd; int error = 0; sx_assert(&pn->mn_info->mi_lock, SX_LOCKED); if (ucred->cr_uid != pn->mn_uid && (error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0) error = EACCES; else if (!pn->mn_deleted) { parent = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { cache_purge(vd->mv_vnode); vhold(vd->mv_vnode); taskqueue_enqueue(taskqueue_thread, &vd->mv_task); } mqnode_release(pn); mqnode_release(parent); } else error = ENOENT; return (error); } #if 0 struct vop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * vnode removal operation */ static int mqfs_remove(struct vop_remove_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn; int error; if (ap->a_vp->v_type == VDIR) return (EPERM); pn = VTON(ap->a_vp); sx_xlock(&mqfs->mi_lock); error = do_unlink(pn, ap->a_cnp->cn_cred); sx_xunlock(&mqfs->mi_lock); return (error); } #if 0 struct vop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_inactive(struct vop_inactive_args *ap) { struct mqfs_node *pn = VTON(ap->a_vp); if (pn->mn_deleted) vrecycle(ap->a_vp, ap->a_td); return (0); } #if 0 struct vop_reclaim_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_reclaim(struct vop_reclaim_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount); struct vnode *vp = ap->a_vp; struct mqfs_node *pn; struct mqfs_vdata *vd; vd = vp->v_data; pn = vd->mv_node; sx_xlock(&mqfs->mi_lock); vp->v_data = NULL; LIST_REMOVE(vd, mv_link); uma_zfree(mvdata_zone, vd); mqnode_release(pn); sx_xunlock(&mqfs->mi_lock); return (0); } #if 0 struct vop_open_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; int a_fdidx; }; #endif static int mqfs_open(struct vop_open_args *ap) { return (0); } #if 0 struct vop_close_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; #endif static int mqfs_close(struct vop_close_args *ap) { return (0); } #if 0 struct vop_access_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Verify permissions */ static int mqfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct vattr vattr; int error; error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td); if (error) return (error); error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid, ap->a_mode, ap->a_cred, NULL); return (error); } #if 0 struct vop_getattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Get file attributes */ static int mqfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct mqfs_node *pn = VTON(vp); struct vattr *vap = ap->a_vap; int error = 0; VATTR_NULL(vap); vap->va_type = vp->v_type; vap->va_mode = pn->mn_mode; vap->va_nlink = 1; vap->va_uid = pn->mn_uid; vap->va_gid = pn->mn_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_fileid = pn->mn_fileno; vap->va_size = 0; vap->va_blocksize = PAGE_SIZE; vap->va_bytes = vap->va_size = 0; vap->va_atime = pn->mn_atime; vap->va_mtime = pn->mn_mtime; vap->va_ctime = pn->mn_ctime; vap->va_birthtime = pn->mn_birth; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = 0; vap->va_bytes = 0; vap->va_filerev = 0; vap->va_vaflags = 0; return (error); } #if 0 struct vop_setattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Set attributes */ static int mqfs_setattr(struct vop_setattr_args *ap) { struct mqfs_node *pn; struct vattr *vap; struct vnode *vp; int c, error; uid_t uid; gid_t gid; vap = ap->a_vap; vp = ap->a_vp; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } pn = VTON(vp); error = c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = pn->mn_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = pn->mn_gid; else gid = vap->va_gid; if (uid != pn->mn_uid || gid != pn->mn_gid) { /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td))) return (error); /* * XXXRW: Why is there a privilege check here: shouldn't the * check in VOP_ACCESS() be enough? Also, are the group bits * below definitely right? */ if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid || (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) && (error = priv_check(ap->a_td, PRIV_MQ_ADMIN)) != 0) return (error); pn->mn_uid = uid; pn->mn_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if ((ap->a_cred->cr_uid != pn->mn_uid) && (error = priv_check(ap->a_td, PRIV_MQ_ADMIN))) return (error); pn->mn_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { /* See the comment in ufs_vnops::ufs_setattr(). */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, ap->a_td)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) { pn->mn_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { pn->mn_mtime = vap->va_mtime; } c = 1; } if (c) { vfs_timestamp(&pn->mn_ctime); } return (0); } #if 0 struct vop_read_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif /* * Read from a file */ static int mqfs_read(struct vop_read_args *ap) { char buf[80]; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct mqfs_node *pn; struct mqueue *mq; int len, error; if (vp->v_type != VREG) return (EINVAL); pn = VTON(vp); mq = VTOMQ(vp); snprintf(buf, sizeof(buf), "QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n", mq->mq_totalbytes, mq->mq_maxmsg, mq->mq_curmsgs, mq->mq_msgsize); buf[sizeof(buf)-1] = '\0'; len = strlen(buf); error = uiomove_frombuf(buf, len, uio); return (error); } #if 0 struct vop_readdir_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; }; #endif /* * Return directory entries. */ static int mqfs_readdir(struct vop_readdir_args *ap) { struct vnode *vp; struct mqfs_info *mi; struct mqfs_node *pd; struct mqfs_node *pn; struct dirent entry; struct uio *uio; int *tmp_ncookies = NULL; off_t offset; int error, i; vp = ap->a_vp; mi = VFSTOMQFS(vp->v_mount); pd = VTON(vp); uio = ap->a_uio; if (vp->v_type != VDIR) return (ENOTDIR); if (uio->uio_offset < 0) return (EINVAL); if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } error = 0; offset = 0; sx_xlock(&mi->mi_lock); LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { entry.d_reclen = sizeof(entry); if (!pn->mn_fileno) mqfs_fileno_alloc(mi, pn); entry.d_fileno = pn->mn_fileno; for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i) entry.d_name[i] = pn->mn_name[i]; entry.d_name[i] = 0; entry.d_namlen = i; switch (pn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_this: case mqfstype_parent: entry.d_type = DT_DIR; break; case mqfstype_file: entry.d_type = DT_REG; break; case mqfstype_symlink: entry.d_type = DT_LNK; break; default: panic("%s has unexpected node type: %d", pn->mn_name, pn->mn_type); } if (entry.d_reclen > uio->uio_resid) break; if (offset >= uio->uio_offset) { error = vfs_read_dirent(ap, &entry, offset); if (error) break; } offset += entry.d_reclen; } sx_xunlock(&mi->mi_lock); uio->uio_offset = offset; if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } #ifdef notyet #if 0 struct vop_mkdir_args { struct vnode *a_dvp; struvt vnode **a_vpp; struvt componentname *a_cnp; struct vattr *a_vap; }; #endif /* * Create a directory. */ static int mqfs_mkdir(struct vop_mkdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd = VTON(ap->a_dvp); struct mqfs_node *pn; int error; if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); #if 0 /* named node */ pn = mqfs_search(pd, cnp->cn_nameptr, cnp->cn_namelen); if (pn != NULL) { sx_xunlock(&mqfs->mi_lock); return (EEXIST); } #else if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); #endif pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen, ap->a_vap->cn_cred, ap->a_vap->va_mode); if (pn == NULL) error = ENOSPC; else error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); sx_xunlock(&mqfs->mi_lock); return (error); } #if 0 struct vop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * Remove a directory. */ static int mqfs_rmdir(struct vop_rmdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn = VTON(ap->a_vp); struct mqfs_node *pt; if (pn->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if (pn->mn_deleted) { sx_xunlock(&mqfs->mi_lock); return (ENOENT); } pt = LIST_FIRST(&pn->mn_children); pt = LIST_NEXT(pt, mn_sibling); pt = LIST_NEXT(pt, mn_sibling); if (pt != NULL) { sx_xunlock(&mqfs->mi_lock); return (ENOTEMPTY); } pt = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); mqnode_release(pn); mqnode_release(pt); sx_xunlock(&mqfs->mi_lock); cache_purge(ap->a_vp); return (0); } #endif /* notyet */ /* * Allocate a message queue */ static struct mqueue * mqueue_alloc(const struct mq_attr *attr) { struct mqueue *mq; if (curmq >= maxmq) return (NULL); mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO); TAILQ_INIT(&mq->mq_msgq); if (attr != NULL) { mq->mq_maxmsg = attr->mq_maxmsg; mq->mq_msgsize = attr->mq_msgsize; } else { mq->mq_maxmsg = default_maxmsg; mq->mq_msgsize = default_msgsize; } mtx_init(&mq->mq_mutex, "mqueue", NULL, MTX_DEF); knlist_init(&mq->mq_rsel.si_note, &mq->mq_mutex, NULL, NULL, NULL); knlist_init(&mq->mq_wsel.si_note, &mq->mq_mutex, NULL, NULL, NULL); atomic_add_int(&curmq, 1); return (mq); } /* * Destroy a message queue */ static void mqueue_free(struct mqueue *mq) { struct mqueue_msg *msg; while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) { TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link); FREE(msg, M_MQUEUEDATA); } mtx_destroy(&mq->mq_mutex); knlist_destroy(&mq->mq_rsel.si_note); knlist_destroy(&mq->mq_wsel.si_note); uma_zfree(mqueue_zone, mq); atomic_add_int(&curmq, -1); } /* * Load a message from user space */ static struct mqueue_msg * mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio) { struct mqueue_msg *msg; size_t len; int error; len = sizeof(struct mqueue_msg) + msg_size; MALLOC(msg, struct mqueue_msg *, len, M_MQUEUEDATA, M_WAITOK); error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg), msg_size); if (error) { FREE(msg, M_MQUEUEDATA); msg = NULL; } else { msg->msg_size = msg_size; msg->msg_prio = msg_prio; } return (msg); } /* * Save a message to user space */ static int mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio) { int error; error = copyout(((char *)msg) + sizeof(*msg), msg_ptr, msg->msg_size); if (error == 0 && msg_prio != NULL) error = copyout(&msg->msg_prio, msg_prio, sizeof(int)); return (error); } /* * Free a message's memory */ static __inline void mqueue_freemsg(struct mqueue_msg *msg) { FREE(msg, M_MQUEUEDATA); } /* * Send a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ets, ts, ts2; struct timeval tv; int error; if (msg_prio >= MQ_PRIO_MAX) return (EINVAL); if (msg_len > mq->mq_msgsize) return (EMSGSIZE); msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio); if (msg == NULL) return (EFAULT); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_send(mq, msg, -1); if (error) goto bad; return (0); } /* we allow a null timeout (wait forever) */ if (abs_timeout == NULL) { error = _mqueue_send(mq, msg, 0); if (error) goto bad; return (0); } /* send it before checking time */ error = _mqueue_send(mq, msg, -1); if (error == 0) return (0); if (error != EAGAIN) goto bad; error = copyin(abs_timeout, &ets, sizeof(ets)); if (error != 0) goto bad; if (ets.tv_nsec >= 1000000000 || ets.tv_nsec < 0) { error = EINVAL; goto bad; } for (;;) { ts2 = ets; getnanotime(&ts); timespecsub(&ts2, &ts); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; break; } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_send(mq, msg, tvtohz(&tv)); if (error != ETIMEDOUT) break; } if (error == 0) return (0); bad: mqueue_freemsg(msg); return (error); } /* * Common routine to send a message */ static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo) { struct mqueue_msg *msg2; int error = 0; mtx_lock(&mq->mq_mutex); while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_senders++; error = msleep(&mq->mq_senders, &mq->mq_mutex, PCATCH, "mqsend", timo); mq->mq_senders--; if (error == EAGAIN) error = ETIMEDOUT; } if (mq->mq_curmsgs >= mq->mq_maxmsg) { mtx_unlock(&mq->mq_mutex); return (error); } error = 0; if (TAILQ_EMPTY(&mq->mq_msgq)) { TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link); } else { if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) { TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link); } else { TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) { if (msg2->msg_prio < msg->msg_prio) break; } TAILQ_INSERT_BEFORE(msg2, msg, msg_link); } } mq->mq_curmsgs++; mq->mq_totalbytes += msg->msg_size; if (mq->mq_receivers) wakeup_one(&mq->mq_receivers); else if (mq->mq_notifier != NULL) mqueue_send_notification(mq); if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } KNOTE_LOCKED(&mq->mq_rsel.si_note, 0); mtx_unlock(&mq->mq_mutex); return (0); } /* * Send realtime a signal to process which registered itself * successfully by mq_notify. */ static void mqueue_send_notification(struct mqueue *mq) { struct mqueue_notifier *nt; struct proc *p; mtx_assert(&mq->mq_mutex, MA_OWNED); nt = mq->mq_notifier; if (nt->nt_sigev.sigev_notify != SIGEV_NONE) { p = nt->nt_proc; PROC_LOCK(p); if (!KSI_ONQ(&nt->nt_ksi)) psignal_event(p, &nt->nt_sigev, &nt->nt_ksi); PROC_UNLOCK(p); } mq->mq_notifier = NULL; } /* * Get a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ets, ts, ts2; struct timeval tv; int error; if (msg_len < mq->mq_msgsize) return (EMSGSIZE); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_recv(mq, &msg, -1); if (error) return (error); goto received; } /* we allow a null timeout (wait forever). */ if (abs_timeout == NULL) { error = _mqueue_recv(mq, &msg, 0); if (error) return (error); goto received; } /* try to get a message before checking time */ error = _mqueue_recv(mq, &msg, -1); if (error == 0) goto received; if (error != EAGAIN) return (error); error = copyin(abs_timeout, &ets, sizeof(ets)); if (error != 0) return (error); if (ets.tv_nsec >= 1000000000 || ets.tv_nsec < 0) { error = EINVAL; return (error); } for (;;) { ts2 = ets; getnanotime(&ts); timespecsub(&ts2, &ts); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; return (error); } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_recv(mq, &msg, tvtohz(&tv)); if (error == 0) break; if (error != ETIMEDOUT) return (error); } received: error = mqueue_savemsg(msg, msg_ptr, msg_prio); if (error == 0) { curthread->td_retval[0] = msg->msg_size; curthread->td_retval[1] = 0; } mqueue_freemsg(msg); return (error); } /* * Common routine to receive a message */ static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo) { int error = 0; mtx_lock(&mq->mq_mutex); while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_receivers++; error = msleep(&mq->mq_receivers, &mq->mq_mutex, PCATCH, "mqrecv", timo); mq->mq_receivers--; if (error == EAGAIN) error = ETIMEDOUT; } if (*msg != NULL) { error = 0; TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link); mq->mq_curmsgs--; mq->mq_totalbytes -= (*msg)->msg_size; if (mq->mq_senders) wakeup_one(&mq->mq_senders); if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } KNOTE_LOCKED(&mq->mq_wsel.si_note, 0); } if (mq->mq_notifier != NULL && mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) { mqueue_send_notification(mq); } mtx_unlock(&mq->mq_mutex); return (error); } static __inline struct mqueue_notifier * notifier_alloc(void) { return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO)); } static __inline void notifier_free(struct mqueue_notifier *p) { uma_zfree(mqnoti_zone, p); } static struct mqueue_notifier * notifier_search(struct proc *p, int fd) { struct mqueue_notifier *nt; LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) { if (nt->nt_ksi.ksi_mqd == fd) break; } return (nt); } static __inline void notifier_insert(struct proc *p, struct mqueue_notifier *nt) { LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link); } static __inline void notifier_delete(struct proc *p, struct mqueue_notifier *nt) { LIST_REMOVE(nt, nt_link); notifier_free(nt); } static void notifier_remove(struct proc *p, struct mqueue *mq, int fd) { struct mqueue_notifier *nt; mtx_assert(&mq->mq_mutex, MA_OWNED); PROC_LOCK(p); nt = notifier_search(p, fd); if (nt != NULL) { if (mq->mq_notifier == nt) mq->mq_notifier = NULL; sigqueue_take(&nt->nt_ksi); notifier_delete(p, nt); } PROC_UNLOCK(p); } /* * Syscall to open a message queue. */ int kmq_open(struct thread *td, struct kmq_open_args *uap) { char path[MQFS_NAMELEN + 1]; struct mq_attr attr, *pattr; struct mqfs_node *pn; struct filedesc *fdp; struct file *fp; struct mqueue *mq; int fd, error, len, flags, cmode; if ((uap->flags & O_ACCMODE) == O_ACCMODE) return (EINVAL); fdp = td->td_proc->p_fd; flags = FFLAGS(uap->flags); cmode = (((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT); mq = NULL; if ((flags & O_CREAT) && (uap->attr != NULL)) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error) return (error); if (attr.mq_maxmsg <= 0 || attr.mq_maxmsg > maxmsg) return (EINVAL); if (attr.mq_msgsize <= 0 || attr.mq_msgsize > maxmsgsize) return (EINVAL); pattr = &attr; } else pattr = NULL; error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); /* * The first character of name must be a slash (/) character * and the remaining characters of name cannot include any slash * characters. */ len = strlen(path); if (len < 2 || path[0] != '/' || index(path + 1, '/') != NULL) return (EINVAL); error = falloc(td, &fp, &fd); if (error) return (error); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1); if (pn == NULL) { if (!(flags & O_CREAT)) { error = ENOENT; } else { mq = mqueue_alloc(pattr); if (mq == NULL) { error = ENFILE; } else { pn = mqfs_create_file(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred, cmode); if (pn == NULL) { error = ENOSPC; mqueue_free(mq); } } } if (error == 0) { pn->mn_data = mq; } } else { if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) { error = EEXIST; } else { int acc_mode = 0; if (flags & FREAD) acc_mode |= VREAD; if (flags & FWRITE) acc_mode |= VWRITE; error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, acc_mode, td->td_ucred, NULL); } } if (error) { sx_xunlock(&mqfs_data.mi_lock); fdclose(fdp, fp, fd, td); fdrop(fp, td); return (error); } mqnode_addref(pn); sx_xunlock(&mqfs_data.mi_lock); - FILE_LOCK(fp); - fp->f_flag = (flags & (FREAD | FWRITE | O_NONBLOCK)); - fp->f_type = DTYPE_MQUEUE; - fp->f_data = pn; - fp->f_ops = &mqueueops; - FILE_UNLOCK(fp); + finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn, + &mqueueops); FILEDESC_XLOCK(fdp); if (fdp->fd_ofiles[fd] == fp) fdp->fd_ofileflags[fd] |= UF_EXCLOSE; FILEDESC_XUNLOCK(fdp); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* * Syscall to unlink a message queue. */ int kmq_unlink(struct thread *td, struct kmq_unlink_args *uap) { char path[MQFS_NAMELEN+1]; struct mqfs_node *pn; int error, len; error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); len = strlen(path); if (len < 2 || path[0] != '/' || index(path + 1, '/') != NULL) return (EINVAL); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1); if (pn != NULL) error = do_unlink(pn, td->td_ucred); else error = ENOENT; sx_xunlock(&mqfs_data.mi_lock); return (error); } typedef int (*_fgetf)(struct thread *, int, struct file **); /* * Get message queue by giving file slot */ static int _getmq(struct thread *td, int fd, _fgetf func, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { struct mqfs_node *pn; int error; error = func(td, fd, fpp); if (error) return (error); if (&mqueueops != (*fpp)->f_ops) { fdrop(*fpp, td); return (EBADF); } pn = (*fpp)->f_data; if (ppn) *ppn = pn; if (pmq) *pmq = pn->mn_data; return (0); } static __inline int getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, fget, fpp, ppn, pmq); } static __inline int getmq_read(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, fget_read, fpp, ppn, pmq); } static __inline int getmq_write(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, fget_write, fpp, ppn, pmq); } int kmq_setattr(struct thread *td, struct kmq_setattr_args *uap) { struct mqueue *mq; struct file *fp; struct mq_attr attr, oattr; + u_int oflag, flag; int error; if (uap->attr) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error) return (error); if (attr.mq_flags & ~O_NONBLOCK) return (EINVAL); } error = getmq(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); oattr.mq_maxmsg = mq->mq_maxmsg; oattr.mq_msgsize = mq->mq_msgsize; oattr.mq_curmsgs = mq->mq_curmsgs; - FILE_LOCK(fp); - oattr.mq_flags = (O_NONBLOCK & fp->f_flag); if (uap->attr) { - fp->f_flag &= ~O_NONBLOCK; - fp->f_flag |= (attr.mq_flags & O_NONBLOCK); - } - FILE_UNLOCK(fp); + do { + oflag = flag = fp->f_flag; + flag &= ~O_NONBLOCK; + flag |= (attr.mq_flags & O_NONBLOCK); + } while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0); + } else + oflag = fp->f_flag; + oattr.mq_flags = (O_NONBLOCK & oflag); fdrop(fp, td); if (uap->oattr) error = copyout(&oattr, uap->oattr, sizeof(oattr)); return (error); } int kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; int error; int waitok; error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, uap->abs_timeout); fdrop(fp, td); return (error); } int kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; int error, waitok; error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, uap->abs_timeout); fdrop(fp, td); return (error); } int kmq_notify(struct thread *td, struct kmq_notify_args *uap) { struct sigevent ev; struct filedesc *fdp; struct proc *p; struct mqueue *mq; struct file *fp; struct mqueue_notifier *nt, *newnt = NULL; int error; p = td->td_proc; fdp = td->td_proc->p_fd; if (uap->sigev) { error = copyin(uap->sigev, &ev, sizeof(ev)); if (error) return (error); if (ev.sigev_notify != SIGEV_SIGNAL && ev.sigev_notify != SIGEV_THREAD_ID && ev.sigev_notify != SIGEV_NONE) return (EINVAL); if ((ev.sigev_notify == SIGEV_SIGNAL || ev.sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(ev.sigev_signo)) return (EINVAL); } error = getmq(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); again: FILEDESC_SLOCK(fdp); if (fget_locked(fdp, uap->mqd) != fp) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } mtx_lock(&mq->mq_mutex); FILEDESC_SUNLOCK(fdp); if (uap->sigev != NULL) { if (mq->mq_notifier != NULL) { error = EBUSY; } else { PROC_LOCK(p); nt = notifier_search(p, uap->mqd); if (nt == NULL) { if (newnt == NULL) { PROC_UNLOCK(p); mtx_unlock(&mq->mq_mutex); newnt = notifier_alloc(); goto again; } } if (nt != NULL) { sigqueue_take(&nt->nt_ksi); if (newnt != NULL) { notifier_free(newnt); newnt = NULL; } } else { nt = newnt; newnt = NULL; ksiginfo_init(&nt->nt_ksi); nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT; nt->nt_ksi.ksi_code = SI_MESGQ; nt->nt_proc = p; nt->nt_ksi.ksi_mqd = uap->mqd; notifier_insert(p, nt); } nt->nt_sigev = ev; mq->mq_notifier = nt; PROC_UNLOCK(p); /* * if there is no receivers and message queue * is not empty, we should send notification * as soon as possible. */ if (mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) mqueue_send_notification(mq); } } else { notifier_remove(p, mq, uap->mqd); } mtx_unlock(&mq->mq_mutex); out: fdrop(fp, td); if (newnt != NULL) notifier_free(newnt); return (error); } static void mqueue_fdclose(struct thread *td, int fd, struct file *fp) { struct filedesc *fdp; struct mqueue *mq; fdp = td->td_proc->p_fd; FILEDESC_LOCK_ASSERT(fdp); if (fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(td->td_proc, mq, fd); /* have to wakeup thread in same process */ if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } mtx_unlock(&mq->mq_mutex); } } static void mq_proc_exit(void *arg __unused, struct proc *p) { struct filedesc *fdp; struct file *fp; struct mqueue *mq; int i; fdp = p->p_fd; FILEDESC_SLOCK(fdp); for (i = 0; i < fdp->fd_nfiles; ++i) { fp = fget_locked(fdp, i); if (fp != NULL && fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(p, FPTOMQ(fp), i); mtx_unlock(&mq->mq_mutex); } } FILEDESC_SUNLOCK(fdp); KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left")); } static int mqf_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EOPNOTSUPP); } static int mqf_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EOPNOTSUPP); } static int mqf_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { return (ENOTTY); } static int mqf_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct mqueue *mq = FPTOMQ(fp); int revents = 0; mtx_lock(&mq->mq_mutex); if (events & (POLLIN | POLLRDNORM)) { if (mq->mq_curmsgs) { revents |= events & (POLLIN | POLLRDNORM); } else { mq->mq_flags |= MQ_RSEL; selrecord(td, &mq->mq_rsel); } } if (events & POLLOUT) { if (mq->mq_curmsgs < mq->mq_maxmsg) revents |= POLLOUT; else { mq->mq_flags |= MQ_WSEL; selrecord(td, &mq->mq_wsel); } } mtx_unlock(&mq->mq_mutex); return (revents); } static int mqf_close(struct file *fp, struct thread *td) { struct mqfs_node *pn; fp->f_ops = &badfileops; pn = fp->f_data; fp->f_data = NULL; sx_xlock(&mqfs_data.mi_lock); mqnode_release(pn); sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn = fp->f_data; bzero(st, sizeof *st); st->st_atimespec = pn->mn_atime; st->st_mtimespec = pn->mn_mtime; st->st_ctimespec = pn->mn_ctime; st->st_birthtimespec = pn->mn_birth; st->st_uid = pn->mn_uid; st->st_gid = pn->mn_gid; st->st_mode = S_IFIFO | pn->mn_mode; return (0); } static int mqf_kqfilter(struct file *fp, struct knote *kn) { struct mqueue *mq = FPTOMQ(fp); int error = 0; if (kn->kn_filter == EVFILT_READ) { kn->kn_fop = &mq_rfiltops; knlist_add(&mq->mq_rsel.si_note, kn, 0); } else if (kn->kn_filter == EVFILT_WRITE) { kn->kn_fop = &mq_wfiltops; knlist_add(&mq->mq_wsel.si_note, kn, 0); } else error = EINVAL; return (error); } static void filt_mqdetach(struct knote *kn) { struct mqueue *mq = FPTOMQ(kn->kn_fp); if (kn->kn_filter == EVFILT_READ) knlist_remove(&mq->mq_rsel.si_note, kn, 0); else if (kn->kn_filter == EVFILT_WRITE) knlist_remove(&mq->mq_wsel.si_note, kn, 0); else panic("filt_mqdetach"); } static int filt_mqread(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs != 0); } static int filt_mqwrite(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs < mq->mq_maxmsg); } static struct fileops mqueueops = { .fo_read = mqf_read, .fo_write = mqf_write, .fo_ioctl = mqf_ioctl, .fo_poll = mqf_poll, .fo_kqfilter = mqf_kqfilter, .fo_stat = mqf_stat, .fo_close = mqf_close }; static struct vop_vector mqfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = mqfs_access, .vop_cachedlookup = mqfs_lookup, .vop_lookup = vfs_cache_lookup, .vop_reclaim = mqfs_reclaim, .vop_create = mqfs_create, .vop_remove = mqfs_remove, .vop_inactive = mqfs_inactive, .vop_open = mqfs_open, .vop_close = mqfs_close, .vop_getattr = mqfs_getattr, .vop_setattr = mqfs_setattr, .vop_read = mqfs_read, .vop_write = VOP_EOPNOTSUPP, .vop_readdir = mqfs_readdir, .vop_mkdir = VOP_EOPNOTSUPP, .vop_rmdir = VOP_EOPNOTSUPP }; static struct vfsops mqfs_vfsops = { .vfs_init = mqfs_init, .vfs_uninit = mqfs_uninit, .vfs_mount = mqfs_mount, .vfs_unmount = mqfs_unmount, .vfs_root = mqfs_root, .vfs_statfs = mqfs_statfs, }; SYSCALL_MODULE_HELPER(kmq_open); SYSCALL_MODULE_HELPER(kmq_setattr); SYSCALL_MODULE_HELPER(kmq_timedsend); SYSCALL_MODULE_HELPER(kmq_timedreceive); SYSCALL_MODULE_HELPER(kmq_notify); SYSCALL_MODULE_HELPER(kmq_unlink); VFS_SET(mqfs_vfsops, mqueuefs, VFCF_SYNTHETIC); MODULE_VERSION(mqueuefs, 1); Index: head/sys/kern/uipc_syscalls.c =================================================================== --- head/sys/kern/uipc_syscalls.c (revision 174987) +++ head/sys/kern/uipc_syscalls.c (revision 174988) @@ -1,2652 +1,2629 @@ /*- * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * sendfile(2) and related extensions: * Copyright (c) 1998, David Greenman. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_sctp.h" #include "opt_compat.h" #include "opt_ktrace.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #ifdef SCTP #include #include #endif /* SCTP */ static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); static int accept1(struct thread *td, struct accept_args *uap, int compat); static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat); static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat); static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat); /* * NSFBUFS-related variables and associated sysctls */ int nsfbufs; int nsfbufspeak; int nsfbufsused; SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0, "Maximum number of sendfile(2) sf_bufs available"); SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0, "Number of sendfile(2) sf_bufs at peak usage"); SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0, "Number of sendfile(2) sf_bufs in use"); /* * Convert a user file descriptor to a kernel file entry. A reference on the * file entry is held upon returning. This is lighter weight than * fgetsock(), which bumps the socket reference drops the file reference * count instead, as this approach avoids several additional mutex operations * associated with the additional reference count. If requested, return the * open file flags. */ static int getsock(struct filedesc *fdp, int fd, struct file **fpp, u_int *fflagp) { struct file *fp; int error; fp = NULL; if (fdp == NULL) error = EBADF; else { FILEDESC_SLOCK(fdp); fp = fget_locked(fdp, fd); if (fp == NULL) error = EBADF; else if (fp->f_type != DTYPE_SOCKET) { fp = NULL; error = ENOTSOCK; } else { fhold(fp); if (fflagp != NULL) *fflagp = fp->f_flag; error = 0; } FILEDESC_SUNLOCK(fdp); } *fpp = fp; return (error); } /* * System call interface to the socket abstraction. */ #if defined(COMPAT_43) #define COMPAT_OLDSOCK #endif int socket(td, uap) struct thread *td; struct socket_args /* { int domain; int type; int protocol; } */ *uap; { struct filedesc *fdp; struct socket *so; struct file *fp; int fd, error; #ifdef MAC error = mac_socket_check_create(td->td_ucred, uap->domain, uap->type, uap->protocol); if (error) return (error); #endif fdp = td->td_proc->p_fd; error = falloc(td, &fp, &fd); if (error) return (error); /* An extra reference on `fp' has been held for us by falloc(). */ error = socreate(uap->domain, &so, uap->type, uap->protocol, td->td_ucred, td); if (error) { fdclose(fdp, fp, fd, td); } else { - FILE_LOCK(fp); - fp->f_data = so; /* already has ref count */ - fp->f_flag = FREAD|FWRITE; - fp->f_type = DTYPE_SOCKET; - fp->f_ops = &socketops; - FILE_UNLOCK(fp); + finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &socketops); td->td_retval[0] = fd; } fdrop(fp, td); return (error); } /* ARGSUSED */ int bind(td, uap) struct thread *td; struct bind_args /* { int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0) return (error); error = kern_bind(td, uap->s, sa); free(sa, M_SONAME); return (error); } int kern_bind(td, fd, sa) struct thread *td; int fd; struct sockaddr *sa; { struct socket *so; struct file *fp; int error; error = getsock(td->td_proc->p_fd, fd, &fp, NULL); if (error) return (error); so = fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_bind(td->td_ucred, so, sa); SOCK_UNLOCK(so); if (error) goto done; #endif error = sobind(so, sa, td); #ifdef MAC done: #endif fdrop(fp, td); return (error); } /* ARGSUSED */ int listen(td, uap) struct thread *td; struct listen_args /* { int s; int backlog; } */ *uap; { struct socket *so; struct file *fp; int error; error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL); if (error == 0) { so = fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_listen(td->td_ucred, so); SOCK_UNLOCK(so); if (error) goto done; #endif error = solisten(so, uap->backlog, td); #ifdef MAC done: #endif fdrop(fp, td); } return(error); } /* * accept1() */ static int accept1(td, uap, compat) struct thread *td; struct accept_args /* { int s; struct sockaddr * __restrict name; socklen_t * __restrict anamelen; } */ *uap; int compat; { struct sockaddr *name; socklen_t namelen; struct file *fp; int error; if (uap->name == NULL) return (kern_accept(td, uap->s, NULL, NULL, NULL)); error = copyin(uap->anamelen, &namelen, sizeof (namelen)); if (error) return (error); error = kern_accept(td, uap->s, &name, &namelen, &fp); /* * return a namelen of zero for older code which might * ignore the return value from accept. */ if (error) { (void) copyout(&namelen, uap->anamelen, sizeof(*uap->anamelen)); return (error); } if (error == 0 && name != NULL) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)name)->sa_family = name->sa_family; #endif error = copyout(name, uap->name, namelen); } if (error == 0) error = copyout(&namelen, uap->anamelen, sizeof(namelen)); if (error) fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td); fdrop(fp, td); free(name, M_SONAME); return (error); } int kern_accept(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, struct file **fp) { struct filedesc *fdp; struct file *headfp, *nfp = NULL; struct sockaddr *sa = NULL; int error; struct socket *head, *so; int fd; u_int fflag; pid_t pgid; int tmp; if (name) { *name = NULL; if (*namelen < 0) return (EINVAL); } fdp = td->td_proc->p_fd; error = getsock(fdp, s, &headfp, &fflag); if (error) return (error); head = headfp->f_data; if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; } #ifdef MAC SOCK_LOCK(head); error = mac_socket_check_accept(td->td_ucred, head); SOCK_UNLOCK(head); if (error != 0) goto done; #endif error = falloc(td, &nfp, &fd); if (error) goto done; ACCEPT_LOCK(); if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { ACCEPT_UNLOCK(); error = EWOULDBLOCK; goto noconnection; } while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { head->so_error = ECONNABORTED; break; } error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, "accept", 0); if (error) { ACCEPT_UNLOCK(); goto noconnection; } } if (head->so_error) { error = head->so_error; head->so_error = 0; ACCEPT_UNLOCK(); goto noconnection; } so = TAILQ_FIRST(&head->so_comp); KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); /* * Before changing the flags on the socket, we have to bump the * reference count. Otherwise, if the protocol calls sofree(), * the socket will be released due to a zero refcount. */ SOCK_LOCK(so); /* soref() and so_state update */ soref(so); /* file descriptor reference */ TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; so->so_state |= (head->so_state & SS_NBIO); so->so_qstate &= ~SQ_COMP; so->so_head = NULL; SOCK_UNLOCK(so); ACCEPT_UNLOCK(); /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; /* connection has been removed from the listen queue */ KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); pgid = fgetown(&head->so_sigio); if (pgid != 0) fsetown(pgid, &so->so_sigio); - FILE_LOCK(nfp); - nfp->f_data = so; /* nfp has ref count from falloc */ - nfp->f_flag = fflag; - nfp->f_type = DTYPE_SOCKET; - nfp->f_ops = &socketops; - FILE_UNLOCK(nfp); + finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); /* Sync socket nonblocking/async state with file flags */ tmp = fflag & FNONBLOCK; (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); sa = 0; error = soaccept(so, &sa); if (error) { /* * return a namelen of zero for older code which might * ignore the return value from accept. */ if (name) *namelen = 0; goto noconnection; } if (sa == NULL) { if (name) *namelen = 0; goto done; } if (name) { /* check sa_len before it is destroyed */ if (*namelen > sa->sa_len) *namelen = sa->sa_len; *name = sa; sa = NULL; } noconnection: if (sa) FREE(sa, M_SONAME); /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error) fdclose(fdp, nfp, fd, td); /* * Release explicitly held references before returning. We return * a reference on nfp to the caller on success if they request it. */ done: if (fp != NULL) { if (error == 0) { *fp = nfp; nfp = NULL; } else *fp = NULL; } if (nfp != NULL) fdrop(nfp, td); fdrop(headfp, td); return (error); } int accept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int oaccept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* ARGSUSED */ int connect(td, uap) struct thread *td; struct connect_args /* { int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error) return (error); error = kern_connect(td, uap->s, sa); free(sa, M_SONAME); return (error); } int kern_connect(td, fd, sa) struct thread *td; int fd; struct sockaddr *sa; { struct socket *so; struct file *fp; int error; int interrupted = 0; error = getsock(td->td_proc->p_fd, fd, &fp, NULL); if (error) return (error); so = fp->f_data; if (so->so_state & SS_ISCONNECTING) { error = EALREADY; goto done1; } #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_connect(td->td_ucred, so, sa); SOCK_UNLOCK(so); if (error) goto bad; #endif error = soconnect(so, sa, td); if (error) goto bad; if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { error = EINPROGRESS; goto done1; } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, "connec", 0); if (error) { if (error == EINTR || error == ERESTART) interrupted = 1; break; } } if (error == 0) { error = so->so_error; so->so_error = 0; } SOCK_UNLOCK(so); bad: if (!interrupted) so->so_state &= ~SS_ISCONNECTING; if (error == ERESTART) error = EINTR; done1: fdrop(fp, td); return (error); } int socketpair(td, uap) struct thread *td; struct socketpair_args /* { int domain; int type; int protocol; int *rsv; } */ *uap; { struct filedesc *fdp = td->td_proc->p_fd; struct file *fp1, *fp2; struct socket *so1, *so2; int fd, error, sv[2]; #ifdef MAC /* We might want to have a separate check for socket pairs. */ error = mac_socket_check_create(td->td_ucred, uap->domain, uap->type, uap->protocol); if (error) return (error); #endif error = socreate(uap->domain, &so1, uap->type, uap->protocol, td->td_ucred, td); if (error) return (error); error = socreate(uap->domain, &so2, uap->type, uap->protocol, td->td_ucred, td); if (error) goto free1; /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ error = falloc(td, &fp1, &fd); if (error) goto free2; sv[0] = fd; fp1->f_data = so1; /* so1 already has ref count */ error = falloc(td, &fp2, &fd); if (error) goto free3; fp2->f_data = so2; /* so2 already has ref count */ sv[1] = fd; error = soconnect2(so1, so2); if (error) goto free4; if (uap->type == SOCK_DGRAM) { /* * Datagram socket connection is asymmetric. */ error = soconnect2(so2, so1); if (error) goto free4; } - FILE_LOCK(fp1); - fp1->f_flag = FREAD|FWRITE; - fp1->f_type = DTYPE_SOCKET; - fp1->f_ops = &socketops; - FILE_UNLOCK(fp1); - FILE_LOCK(fp2); - fp2->f_flag = FREAD|FWRITE; - fp2->f_type = DTYPE_SOCKET; - fp2->f_ops = &socketops; - FILE_UNLOCK(fp2); + finit(fp1, FREAD | FWRITE, DTYPE_SOCKET, fp1->f_data, &socketops); + finit(fp2, FREAD | FWRITE, DTYPE_SOCKET, fp2->f_data, &socketops); so1 = so2 = NULL; error = copyout(sv, uap->rsv, 2 * sizeof (int)); if (error) goto free4; fdrop(fp1, td); fdrop(fp2, td); return (0); free4: fdclose(fdp, fp2, sv[1], td); fdrop(fp2, td); free3: fdclose(fdp, fp1, sv[0], td); fdrop(fp1, td); free2: if (so2 != NULL) (void)soclose(so2); free1: if (so1 != NULL) (void)soclose(so1); return (error); } static int sendit(td, s, mp, flags) struct thread *td; int s; struct msghdr *mp; int flags; { struct mbuf *control; struct sockaddr *to; int error; if (mp->msg_name != NULL) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error) { to = NULL; goto bad; } mp->msg_name = to; } else { to = NULL; } if (mp->msg_control) { if (mp->msg_controllen < sizeof(struct cmsghdr) #ifdef COMPAT_OLDSOCK && mp->msg_flags != MSG_COMPAT #endif ) { error = EINVAL; goto bad; } error = sockargs(&control, mp->msg_control, mp->msg_controllen, MT_CONTROL); if (error) goto bad; #ifdef COMPAT_OLDSOCK if (mp->msg_flags == MSG_COMPAT) { struct cmsghdr *cm; M_PREPEND(control, sizeof(*cm), M_TRYWAIT); if (control == 0) { error = ENOBUFS; goto bad; } else { cm = mtod(control, struct cmsghdr *); cm->cmsg_len = control->m_len; cm->cmsg_level = SOL_SOCKET; cm->cmsg_type = SCM_RIGHTS; } } #endif } else { control = NULL; } error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); bad: if (to) FREE(to, M_SONAME); return (error); } int kern_sendit(td, s, mp, flags, control, segflg) struct thread *td; int s; struct msghdr *mp; int flags; struct mbuf *control; enum uio_seg segflg; { struct file *fp; struct uio auio; struct iovec *iov; struct socket *so; int i; int len, error; #ifdef KTRACE struct uio *ktruio = NULL; #endif error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error) return (error); so = (struct socket *)fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_send(td->td_ucred, so); SOCK_UNLOCK(so); if (error) goto bad; #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = segflg; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { error = EINVAL; goto bad; } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); psignal(td->td_proc, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(s, UIO_WRITE, ktruio, error); } #endif bad: fdrop(fp, td); return (error); } int sendto(td, uap) struct thread *td; struct sendto_args /* { int s; caddr_t buf; size_t len; int flags; caddr_t to; int tolen; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; msg.msg_name = uap->to; msg.msg_namelen = uap->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif aiov.iov_base = uap->buf; aiov.iov_len = uap->len; error = sendit(td, uap->s, &msg, uap->flags); return (error); } #ifdef COMPAT_OLDSOCK int osend(td, uap) struct thread *td; struct osend_args /* { int s; caddr_t buf; int len; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = 0; error = sendit(td, uap->s, &msg, uap->flags); return (error); } int osendmsg(td, uap) struct thread *td; struct osendmsg_args /* { int s; caddr_t msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error) return (error); msg.msg_iov = iov; msg.msg_flags = MSG_COMPAT; error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } #endif int sendmsg(td, uap) struct thread *td; struct sendmsg_args /* { int s; caddr_t msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error) return (error); msg.msg_iov = iov; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } int kern_recvit(td, s, mp, fromseg, controlp) struct thread *td; int s; struct msghdr *mp; enum uio_seg fromseg; struct mbuf **controlp; { struct uio auio; struct iovec *iov; int i; socklen_t len; int error; struct mbuf *m, *control = 0; caddr_t ctlbuf; struct file *fp; struct socket *so; struct sockaddr *fromsa = 0; #ifdef KTRACE struct uio *ktruio = NULL; #endif if(controlp != NULL) *controlp = 0; error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error) return (error); so = fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_receive(td->td_ucred, so); SOCK_UNLOCK(so); if (error) { fdrop(fp, td); return (error); } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { fdrop(fp, td); return (EINVAL); } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = soreceive(so, &fromsa, &auio, (struct mbuf **)0, (mp->msg_control || controlp) ? &control : (struct mbuf **)0, &mp->msg_flags); if (error) { if (auio.uio_resid != (int)len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = (int)len - auio.uio_resid; ktrgenio(s, UIO_READ, ktruio, error); } #endif if (error) goto out; td->td_retval[0] = (int)len - auio.uio_resid; if (mp->msg_name) { len = mp->msg_namelen; if (len <= 0 || fromsa == 0) len = 0; else { /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) ((struct osockaddr *)fromsa)->sa_family = fromsa->sa_family; #endif if (fromseg == UIO_USERSPACE) { error = copyout(fromsa, mp->msg_name, (unsigned)len); if (error) goto out; } else bcopy(fromsa, mp->msg_name, len); } mp->msg_namelen = len; } if (mp->msg_control && controlp == NULL) { #ifdef COMPAT_OLDSOCK /* * We assume that old recvmsg calls won't receive access * rights and other control info, esp. as control info * is always optional and those options didn't exist in 4.3. * If we receive rights, trim the cmsghdr; anything else * is tossed. */ if (control && mp->msg_flags & MSG_COMPAT) { if (mtod(control, struct cmsghdr *)->cmsg_level != SOL_SOCKET || mtod(control, struct cmsghdr *)->cmsg_type != SCM_RIGHTS) { mp->msg_controllen = 0; goto out; } control->m_len -= sizeof (struct cmsghdr); control->m_data += sizeof (struct cmsghdr); } #endif len = mp->msg_controllen; m = control; mp->msg_controllen = 0; ctlbuf = mp->msg_control; while (m && len > 0) { unsigned int tocopy; if (len >= m->m_len) tocopy = m->m_len; else { mp->msg_flags |= MSG_CTRUNC; tocopy = len; } if ((error = copyout(mtod(m, caddr_t), ctlbuf, tocopy)) != 0) goto out; ctlbuf += tocopy; len -= tocopy; m = m->m_next; } mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; } out: fdrop(fp, td); if (fromsa) FREE(fromsa, M_SONAME); if (error == 0 && controlp != NULL) *controlp = control; else if (control) m_freem(control); return (error); } static int recvit(td, s, mp, namelenp) struct thread *td; int s; struct msghdr *mp; void *namelenp; { int error; error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); if (error) return (error); if (namelenp) { error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) error = 0; /* old recvfrom didn't check */ #endif } return (error); } int recvfrom(td, uap) struct thread *td; struct recvfrom_args /* { int s; caddr_t buf; size_t len; int flags; struct sockaddr * __restrict from; socklen_t * __restrict fromlenaddr; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; if (uap->fromlenaddr) { error = copyin(uap->fromlenaddr, &msg.msg_namelen, sizeof (msg.msg_namelen)); if (error) goto done2; } else { msg.msg_namelen = 0; } msg.msg_name = uap->from; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, uap->fromlenaddr); done2: return(error); } #ifdef COMPAT_OLDSOCK int orecvfrom(td, uap) struct thread *td; struct recvfrom_args *uap; { uap->flags |= MSG_COMPAT; return (recvfrom(td, uap)); } #endif #ifdef COMPAT_OLDSOCK int orecv(td, uap) struct thread *td; struct orecv_args /* { int s; caddr_t buf; int len; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, NULL); return (error); } /* * Old recvmsg. This code takes advantage of the fact that the old msghdr * overlays the new one, missing only the flags, and with the (old) access * rights where the control fields are now. */ int orecvmsg(td, uap) struct thread *td; struct orecvmsg_args /* { int s; struct omsghdr *msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error) return (error); msg.msg_flags = uap->flags | MSG_COMPAT; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); if (msg.msg_controllen && error == 0) error = copyout(&msg.msg_controllen, &uap->msg->msg_accrightslen, sizeof (int)); free(iov, M_IOV); return (error); } #endif int recvmsg(td, uap) struct thread *td; struct recvmsg_args /* { int s; struct msghdr *msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *uiov, *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error) return (error); msg.msg_flags = uap->flags; #ifdef COMPAT_OLDSOCK msg.msg_flags &= ~MSG_COMPAT; #endif uiov = msg.msg_iov; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, NULL); if (error == 0) { msg.msg_iov = uiov; error = copyout(&msg, uap->msg, sizeof(msg)); } free(iov, M_IOV); return (error); } /* ARGSUSED */ int shutdown(td, uap) struct thread *td; struct shutdown_args /* { int s; int how; } */ *uap; { struct socket *so; struct file *fp; int error; error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL); if (error == 0) { so = fp->f_data; error = soshutdown(so, uap->how); fdrop(fp, td); } return (error); } /* ARGSUSED */ int setsockopt(td, uap) struct thread *td; struct setsockopt_args /* { int s; int level; int name; caddr_t val; int valsize; } */ *uap; { return (kern_setsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, uap->valsize)); } int kern_setsockopt(td, s, level, name, val, valseg, valsize) struct thread *td; int s; int level; int name; void *val; enum uio_seg valseg; socklen_t valsize; { int error; struct socket *so; struct file *fp; struct sockopt sopt; if (val == NULL && valsize != 0) return (EFAULT); if ((int)valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = valsize; switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_setsockopt called with bad valseg"); } error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error == 0) { so = fp->f_data; error = sosetopt(so, &sopt); fdrop(fp, td); } return(error); } /* ARGSUSED */ int getsockopt(td, uap) struct thread *td; struct getsockopt_args /* { int s; int level; int name; void * __restrict val; socklen_t * __restrict avalsize; } */ *uap; { socklen_t valsize; int error; if (uap->val) { error = copyin(uap->avalsize, &valsize, sizeof (valsize)); if (error) return (error); } error = kern_getsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, &valsize); if (error == 0) error = copyout(&valsize, uap->avalsize, sizeof (valsize)); return (error); } /* * Kernel version of getsockopt. * optval can be a userland or userspace. optlen is always a kernel pointer. */ int kern_getsockopt(td, s, level, name, val, valseg, valsize) struct thread *td; int s; int level; int name; void *val; enum uio_seg valseg; socklen_t *valsize; { int error; struct socket *so; struct file *fp; struct sockopt sopt; if (val == NULL) *valsize = 0; if ((int)*valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_GET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_getsockopt called with bad valseg"); } error = getsock(td->td_proc->p_fd, s, &fp, NULL); if (error == 0) { so = fp->f_data; error = sogetopt(so, &sopt); *valsize = sopt.sopt_valsize; fdrop(fp, td); } return (error); } /* * getsockname1() - Get socket name. */ /* ARGSUSED */ static int getsockname1(td, uap, compat) struct thread *td; struct getsockname_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ *uap; int compat; { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof(len)); if (error) return (error); error = kern_getsockname(td, uap->fdes, &sa, &len); if (error) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; socklen_t len; int error; if (*alen < 0) return (EINVAL); error = getsock(td->td_proc->p_fd, fd, &fp, NULL); if (error) return (error); so = fp->f_data; *sa = NULL; error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); if (error) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; bad: fdrop(fp, td); if (error && *sa) { free(*sa, M_SONAME); *sa = NULL; } return (error); } int getsockname(td, uap) struct thread *td; struct getsockname_args *uap; { return (getsockname1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetsockname(td, uap) struct thread *td; struct getsockname_args *uap; { return (getsockname1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* * getpeername1() - Get name of peer for connected socket. */ /* ARGSUSED */ static int getpeername1(td, uap, compat) struct thread *td; struct getpeername_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ *uap; int compat; { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof (len)); if (error) return (error); error = kern_getpeername(td, uap->fdes, &sa, &len); if (error) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; socklen_t len; int error; if (*alen < 0) return (EINVAL); error = getsock(td->td_proc->p_fd, fd, &fp, NULL); if (error) return (error); so = fp->f_data; if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { error = ENOTCONN; goto done; } *sa = NULL; error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); if (error) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; bad: if (error && *sa) { free(*sa, M_SONAME); *sa = NULL; } done: fdrop(fp, td); return (error); } int getpeername(td, uap) struct thread *td; struct getpeername_args *uap; { return (getpeername1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetpeername(td, uap) struct thread *td; struct ogetpeername_args *uap; { /* XXX uap should have type `getpeername_args *' to begin with. */ return (getpeername1(td, (struct getpeername_args *)uap, 1)); } #endif /* COMPAT_OLDSOCK */ int sockargs(mp, buf, buflen, type) struct mbuf **mp; caddr_t buf; int buflen, type; { struct sockaddr *sa; struct mbuf *m; int error; if ((u_int)buflen > MLEN) { #ifdef COMPAT_OLDSOCK if (type == MT_SONAME && (u_int)buflen <= 112) buflen = MLEN; /* unix domain compat. hack */ else #endif if ((u_int)buflen > MCLBYTES) return (EINVAL); } m = m_get(M_TRYWAIT, type); if (m == NULL) return (ENOBUFS); if ((u_int)buflen > MLEN) { MCLGET(m, M_TRYWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return (ENOBUFS); } } m->m_len = buflen; error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); if (error) (void) m_free(m); else { *mp = m; if (type == MT_SONAME) { sa = mtod(m, struct sockaddr *); #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = buflen; } } return (error); } int getsockaddr(namp, uaddr, len) struct sockaddr **namp; caddr_t uaddr; size_t len; { struct sockaddr *sa; int error; if (len > SOCK_MAXADDRLEN) return (ENAMETOOLONG); if (len < offsetof(struct sockaddr, sa_data[0])) return (EINVAL); MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK); error = copyin(uaddr, sa, len); if (error) { FREE(sa, M_SONAME); } else { #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = len; *namp = sa; } return (error); } /* * Detach mapped page and release resources back to the system. */ void sf_buf_mext(void *addr, void *args) { vm_page_t m; m = sf_buf_page(args); sf_buf_free(args); vm_page_lock_queues(); vm_page_unwire(m, 0); /* * Check for the object going away on us. This can * happen since we don't hold a reference to it. * If so, we're responsible for freeing the page. */ if (m->wire_count == 0 && m->object == NULL) vm_page_free(m); vm_page_unlock_queues(); } /* * sendfile(2) * * int sendfile(int fd, int s, off_t offset, size_t nbytes, * struct sf_hdtr *hdtr, off_t *sbytes, int flags) * * Send a file specified by 'fd' and starting at 'offset' to a socket * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == * 0. Optionally add a header and/or trailer to the socket output. If * specified, write the total number of bytes sent into *sbytes. */ int sendfile(struct thread *td, struct sendfile_args *uap) { return (do_sendfile(td, uap, 0)); } static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat) { struct sf_hdtr hdtr; struct uio *hdr_uio, *trl_uio; int error; hdr_uio = trl_uio = NULL; if (uap->hdtr != NULL) { error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); if (error) goto out; if (hdtr.headers != NULL) { error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio); if (error) goto out; } if (hdtr.trailers != NULL) { error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio); if (error) goto out; } } error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat); out: if (hdr_uio) free(hdr_uio, M_IOV); if (trl_uio) free(trl_uio, M_IOV); return (error); } #ifdef COMPAT_FREEBSD4 int freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) { struct sendfile_args args; args.fd = uap->fd; args.s = uap->s; args.offset = uap->offset; args.nbytes = uap->nbytes; args.hdtr = uap->hdtr; args.sbytes = uap->sbytes; args.flags = uap->flags; return (do_sendfile(td, &args, 1)); } #endif /* COMPAT_FREEBSD4 */ int kern_sendfile(struct thread *td, struct sendfile_args *uap, struct uio *hdr_uio, struct uio *trl_uio, int compat) { struct file *sock_fp; struct vnode *vp; struct vm_object *obj = NULL; struct socket *so = NULL; struct mbuf *m = NULL; struct sf_buf *sf; struct vm_page *pg; off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0; int error, hdrlen = 0, mnw = 0; int vfslocked; /* * The file descriptor must be a regular file and have a * backing VM object. * File offset must be positive. If it goes beyond EOF * we send only the header/trailer and no payload data. */ if ((error = fgetvp_read(td, uap->fd, &vp)) != 0) goto out; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); obj = vp->v_object; if (obj != NULL) { /* * Temporarily increase the backing VM object's reference * count so that a forced reclamation of its vnode does not * immediately destroy it. */ VM_OBJECT_LOCK(obj); if ((obj->flags & OBJ_DEAD) == 0) { vm_object_reference_locked(obj); VM_OBJECT_UNLOCK(obj); } else { VM_OBJECT_UNLOCK(obj); obj = NULL; } } VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); if (obj == NULL) { error = EINVAL; goto out; } if (uap->offset < 0) { error = EINVAL; goto out; } /* * The socket must be a stream socket and connected. * Remember if it a blocking or non-blocking socket. */ if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp, NULL)) != 0) goto out; so = sock_fp->f_data; if (so->so_type != SOCK_STREAM) { error = EINVAL; goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { error = ENOTCONN; goto out; } /* * Do not wait on memory allocations but return ENOMEM for * caller to retry later. * XXX: Experimental. */ if (uap->flags & SF_MNOWAIT) mnw = 1; #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_send(td->td_ucred, so); SOCK_UNLOCK(so); if (error) goto out; #endif /* If headers are specified copy them into mbufs. */ if (hdr_uio != NULL) { hdr_uio->uio_td = td; hdr_uio->uio_rw = UIO_WRITE; if (hdr_uio->uio_resid > 0) { /* * In FBSD < 5.0 the nbytes to send also included * the header. If compat is specified subtract the * header size from nbytes. */ if (compat) { if (uap->nbytes > hdr_uio->uio_resid) uap->nbytes -= hdr_uio->uio_resid; else uap->nbytes = 0; } m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK), 0, 0, 0); if (m == NULL) { error = mnw ? EAGAIN : ENOBUFS; goto out; } hdrlen = m_length(m, NULL); } } /* Protect against multiple writers to the socket. */ (void) sblock(&so->so_snd, M_WAITOK); /* * Loop through the pages of the file, starting with the requested * offset. Get a file page (do I/O if necessary), map the file page * into an sf_buf, attach an mbuf header to the sf_buf, and queue * it on the socket. * This is done in two loops. The inner loop turns as many pages * as it can, up to available socket buffer space, without blocking * into mbufs to have it bulk delivered into the socket send buffer. * The outer loop checks the state and available space of the socket * and takes care of the overall progress. */ for (off = uap->offset, rem = uap->nbytes; ; ) { int loopbytes = 0; int space = 0; int done = 0; /* * Check the socket state for ongoing connection, * no errors and space in socket buffer. * If space is low allow for the remainder of the * file to be processed if it fits the socket buffer. * Otherwise block in waiting for sufficient space * to proceed, or if the socket is nonblocking, return * to userland with EAGAIN while reporting how far * we've come. * We wait until the socket buffer has significant free * space to do bulk sends. This makes good use of file * system read ahead and allows packet segmentation * offloading hardware to take over lots of work. If * we were not careful here we would send off only one * sfbuf at a time. */ SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; retry_space: if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; SOCKBUF_UNLOCK(&so->so_snd); goto done; } else if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto done; } space = sbspace(&so->so_snd); if (space < rem && (space <= 0 || space < so->so_snd.sb_lowat)) { if (so->so_state & SS_NBIO) { SOCKBUF_UNLOCK(&so->so_snd); error = EAGAIN; goto done; } /* * sbwait drops the lock while sleeping. * When we loop back to retry_space the * state may have changed and we retest * for it. */ error = sbwait(&so->so_snd); /* * An error from sbwait usually indicates that we've * been interrupted by a signal. If we've sent anything * then return bytes sent, otherwise return the error. */ if (error) { SOCKBUF_UNLOCK(&so->so_snd); goto done; } goto retry_space; } SOCKBUF_UNLOCK(&so->so_snd); /* * Reduce space in the socket buffer by the size of * the header mbuf chain. * hdrlen is set to 0 after the first loop. */ space -= hdrlen; /* * Loop and construct maximum sized mbuf chain to be bulk * dumped into socket buffer. */ while(space > loopbytes) { vm_pindex_t pindex; vm_offset_t pgoff; struct mbuf *m0; VM_OBJECT_LOCK(obj); /* * Calculate the amount to transfer. * Not to exceed a page, the EOF, * or the passed in nbytes. */ pgoff = (vm_offset_t)(off & PAGE_MASK); xfsize = omin(PAGE_SIZE - pgoff, obj->un_pager.vnp.vnp_size - uap->offset - fsbytes - loopbytes); if (uap->nbytes) rem = (uap->nbytes - fsbytes - loopbytes); else rem = obj->un_pager.vnp.vnp_size - uap->offset - fsbytes - loopbytes; xfsize = omin(rem, xfsize); if (xfsize <= 0) { VM_OBJECT_UNLOCK(obj); done = 1; /* all data sent */ break; } /* * Don't overflow the send buffer. * Stop here and send out what we've * already got. */ if (space < loopbytes + xfsize) { VM_OBJECT_UNLOCK(obj); break; } /* * Attempt to look up the page. Allocate * if not found or wait and loop if busy. */ pindex = OFF_TO_IDX(off); pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_RETRY); /* * Check if page is valid for what we need, * otherwise initiate I/O. * If we already turned some pages into mbufs, * send them off before we come here again and * block. */ if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize)) VM_OBJECT_UNLOCK(obj); else if (m != NULL) error = EAGAIN; /* send what we already got */ else if (uap->flags & SF_NODISKIO) error = EBUSY; else { int bsize, resid; /* * Ensure that our page is still around * when the I/O completes. */ vm_page_io_start(pg); VM_OBJECT_UNLOCK(obj); /* * Get the page from backing store. */ bsize = vp->v_mount->mnt_stat.f_iosize; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_SHARED | LK_RETRY, td); /* * XXXMAC: Because we don't have fp->f_cred * here, we pass in NOCRED. This is probably * wrong, but is consistent with our original * implementation. */ error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE, trunc_page(off), UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td); VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); VM_OBJECT_LOCK(obj); vm_page_io_finish(pg); if (!error) VM_OBJECT_UNLOCK(obj); mbstat.sf_iocnt++; } if (error) { vm_page_lock_queues(); vm_page_unwire(pg, 0); /* * See if anyone else might know about * this page. If not and it is not valid, * then free it. */ if (pg->wire_count == 0 && pg->valid == 0 && pg->busy == 0 && !(pg->oflags & VPO_BUSY) && pg->hold_count == 0) { vm_page_free(pg); } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(obj); if (error == EAGAIN) error = 0; /* not a real error */ break; } /* * Get a sendfile buf. We usually wait as long * as necessary, but this wait can be interrupted. */ if ((sf = sf_buf_alloc(pg, (mnw ? SFB_NOWAIT : SFB_CATCH))) == NULL) { mbstat.sf_allocfail++; vm_page_lock_queues(); vm_page_unwire(pg, 0); /* * XXX: Not same check as above!? */ if (pg->wire_count == 0 && pg->object == NULL) vm_page_free(pg); vm_page_unlock_queues(); error = (mnw ? EAGAIN : EINTR); break; } /* * Get an mbuf and set it up as having * external storage. */ m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA); if (m0 == NULL) { error = (mnw ? EAGAIN : ENOBUFS); sf_buf_mext((void *)sf_buf_kva(sf), sf); break; } MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext, sf, M_RDONLY, EXT_SFBUF); m0->m_data = (char *)sf_buf_kva(sf) + pgoff; m0->m_len = xfsize; /* Append to mbuf chain. */ if (m != NULL) m_cat(m, m0); else m = m0; /* Keep track of bits processed. */ loopbytes += xfsize; off += xfsize; } /* Add the buffer chain to the socket buffer. */ if (m != NULL) { int mlen, err; mlen = m_length(m, NULL); SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; SOCKBUF_UNLOCK(&so->so_snd); goto done; } SOCKBUF_UNLOCK(&so->so_snd); /* Avoid error aliasing. */ err = (*so->so_proto->pr_usrreqs->pru_send) (so, 0, m, NULL, NULL, td); if (err == 0) { /* * We need two counters to get the * file offset and nbytes to send * right: * - sbytes contains the total amount * of bytes sent, including headers. * - fsbytes contains the total amount * of bytes sent from the file. */ sbytes += mlen; fsbytes += mlen; if (hdrlen) { fsbytes -= hdrlen; hdrlen = 0; } } else if (error == 0) error = err; m = NULL; /* pru_send always consumes */ } /* Quit outer loop on error or when we're done. */ if (error || done) goto done; } /* * Send trailers. Wimp out and use writev(2). */ if (trl_uio != NULL) { error = kern_writev(td, uap->s, trl_uio); if (error) goto done; sbytes += td->td_retval[0]; } done: sbunlock(&so->so_snd); out: /* * If there was no error we have to clear td->td_retval[0] * because it may have been set by writev. */ if (error == 0) { td->td_retval[0] = 0; } if (uap->sbytes != NULL) { copyout(&sbytes, uap->sbytes, sizeof(off_t)); } if (obj != NULL) vm_object_deallocate(obj); if (vp != NULL) { vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); } if (so) fdrop(sock_fp, td); if (m) m_freem(m); if (error == ERESTART) error = EINTR; return (error); } /* * SCTP syscalls. * Functionality only compiled in if SCTP is defined in the kernel Makefile, * otherwise all return EOPNOTSUPP. * XXX: We should make this loadable one day. */ int sctp_peeloff(td, uap) struct thread *td; struct sctp_peeloff_args /* { int sd; caddr_t name; } */ *uap; { #ifdef SCTP struct filedesc *fdp; struct file *nfp = NULL; int error; struct socket *head, *so; int fd; u_int fflag; fdp = td->td_proc->p_fd; error = fgetsock(td, uap->sd, &head, &fflag); if (error) goto done2; error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name); if (error) goto done2; /* * At this point we know we do have a assoc to pull * we proceed to get the fd setup. This may block * but that is ok. */ error = falloc(td, &nfp, &fd); if (error) goto done; td->td_retval[0] = fd; so = sonewconn(head, SS_ISCONNECTED); if (so == NULL) goto noconnection; /* * Before changing the flags on the socket, we have to bump the * reference count. Otherwise, if the protocol calls sofree(), * the socket will be released due to a zero refcount. */ SOCK_LOCK(so); soref(so); /* file descriptor reference */ SOCK_UNLOCK(so); ACCEPT_LOCK(); TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; so->so_state |= (head->so_state & SS_NBIO); so->so_state &= ~SS_NOFDREF; so->so_qstate &= ~SQ_COMP; so->so_head = NULL; ACCEPT_UNLOCK(); - FILE_LOCK(nfp); - nfp->f_data = so; - nfp->f_flag = fflag; - nfp->f_type = DTYPE_SOCKET; - nfp->f_ops = &socketops; - FILE_UNLOCK(nfp); + finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name); if (error) goto noconnection; if (head->so_sigio != NULL) fsetown(fgetown(&head->so_sigio), &so->so_sigio); noconnection: /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error) fdclose(fdp, nfp, fd, td); /* * Release explicitly held references before returning. */ done: if (nfp != NULL) fdrop(nfp, td); fputsock(head); done2: return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sctp_generic_sendmsg (td, uap) struct thread *td; struct sctp_generic_sendmsg_args /* { int sd, caddr_t msg, int mlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags } */ *uap; { #ifdef SCTP struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; struct socket *so; struct file *fp = NULL; int use_rcvinfo = 1; int error = 0, len; struct sockaddr *to = NULL; #ifdef KTRACE struct uio *ktruio = NULL; #endif struct uio auio; struct iovec iov[1]; if (uap->sinfo) { error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); if (error) return (error); u_sinfo = &sinfo; } if (uap->tolen) { error = getsockaddr(&to, uap->to, uap->tolen); if (error) { to = NULL; goto sctp_bad2; } } error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL); if (error) goto sctp_bad; iov[0].iov_base = uap->msg; iov[0].iov_len = uap->mlen; so = (struct socket *)fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_send(td->td_ucred, so); SOCK_UNLOCK(so); if (error) goto sctp_bad; #endif /* MAC */ auio.uio_iov = iov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; len = auio.uio_resid = uap->mlen; error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, (struct mbuf *)NULL, uap->flags, use_rcvinfo, u_sinfo, td); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket. */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(uap->flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); psignal(td->td_proc, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(uap->sd, UIO_WRITE, ktruio, error); } #endif /* KTRACE */ sctp_bad: if (fp) fdrop(fp, td); sctp_bad2: if (to) free(to, M_SONAME); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sctp_generic_sendmsg_iov(td, uap) struct thread *td; struct sctp_generic_sendmsg_iov_args /* { int sd, struct iovec *iov, int iovlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags } */ *uap; { #ifdef SCTP struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; struct socket *so; struct file *fp = NULL; int use_rcvinfo = 1; int error=0, len, i; struct sockaddr *to = NULL; #ifdef KTRACE struct uio *ktruio = NULL; #endif struct uio auio; struct iovec *iov, *tiov; if (uap->sinfo) { error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); if (error) return (error); u_sinfo = &sinfo; } if (uap->tolen) { error = getsockaddr(&to, uap->to, uap->tolen); if (error) { to = NULL; goto sctp_bad2; } } error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL); if (error) goto sctp_bad1; error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); if (error) goto sctp_bad1; so = (struct socket *)fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_send(td->td_ucred, so); SOCK_UNLOCK(so); if (error) goto sctp_bad; #endif /* MAC */ auio.uio_iov = iov; auio.uio_iovcnt = uap->iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; tiov = iov; for (i = 0; i iovlen; i++, tiov++) { if ((auio.uio_resid += tiov->iov_len) < 0) { error = EINVAL; goto sctp_bad; } } len = auio.uio_resid; error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, (struct mbuf *)NULL, uap->flags, use_rcvinfo, u_sinfo, td); if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(uap->flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); psignal(td->td_proc, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(uap->sd, UIO_WRITE, ktruio, error); } #endif /* KTRACE */ sctp_bad: free(iov, M_IOV); sctp_bad1: if (fp) fdrop(fp, td); sctp_bad2: if (to) free(to, M_SONAME); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sctp_generic_recvmsg(td, uap) struct thread *td; struct sctp_generic_recvmsg_args /* { int sd, struct iovec *iov, int iovlen, struct sockaddr *from, __socklen_t *fromlenaddr, struct sctp_sndrcvinfo *sinfo, int *msg_flags } */ *uap; { #ifdef SCTP u_int8_t sockbufstore[256]; struct uio auio; struct iovec *iov, *tiov; struct sctp_sndrcvinfo sinfo; struct socket *so; struct file *fp = NULL; struct sockaddr *fromsa; int fromlen; int len, i, msg_flags; int error = 0; #ifdef KTRACE struct uio *ktruio = NULL; #endif error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL); if (error) { return (error); } error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); if (error) { goto out1; } so = fp->f_data; #ifdef MAC SOCK_LOCK(so); error = mac_socket_check_receive(td->td_ucred, so); SOCK_UNLOCK(so); if (error) { goto out; return (error); } #endif /* MAC */ if (uap->fromlenaddr) { error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen)); if (error) { goto out; } } else { fromlen = 0; } if(uap->msg_flags) { error = copyin(uap->msg_flags, &msg_flags, sizeof (int)); if (error) { goto out; } } else { msg_flags = 0; } auio.uio_iov = iov; auio.uio_iovcnt = uap->iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; tiov = iov; for (i = 0; i iovlen; i++, tiov++) { if ((auio.uio_resid += tiov->iov_len) < 0) { error = EINVAL; goto out; } } len = auio.uio_resid; fromsa = (struct sockaddr *)sockbufstore; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif /* KTRACE */ error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL, fromsa, fromlen, &msg_flags, (struct sctp_sndrcvinfo *)&sinfo, 1); if (error) { if (auio.uio_resid != (int)len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } else { if (uap->sinfo) error = copyout(&sinfo, uap->sinfo, sizeof (sinfo)); } #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = (int)len - auio.uio_resid; ktrgenio(uap->sd, UIO_READ, ktruio, error); } #endif /* KTRACE */ if (error) goto out; td->td_retval[0] = (int)len - auio.uio_resid; if (fromlen && uap->from) { len = fromlen; if (len <= 0 || fromsa == 0) len = 0; else { len = MIN(len, fromsa->sa_len); error = copyout(fromsa, uap->from, (unsigned)len); if (error) goto out; } error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t)); if (error) { goto out; } } if (uap->msg_flags) { error = copyout(&msg_flags, uap->msg_flags, sizeof (int)); if (error) { goto out; } } out: free(iov, M_IOV); out1: if (fp) fdrop(fp, td); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } Index: head/sys/kern/uipc_usrreq.c =================================================================== --- head/sys/kern/uipc_usrreq.c (revision 174987) +++ head/sys/kern/uipc_usrreq.c (revision 174988) @@ -1,2285 +1,2223 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. * Copyright (c) 2004-2007 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 */ /* * UNIX Domain (Local) Sockets * * This is an implementation of UNIX (local) domain sockets. Each socket has * an associated struct unpcb (UNIX protocol control block). Stream sockets * may be connected to 0 or 1 other socket. Datagram sockets may be * connected to 0, 1, or many other sockets. Sockets may be created and * connected in pairs (socketpair(2)), or bound/connected to using the file * system name space. For most purposes, only the receive socket buffer is * used, as sending on one socket delivers directly to the receive socket * buffer of a second socket. * * The implementation is substantially complicated by the fact that * "ancillary data", such as file descriptors or credentials, may be passed * across UNIX domain sockets. The potential for passing UNIX domain sockets * over other UNIX domain sockets requires the implementation of a simple * garbage collector to find and tear down cycles of disconnected sockets. * * TODO: * SEQPACKET, RDM * rethink name space problems * need a proper out-of-band * lock pushdown */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_mac.h" #include #include #include #include /* XXX must be before */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include static uma_zone_t unp_zone; static unp_gen_t unp_gencnt; static u_int unp_count; /* Count of local sockets. */ static ino_t unp_ino; /* Prototype for fake inode numbers. */ static int unp_rights; /* File descriptors in flight. */ static struct unp_head unp_shead; /* List of local stream sockets. */ static struct unp_head unp_dhead; /* List of local datagram sockets. */ static const struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; /* * Garbage collection of cyclic file descriptor/socket references occurs * asynchronously in a taskqueue context in order to avoid recursion and * reentrance in the UNIX domain socket, file descriptor, and socket layer * code. See unp_gc() for a full description. */ static struct task unp_gc_task; /* * Both send and receive buffers are allocated PIPSIZ bytes of buffering for * stream sockets, although the total for sender and receiver is actually * only PIPSIZ. * * Datagram sockets really use the sendspace as the maximum datagram size, * and don't really want to reserve the sendspace. Their recvspace should be * large enough for at least one max-size datagram plus address. */ #ifndef PIPSIZ #define PIPSIZ 8192 #endif static u_long unpst_sendspace = PIPSIZ; static u_long unpst_recvspace = PIPSIZ; static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ static u_long unpdg_recvspace = 4*1024; SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain"); SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM"); SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM"); SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, &unpst_sendspace, 0, ""); SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, &unpst_recvspace, 0, ""); SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, &unpdg_sendspace, 0, ""); SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, &unpdg_recvspace, 0, ""); SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, ""); /*- * Locking and synchronization: * * The global UNIX domain socket rwlock (unp_global_rwlock) protects all * global variables, including the linked lists tracking the set of allocated * UNIX domain sockets. The global rwlock also serves to prevent deadlock * when more than one PCB lock is acquired at a time (i.e., during * connect()). Finally, the global rwlock protects uncounted references from * vnodes to sockets bound to those vnodes: to safely dereference the * v_socket pointer, the global rwlock must be held while a full reference is * acquired. * * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer, * allocated in pru_attach() and freed in pru_detach(). The validity of that * pointer is an invariant, so no lock is required to dereference the so_pcb * pointer if a valid socket reference is held by the caller. In practice, * this is always true during operations performed on a socket. Each unpcb * has a back-pointer to its socket, unp_socket, which will be stable under * the same circumstances. * * This pointer may only be safely dereferenced as long as a valid reference * to the unpcb is held. Typically, this reference will be from the socket, * or from another unpcb when the referring unpcb's lock is held (in order * that the reference not be invalidated during use). For example, to follow * unp->unp_conn->unp_socket, you need unlock the lock on unp, not unp_conn, * as unp_socket remains valid as long as the reference to unp_conn is valid. * * Fields of unpcbss are locked using a per-unpcb lock, unp_mtx. Individual * atomic reads without the lock may be performed "lockless", but more * complex reads and read-modify-writes require the mutex to be held. No * lock order is defined between unpcb locks -- multiple unpcb locks may be * acquired at the same time only when holding the global UNIX domain socket * rwlock exclusively, which prevents deadlocks. * * Blocking with UNIX domain sockets is a tricky issue: unlike most network * protocols, bind() is a non-atomic operation, and connect() requires * potential sleeping in the protocol, due to potentially waiting on local or * distributed file systems. We try to separate "lookup" operations, which * may sleep, and the IPC operations themselves, which typically can occur * with relative atomicity as locks can be held over the entire operation. * * Another tricky issue is simultaneous multi-threaded or multi-process * access to a single UNIX domain socket. These are handled by the flags * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or * binding, both of which involve dropping UNIX domain socket locks in order * to perform namei() and other file system operations. */ static struct rwlock unp_global_rwlock; #define UNP_GLOBAL_LOCK_INIT() rw_init(&unp_global_rwlock, \ "unp_global_rwlock") #define UNP_GLOBAL_LOCK_ASSERT() rw_assert(&unp_global_rwlock, \ RA_LOCKED) #define UNP_GLOBAL_UNLOCK_ASSERT() rw_assert(&unp_global_rwlock, \ RA_UNLOCKED) #define UNP_GLOBAL_WLOCK() rw_wlock(&unp_global_rwlock) #define UNP_GLOBAL_WUNLOCK() rw_wunlock(&unp_global_rwlock) #define UNP_GLOBAL_WLOCK_ASSERT() rw_assert(&unp_global_rwlock, \ RA_WLOCKED) #define UNP_GLOBAL_WOWNED() rw_wowned(&unp_global_rwlock) #define UNP_GLOBAL_RLOCK() rw_rlock(&unp_global_rwlock) #define UNP_GLOBAL_RUNLOCK() rw_runlock(&unp_global_rwlock) #define UNP_GLOBAL_RLOCK_ASSERT() rw_assert(&unp_global_rwlock, \ RA_RLOCKED) #define UNP_PCB_LOCK_INIT(unp) mtx_init(&(unp)->unp_mtx, \ "unp_mtx", "unp_mtx", \ MTX_DUPOK|MTX_DEF|MTX_RECURSE) #define UNP_PCB_LOCK_DESTROY(unp) mtx_destroy(&(unp)->unp_mtx) #define UNP_PCB_LOCK(unp) mtx_lock(&(unp)->unp_mtx) #define UNP_PCB_UNLOCK(unp) mtx_unlock(&(unp)->unp_mtx) #define UNP_PCB_LOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_OWNED) static int unp_connect(struct socket *, struct sockaddr *, struct thread *); static int unp_connect2(struct socket *so, struct socket *so2, int); static void unp_disconnect(struct unpcb *unp, struct unpcb *unp2); static void unp_shutdown(struct unpcb *); static void unp_drop(struct unpcb *, int); static void unp_gc(__unused void *, int); static void unp_scan(struct mbuf *, void (*)(struct file *)); -static void unp_mark(struct file *); static void unp_discard(struct file *); static void unp_freerights(struct file **, int); static int unp_internalize(struct mbuf **, struct thread *); +static void unp_internalize_fp(struct file *); +static void unp_externalize_fp(struct file *); static struct mbuf *unp_addsockcred(struct thread *, struct mbuf *); /* * Definitions of protocols supported in the LOCAL domain. */ static struct domain localdomain; static struct protosw localsw[] = { { .pr_type = SOCK_STREAM, .pr_domain = &localdomain, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS, .pr_ctloutput = &uipc_ctloutput, .pr_usrreqs = &uipc_usrreqs }, { .pr_type = SOCK_DGRAM, .pr_domain = &localdomain, .pr_flags = PR_ATOMIC|PR_ADDR|PR_RIGHTS, .pr_usrreqs = &uipc_usrreqs }, }; static struct domain localdomain = { .dom_family = AF_LOCAL, .dom_name = "local", .dom_init = unp_init, .dom_externalize = unp_externalize, .dom_dispose = unp_dispose, .dom_protosw = localsw, .dom_protoswNPROTOSW = &localsw[sizeof(localsw)/sizeof(localsw[0])] }; DOMAIN_SET(local); static void uipc_abort(struct socket *so) { struct unpcb *unp, *unp2; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_abort: unp == NULL")); UNP_GLOBAL_WLOCK(); UNP_PCB_LOCK(unp); unp2 = unp->unp_conn; if (unp2 != NULL) { UNP_PCB_LOCK(unp2); unp_drop(unp2, ECONNABORTED); UNP_PCB_UNLOCK(unp2); } UNP_PCB_UNLOCK(unp); UNP_GLOBAL_WUNLOCK(); } static int uipc_accept(struct socket *so, struct sockaddr **nam) { struct unpcb *unp, *unp2; const struct sockaddr *sa; /* * Pass back name of connected socket, if it was bound and we are * still connected (our peer may have closed already!). */ unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_accept: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_GLOBAL_RLOCK(); unp2 = unp->unp_conn; if (unp2 != NULL && unp2->unp_addr != NULL) { UNP_PCB_LOCK(unp2); sa = (struct sockaddr *) unp2->unp_addr; bcopy(sa, *nam, sa->sa_len); UNP_PCB_UNLOCK(unp2); } else { sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); } UNP_GLOBAL_RUNLOCK(); return (0); } static int uipc_attach(struct socket *so, int proto, struct thread *td) { u_long sendspace, recvspace; struct unpcb *unp; int error, locked; KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL")); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { switch (so->so_type) { case SOCK_STREAM: sendspace = unpst_sendspace; recvspace = unpst_recvspace; break; case SOCK_DGRAM: sendspace = unpdg_sendspace; recvspace = unpdg_recvspace; break; default: panic("uipc_attach"); } error = soreserve(so, sendspace, recvspace); if (error) return (error); } unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO); if (unp == NULL) return (ENOBUFS); LIST_INIT(&unp->unp_refs); UNP_PCB_LOCK_INIT(unp); unp->unp_socket = so; so->so_pcb = unp; unp->unp_refcount = 1; /* * uipc_attach() may be called indirectly from within the UNIX domain * socket code via sonewconn() in unp_connect(). Since rwlocks can * not be recursed, we do the closest thing. */ locked = 0; if (!UNP_GLOBAL_WOWNED()) { UNP_GLOBAL_WLOCK(); locked = 1; } unp->unp_gencnt = ++unp_gencnt; unp_count++; LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead : &unp_shead, unp, unp_link); if (locked) UNP_GLOBAL_WUNLOCK(); return (0); } static int uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_un *soun = (struct sockaddr_un *)nam; struct vattr vattr; int error, namelen, vfslocked; struct nameidata nd; struct unpcb *unp; struct vnode *vp; struct mount *mp; char *buf; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_bind: unp == NULL")); namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); if (namelen <= 0) return (EINVAL); /* * We don't allow simultaneous bind() calls on a single UNIX domain * socket, so flag in-progress operations, and return an error if an * operation is already in progress. * * Historically, we have not allowed a socket to be rebound, so this * also returns an error. Not allowing re-binding simplifies the * implementation and avoids a great many possible failure modes. */ UNP_PCB_LOCK(unp); if (unp->unp_vnode != NULL) { UNP_PCB_UNLOCK(unp); return (EINVAL); } if (unp->unp_flags & UNP_BINDING) { UNP_PCB_UNLOCK(unp); return (EALREADY); } unp->unp_flags |= UNP_BINDING; UNP_PCB_UNLOCK(unp); buf = malloc(namelen + 1, M_TEMP, M_WAITOK); strlcpy(buf, soun->sun_path, namelen + 1); restart: vfslocked = 0; NDINIT(&nd, CREATE, MPSAFE | NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE, buf, td); /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ error = namei(&nd); if (error) goto error; vp = nd.ni_vp; vfslocked = NDHASGIANT(&nd); if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (vp != NULL) { vrele(vp); error = EADDRINUSE; goto error; } error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error) goto error; VFS_UNLOCK_GIANT(vfslocked); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VSOCK; vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask); #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (error == 0) { VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (error) { vn_finished_write(mp); goto error; } vp = nd.ni_vp; ASSERT_VOP_ELOCKED(vp, "uipc_bind"); soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK); UNP_GLOBAL_WLOCK(); UNP_PCB_LOCK(unp); vp->v_socket = unp->unp_socket; unp->unp_vnode = vp; unp->unp_addr = soun; unp->unp_flags &= ~UNP_BINDING; UNP_PCB_UNLOCK(unp); UNP_GLOBAL_WUNLOCK(); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); free(buf, M_TEMP); return (0); error: VFS_UNLOCK_GIANT(vfslocked); UNP_PCB_LOCK(unp); unp->unp_flags &= ~UNP_BINDING; UNP_PCB_UNLOCK(unp); free(buf, M_TEMP); return (error); } static int uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; KASSERT(td == curthread, ("uipc_connect: td != curthread")); UNP_GLOBAL_WLOCK(); error = unp_connect(so, nam, td); UNP_GLOBAL_WUNLOCK(); return (error); } static void uipc_close(struct socket *so) { struct unpcb *unp, *unp2; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_close: unp == NULL")); UNP_GLOBAL_WLOCK(); UNP_PCB_LOCK(unp); unp2 = unp->unp_conn; if (unp2 != NULL) { UNP_PCB_LOCK(unp2); unp_disconnect(unp, unp2); UNP_PCB_UNLOCK(unp2); } UNP_PCB_UNLOCK(unp); UNP_GLOBAL_WUNLOCK(); } int uipc_connect2(struct socket *so1, struct socket *so2) { struct unpcb *unp, *unp2; int error; UNP_GLOBAL_WLOCK(); unp = so1->so_pcb; KASSERT(unp != NULL, ("uipc_connect2: unp == NULL")); UNP_PCB_LOCK(unp); unp2 = so2->so_pcb; KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL")); UNP_PCB_LOCK(unp2); error = unp_connect2(so1, so2, PRU_CONNECT2); UNP_PCB_UNLOCK(unp2); UNP_PCB_UNLOCK(unp); UNP_GLOBAL_WUNLOCK(); return (error); } /* control is EOPNOTSUPP */ static void uipc_detach(struct socket *so) { struct unpcb *unp, *unp2; struct sockaddr_un *saved_unp_addr; struct vnode *vp; int freeunp, local_unp_rights; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_detach: unp == NULL")); UNP_GLOBAL_WLOCK(); UNP_PCB_LOCK(unp); LIST_REMOVE(unp, unp_link); unp->unp_gencnt = ++unp_gencnt; --unp_count; /* * XXXRW: Should assert vp->v_socket == so. */ if ((vp = unp->unp_vnode) != NULL) { unp->unp_vnode->v_socket = NULL; unp->unp_vnode = NULL; } unp2 = unp->unp_conn; if (unp2 != NULL) { UNP_PCB_LOCK(unp2); unp_disconnect(unp, unp2); UNP_PCB_UNLOCK(unp2); } /* * We hold the global lock, so it's OK to acquire multiple pcb locks * at a time. */ while (!LIST_EMPTY(&unp->unp_refs)) { struct unpcb *ref = LIST_FIRST(&unp->unp_refs); UNP_PCB_LOCK(ref); unp_drop(ref, ECONNRESET); UNP_PCB_UNLOCK(ref); } + local_unp_rights = unp_rights; UNP_GLOBAL_WUNLOCK(); unp->unp_socket->so_pcb = NULL; - local_unp_rights = unp_rights; saved_unp_addr = unp->unp_addr; unp->unp_addr = NULL; unp->unp_refcount--; freeunp = (unp->unp_refcount == 0); if (saved_unp_addr != NULL) FREE(saved_unp_addr, M_SONAME); if (freeunp) { UNP_PCB_LOCK_DESTROY(unp); uma_zfree(unp_zone, unp); } else UNP_PCB_UNLOCK(unp); if (vp) { int vfslocked; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); } if (local_unp_rights) taskqueue_enqueue(taskqueue_thread, &unp_gc_task); } static int uipc_disconnect(struct socket *so) { struct unpcb *unp, *unp2; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL")); UNP_GLOBAL_WLOCK(); UNP_PCB_LOCK(unp); unp2 = unp->unp_conn; if (unp2 != NULL) { UNP_PCB_LOCK(unp2); unp_disconnect(unp, unp2); UNP_PCB_UNLOCK(unp2); } UNP_PCB_UNLOCK(unp); UNP_GLOBAL_WUNLOCK(); return (0); } static int uipc_listen(struct socket *so, int backlog, struct thread *td) { struct unpcb *unp; int error; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_listen: unp == NULL")); UNP_PCB_LOCK(unp); if (unp->unp_vnode == NULL) { UNP_PCB_UNLOCK(unp); return (EINVAL); } SOCK_LOCK(so); error = solisten_proto_check(so); if (error == 0) { cru2x(td->td_ucred, &unp->unp_peercred); unp->unp_flags |= UNP_HAVEPCCACHED; solisten_proto(so, backlog); } SOCK_UNLOCK(so); UNP_PCB_UNLOCK(unp); return (error); } static int uipc_peeraddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp, *unp2; const struct sockaddr *sa; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_PCB_LOCK(unp); /* * XXX: It seems that this test always fails even when connection is * established. So, this else clause is added as workaround to * return PF_LOCAL sockaddr. */ unp2 = unp->unp_conn; if (unp2 != NULL) { UNP_PCB_LOCK(unp2); if (unp2->unp_addr != NULL) sa = (struct sockaddr *) unp->unp_conn->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); UNP_PCB_UNLOCK(unp2); } else { sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); } UNP_PCB_UNLOCK(unp); return (0); } static int uipc_rcvd(struct socket *so, int flags) { struct unpcb *unp, *unp2; struct socket *so2; u_int mbcnt, sbcc; u_long newhiwat; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL")); if (so->so_type == SOCK_DGRAM) panic("uipc_rcvd DGRAM?"); if (so->so_type != SOCK_STREAM) panic("uipc_rcvd unknown socktype"); /* * Adjust backpressure on sender and wakeup any waiting to write. * * The unp lock is acquired to maintain the validity of the unp_conn * pointer; no lock on unp2 is required as unp2->unp_socket will be * static as long as we don't permit unp2 to disconnect from unp, * which is prevented by the lock on unp. We cache values from * so_rcv to avoid holding the so_rcv lock over the entire * transaction on the remote so_snd. */ SOCKBUF_LOCK(&so->so_rcv); mbcnt = so->so_rcv.sb_mbcnt; sbcc = so->so_rcv.sb_cc; SOCKBUF_UNLOCK(&so->so_rcv); UNP_PCB_LOCK(unp); unp2 = unp->unp_conn; if (unp2 == NULL) { UNP_PCB_UNLOCK(unp); return (0); } so2 = unp2->unp_socket; SOCKBUF_LOCK(&so2->so_snd); so2->so_snd.sb_mbmax += unp->unp_mbcnt - mbcnt; newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - sbcc; (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat, newhiwat, RLIM_INFINITY); sowwakeup_locked(so2); unp->unp_mbcnt = mbcnt; unp->unp_cc = sbcc; UNP_PCB_UNLOCK(unp); return (0); } /* pru_rcvoob is EOPNOTSUPP */ static int uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct unpcb *unp, *unp2; struct socket *so2; u_int mbcnt, sbcc; u_long newhiwat; int error = 0; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_send: unp == NULL")); if (flags & PRUS_OOB) { error = EOPNOTSUPP; goto release; } if (control != NULL && (error = unp_internalize(&control, td))) goto release; if ((nam != NULL) || (flags & PRUS_EOF)) UNP_GLOBAL_WLOCK(); else UNP_GLOBAL_RLOCK(); switch (so->so_type) { case SOCK_DGRAM: { const struct sockaddr *from; unp2 = unp->unp_conn; if (nam != NULL) { UNP_GLOBAL_WLOCK_ASSERT(); if (unp2 != NULL) { error = EISCONN; break; } error = unp_connect(so, nam, td); if (error) break; unp2 = unp->unp_conn; } /* * Because connect() and send() are non-atomic in a sendto() * with a target address, it's possible that the socket will * have disconnected before the send() can run. In that case * return the slightly counter-intuitive but otherwise * correct error that the socket is not connected. */ if (unp2 == NULL) { error = ENOTCONN; break; } /* Lockless read. */ if (unp2->unp_flags & UNP_WANTCRED) control = unp_addsockcred(td, control); UNP_PCB_LOCK(unp); if (unp->unp_addr != NULL) from = (struct sockaddr *)unp->unp_addr; else from = &sun_noname; so2 = unp2->unp_socket; SOCKBUF_LOCK(&so2->so_rcv); if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) { sorwakeup_locked(so2); m = NULL; control = NULL; } else { SOCKBUF_UNLOCK(&so2->so_rcv); error = ENOBUFS; } if (nam != NULL) { UNP_GLOBAL_WLOCK_ASSERT(); UNP_PCB_LOCK(unp2); unp_disconnect(unp, unp2); UNP_PCB_UNLOCK(unp2); } UNP_PCB_UNLOCK(unp); break; } case SOCK_STREAM: /* * Connect if not connected yet. * * Note: A better implementation would complain if not equal * to the peer's address. */ if ((so->so_state & SS_ISCONNECTED) == 0) { if (nam != NULL) { UNP_GLOBAL_WLOCK_ASSERT(); error = unp_connect(so, nam, td); if (error) break; /* XXX */ } else { error = ENOTCONN; break; } } /* Lockless read. */ if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; break; } /* * Because connect() and send() are non-atomic in a sendto() * with a target address, it's possible that the socket will * have disconnected before the send() can run. In that case * return the slightly counter-intuitive but otherwise * correct error that the socket is not connected. * * Locking here must be done carefully: the global lock * prevents interconnections between unpcbs from changing, so * we can traverse from unp to unp2 without acquiring unp's * lock. Socket buffer locks follow unpcb locks, so we can * acquire both remote and lock socket buffer locks. */ unp2 = unp->unp_conn; if (unp2 == NULL) { error = ENOTCONN; break; } so2 = unp2->unp_socket; UNP_PCB_LOCK(unp2); SOCKBUF_LOCK(&so2->so_rcv); if (unp2->unp_flags & UNP_WANTCRED) { /* * Credentials are passed only once on SOCK_STREAM. */ unp2->unp_flags &= ~UNP_WANTCRED; control = unp_addsockcred(td, control); } /* * Send to paired receive port, and then reduce send buffer * hiwater marks to maintain backpressure. Wake up readers. */ if (control != NULL) { if (sbappendcontrol_locked(&so2->so_rcv, m, control)) control = NULL; } else sbappend_locked(&so2->so_rcv, m); mbcnt = so2->so_rcv.sb_mbcnt - unp2->unp_mbcnt; unp2->unp_mbcnt = so2->so_rcv.sb_mbcnt; sbcc = so2->so_rcv.sb_cc; sorwakeup_locked(so2); SOCKBUF_LOCK(&so->so_snd); newhiwat = so->so_snd.sb_hiwat - (sbcc - unp2->unp_cc); (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, newhiwat, RLIM_INFINITY); so->so_snd.sb_mbmax -= mbcnt; SOCKBUF_UNLOCK(&so->so_snd); unp2->unp_cc = sbcc; UNP_PCB_UNLOCK(unp2); m = NULL; break; default: panic("uipc_send unknown socktype"); } /* * SEND_EOF is equivalent to a SEND followed by a SHUTDOWN. */ if (flags & PRUS_EOF) { UNP_PCB_LOCK(unp); socantsendmore(so); unp_shutdown(unp); UNP_PCB_UNLOCK(unp); } if ((nam != NULL) || (flags & PRUS_EOF)) UNP_GLOBAL_WUNLOCK(); else UNP_GLOBAL_RUNLOCK(); if (control != NULL && error != 0) unp_dispose(control); release: if (control != NULL) m_freem(control); if (m != NULL) m_freem(m); return (error); } static int uipc_sense(struct socket *so, struct stat *sb) { struct unpcb *unp, *unp2; struct socket *so2; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_sense: unp == NULL")); sb->st_blksize = so->so_snd.sb_hiwat; UNP_GLOBAL_RLOCK(); UNP_PCB_LOCK(unp); unp2 = unp->unp_conn; if (so->so_type == SOCK_STREAM && unp2 != NULL) { so2 = unp2->unp_socket; sb->st_blksize += so2->so_rcv.sb_cc; } sb->st_dev = NODEV; if (unp->unp_ino == 0) unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino; sb->st_ino = unp->unp_ino; UNP_PCB_UNLOCK(unp); UNP_GLOBAL_RUNLOCK(); return (0); } static int uipc_shutdown(struct socket *so) { struct unpcb *unp; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL")); UNP_GLOBAL_WLOCK(); UNP_PCB_LOCK(unp); socantsendmore(so); unp_shutdown(unp); UNP_PCB_UNLOCK(unp); UNP_GLOBAL_WUNLOCK(); return (0); } static int uipc_sockaddr(struct socket *so, struct sockaddr **nam) { struct unpcb *unp; const struct sockaddr *sa; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL")); *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); UNP_PCB_LOCK(unp); if (unp->unp_addr != NULL) sa = (struct sockaddr *) unp->unp_addr; else sa = &sun_noname; bcopy(sa, *nam, sa->sa_len); UNP_PCB_UNLOCK(unp); return (0); } struct pr_usrreqs uipc_usrreqs = { .pru_abort = uipc_abort, .pru_accept = uipc_accept, .pru_attach = uipc_attach, .pru_bind = uipc_bind, .pru_connect = uipc_connect, .pru_connect2 = uipc_connect2, .pru_detach = uipc_detach, .pru_disconnect = uipc_disconnect, .pru_listen = uipc_listen, .pru_peeraddr = uipc_peeraddr, .pru_rcvd = uipc_rcvd, .pru_send = uipc_send, .pru_sense = uipc_sense, .pru_shutdown = uipc_shutdown, .pru_sockaddr = uipc_sockaddr, .pru_close = uipc_close, }; int uipc_ctloutput(struct socket *so, struct sockopt *sopt) { struct unpcb *unp; struct xucred xu; int error, optval; if (sopt->sopt_level != 0) return (EINVAL); unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL")); error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case LOCAL_PEERCRED: UNP_PCB_LOCK(unp); if (unp->unp_flags & UNP_HAVEPC) xu = unp->unp_peercred; else { if (so->so_type == SOCK_STREAM) error = ENOTCONN; else error = EINVAL; } UNP_PCB_UNLOCK(unp); if (error == 0) error = sooptcopyout(sopt, &xu, sizeof(xu)); break; case LOCAL_CREDS: /* Unocked read. */ optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case LOCAL_CONNWAIT: /* Unocked read. */ optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = EOPNOTSUPP; break; } break; case SOPT_SET: switch (sopt->sopt_name) { case LOCAL_CREDS: case LOCAL_CONNWAIT: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; #define OPTSET(bit) do { \ UNP_PCB_LOCK(unp); \ if (optval) \ unp->unp_flags |= bit; \ else \ unp->unp_flags &= ~bit; \ UNP_PCB_UNLOCK(unp); \ } while (0) switch (sopt->sopt_name) { case LOCAL_CREDS: OPTSET(UNP_WANTCRED); break; case LOCAL_CONNWAIT: OPTSET(UNP_CONNWAIT); break; default: break; } break; #undef OPTSET default: error = ENOPROTOOPT; break; } break; default: error = EOPNOTSUPP; break; } return (error); } static int unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_un *soun = (struct sockaddr_un *)nam; struct vnode *vp; struct socket *so2, *so3; struct unpcb *unp, *unp2, *unp3; int error, len, vfslocked; struct nameidata nd; char buf[SOCK_MAXADDRLEN]; struct sockaddr *sa; UNP_GLOBAL_WLOCK_ASSERT(); UNP_GLOBAL_WUNLOCK(); unp = sotounpcb(so); KASSERT(unp != NULL, ("unp_connect: unp == NULL")); len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); if (len <= 0) return (EINVAL); strlcpy(buf, soun->sun_path, len + 1); UNP_PCB_LOCK(unp); if (unp->unp_flags & UNP_CONNECTING) { UNP_PCB_UNLOCK(unp); return (EALREADY); } unp->unp_flags |= UNP_CONNECTING; UNP_PCB_UNLOCK(unp); sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK); NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td); error = namei(&nd); if (error) vp = NULL; else vp = nd.ni_vp; ASSERT_VOP_LOCKED(vp, "unp_connect"); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); if (error) goto bad; if (vp->v_type != VSOCK) { error = ENOTSOCK; goto bad; } #ifdef MAC error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD); if (error) goto bad; #endif error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td); if (error) goto bad; VFS_UNLOCK_GIANT(vfslocked); unp = sotounpcb(so); KASSERT(unp != NULL, ("unp_connect: unp == NULL")); /* * Lock global lock for two reasons: make sure v_socket is stable, * and to protect simultaneous locking of multiple pcbs. */ UNP_GLOBAL_WLOCK(); so2 = vp->v_socket; if (so2 == NULL) { error = ECONNREFUSED; goto bad2; } if (so->so_type != so2->so_type) { error = EPROTOTYPE; goto bad2; } if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if (so2->so_options & SO_ACCEPTCONN) { /* * We can't drop the global lock here or 'so2' may * become invalid. As a result, we need to handle * possibly lock recursion in uipc_attach. */ so3 = sonewconn(so2, 0); } else so3 = NULL; if (so3 == NULL) { error = ECONNREFUSED; goto bad2; } unp = sotounpcb(so); unp2 = sotounpcb(so2); unp3 = sotounpcb(so3); UNP_PCB_LOCK(unp); UNP_PCB_LOCK(unp2); UNP_PCB_LOCK(unp3); if (unp2->unp_addr != NULL) { bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); unp3->unp_addr = (struct sockaddr_un *) sa; sa = NULL; } /* * unp_peercred management: * * The connecter's (client's) credentials are copied from its * process structure at the time of connect() (which is now). */ cru2x(td->td_ucred, &unp3->unp_peercred); unp3->unp_flags |= UNP_HAVEPC; /* * The receiver's (server's) credentials are copied from the * unp_peercred member of socket on which the former called * listen(); uipc_listen() cached that process's credentials * at that time so we can use them now. */ KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED, ("unp_connect: listener without cached peercred")); memcpy(&unp->unp_peercred, &unp2->unp_peercred, sizeof(unp->unp_peercred)); unp->unp_flags |= UNP_HAVEPC; if (unp2->unp_flags & UNP_WANTCRED) unp3->unp_flags |= UNP_WANTCRED; UNP_PCB_UNLOCK(unp3); UNP_PCB_UNLOCK(unp2); UNP_PCB_UNLOCK(unp); #ifdef MAC SOCK_LOCK(so); mac_socketpeer_set_from_socket(so, so3); mac_socketpeer_set_from_socket(so3, so); SOCK_UNLOCK(so); #endif so2 = so3; } unp = sotounpcb(so); KASSERT(unp != NULL, ("unp_connect: unp == NULL")); unp2 = sotounpcb(so2); KASSERT(unp2 != NULL, ("unp_connect: unp2 == NULL")); UNP_PCB_LOCK(unp); UNP_PCB_LOCK(unp2); error = unp_connect2(so, so2, PRU_CONNECT); UNP_PCB_UNLOCK(unp2); UNP_PCB_UNLOCK(unp); bad2: UNP_GLOBAL_WUNLOCK(); if (vfslocked) /* * Giant has been previously acquired. This means filesystem * isn't MPSAFE. Do it once again. */ mtx_lock(&Giant); bad: if (vp != NULL) vput(vp); VFS_UNLOCK_GIANT(vfslocked); free(sa, M_SONAME); UNP_GLOBAL_WLOCK(); UNP_PCB_LOCK(unp); unp->unp_flags &= ~UNP_CONNECTING; UNP_PCB_UNLOCK(unp); return (error); } static int unp_connect2(struct socket *so, struct socket *so2, int req) { struct unpcb *unp; struct unpcb *unp2; unp = sotounpcb(so); KASSERT(unp != NULL, ("unp_connect2: unp == NULL")); unp2 = sotounpcb(so2); KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL")); UNP_GLOBAL_WLOCK_ASSERT(); UNP_PCB_LOCK_ASSERT(unp); UNP_PCB_LOCK_ASSERT(unp2); if (so2->so_type != so->so_type) return (EPROTOTYPE); unp->unp_conn = unp2; switch (so->so_type) { case SOCK_DGRAM: LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); soisconnected(so); break; case SOCK_STREAM: unp2->unp_conn = unp; if (req == PRU_CONNECT && ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)) soisconnecting(so); else soisconnected(so); soisconnected(so2); break; default: panic("unp_connect2"); } return (0); } static void unp_disconnect(struct unpcb *unp, struct unpcb *unp2) { struct socket *so; KASSERT(unp2 != NULL, ("unp_disconnect: unp2 == NULL")); UNP_GLOBAL_WLOCK_ASSERT(); UNP_PCB_LOCK_ASSERT(unp); UNP_PCB_LOCK_ASSERT(unp2); unp->unp_conn = NULL; switch (unp->unp_socket->so_type) { case SOCK_DGRAM: LIST_REMOVE(unp, unp_reflink); so = unp->unp_socket; SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); break; case SOCK_STREAM: soisdisconnected(unp->unp_socket); unp2->unp_conn = NULL; soisdisconnected(unp2->unp_socket); break; } } /* * unp_pcblist() walks the global list of struct unpcb's to generate a * pointer list, bumping the refcount on each unpcb. It then copies them out * sequentially, validating the generation number on each to see if it has * been detached. All of this is necessary because copyout() may sleep on * disk I/O. */ static int unp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; int freeunp; struct unpcb *unp, **unp_list; unp_gen_t gencnt; struct xunpgen *xug; struct unp_head *head; struct xunpcb *xu; head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead); /* * The process of preparing the PCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { n = unp_count; req->oldidx = 2 * (sizeof *xug) + (n + n/8) * sizeof(struct xunpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK); UNP_GLOBAL_RLOCK(); gencnt = unp_gencnt; n = unp_count; UNP_GLOBAL_RUNLOCK(); xug->xug_len = sizeof *xug; xug->xug_count = n; xug->xug_gen = gencnt; xug->xug_sogen = so_gencnt; error = SYSCTL_OUT(req, xug, sizeof *xug); if (error) { free(xug, M_TEMP); return (error); } unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); UNP_GLOBAL_RLOCK(); for (unp = LIST_FIRST(head), i = 0; unp && i < n; unp = LIST_NEXT(unp, unp_link)) { UNP_PCB_LOCK(unp); if (unp->unp_gencnt <= gencnt) { if (cr_cansee(req->td->td_ucred, unp->unp_socket->so_cred)) { UNP_PCB_UNLOCK(unp); continue; } unp_list[i++] = unp; unp->unp_refcount++; } UNP_PCB_UNLOCK(unp); } UNP_GLOBAL_RUNLOCK(); n = i; /* In case we lost some during malloc. */ error = 0; xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO); for (i = 0; i < n; i++) { unp = unp_list[i]; UNP_PCB_LOCK(unp); unp->unp_refcount--; if (unp->unp_refcount != 0 && unp->unp_gencnt <= gencnt) { xu->xu_len = sizeof *xu; xu->xu_unpp = unp; /* * XXX - need more locking here to protect against * connect/disconnect races for SMP. */ if (unp->unp_addr != NULL) bcopy(unp->unp_addr, &xu->xu_addr, unp->unp_addr->sun_len); if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL) bcopy(unp->unp_conn->unp_addr, &xu->xu_caddr, unp->unp_conn->unp_addr->sun_len); bcopy(unp, &xu->xu_unp, sizeof *unp); sotoxsocket(unp->unp_socket, &xu->xu_socket); UNP_PCB_UNLOCK(unp); error = SYSCTL_OUT(req, xu, sizeof *xu); } else { freeunp = (unp->unp_refcount == 0); UNP_PCB_UNLOCK(unp); if (freeunp) { UNP_PCB_LOCK_DESTROY(unp); uma_zfree(unp_zone, unp); } } } free(xu, M_TEMP); if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ xug->xug_gen = unp_gencnt; xug->xug_sogen = so_gencnt; xug->xug_count = unp_count; error = SYSCTL_OUT(req, xug, sizeof *xug); } free(unp_list, M_TEMP); free(xug, M_TEMP); return (error); } SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD, (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb", "List of active local datagram sockets"); SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD, (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", "List of active local stream sockets"); static void unp_shutdown(struct unpcb *unp) { struct unpcb *unp2; struct socket *so; UNP_GLOBAL_WLOCK_ASSERT(); UNP_PCB_LOCK_ASSERT(unp); unp2 = unp->unp_conn; if (unp->unp_socket->so_type == SOCK_STREAM && unp2 != NULL) { so = unp2->unp_socket; if (so != NULL) socantrcvmore(so); } } static void unp_drop(struct unpcb *unp, int errno) { struct socket *so = unp->unp_socket; struct unpcb *unp2; UNP_GLOBAL_WLOCK_ASSERT(); UNP_PCB_LOCK_ASSERT(unp); so->so_error = errno; unp2 = unp->unp_conn; if (unp2 == NULL) return; UNP_PCB_LOCK(unp2); unp_disconnect(unp, unp2); UNP_PCB_UNLOCK(unp2); } static void unp_freerights(struct file **rp, int fdcount) { int i; struct file *fp; for (i = 0; i < fdcount; i++) { /* * Zero the pointer before calling unp_discard since it may * end up in unp_gc().. * * XXXRW: This is less true than it used to be. */ fp = *rp; *rp++ = NULL; unp_discard(fp); } } int unp_externalize(struct mbuf *control, struct mbuf **controlp) { struct thread *td = curthread; /* XXX */ struct cmsghdr *cm = mtod(control, struct cmsghdr *); int i; int *fdp; struct file **rp; struct file *fp; void *data; socklen_t clen = control->m_len, datalen; int error, newfds; int f; u_int newlen; UNP_GLOBAL_UNLOCK_ASSERT(); error = 0; if (controlp != NULL) /* controlp == NULL => free control messages */ *controlp = NULL; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_len > clen) { error = EINVAL; break; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { newfds = datalen / sizeof(struct file *); rp = data; /* If we're not outputting the descriptors free them. */ if (error || controlp == NULL) { unp_freerights(rp, newfds); goto next; } FILEDESC_XLOCK(td->td_proc->p_fd); /* if the new FD's will not fit free them. */ if (!fdavail(td, newfds)) { FILEDESC_XUNLOCK(td->td_proc->p_fd); error = EMSGSIZE; unp_freerights(rp, newfds); goto next; } /* * Now change each pointer to an fd in the global * table to an integer that is the index to the local * fd table entry that we set up to point to the * global one we are transferring. */ newlen = newfds * sizeof(int); *controlp = sbcreatecontrol(NULL, newlen, SCM_RIGHTS, SOL_SOCKET); if (*controlp == NULL) { FILEDESC_XUNLOCK(td->td_proc->p_fd); error = E2BIG; unp_freerights(rp, newfds); goto next; } fdp = (int *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); for (i = 0; i < newfds; i++) { if (fdalloc(td, 0, &f)) panic("unp_externalize fdalloc failed"); fp = *rp++; td->td_proc->p_fd->fd_ofiles[f] = fp; - FILE_LOCK(fp); - fp->f_msgcount--; - FILE_UNLOCK(fp); - unp_rights--; + unp_externalize_fp(fp); *fdp++ = f; } FILEDESC_XUNLOCK(td->td_proc->p_fd); } else { /* We can just copy anything else across. */ if (error || controlp == NULL) goto next; *controlp = sbcreatecontrol(NULL, datalen, cm->cmsg_type, cm->cmsg_level); if (*controlp == NULL) { error = ENOBUFS; goto next; } bcopy(data, CMSG_DATA(mtod(*controlp, struct cmsghdr *)), datalen); } controlp = &(*controlp)->m_next; next: if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } m_freem(control); return (error); } static void unp_zone_change(void *tag) { uma_zone_set_max(unp_zone, maxsockets); } void unp_init(void) { unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); if (unp_zone == NULL) panic("unp_init"); uma_zone_set_max(unp_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change, NULL, EVENTHANDLER_PRI_ANY); LIST_INIT(&unp_dhead); LIST_INIT(&unp_shead); TASK_INIT(&unp_gc_task, 0, unp_gc, NULL); UNP_GLOBAL_LOCK_INIT(); } static int unp_internalize(struct mbuf **controlp, struct thread *td) { struct mbuf *control = *controlp; struct proc *p = td->td_proc; struct filedesc *fdescp = p->p_fd; struct cmsghdr *cm = mtod(control, struct cmsghdr *); struct cmsgcred *cmcred; struct file **rp; struct file *fp; struct timeval *tv; int i, fd, *fdp; void *data; socklen_t clen = control->m_len, datalen; int error, oldfds; u_int newlen; UNP_GLOBAL_UNLOCK_ASSERT(); error = 0; *controlp = NULL; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET || cm->cmsg_len > clen) { error = EINVAL; goto out; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; switch (cm->cmsg_type) { /* * Fill in credential information. */ case SCM_CREDS: *controlp = sbcreatecontrol(NULL, sizeof(*cmcred), SCM_CREDS, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } cmcred = (struct cmsgcred *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); cmcred->cmcred_pid = p->p_pid; cmcred->cmcred_uid = td->td_ucred->cr_ruid; cmcred->cmcred_gid = td->td_ucred->cr_rgid; cmcred->cmcred_euid = td->td_ucred->cr_uid; cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); for (i = 0; i < cmcred->cmcred_ngroups; i++) cmcred->cmcred_groups[i] = td->td_ucred->cr_groups[i]; break; case SCM_RIGHTS: oldfds = datalen / sizeof (int); /* * Check that all the FDs passed in refer to legal * files. If not, reject the entire operation. */ fdp = data; FILEDESC_SLOCK(fdescp); for (i = 0; i < oldfds; i++) { fd = *fdp++; if ((unsigned)fd >= fdescp->fd_nfiles || fdescp->fd_ofiles[fd] == NULL) { FILEDESC_SUNLOCK(fdescp); error = EBADF; goto out; } fp = fdescp->fd_ofiles[fd]; if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) { FILEDESC_SUNLOCK(fdescp); error = EOPNOTSUPP; goto out; } } /* * Now replace the integer FDs with pointers to * the associated global file table entry.. */ newlen = oldfds * sizeof(struct file *); *controlp = sbcreatecontrol(NULL, newlen, SCM_RIGHTS, SOL_SOCKET); if (*controlp == NULL) { FILEDESC_SUNLOCK(fdescp); error = E2BIG; goto out; } fdp = data; rp = (struct file **) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); for (i = 0; i < oldfds; i++) { fp = fdescp->fd_ofiles[*fdp++]; *rp++ = fp; - FILE_LOCK(fp); - fp->f_count++; - fp->f_msgcount++; - FILE_UNLOCK(fp); - unp_rights++; + fhold(fp); + unp_internalize_fp(fp); } FILEDESC_SUNLOCK(fdescp); break; case SCM_TIMESTAMP: *controlp = sbcreatecontrol(NULL, sizeof(*tv), SCM_TIMESTAMP, SOL_SOCKET); if (*controlp == NULL) { error = ENOBUFS; goto out; } tv = (struct timeval *) CMSG_DATA(mtod(*controlp, struct cmsghdr *)); microtime(tv); break; default: error = EINVAL; goto out; } controlp = &(*controlp)->m_next; if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } out: m_freem(control); return (error); } static struct mbuf * unp_addsockcred(struct thread *td, struct mbuf *control) { struct mbuf *m, *n, *n_prev; struct sockcred *sc; const struct cmsghdr *cm; int ngroups; int i; ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX); m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET); if (m == NULL) return (control); sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *)); sc->sc_uid = td->td_ucred->cr_ruid; sc->sc_euid = td->td_ucred->cr_uid; sc->sc_gid = td->td_ucred->cr_rgid; sc->sc_egid = td->td_ucred->cr_gid; sc->sc_ngroups = ngroups; for (i = 0; i < sc->sc_ngroups; i++) sc->sc_groups[i] = td->td_ucred->cr_groups[i]; /* * Unlink SCM_CREDS control messages (struct cmsgcred), since just * created SCM_CREDS control message (struct sockcred) has another * format. */ if (control != NULL) for (n = control, n_prev = NULL; n != NULL;) { cm = mtod(n, struct cmsghdr *); if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_CREDS) { if (n_prev == NULL) control = n->m_next; else n_prev->m_next = n->m_next; n = m_free(n); } else { n_prev = n; n = n->m_next; } } /* Prepend it to the head. */ m->m_next = control; return (m); } +static struct unpcb * +fptounp(struct file *fp) +{ + struct socket *so; + + if (fp->f_type != DTYPE_SOCKET) + return (NULL); + if ((so = fp->f_data) == NULL) + return (NULL); + if (so->so_proto->pr_domain != &localdomain) + return (NULL); + return sotounpcb(so); +} + +static void +unp_discard(struct file *fp) +{ + + unp_externalize_fp(fp); + (void) closef(fp, (struct thread *)NULL); +} + +static void +unp_internalize_fp(struct file *fp) +{ + struct unpcb *unp; + + UNP_GLOBAL_WLOCK(); + if ((unp = fptounp(fp)) != NULL) { + unp->unp_file = fp; + unp->unp_msgcount++; + } + unp_rights++; + UNP_GLOBAL_WUNLOCK(); +} + +static void +unp_externalize_fp(struct file *fp) +{ + struct unpcb *unp; + + UNP_GLOBAL_WLOCK(); + if ((unp = fptounp(fp)) != NULL) + unp->unp_msgcount--; + unp_rights--; + UNP_GLOBAL_WUNLOCK(); +} + /* * unp_defer indicates whether additional work has been defered for a future * pass through unp_gc(). It is thread local and does not require explicit * synchronization. */ -static int unp_defer; +static int unp_marked; +static int unp_unreachable; -static int unp_taskcount; -SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, ""); +static void +unp_accessable(struct file *fp) +{ + struct unpcb *unp; + unp = fptounp(fp); + if (fp == NULL) + return; + if (unp->unp_gcflag & UNPGC_REF) + return; + unp->unp_gcflag &= ~UNPGC_DEAD; + unp->unp_gcflag |= UNPGC_REF; + unp_marked++; +} + +static void +unp_gc_process(struct unpcb *unp) +{ + struct socket *soa; + struct socket *so; + struct file *fp; + + /* Already processed. */ + if (unp->unp_gcflag & UNPGC_SCANNED) + return; + fp = unp->unp_file; + /* + * Check for a socket potentially in a cycle. It must be in a + * queue as indicated by msgcount, and this must equal the file + * reference count. Note that when msgcount is 0 the file is NULL. + */ + if (unp->unp_msgcount != 0 && fp->f_count != 0 && + fp->f_count == unp->unp_msgcount) { + unp->unp_gcflag |= UNPGC_DEAD; + unp_unreachable++; + return; + } + /* + * Mark all sockets we reference with RIGHTS. + */ + so = unp->unp_socket; + SOCKBUF_LOCK(&so->so_rcv); + unp_scan(so->so_rcv.sb_mb, unp_accessable); + SOCKBUF_UNLOCK(&so->so_rcv); + /* + * Mark all sockets in our accept queue. + */ + ACCEPT_LOCK(); + TAILQ_FOREACH(soa, &so->so_comp, so_list) { + SOCKBUF_LOCK(&soa->so_rcv); + unp_scan(soa->so_rcv.sb_mb, unp_accessable); + SOCKBUF_UNLOCK(&soa->so_rcv); + } + ACCEPT_UNLOCK(); + unp->unp_gcflag |= UNPGC_SCANNED; +} + static int unp_recycled; SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, ""); +static int unp_taskcount; +SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, ""); + static void unp_gc(__unused void *arg, int pending) { - struct file *fp, *nextfp; - struct socket *so; - struct file **extra_ref, **fpp; - int nunref, i; - int nfiles_snap; - int nfiles_slack = 20; + struct unp_head *heads[] = { &unp_dhead, &unp_shead, NULL }; + struct unp_head **head; + struct file **unref; + struct unpcb *unp; + int i; unp_taskcount++; - unp_defer = 0; + UNP_GLOBAL_RLOCK(); /* - * Before going through all this, set all FDs to be NOT deferred and - * NOT externally accessible. + * First clear all gc flags from previous runs. */ - sx_slock(&filelist_lock); - LIST_FOREACH(fp, &filehead, f_list) - fp->f_gcflag &= ~(FMARK|FDEFER); + for (head = heads; *head != NULL; head++) + LIST_FOREACH(unp, *head, unp_link) + unp->unp_gcflag &= ~(UNPGC_REF|UNPGC_DEAD); + /* + * Scan marking all reachable sockets with UNPGC_REF. Once a socket + * is reachable all of the sockets it references are reachable. + * Stop the scan once we do a complete loop without discovering + * a new reachable socket. + */ do { - KASSERT(unp_defer >= 0, ("unp_gc: unp_defer %d", unp_defer)); - LIST_FOREACH(fp, &filehead, f_list) { - FILE_LOCK(fp); - /* - * If the file is not open, skip it -- could be a - * file in the process of being opened, or in the - * process of being closed. If the file is - * "closing", it may have been marked for deferred - * consideration. Clear the flag now if so. - */ - if (fp->f_count == 0) { - if (fp->f_gcflag & FDEFER) - unp_defer--; - fp->f_gcflag &= ~(FMARK|FDEFER); - FILE_UNLOCK(fp); - continue; + unp_unreachable = 0; + unp_marked = 0; + for (head = heads; *head != NULL; head++) + LIST_FOREACH(unp, *head, unp_link) + unp_gc_process(unp); + } while (unp_marked); + UNP_GLOBAL_RUNLOCK(); + if (unp_unreachable == 0) + return; + /* + * Allocate space for a local list of dead unpcbs. + */ + unref = malloc(unp_unreachable * sizeof(struct file *), + M_TEMP, M_WAITOK); + /* + * Iterate looking for sockets which have been specifically marked + * as as unreachable and store them locally. + */ + UNP_GLOBAL_RLOCK(); + for (i = 0, head = heads; *head != NULL; head++) + LIST_FOREACH(unp, *head, unp_link) + if (unp->unp_gcflag & UNPGC_DEAD) { + unref[i++] = unp->unp_file; + KASSERT(unp->unp_file != NULL, + ("unp_gc: Invalid unpcb.")); + KASSERT(i <= unp_unreachable, + ("unp_gc: incorrect unreachable count.")); } - /* - * If we already marked it as 'defer' in a - * previous pass, then try to process it this - * time and un-mark it. - */ - if (fp->f_gcflag & FDEFER) { - fp->f_gcflag &= ~FDEFER; - unp_defer--; - } else { - /* - * If it's not deferred, then check if it's - * already marked.. if so skip it - */ - if (fp->f_gcflag & FMARK) { - FILE_UNLOCK(fp); - continue; - } - /* - * If all references are from messages in - * transit, then skip it. it's not externally - * accessible. - */ - if (fp->f_count == fp->f_msgcount) { - FILE_UNLOCK(fp); - continue; - } - /* - * If it got this far then it must be - * externally accessible. - */ - fp->f_gcflag |= FMARK; - } - /* - * Either it was deferred, or it is externally - * accessible and not already marked so. Now check - * if it is possibly one of OUR sockets. - */ - if (fp->f_type != DTYPE_SOCKET || - (so = fp->f_data) == NULL) { - FILE_UNLOCK(fp); - continue; - } - if (so->so_proto->pr_domain != &localdomain || - (so->so_proto->pr_flags & PR_RIGHTS) == 0) { - FILE_UNLOCK(fp); - continue; - } - - /* - * Tell any other threads that do a subsequent - * fdrop() that we are scanning the message - * buffers. - */ - fp->f_gcflag |= FWAIT; - FILE_UNLOCK(fp); - - /* - * So, Ok, it's one of our sockets and it IS - * externally accessible (or was deferred). Now we - * look to see if we hold any file descriptors in its - * message buffers. Follow those links and mark them - * as accessible too. - */ - SOCKBUF_LOCK(&so->so_rcv); - unp_scan(so->so_rcv.sb_mb, unp_mark); - SOCKBUF_UNLOCK(&so->so_rcv); - - /* - * Wake up any threads waiting in fdrop(). - */ - FILE_LOCK(fp); - fp->f_gcflag &= ~FWAIT; - wakeup(&fp->f_gcflag); - FILE_UNLOCK(fp); - } - } while (unp_defer); - sx_sunlock(&filelist_lock); + UNP_GLOBAL_RUNLOCK(); /* - * XXXRW: The following comments need updating for a post-SMPng and - * deferred unp_gc() world, but are still generally accurate. - * - * We grab an extra reference to each of the file table entries that - * are not otherwise accessible and then free the rights that are - * stored in messages on them. - * - * The bug in the orginal code is a little tricky, so I'll describe - * what's wrong with it here. - * - * It is incorrect to simply unp_discard each entry for f_msgcount - * times -- consider the case of sockets A and B that contain - * references to each other. On a last close of some other socket, - * we trigger a gc since the number of outstanding rights (unp_rights) - * is non-zero. If during the sweep phase the gc code unp_discards, - * we end up doing a (full) closef on the descriptor. A closef on A - * results in the following chain. Closef calls soo_close, which - * calls soclose. Soclose calls first (through the switch - * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply - * returns because the previous instance had set unp_gcing, and we - * return all the way back to soclose, which marks the socket with - * SS_NOFDREF, and then calls sofree. Sofree calls sorflush to free - * up the rights that are queued in messages on the socket A, i.e., - * the reference on B. The sorflush calls via the dom_dispose switch - * unp_dispose, which unp_scans with unp_discard. This second - * instance of unp_discard just calls closef on B. - * - * Well, a similar chain occurs on B, resulting in a sorflush on B, - * which results in another closef on A. Unfortunately, A is already - * being closed, and the descriptor has already been marked with - * SS_NOFDREF, and soclose panics at this point. - * - * Here, we first take an extra reference to each inaccessible - * descriptor. Then, we call sorflush ourself, since we know it is a - * Unix domain socket anyhow. After we destroy all the rights - * carried in messages, we do a last closef to get rid of our extra - * reference. This is the last close, and the unp_detach etc will - * shut down the socket. - * - * 91/09/19, bsy@cs.cmu.edu + * All further operation is now done on a local list. We first ref + * all sockets to avoid closing them until all are flushed. */ -again: - nfiles_snap = openfiles + nfiles_slack; /* some slack */ - extra_ref = malloc(nfiles_snap * sizeof(struct file *), M_TEMP, - M_WAITOK); - sx_slock(&filelist_lock); - if (nfiles_snap < openfiles) { - sx_sunlock(&filelist_lock); - free(extra_ref, M_TEMP); - nfiles_slack += 20; - goto again; - } - for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; - fp != NULL; fp = nextfp) { - nextfp = LIST_NEXT(fp, f_list); - FILE_LOCK(fp); - /* - * If it's not open, skip it - */ - if (fp->f_count == 0) { - FILE_UNLOCK(fp); - continue; - } - /* - * If all refs are from msgs, and it's not marked accessible - * then it must be referenced from some unreachable cycle of - * (shut-down) FDs, so include it in our list of FDs to - * remove. - */ - if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) { - *fpp++ = fp; - nunref++; - fp->f_count++; - } - FILE_UNLOCK(fp); - } - sx_sunlock(&filelist_lock); + for (i = 0; i < unp_unreachable; i++) + fhold(unref[i]); /* - * For each FD on our hit list, do the following two things: + * Now flush all sockets, free'ing rights. This will free the + * struct files associated with these sockets but leave each socket + * with one remaining ref. */ - for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { - struct file *tfp = *fpp; - FILE_LOCK(tfp); - if (tfp->f_type == DTYPE_SOCKET && - tfp->f_data != NULL) { - FILE_UNLOCK(tfp); - sorflush(tfp->f_data); - } else { - FILE_UNLOCK(tfp); - } - } - for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { - closef(*fpp, (struct thread *) NULL); - unp_recycled++; - } - free(extra_ref, M_TEMP); + for (i = 0; i < unp_unreachable; i++) + sorflush(unref[i]->f_data); + /* + * And finally release the sockets so they can be reclaimed. + */ + for (i = 0; i < unp_unreachable; i++) + fdrop(unref[i], NULL); + unp_recycled += unp_unreachable; + free(unref, M_TEMP); } void unp_dispose(struct mbuf *m) { if (m) unp_scan(m, unp_discard); } static void unp_scan(struct mbuf *m0, void (*op)(struct file *)) { struct mbuf *m; struct file **rp; struct cmsghdr *cm; void *data; int i; socklen_t clen, datalen; int qfds; while (m0 != NULL) { for (m = m0; m; m = m->m_next) { if (m->m_type != MT_CONTROL) continue; cm = mtod(m, struct cmsghdr *); clen = m->m_len; while (cm != NULL) { if (sizeof(*cm) > clen || cm->cmsg_len > clen) break; data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { qfds = datalen / sizeof (struct file *); rp = data; for (i = 0; i < qfds; i++) (*op)(*rp++); } if (CMSG_SPACE(datalen) < clen) { clen -= CMSG_SPACE(datalen); cm = (struct cmsghdr *) ((caddr_t)cm + CMSG_SPACE(datalen)); } else { clen = 0; cm = NULL; } } } m0 = m0->m_act; } -} - -static void -unp_mark(struct file *fp) -{ - - /* XXXRW: Should probably assert file list lock here. */ - - if (fp->f_gcflag & FMARK) - return; - unp_defer++; - fp->f_gcflag |= (FMARK|FDEFER); -} - -static void -unp_discard(struct file *fp) -{ - - UNP_GLOBAL_WLOCK(); - FILE_LOCK(fp); - fp->f_msgcount--; - unp_rights--; - FILE_UNLOCK(fp); - UNP_GLOBAL_WUNLOCK(); - (void) closef(fp, (struct thread *)NULL); } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_unpflags(int unp_flags) { int comma; comma = 0; if (unp_flags & UNP_HAVEPC) { db_printf("%sUNP_HAVEPC", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_HAVEPCCACHED) { db_printf("%sUNP_HAVEPCCACHED", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_WANTCRED) { db_printf("%sUNP_WANTCRED", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_CONNWAIT) { db_printf("%sUNP_CONNWAIT", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_CONNECTING) { db_printf("%sUNP_CONNECTING", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_BINDING) { db_printf("%sUNP_BINDING", comma ? ", " : ""); comma = 1; } } static void db_print_xucred(int indent, struct xucred *xu) { int comma, i; db_print_indent(indent); db_printf("cr_version: %u cr_uid: %u cr_ngroups: %d\n", xu->cr_version, xu->cr_uid, xu->cr_ngroups); db_print_indent(indent); db_printf("cr_groups: "); comma = 0; for (i = 0; i < xu->cr_ngroups; i++) { db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]); comma = 1; } db_printf("\n"); } static void db_print_unprefs(int indent, struct unp_head *uh) { struct unpcb *unp; int counter; counter = 0; LIST_FOREACH(unp, uh, unp_reflink) { if (counter % 4 == 0) db_print_indent(indent); db_printf("%p ", unp); if (counter % 4 == 3) db_printf("\n"); counter++; } if (counter != 0 && counter % 4 != 0) db_printf("\n"); } DB_SHOW_COMMAND(unpcb, db_show_unpcb) { struct unpcb *unp; if (!have_addr) { db_printf("usage: show unpcb \n"); return; } unp = (struct unpcb *)addr; db_printf("unp_socket: %p unp_vnode: %p\n", unp->unp_socket, unp->unp_vnode); db_printf("unp_ino: %d unp_conn: %p\n", unp->unp_ino, unp->unp_conn); db_printf("unp_refs:\n"); db_print_unprefs(2, &unp->unp_refs); /* XXXRW: Would be nice to print the full address, if any. */ db_printf("unp_addr: %p\n", unp->unp_addr); db_printf("unp_cc: %d unp_mbcnt: %d unp_gencnt: %llu\n", unp->unp_cc, unp->unp_mbcnt, (unsigned long long)unp->unp_gencnt); db_printf("unp_flags: %x (", unp->unp_flags); db_print_unpflags(unp->unp_flags); db_printf(")\n"); db_printf("unp_peercred:\n"); db_print_xucred(2, &unp->unp_peercred); db_printf("unp_refcount: %u\n", unp->unp_refcount); } #endif Index: head/sys/kern/vfs_syscalls.c =================================================================== --- head/sys/kern/vfs_syscalls.c (revision 174987) +++ head/sys/kern/vfs_syscalls.c (revision 174988) @@ -1,4358 +1,4354 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int chroot_refuse_vdir_fds(struct filedesc *fdp); static int getutimes(const struct timeval *, enum uio_seg, struct timespec *); static int setfown(struct thread *td, struct vnode *, uid_t, gid_t); static int setfmode(struct thread *td, struct vnode *, int); static int setfflags(struct thread *td, struct vnode *, int); static int setutimes(struct thread *td, struct vnode *, const struct timespec *, int, int); static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td); /* * The module initialization routine for POSIX asynchronous I/O will * set this to the version of AIO that it implements. (Zero means * that it is not implemented.) This value is used here by pathconf() * and in kern_descrip.c by fpathconf(). */ int async_io_version; #ifdef DEBUG static int syncprt = 0; SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, ""); #endif /* * Sync each mounted filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct sync_args { int dummy; }; #endif /* ARGSUSED */ int sync(td, uap) struct thread *td; struct sync_args *uap; { struct mount *mp, *nmp; int vfslocked; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } vfslocked = VFS_LOCK_GIANT(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { MNT_ILOCK(mp); mp->mnt_noasync++; mp->mnt_kern_flag &= ~MNTK_ASYNC; MNT_IUNLOCK(mp); vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT, td); MNT_ILOCK(mp); mp->mnt_noasync--; if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0) mp->mnt_kern_flag |= MNTK_ASYNC; MNT_IUNLOCK(mp); vn_finished_write(mp); } VFS_UNLOCK_GIANT(vfslocked); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); return (0); } /* XXX PRISON: could be per prison flag */ static int prison_quotas; #if 0 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, ""); #endif /* * Change filesystem quotas. */ #ifndef _SYS_SYSPROTO_H_ struct quotactl_args { char *path; int cmd; int uid; caddr_t arg; }; #endif int quotactl(td, uap) struct thread *td; register struct quotactl_args /* { char *path; int cmd; int uid; caddr_t arg; } */ *uap; { struct mount *mp; int vfslocked; int error; struct nameidata nd; AUDIT_ARG(cmd, uap->cmd); AUDIT_ARG(uid, uap->uid); if (jailed(td->td_ucred) && !prison_quotas) return (EPERM); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); mp = nd.ni_vp->v_mount; if ((error = vfs_busy(mp, 0, NULL, td))) { vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } vrele(nd.ni_vp); error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg, td); vfs_unbusy(mp, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Used by statfs conversion routines to scale the block size up if * necessary so that all of the block counts are <= 'max_size'. Note * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero * value of 'n'. */ void statfs_scale_blocks(struct statfs *sf, long max_size) { uint64_t count; int shift; KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__)); /* * Attempt to scale the block counts to give a more accurate * overview to userland of the ratio of free space to used * space. To do this, find the largest block count and compute * a divisor that lets it fit into a signed integer <= max_size. */ if (sf->f_bavail < 0) count = -sf->f_bavail; else count = sf->f_bavail; count = MAX(sf->f_blocks, MAX(sf->f_bfree, count)); if (count <= max_size) return; count >>= flsl(max_size); shift = 0; while (count > 0) { shift++; count >>=1; } sf->f_bsize <<= shift; sf->f_blocks >>= shift; sf->f_bfree >>= shift; sf->f_bavail >>= shift; } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct statfs_args { char *path; struct statfs *buf; }; #endif int statfs(td, uap) struct thread *td; register struct statfs_args /* { char *path; struct statfs *buf; } */ *uap; { struct statfs sf; int error; error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf); if (error == 0) error = copyout(&sf, uap->buf, sizeof(sf)); return (error); } int kern_statfs(struct thread *td, char *path, enum uio_seg pathseg, struct statfs *buf) { struct mount *mp; struct statfs *sp, sb; int vfslocked; int error; struct nameidata nd; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); error = namei(&nd); if (error) return (error); vfslocked = NDHASGIANT(&nd); mp = nd.ni_vp->v_mount; vfs_ref(mp); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error) goto out; #endif /* * Set these in case the underlying filesystem fails to do so. */ sp = &mp->mnt_stat; sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp, td); if (error) goto out; if (priv_check(td, PRIV_VFS_GENERATION)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, &sb); sp = &sb; } *buf = *sp; out: vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); if (mtx_owned(&Giant)) printf("statfs(%d): %s: %d\n", vfslocked, path, error); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct fstatfs_args { int fd; struct statfs *buf; }; #endif int fstatfs(td, uap) struct thread *td; register struct fstatfs_args /* { int fd; struct statfs *buf; } */ *uap; { struct statfs sf; int error; error = kern_fstatfs(td, uap->fd, &sf); if (error == 0) error = copyout(&sf, uap->buf, sizeof(sf)); return (error); } int kern_fstatfs(struct thread *td, int fd, struct statfs *buf) { struct file *fp; struct mount *mp; struct statfs *sp, sb; int vfslocked; struct vnode *vp; int error; AUDIT_ARG(fd, fd); error = getvnode(td->td_proc->p_fd, fd, &fp); if (error) return (error); vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); #ifdef AUDIT AUDIT_ARG(vnode, vp, ARG_VNODE1); #endif mp = vp->v_mount; if (mp) vfs_ref(mp); VOP_UNLOCK(vp, 0, td); fdrop(fp, td); if (vp->v_iflag & VI_DOOMED) { error = EBADF; goto out; } #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error) goto out; #endif /* * Set these in case the underlying filesystem fails to do so. */ sp = &mp->mnt_stat; sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp, td); if (error) goto out; if (priv_check(td, PRIV_VFS_GENERATION)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, &sb); sp = &sb; } *buf = *sp; out: if (mp) vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct getfsstat_args { struct statfs *buf; long bufsize; int flags; }; #endif int getfsstat(td, uap) struct thread *td; register struct getfsstat_args /* { struct statfs *buf; long bufsize; int flags; } */ *uap; { return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE, uap->flags)); } /* * If (bufsize > 0 && bufseg == UIO_SYSSPACE) * The caller is responsible for freeing memory which will be allocated * in '*buf'. */ int kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize, enum uio_seg bufseg, int flags) { struct mount *mp, *nmp; struct statfs *sfsp, *sp, sb; size_t count, maxcount; int vfslocked; int error; maxcount = bufsize / sizeof(struct statfs); if (bufsize == 0) sfsp = NULL; else if (bufseg == UIO_USERSPACE) sfsp = *buf; else /* if (bufseg == UIO_SYSSPACE) */ { count = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { count++; } mtx_unlock(&mountlist_mtx); if (maxcount > count) maxcount = count; sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP, M_WAITOK); } count = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (prison_canseemount(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #ifdef MAC if (mac_mount_check_stat(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #endif if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } vfslocked = VFS_LOCK_GIANT(mp); if (sfsp && count < maxcount) { sp = &mp->mnt_stat; /* * Set these in case the underlying filesystem * fails to do so. */ sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; /* * If MNT_NOWAIT or MNT_LAZY is specified, do not * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY * overrides MNT_WAIT. */ if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 || (flags & MNT_WAIT)) && (error = VFS_STATFS(mp, sp, td))) { VFS_UNLOCK_GIANT(vfslocked); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); continue; } if (priv_check(td, PRIV_VFS_GENERATION)) { bcopy(sp, &sb, sizeof(sb)); sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, &sb); sp = &sb; } if (bufseg == UIO_SYSSPACE) bcopy(sp, sfsp, sizeof(*sp)); else /* if (bufseg == UIO_USERSPACE) */ { error = copyout(sp, sfsp, sizeof(*sp)); if (error) { vfs_unbusy(mp, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } } sfsp++; } VFS_UNLOCK_GIANT(vfslocked); count++; mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); if (sfsp && count > maxcount) td->td_retval[0] = maxcount; else td->td_retval[0] = count; return (0); } #ifdef COMPAT_FREEBSD4 /* * Get old format filesystem statistics. */ static void cvtstatfs(struct statfs *, struct ostatfs *); #ifndef _SYS_SYSPROTO_H_ struct freebsd4_statfs_args { char *path; struct ostatfs *buf; }; #endif int freebsd4_statfs(td, uap) struct thread *td; struct freebsd4_statfs_args /* { char *path; struct ostatfs *buf; } */ *uap; { struct ostatfs osb; struct statfs sf; int error; error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf); if (error) return (error); cvtstatfs(&sf, &osb); return (copyout(&osb, uap->buf, sizeof(osb))); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fstatfs_args { int fd; struct ostatfs *buf; }; #endif int freebsd4_fstatfs(td, uap) struct thread *td; struct freebsd4_fstatfs_args /* { int fd; struct ostatfs *buf; } */ *uap; { struct ostatfs osb; struct statfs sf; int error; error = kern_fstatfs(td, uap->fd, &sf); if (error) return (error); cvtstatfs(&sf, &osb); return (copyout(&osb, uap->buf, sizeof(osb))); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_getfsstat_args { struct ostatfs *buf; long bufsize; int flags; }; #endif int freebsd4_getfsstat(td, uap) struct thread *td; register struct freebsd4_getfsstat_args /* { struct ostatfs *buf; long bufsize; int flags; } */ *uap; { struct statfs *buf, *sp; struct ostatfs osb; size_t count, size; int error; count = uap->bufsize / sizeof(struct ostatfs); size = count * sizeof(struct statfs); error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags); if (size > 0) { count = td->td_retval[0]; sp = buf; while (count > 0 && error == 0) { cvtstatfs(sp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); sp++; uap->buf++; count--; } free(buf, M_TEMP); } return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fhstatfs_args { struct fhandle *u_fhp; struct ostatfs *buf; }; #endif int freebsd4_fhstatfs(td, uap) struct thread *td; struct freebsd4_fhstatfs_args /* { struct fhandle *u_fhp; struct ostatfs *buf; } */ *uap; { struct ostatfs osb; struct statfs sf; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); error = kern_fhstatfs(td, fh, &sf); if (error) return (error); cvtstatfs(&sf, &osb); return (copyout(&osb, uap->buf, sizeof(osb))); } /* * Convert a new format statfs structure to an old format statfs structure. */ static void cvtstatfs(nsp, osp) struct statfs *nsp; struct ostatfs *osp; { statfs_scale_blocks(nsp, LONG_MAX); bzero(osp, sizeof(*osp)); osp->f_bsize = nsp->f_bsize; osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX); osp->f_blocks = nsp->f_blocks; osp->f_bfree = nsp->f_bfree; osp->f_bavail = nsp->f_bavail; osp->f_files = MIN(nsp->f_files, LONG_MAX); osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX); osp->f_owner = nsp->f_owner; osp->f_type = nsp->f_type; osp->f_flags = nsp->f_flags; osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX); osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX); osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX); osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX); strlcpy(osp->f_fstypename, nsp->f_fstypename, MIN(MFSNAMELEN, OMFSNAMELEN)); strlcpy(osp->f_mntonname, nsp->f_mntonname, MIN(MNAMELEN, OMNAMELEN)); strlcpy(osp->f_mntfromname, nsp->f_mntfromname, MIN(MNAMELEN, OMNAMELEN)); osp->f_fsid = nsp->f_fsid; } #endif /* COMPAT_FREEBSD4 */ /* * Change current working directory to a given file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchdir_args { int fd; }; #endif int fchdir(td, uap) struct thread *td; struct fchdir_args /* { int fd; } */ *uap; { register struct filedesc *fdp = td->td_proc->p_fd; struct vnode *vp, *tdp, *vpold; struct mount *mp; struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); if ((error = getvnode(fdp, uap->fd, &fp)) != 0) return (error); vp = fp->f_vnode; VREF(vp); fdrop(fp, td); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); AUDIT_ARG(vnode, vp, ARG_VNODE1); error = change_dir(vp, td); while (!error && (mp = vp->v_mountedhere) != NULL) { int tvfslocked; if (vfs_busy(mp, 0, 0, td)) continue; tvfslocked = VFS_LOCK_GIANT(mp); error = VFS_ROOT(mp, LK_EXCLUSIVE, &tdp, td); vfs_unbusy(mp, td); if (error) { VFS_UNLOCK_GIANT(tvfslocked); break; } vput(vp); VFS_UNLOCK_GIANT(vfslocked); vp = tdp; vfslocked = tvfslocked; } if (error) { vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); FILEDESC_XLOCK(fdp); vpold = fdp->fd_cdir; fdp->fd_cdir = vp; FILEDESC_XUNLOCK(fdp); vfslocked = VFS_LOCK_GIANT(vpold->v_mount); vrele(vpold); VFS_UNLOCK_GIANT(vfslocked); return (0); } /* * Change current working directory (``.''). */ #ifndef _SYS_SYSPROTO_H_ struct chdir_args { char *path; }; #endif int chdir(td, uap) struct thread *td; struct chdir_args /* { char *path; } */ *uap; { return (kern_chdir(td, uap->path, UIO_USERSPACE)); } int kern_chdir(struct thread *td, char *path, enum uio_seg pathseg) { register struct filedesc *fdp = td->td_proc->p_fd; int error; struct nameidata nd; struct vnode *vp; int vfslocked; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1 | MPSAFE, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); if ((error = change_dir(nd.ni_vp, td)) != 0) { vput(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } VOP_UNLOCK(nd.ni_vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); NDFREE(&nd, NDF_ONLY_PNBUF); FILEDESC_XLOCK(fdp); vp = fdp->fd_cdir; fdp->fd_cdir = nd.ni_vp; FILEDESC_XUNLOCK(fdp); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (0); } /* * Helper function for raised chroot(2) security function: Refuse if * any filedescriptors are open directories. */ static int chroot_refuse_vdir_fds(fdp) struct filedesc *fdp; { struct vnode *vp; struct file *fp; int fd; FILEDESC_LOCK_ASSERT(fdp); for (fd = 0; fd < fdp->fd_nfiles ; fd++) { fp = fget_locked(fdp, fd); if (fp == NULL) continue; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VDIR) return (EPERM); } } return (0); } /* * This sysctl determines if we will allow a process to chroot(2) if it * has a directory open: * 0: disallowed for all processes. * 1: allowed for processes that were not already chroot(2)'ed. * 2: allowed for all processes. */ static int chroot_allow_open_directories = 1; SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, &chroot_allow_open_directories, 0, ""); /* * Change notion of root (``/'') directory. */ #ifndef _SYS_SYSPROTO_H_ struct chroot_args { char *path; }; #endif int chroot(td, uap) struct thread *td; struct chroot_args /* { char *path; } */ *uap; { int error; struct nameidata nd; int vfslocked; error = priv_check(td, PRIV_VFS_CHROOT); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) goto error; vfslocked = NDHASGIANT(&nd); if ((error = change_dir(nd.ni_vp, td)) != 0) goto e_vunlock; #ifdef MAC if ((error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp))) goto e_vunlock; #endif VOP_UNLOCK(nd.ni_vp, 0, td); error = change_root(nd.ni_vp, td); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); e_vunlock: vput(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); error: NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } /* * Common routine for chroot and chdir. Callers must provide a locked vnode * instance. */ int change_dir(vp, td) struct vnode *vp; struct thread *td; { int error; ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked"); if (vp->v_type != VDIR) return (ENOTDIR); #ifdef MAC error = mac_vnode_check_chdir(td->td_ucred, vp); if (error) return (error); #endif error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); return (error); } /* * Common routine for kern_chroot() and jail_attach(). The caller is * responsible for invoking priv_check() and mac_vnode_check_chroot() to * authorize this operation. */ int change_root(vp, td) struct vnode *vp; struct thread *td; { struct filedesc *fdp; struct vnode *oldvp; int vfslocked; int error; VFS_ASSERT_GIANT(vp->v_mount); fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (chroot_allow_open_directories == 0 || (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) { error = chroot_refuse_vdir_fds(fdp); if (error) { FILEDESC_XUNLOCK(fdp); return (error); } } oldvp = fdp->fd_rdir; fdp->fd_rdir = vp; VREF(fdp->fd_rdir); if (!fdp->fd_jdir) { fdp->fd_jdir = vp; VREF(fdp->fd_jdir); } FILEDESC_XUNLOCK(fdp); vfslocked = VFS_LOCK_GIANT(oldvp->v_mount); vrele(oldvp); VFS_UNLOCK_GIANT(vfslocked); return (0); } /* * Check permissions, allocate an open file structure, and call the device * open routine if any. */ #ifndef _SYS_SYSPROTO_H_ struct open_args { char *path; int flags; int mode; }; #endif int open(td, uap) struct thread *td; register struct open_args /* { char *path; int flags; int mode; } */ *uap; { return kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode); } int kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags, int mode) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; struct file *fp; struct vnode *vp; struct vattr vat; struct mount *mp; int cmode; struct file *nfp; int type, indx, error; struct flock lf; struct nameidata nd; int vfslocked; AUDIT_ARG(fflags, flags); AUDIT_ARG(mode, mode); if ((flags & O_ACCMODE) == O_ACCMODE) return (EINVAL); flags = FFLAGS(flags); error = falloc(td, &nfp, &indx); if (error) return (error); /* An extra reference on `nfp' has been held for us by falloc(). */ fp = nfp; + /* Set the flags early so the finit in devfs can pick them up. */ + fp->f_flag = flags & FMASK; cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg, path, td); td->td_dupfd = -1; /* XXX check for fdopen */ error = vn_open(&nd, &flags, cmode, fp); if (error) { /* * If the vn_open replaced the method vector, something * wonderous happened deep below and we just pass it up * pretending we know what we do. */ if (error == ENXIO && fp->f_ops != &badfileops) { fdrop(fp, td); td->td_retval[0] = indx; return (0); } /* * handle special fdopen() case. bleh. dupfdopen() is * responsible for dropping the old contents of ofiles[indx] * if it succeeds. */ if ((error == ENODEV || error == ENXIO) && td->td_dupfd >= 0 && /* XXX from fdopen */ (error = dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) { td->td_retval[0] = indx; fdrop(fp, td); return (0); } /* * Clean up the descriptor, but only if another thread hadn't * replaced or closed it. */ fdclose(fdp, fp, indx, td); fdrop(fp, td); if (error == ERESTART) error = EINTR; return (error); } td->td_dupfd = 0; vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; - FILE_LOCK(fp); - fp->f_vnode = vp; - if (fp->f_data == NULL) - fp->f_data = vp; - fp->f_flag = flags & FMASK; - fp->f_seqcount = 1; - fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE); - if (fp->f_ops == &badfileops) - fp->f_ops = &vnops; - FILE_UNLOCK(fp); + fp->f_vnode = vp; /* XXX Does devfs need this? */ + /* + * If the file wasn't claimed by devfs bind it to the normal + * vnode operations here. + */ + if (fp->f_ops == &badfileops) { + KASSERT(vp->v_type != VFIFO, ("Unexpected fifo.")); + fp->f_seqcount = 1; + finit(fp, flags & FMASK, DTYPE_VNODE, vp, &vnops); + } VOP_UNLOCK(vp, 0, td); if (flags & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (flags & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((flags & FNONBLOCK) == 0) type |= F_WAIT; if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) goto bad; - fp->f_flag |= FHASLOCK; + atomic_set_int(&fp->f_flag, FHASLOCK); } if (flags & O_TRUNC) { if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto bad; VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); VATTR_NULL(&vat); vat.va_size = 0; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); #ifdef MAC error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp); if (error == 0) #endif error = VOP_SETATTR(vp, &vat, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); if (error) goto bad; } VFS_UNLOCK_GIANT(vfslocked); /* * Release our private reference, leaving the one associated with * the descriptor table intact. */ fdrop(fp, td); td->td_retval[0] = indx; return (0); bad: VFS_UNLOCK_GIANT(vfslocked); fdclose(fdp, fp, indx, td); fdrop(fp, td); return (error); } #ifdef COMPAT_43 /* * Create a file. */ #ifndef _SYS_SYSPROTO_H_ struct ocreat_args { char *path; int mode; }; #endif int ocreat(td, uap) struct thread *td; register struct ocreat_args /* { char *path; int mode; } */ *uap; { return (kern_open(td, uap->path, UIO_USERSPACE, O_WRONLY | O_CREAT | O_TRUNC, uap->mode)); } #endif /* COMPAT_43 */ /* * Create a special file. */ #ifndef _SYS_SYSPROTO_H_ struct mknod_args { char *path; int mode; int dev; }; #endif int mknod(td, uap) struct thread *td; register struct mknod_args /* { char *path; int mode; int dev; } */ *uap; { return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } int kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode, int dev) { struct vnode *vp; struct mount *mp; struct vattr vattr; int error; int whiteout = 0; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, mode); AUDIT_ARG(dev, dev); switch (mode & S_IFMT) { case S_IFCHR: case S_IFBLK: error = priv_check(td, PRIV_VFS_MKNOD_DEV); break; case S_IFMT: error = priv_check(td, PRIV_VFS_MKNOD_BAD); break; case S_IFWHT: error = priv_check(td, PRIV_VFS_MKNOD_WHT); break; default: error = EINVAL; break; } if (error) return (error); restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (EEXIST); } else { VATTR_NULL(&vattr); FILEDESC_SLOCK(td->td_proc->p_fd); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask; FILEDESC_SUNLOCK(td->td_proc->p_fd); vattr.va_rdev = dev; whiteout = 0; switch (mode & S_IFMT) { case S_IFMT: /* used by badsect to flag bad sectors */ vattr.va_type = VBAD; break; case S_IFCHR: vattr.va_type = VCHR; break; case S_IFBLK: vattr.va_type = VBLK; break; case S_IFWHT: whiteout = 1; break; default: panic("kern_mknod: invalid mode"); } } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC if (error == 0 && !whiteout) error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (!error) { VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); if (whiteout) error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); else { error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); } } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Create a named pipe. */ #ifndef _SYS_SYSPROTO_H_ struct mkfifo_args { char *path; int mode; }; #endif int mkfifo(td, uap) struct thread *td; register struct mkfifo_args /* { char *path; int mode; } */ *uap; { return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode) { struct mount *mp; struct vattr vattr; int error; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, mode); restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VFIFO; FILEDESC_SLOCK(td->td_proc->p_fd); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask; FILEDESC_SUNLOCK(td->td_proc->p_fd); #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); #ifdef MAC out: #endif vput(nd.ni_dvp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } /* * Make a hard file link. */ #ifndef _SYS_SYSPROTO_H_ struct link_args { char *path; char *link; }; #endif int link(td, uap) struct thread *td; register struct link_args /* { char *path; char *link; } */ *uap; { int error; error = kern_link(td, uap->path, uap->link, UIO_USERSPACE); return (error); } static int hardlink_check_uid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW, &hardlink_check_uid, 0, "Unprivileged processes cannot create hard links to files owned by other " "users"); static int hardlink_check_gid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW, &hardlink_check_gid, 0, "Unprivileged processes cannot create hard links to files owned by other " "groups"); static int can_hardlink(struct vnode *vp, struct thread *td, struct ucred *cred) { struct vattr va; int error; if (!hardlink_check_uid && !hardlink_check_gid) return (0); error = VOP_GETATTR(vp, &va, cred, td); if (error != 0) return (error); if (hardlink_check_uid && cred->cr_uid != va.va_uid) { error = priv_check_cred(cred, PRIV_VFS_LINK, 0); if (error) return (error); } if (hardlink_check_gid && !groupmember(va.va_gid, cred)) { error = priv_check_cred(cred, PRIV_VFS_LINK, 0); if (error) return (error); } return (0); } int kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg) { struct vnode *vp; struct mount *mp; struct nameidata nd; int vfslocked; int lvfslocked; int error; bwillwrite(); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, segflg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vp->v_type == VDIR) { vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (EPERM); /* POSIX */ } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE2, segflg, link, td); if ((error = namei(&nd)) == 0) { lvfslocked = NDHASGIANT(&nd); if (nd.ni_vp != NULL) { if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); error = EEXIST; } else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td)) == 0) { VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); error = can_hardlink(vp, td, td->td_ucred); if (error == 0) #ifdef MAC error = mac_vnode_check_link(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error == 0) #endif error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); VOP_UNLOCK(vp, 0, td); vput(nd.ni_dvp); } NDFREE(&nd, NDF_ONLY_PNBUF); VFS_UNLOCK_GIANT(lvfslocked); } vrele(vp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Make a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct symlink_args { char *path; char *link; }; #endif int symlink(td, uap) struct thread *td; register struct symlink_args /* { char *path; char *link; } */ *uap; { return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE)); } int kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg) { struct mount *mp; struct vattr vattr; char *syspath; int error; struct nameidata nd; int vfslocked; if (segflg == UIO_SYSSPACE) { syspath = path; } else { syspath = uma_zalloc(namei_zone, M_WAITOK); if ((error = copyinstr(path, syspath, MAXPATHLEN, NULL)) != 0) goto out; } AUDIT_ARG(text, syspath); restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, segflg, link, td); if ((error = namei(&nd)) != 0) goto out; vfslocked = NDHASGIANT(&nd); if (nd.ni_vp) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); error = EEXIST; goto out; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto out; goto restart; } VATTR_NULL(&vattr); FILEDESC_SLOCK(td->td_proc->p_fd); vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask; FILEDESC_SUNLOCK(td->td_proc->p_fd); #ifdef MAC vattr.va_type = VLNK; error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out2; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath); if (error == 0) vput(nd.ni_vp); #ifdef MAC out2: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); out: if (segflg != UIO_SYSSPACE) uma_zfree(namei_zone, syspath); return (error); } /* * Delete a whiteout from the filesystem. */ int undelete(td, uap) struct thread *td; register struct undelete_args /* { char *path; } */ *uap; { int error; struct mount *mp; struct nameidata nd; int vfslocked; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error) return (error); vfslocked = NDHASGIANT(&nd); if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (nd.ni_vp) vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Delete a name from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct unlink_args { char *path; }; #endif int unlink(td, uap) struct thread *td; struct unlink_args /* { char *path; } */ *uap; { int error; error = kern_unlink(td, uap->path, UIO_USERSPACE); return (error); } int kern_unlink(struct thread *td, char *path, enum uio_seg pathseg) { struct mount *mp; struct vnode *vp; int error; struct nameidata nd; int vfslocked; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error == EINVAL ? EPERM : error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if (vp->v_type == VDIR) error = EPERM; /* POSIX */ else { /* * The root of a mounted filesystem cannot be deleted. * * XXX: can this only be a VDIR case? */ if (vp->v_vflag & VV_ROOT) error = EBUSY; } if (error == 0) { if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); #ifdef MAC out: #endif vn_finished_write(mp); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct lseek_args { int fd; int pad; off_t offset; int whence; }; #endif int lseek(td, uap) struct thread *td; register struct lseek_args /* { int fd; int pad; off_t offset; int whence; } */ *uap; { struct ucred *cred = td->td_ucred; struct file *fp; struct vnode *vp; struct vattr vattr; off_t offset; int error, noneg; int vfslocked; if ((error = fget(td, uap->fd, &fp)) != 0) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) { fdrop(fp, td); return (ESPIPE); } vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); noneg = (vp->v_type != VCHR); offset = uap->offset; switch (uap->whence) { case L_INCR: if (noneg && (fp->f_offset < 0 || (offset > 0 && fp->f_offset > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += fp->f_offset; break; case L_XTND: vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_GETATTR(vp, &vattr, cred, td); VOP_UNLOCK(vp, 0, td); if (error) break; if (noneg && (vattr.va_size > OFF_MAX || (offset > 0 && vattr.va_size > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += vattr.va_size; break; case L_SET: break; case SEEK_DATA: error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td); break; case SEEK_HOLE: error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td); break; default: error = EINVAL; } if (error == 0 && noneg && offset < 0) error = EINVAL; if (error != 0) goto drop; fp->f_offset = offset; *(off_t *)(td->td_retval) = fp->f_offset; drop: fdrop(fp, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } #if defined(COMPAT_43) /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct olseek_args { int fd; long offset; int whence; }; #endif int olseek(td, uap) struct thread *td; register struct olseek_args /* { int fd; long offset; int whence; } */ *uap; { struct lseek_args /* { int fd; int pad; off_t offset; int whence; } */ nuap; nuap.fd = uap->fd; nuap.offset = uap->offset; nuap.whence = uap->whence; return (lseek(td, &nuap)); } #endif /* COMPAT_43 */ /* Version with the 'pad' argument */ int freebsd6_lseek(td, uap) struct thread *td; register struct freebsd6_lseek_args *uap; { struct lseek_args ouap; ouap.fd = uap->fd; ouap.offset = uap->offset; ouap.whence = uap->whence; return (lseek(td, &ouap)); } /* * Check access permissions using passed credentials. */ static int vn_access(vp, user_flags, cred, td) struct vnode *vp; int user_flags; struct ucred *cred; struct thread *td; { int error, flags; /* Flags == 0 means only check for existence. */ error = 0; if (user_flags) { flags = 0; if (user_flags & R_OK) flags |= VREAD; if (user_flags & W_OK) flags |= VWRITE; if (user_flags & X_OK) flags |= VEXEC; #ifdef MAC error = mac_vnode_check_access(cred, vp, flags); if (error) return (error); #endif if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) error = VOP_ACCESS(vp, flags, cred, td); } return (error); } /* * Check access permissions using "real" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct access_args { char *path; int flags; }; #endif int access(td, uap) struct thread *td; register struct access_args /* { char *path; int flags; } */ *uap; { return (kern_access(td, uap->path, UIO_USERSPACE, uap->flags)); } int kern_access(struct thread *td, char *path, enum uio_seg pathseg, int flags) { struct ucred *cred, *tmpcred; register struct vnode *vp; struct nameidata nd; int vfslocked; int error; /* * Create and modify a temporary credential instead of one that * is potentially shared. This could also mess up socket * buffer accounting which can run in an interrupt context. */ cred = td->td_ucred; tmpcred = crdup(cred); tmpcred->cr_uid = cred->cr_ruid; tmpcred->cr_groups[0] = cred->cr_rgid; td->td_ucred = tmpcred; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) goto out1; vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; error = vn_access(vp, flags, tmpcred, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); VFS_UNLOCK_GIANT(vfslocked); out1: td->td_ucred = cred; crfree(tmpcred); return (error); } /* * Check access permissions using "effective" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct eaccess_args { char *path; int flags; }; #endif int eaccess(td, uap) struct thread *td; register struct eaccess_args /* { char *path; int flags; } */ *uap; { return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->flags)); } int kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int flags) { struct nameidata nd; struct vnode *vp; int vfslocked; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; vfslocked = NDHASGIANT(&nd); error = vn_access(vp, flags, td->td_ucred, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } #if defined(COMPAT_43) /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct ostat_args { char *path; struct ostat *ub; }; #endif int ostat(td, uap) struct thread *td; register struct ostat_args /* { char *path; struct ostat *ub; } */ *uap; { struct stat sb; struct ostat osb; int error; error = kern_stat(td, uap->path, UIO_USERSPACE, &sb); if (error) return (error); cvtstat(&sb, &osb); error = copyout(&osb, uap->ub, sizeof (osb)); return (error); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct olstat_args { char *path; struct ostat *ub; }; #endif int olstat(td, uap) struct thread *td; register struct olstat_args /* { char *path; struct ostat *ub; } */ *uap; { struct stat sb; struct ostat osb; int error; error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb); if (error) return (error); cvtstat(&sb, &osb); error = copyout(&osb, uap->ub, sizeof (osb)); return (error); } /* * Convert from an old to a new stat structure. */ void cvtstat(st, ost) struct stat *st; struct ostat *ost; { ost->st_dev = st->st_dev; ost->st_ino = st->st_ino; ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; if (st->st_size < (quad_t)1 << 32) ost->st_size = st->st_size; else ost->st_size = -2; ost->st_atime = st->st_atime; ost->st_mtime = st->st_mtime; ost->st_ctime = st->st_ctime; ost->st_blksize = st->st_blksize; ost->st_blocks = st->st_blocks; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; } #endif /* COMPAT_43 */ /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct stat_args { char *path; struct stat *ub; }; #endif int stat(td, uap) struct thread *td; register struct stat_args /* { char *path; struct stat *ub; } */ *uap; { struct stat sb; int error; error = kern_stat(td, uap->path, UIO_USERSPACE, &sb); if (error == 0) error = copyout(&sb, uap->ub, sizeof (sb)); return (error); } int kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp) { struct nameidata nd; struct stat sb; int error, vfslocked; NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); if (mtx_owned(&Giant)) printf("stat(%d): %s\n", vfslocked, path); if (error) return (error); *sbp = sb; return (0); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct lstat_args { char *path; struct stat *ub; }; #endif int lstat(td, uap) struct thread *td; register struct lstat_args /* { char *path; struct stat *ub; } */ *uap; { struct stat sb; int error; error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb); if (error == 0) error = copyout(&sb, uap->ub, sizeof (sb)); return (error); } int kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp) { struct vnode *vp; struct stat sb; struct nameidata nd; int error, vfslocked; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); VFS_UNLOCK_GIANT(vfslocked); if (error) return (error); *sbp = sb; return (0); } /* * Implementation of the NetBSD [l]stat() functions. */ void cvtnstat(sb, nsb) struct stat *sb; struct nstat *nsb; { bzero(nsb, sizeof *nsb); nsb->st_dev = sb->st_dev; nsb->st_ino = sb->st_ino; nsb->st_mode = sb->st_mode; nsb->st_nlink = sb->st_nlink; nsb->st_uid = sb->st_uid; nsb->st_gid = sb->st_gid; nsb->st_rdev = sb->st_rdev; nsb->st_atimespec = sb->st_atimespec; nsb->st_mtimespec = sb->st_mtimespec; nsb->st_ctimespec = sb->st_ctimespec; nsb->st_size = sb->st_size; nsb->st_blocks = sb->st_blocks; nsb->st_blksize = sb->st_blksize; nsb->st_flags = sb->st_flags; nsb->st_gen = sb->st_gen; nsb->st_birthtimespec = sb->st_birthtimespec; } #ifndef _SYS_SYSPROTO_H_ struct nstat_args { char *path; struct nstat *ub; }; #endif int nstat(td, uap) struct thread *td; register struct nstat_args /* { char *path; struct nstat *ub; } */ *uap; { struct stat sb; struct nstat nsb; int error; error = kern_stat(td, uap->path, UIO_USERSPACE, &sb); if (error) return (error); cvtnstat(&sb, &nsb); error = copyout(&nsb, uap->ub, sizeof (nsb)); return (error); } /* * NetBSD lstat. Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct lstat_args { char *path; struct stat *ub; }; #endif int nlstat(td, uap) struct thread *td; register struct nlstat_args /* { char *path; struct nstat *ub; } */ *uap; { struct stat sb; struct nstat nsb; int error; error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb); if (error) return (error); cvtnstat(&sb, &nsb); error = copyout(&nsb, uap->ub, sizeof (nsb)); return (error); } /* * Get configurable pathname variables. */ #ifndef _SYS_SYSPROTO_H_ struct pathconf_args { char *path; int name; }; #endif int pathconf(td, uap) struct thread *td; register struct pathconf_args /* { char *path; int name; } */ *uap; { return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name)); } int kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name) { struct nameidata nd; int error, vfslocked; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); /* If asynchronous I/O is available, it works for all files. */ if (name == _PC_ASYNC_IO) td->td_retval[0] = async_io_version; else error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval); vput(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Return target name of a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct readlink_args { char *path; char *buf; int count; }; #endif int readlink(td, uap) struct thread *td; register struct readlink_args /* { char *path; char *buf; int count; } */ *uap; { return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->count)); } int kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf, enum uio_seg bufseg, int count) { register struct vnode *vp; struct iovec aiov; struct uio auio; int error; struct nameidata nd; int vfslocked; NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; #ifdef MAC error = mac_vnode_check_readlink(td->td_ucred, vp); if (error) { vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } #endif if (vp->v_type != VLNK) error = EINVAL; else { aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; auio.uio_resid = count; error = VOP_READLINK(vp, &auio, td->td_ucred); } vput(vp); VFS_UNLOCK_GIANT(vfslocked); td->td_retval[0] = count - auio.uio_resid; return (error); } /* * Common implementation code for chflags() and fchflags(). */ static int setfflags(td, vp, flags) struct thread *td; struct vnode *vp; int flags; { int error; struct mount *mp; struct vattr vattr; /* * Prevent non-root users from setting flags on devices. When * a device is reused, users can retain ownership of the device * if they are allowed to set flags and programs assume that * chown can't fail when done as root. */ if (vp->v_type == VCHR || vp->v_type == VBLK) { error = priv_check(td, PRIV_VFS_CHFLAGS_DEV); if (error) return (error); } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_flags = flags; #ifdef MAC error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Change flags of a file given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chflags_args { char *path; int flags; }; #endif int chflags(td, uap) struct thread *td; register struct chflags_args /* { char *path; int flags; } */ *uap; { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(fflags, uap->flags); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vfslocked = NDHASGIANT(&nd); error = setfflags(td, nd.ni_vp, uap->flags); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Same as chflags() but doesn't follow symlinks. */ int lchflags(td, uap) struct thread *td; register struct lchflags_args /* { char *path; int flags; } */ *uap; { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(fflags, uap->flags); NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfflags(td, nd.ni_vp, uap->flags); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Change flags of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchflags_args { int fd; int flags; }; #endif int fchflags(td, uap) struct thread *td; register struct fchflags_args /* { int fd; int flags; } */ *uap; { struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); AUDIT_ARG(fflags, uap->flags); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td); AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1); VOP_UNLOCK(fp->f_vnode, 0, td); #endif error = setfflags(td, fp->f_vnode, uap->flags); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Common implementation code for chmod(), lchmod() and fchmod(). */ static int setfmode(td, vp, mode) struct thread *td; struct vnode *vp; int mode; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_mode = mode & ALLPERMS; #ifdef MAC error = mac_vnode_check_setmode(td->td_ucred, vp, vattr.va_mode); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Change mode of a file given path name. */ #ifndef _SYS_SYSPROTO_H_ struct chmod_args { char *path; int mode; }; #endif int chmod(td, uap) struct thread *td; register struct chmod_args /* { char *path; int mode; } */ *uap; { return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode)); } int kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode) { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, mode); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, nd.ni_vp, mode); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Change mode of a file given path name (don't follow links.) */ #ifndef _SYS_SYSPROTO_H_ struct lchmod_args { char *path; int mode; }; #endif int lchmod(td, uap) struct thread *td; register struct lchmod_args /* { char *path; int mode; } */ *uap; { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, (mode_t)uap->mode); NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, nd.ni_vp, uap->mode); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Change mode of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchmod_args { int fd; int mode; }; #endif int fchmod(td, uap) struct thread *td; register struct fchmod_args /* { int fd; int mode; } */ *uap; { struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); AUDIT_ARG(mode, uap->mode); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td); AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1); VOP_UNLOCK(fp->f_vnode, 0, td); #endif error = setfmode(td, fp->f_vnode, uap->mode); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Common implementation for chown(), lchown(), and fchown() */ static int setfown(td, vp, uid, gid) struct thread *td; struct vnode *vp; uid_t uid; gid_t gid; { int error; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); VATTR_NULL(&vattr); vattr.va_uid = uid; vattr.va_gid = gid; #ifdef MAC error = mac_vnode_check_setowner(td->td_ucred, vp, vattr.va_uid, vattr.va_gid); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Set ownership given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chown_args { char *path; int uid; int gid; }; #endif int chown(td, uap) struct thread *td; register struct chown_args /* { char *path; int uid; int gid; } */ *uap; { return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid)); } int kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid, int gid) { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(owner, uid, gid); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfown(td, nd.ni_vp, uid, gid); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Set ownership given a path name, do not cross symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchown_args { char *path; int uid; int gid; }; #endif int lchown(td, uap) struct thread *td; register struct lchown_args /* { char *path; int uid; int gid; } */ *uap; { return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid)); } int kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid, int gid) { int error; struct nameidata nd; int vfslocked; AUDIT_ARG(owner, uid, gid); NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfown(td, nd.ni_vp, uid, gid); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Set ownership given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchown_args { int fd; int uid; int gid; }; #endif int fchown(td, uap) struct thread *td; register struct fchown_args /* { int fd; int uid; int gid; } */ *uap; { struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); AUDIT_ARG(owner, uap->uid, uap->gid); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td); AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1); VOP_UNLOCK(fp->f_vnode, 0, td); #endif error = setfown(td, fp->f_vnode, uap->uid, uap->gid); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int getutimes(usrtvp, tvpseg, tsp) const struct timeval *usrtvp; enum uio_seg tvpseg; struct timespec *tsp; { struct timeval tv[2]; const struct timeval *tvp; int error; if (usrtvp == NULL) { microtime(&tv[0]); TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]); tsp[1] = tsp[0]; } else { if (tvpseg == UIO_SYSSPACE) { tvp = usrtvp; } else { if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0) return (error); tvp = tv; } if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 || tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000) return (EINVAL); TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]); TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]); } return (0); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int setutimes(td, vp, ts, numtimes, nullflag) struct thread *td; struct vnode *vp; const struct timespec *ts; int numtimes; int nullflag; { int error, setbirthtime; struct mount *mp; struct vattr vattr; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); setbirthtime = 0; if (numtimes < 3 && VOP_GETATTR(vp, &vattr, td->td_ucred, td) == 0 && timespeccmp(&ts[1], &vattr.va_birthtime, < )) setbirthtime = 1; VATTR_NULL(&vattr); vattr.va_atime = ts[0]; vattr.va_mtime = ts[1]; if (setbirthtime) vattr.va_birthtime = ts[1]; if (numtimes > 2) vattr.va_birthtime = ts[2]; if (nullflag) vattr.va_vaflags |= VA_UTIMES_NULL; #ifdef MAC error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime, vattr.va_mtime); #endif if (error == 0) error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct utimes_args { char *path; struct timeval *tptr; }; #endif int utimes(td, uap) struct thread *td; register struct utimes_args /* { char *path; struct timeval *tptr; } */ *uap; { return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } int kern_utimes(struct thread *td, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; int error; struct nameidata nd; int vfslocked; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct lutimes_args { char *path; struct timeval *tptr; }; #endif int lutimes(td, uap) struct thread *td; register struct lutimes_args /* { char *path; struct timeval *tptr; } */ *uap; { return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } int kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; int error; struct nameidata nd; int vfslocked; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct futimes_args { int fd; struct timeval *tptr; }; #endif int futimes(td, uap) struct thread *td; register struct futimes_args /* { int fd; struct timeval *tptr; } */ *uap; { return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE)); } int kern_futimes(struct thread *td, int fd, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, fd); if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); if ((error = getvnode(td->td_proc->p_fd, fd, &fp)) != 0) return (error); vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); #ifdef AUDIT vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td); AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1); VOP_UNLOCK(fp->f_vnode, 0, td); #endif error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct truncate_args { char *path; int pad; off_t length; }; #endif int truncate(td, uap) struct thread *td; register struct truncate_args /* { char *path; int pad; off_t length; } */ *uap; { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length) { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; int vfslocked; if (length < 0) return(EINVAL); NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (vp->v_type == VDIR) error = EISDIR; #ifdef MAC else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) { } #endif else if ((error = vn_writechk(vp)) == 0 && (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) { VATTR_NULL(&vattr); vattr.va_size = length; error = VOP_SETATTR(vp, &vattr, td->td_ucred, td); } vput(vp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Truncate a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ftruncate_args { int fd; int pad; off_t length; }; #endif int ftruncate(td, uap) struct thread *td; register struct ftruncate_args /* { int fd; int pad; off_t length; } */ *uap; { struct mount *mp; struct vattr vattr; struct vnode *vp; struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); if (uap->length < 0) return(EINVAL); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FWRITE) == 0) { fdrop(fp, td); return (EINVAL); } vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); AUDIT_ARG(vnode, vp, ARG_VNODE1); if (vp->v_type == VDIR) error = EISDIR; #ifdef MAC else if ((error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp))) { } #endif else if ((error = vn_writechk(vp)) == 0) { VATTR_NULL(&vattr); vattr.va_size = uap->length; error = VOP_SETATTR(vp, &vattr, fp->f_cred, td); } VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); drop: VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } #if defined(COMPAT_43) /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct otruncate_args { char *path; long length; }; #endif int otruncate(td, uap) struct thread *td; register struct otruncate_args /* { char *path; long length; } */ *uap; { struct truncate_args /* { char *path; int pad; off_t length; } */ nuap; nuap.path = uap->path; nuap.length = uap->length; return (truncate(td, &nuap)); } /* * Truncate a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct oftruncate_args { int fd; long length; }; #endif int oftruncate(td, uap) struct thread *td; register struct oftruncate_args /* { int fd; long length; } */ *uap; { struct ftruncate_args /* { int fd; int pad; off_t length; } */ nuap; nuap.fd = uap->fd; nuap.length = uap->length; return (ftruncate(td, &nuap)); } #endif /* COMPAT_43 */ /* Versions with the pad argument */ int freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap) { struct truncate_args ouap; ouap.path = uap->path; ouap.length = uap->length; return (truncate(td, &ouap)); } int freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap) { struct ftruncate_args ouap; ouap.fd = uap->fd; ouap.length = uap->length; return (ftruncate(td, &ouap)); } /* * Sync an open file. */ #ifndef _SYS_SYSPROTO_H_ struct fsync_args { int fd; }; #endif int fsync(td, uap) struct thread *td; struct fsync_args /* { int fd; } */ *uap; { struct vnode *vp; struct mount *mp; struct file *fp; int vfslocked; int error; AUDIT_ARG(fd, uap->fd); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); AUDIT_ARG(vnode, vp, ARG_VNODE1); if (vp->v_object != NULL) { VM_OBJECT_LOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_UNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0, td); vn_finished_write(mp); drop: VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } /* * Rename files. Source and destination must either both be directories, or * both not be directories. If target is a directory, it must be empty. */ #ifndef _SYS_SYSPROTO_H_ struct rename_args { char *from; char *to; }; #endif int rename(td, uap) struct thread *td; register struct rename_args /* { char *from; char *to; } */ *uap; { return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE)); } int kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg) { struct mount *mp = NULL; struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; int tvfslocked; int fvfslocked; int error; bwillwrite(); #ifdef MAC NDINIT(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | MPSAFE | AUDITVNODE1, pathseg, from, td); #else NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE | AUDITVNODE1, pathseg, from, td); #endif if ((error = namei(&fromnd)) != 0) return (error); fvfslocked = NDHASGIANT(&fromnd); tvfslocked = 0; #ifdef MAC error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd); VOP_UNLOCK(fromnd.ni_dvp, 0, td); if (fromnd.ni_dvp != fromnd.ni_vp) VOP_UNLOCK(fromnd.ni_vp, 0, td); #endif fvp = fromnd.ni_vp; if (error == 0) error = vn_start_write(fvp, &mp, V_WAIT | PCATCH); if (error != 0) { NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | MPSAFE | AUDITVNODE2, pathseg, to, td); if (fromnd.ni_vp->v_type == VDIR) tond.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&tond)) != 0) { /* Translate error code for rename("dir1", "dir2/."). */ if (error == EISDIR && fvp->v_type == VDIR) error = EINVAL; NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); vn_finished_write(mp); goto out1; } tvfslocked = NDHASGIANT(&tond); tdvp = tond.ni_dvp; tvp = tond.ni_vp; if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { error = ENOTDIR; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { error = EISDIR; goto out; } } if (fvp == tdvp) error = EINVAL; /* * If the source is the same as the destination (that is, if they * are links to the same vnode), then there is nothing to do. */ if (fvp == tvp) error = -1; #ifdef MAC else error = mac_vnode_check_rename_to(td->td_ucred, tdvp, tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd); #endif out: if (!error) { VOP_LEASE(tdvp, td, td->td_ucred, LEASE_WRITE); if (fromnd.ni_dvp != tdvp) { VOP_LEASE(fromnd.ni_dvp, td, td->td_ucred, LEASE_WRITE); } if (tvp) { VOP_LEASE(tvp, td, td->td_ucred, LEASE_WRITE); } error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); } else { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tvp) vput(tvp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); vrele(fromnd.ni_dvp); vrele(fvp); } vrele(tond.ni_startdir); vn_finished_write(mp); out1: if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); VFS_UNLOCK_GIANT(fvfslocked); VFS_UNLOCK_GIANT(tvfslocked); if (error == -1) return (0); return (error); } /* * Make a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct mkdir_args { char *path; int mode; }; #endif int mkdir(td, uap) struct thread *td; register struct mkdir_args /* { char *path; int mode; } */ *uap; { return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode) { struct mount *mp; struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; int vfslocked; AUDIT_ARG(mode, mode); restart: bwillwrite(); NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, segflg, path, td); nd.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); /* * XXX namei called with LOCKPARENT but not LOCKLEAF has * the strange behaviour of leaving the vnode unlocked * if the target is the same vnode as the parent. */ if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VDIR; FILEDESC_SLOCK(td->td_proc->p_fd); vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask; FILEDESC_SUNLOCK(td->td_proc->p_fd); #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error) goto out; #endif VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); #ifdef MAC out: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (!error) vput(nd.ni_vp); vn_finished_write(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Remove a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct rmdir_args { char *path; }; #endif int rmdir(td, uap) struct thread *td; struct rmdir_args /* { char *path; } */ *uap; { return (kern_rmdir(td, uap->path, UIO_USERSPACE)); } int kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg) { struct mount *mp; struct vnode *vp; int error; struct nameidata nd; int vfslocked; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ if (nd.ni_dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (vp->v_vflag & VV_ROOT) { error = EBUSY; goto out; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error) goto out; #endif if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE); VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vn_finished_write(mp); out: NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); VFS_UNLOCK_GIANT(vfslocked); return (error); } #ifdef COMPAT_43 /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct ogetdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int ogetdirentries(td, uap) struct thread *td; register struct ogetdirentries_args /* { int fd; char *buf; u_int count; long *basep; } */ *uap; { struct vnode *vp; struct file *fp; struct uio auio, kuio; struct iovec aiov, kiov; struct dirent *dp, *edp; caddr_t dirbuf; int error, eofflag, readcnt, vfslocked; long loff; /* XXX arbitrary sanity limit on `count'. */ if (uap->count > 64 * 1024) return (EINVAL); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; unionread: vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (vp->v_type != VDIR) { VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (EINVAL); } aiov.iov_base = uap->buf; aiov.iov_len = uap->count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = uap->count; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); loff = auio.uio_offset = fp->f_offset; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error) { VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } #endif # if (BYTE_ORDER != LITTLE_ENDIAN) if (vp->v_mount->mnt_maxsymlinklen <= 0) { error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = auio.uio_offset; } else # endif { kuio = auio; kuio.uio_iov = &kiov; kuio.uio_segflg = UIO_SYSSPACE; kiov.iov_len = uap->count; MALLOC(dirbuf, caddr_t, uap->count, M_TEMP, M_WAITOK); kiov.iov_base = dirbuf; error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = kuio.uio_offset; if (error == 0) { readcnt = uap->count - kuio.uio_resid; edp = (struct dirent *)&dirbuf[readcnt]; for (dp = (struct dirent *)dirbuf; dp < edp; ) { # if (BYTE_ORDER == LITTLE_ENDIAN) /* * The expected low byte of * dp->d_namlen is our dp->d_type. * The high MBZ byte of dp->d_namlen * is our dp->d_namlen. */ dp->d_type = dp->d_namlen; dp->d_namlen = 0; # else /* * The dp->d_type is the high byte * of the expected dp->d_namlen, * so must be zero'ed. */ dp->d_type = 0; # endif if (dp->d_reclen > 0) { dp = (struct dirent *) ((char *)dp + dp->d_reclen); } else { error = EIO; break; } } if (dp >= edp) error = uiomove(dirbuf, readcnt, &auio); } FREE(dirbuf, M_TEMP); } if (error) { VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); fdrop(fp, td); return (error); } if (uap->count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; fp->f_data = vp; fp->f_offset = 0; vput(tvp); VFS_UNLOCK_GIANT(vfslocked); goto unionread; } VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); error = copyout(&loff, uap->basep, sizeof(long)); fdrop(fp, td); td->td_retval[0] = uap->count - auio.uio_resid; return (error); } #endif /* COMPAT_43 */ /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct getdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int getdirentries(td, uap) struct thread *td; register struct getdirentries_args /* { int fd; char *buf; u_int count; long *basep; } */ *uap; { struct vnode *vp; struct file *fp; struct uio auio; struct iovec aiov; int vfslocked; long loff; int error, eofflag; AUDIT_ARG(fd, uap->fd); if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; unionread: vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (vp->v_type != VDIR) { VFS_UNLOCK_GIANT(vfslocked); error = EINVAL; goto fail; } aiov.iov_base = uap->buf; aiov.iov_len = uap->count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auio.uio_resid = uap->count; /* vn_lock(vp, LK_SHARED | LK_RETRY, td); */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); AUDIT_ARG(vnode, vp, ARG_VNODE1); loff = auio.uio_offset = fp->f_offset; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error == 0) #endif error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = auio.uio_offset; if (error) { VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); goto fail; } if (uap->count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; fp->f_data = vp; fp->f_offset = 0; vput(tvp); VFS_UNLOCK_GIANT(vfslocked); goto unionread; } VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); if (uap->basep != NULL) { error = copyout(&loff, uap->basep, sizeof(long)); } td->td_retval[0] = uap->count - auio.uio_resid; fail: fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getdents_args { int fd; char *buf; size_t count; }; #endif int getdents(td, uap) struct thread *td; register struct getdents_args /* { int fd; char *buf; u_int count; } */ *uap; { struct getdirentries_args ap; ap.fd = uap->fd; ap.buf = uap->buf; ap.count = uap->count; ap.basep = NULL; return (getdirentries(td, &ap)); } /* * Set the mode mask for creation of filesystem nodes. */ #ifndef _SYS_SYSPROTO_H_ struct umask_args { int newmask; }; #endif int umask(td, uap) struct thread *td; struct umask_args /* { int newmask; } */ *uap; { register struct filedesc *fdp; FILEDESC_XLOCK(td->td_proc->p_fd); fdp = td->td_proc->p_fd; td->td_retval[0] = fdp->fd_cmask; fdp->fd_cmask = uap->newmask & ALLPERMS; FILEDESC_XUNLOCK(td->td_proc->p_fd); return (0); } /* * Void all references to file by ripping underlying filesystem away from * vnode. */ #ifndef _SYS_SYSPROTO_H_ struct revoke_args { char *path; }; #endif int revoke(td, uap) struct thread *td; register struct revoke_args /* { char *path; } */ *uap; { struct vnode *vp; struct vattr vattr; int error; struct nameidata nd; int vfslocked; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vfslocked = NDHASGIANT(&nd); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (vp->v_type != VCHR) { error = EINVAL; goto out; } #ifdef MAC error = mac_vnode_check_revoke(td->td_ucred, vp); if (error) goto out; #endif error = VOP_GETATTR(vp, &vattr, td->td_ucred, td); if (error) goto out; if (td->td_ucred->cr_uid != vattr.va_uid) { error = priv_check(td, PRIV_VFS_ADMIN); if (error) goto out; } if (vcount(vp) > 1) VOP_REVOKE(vp, REVOKEALL); out: vput(vp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Convert a user file descriptor to a kernel file entry. * A reference on the file entry is held upon returning. */ int getvnode(fdp, fd, fpp) struct filedesc *fdp; int fd; struct file **fpp; { int error; struct file *fp; fp = NULL; if (fdp == NULL) error = EBADF; else { FILEDESC_SLOCK(fdp); if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) error = EBADF; else if (fp->f_vnode == NULL) { fp = NULL; error = EINVAL; } else { fhold(fp); error = 0; } FILEDESC_SUNLOCK(fdp); } *fpp = fp; return (error); } /* * Get an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct lgetfh_args { char *fname; fhandle_t *fhp; }; #endif int lgetfh(td, uap) struct thread *td; register struct lgetfh_args *uap; { struct nameidata nd; fhandle_t fh; register struct vnode *vp; int vfslocked; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->fname, td); error = namei(&nd); if (error) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fh.fh_fid); vput(vp); VFS_UNLOCK_GIANT(vfslocked); if (error) return (error); error = copyout(&fh, uap->fhp, sizeof (fh)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getfh_args { char *fname; fhandle_t *fhp; }; #endif int getfh(td, uap) struct thread *td; register struct getfh_args *uap; { struct nameidata nd; fhandle_t fh; register struct vnode *vp; int vfslocked; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->fname, td); error = namei(&nd); if (error) return (error); vfslocked = NDHASGIANT(&nd); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fh.fh_fid); vput(vp); VFS_UNLOCK_GIANT(vfslocked); if (error) return (error); error = copyout(&fh, uap->fhp, sizeof (fh)); return (error); } /* * syscall for the rpc.lockd to use to translate a NFS file handle into an * open descriptor. * * warning: do not remove the priv_check() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct fhopen_args { const struct fhandle *u_fhp; int flags; }; #endif int fhopen(td, uap) struct thread *td; struct fhopen_args /* { const struct fhandle *u_fhp; int flags; } */ *uap; { struct proc *p = td->td_proc; struct mount *mp; struct vnode *vp; struct fhandle fhp; struct vattr vat; struct vattr *vap = &vat; struct flock lf; struct file *fp; register struct filedesc *fdp = p->p_fd; int fmode, mode, error, type; struct file *nfp; int vfslocked; int indx; error = priv_check(td, PRIV_VFS_FHOPEN); if (error) return (error); fmode = FFLAGS(uap->flags); /* why not allow a non-read/write open for our lockd? */ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) return (EINVAL); error = copyin(uap->u_fhp, &fhp, sizeof(fhp)); if (error) return(error); /* find the mount point */ mp = vfs_getvfs(&fhp.fh_fsid); if (mp == NULL) return (ESTALE); vfslocked = VFS_LOCK_GIANT(mp); /* now give me my vnode, it gets returned to me locked */ error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp); if (error) goto out; /* * from now on we have to make sure not * to forget about the vnode * any error that causes an abort must vput(vp) * just set error = err and 'goto bad;'. */ /* * from vn_open */ if (vp->v_type == VLNK) { error = EMLINK; goto bad; } if (vp->v_type == VSOCK) { error = EOPNOTSUPP; goto bad; } mode = 0; if (fmode & (FWRITE | O_TRUNC)) { if (vp->v_type == VDIR) { error = EISDIR; goto bad; } error = vn_writechk(vp); if (error) goto bad; mode |= VWRITE; } if (fmode & FREAD) mode |= VREAD; if (fmode & O_APPEND) mode |= VAPPEND; #ifdef MAC error = mac_vnode_check_open(td->td_ucred, vp, mode); if (error) goto bad; #endif if (mode) { error = VOP_ACCESS(vp, mode, td->td_ucred, td); if (error) goto bad; } if (fmode & O_TRUNC) { VOP_UNLOCK(vp, 0, td); /* XXX */ if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) { vrele(vp); goto out; } VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); /* XXX */ #ifdef MAC /* * We don't yet have fp->f_cred, so use td->td_ucred, which * should be right. */ error = mac_vnode_check_write(td->td_ucred, td->td_ucred, vp); if (error == 0) { #endif VATTR_NULL(vap); vap->va_size = 0; error = VOP_SETATTR(vp, vap, td->td_ucred, td); #ifdef MAC } #endif vn_finished_write(mp); if (error) goto bad; } error = VOP_OPEN(vp, fmode, td->td_ucred, td, NULL); if (error) goto bad; if (fmode & FWRITE) vp->v_writecount++; /* * end of vn_open code */ if ((error = falloc(td, &nfp, &indx)) != 0) { if (fmode & FWRITE) vp->v_writecount--; goto bad; } /* An extra reference on `nfp' has been held for us by falloc(). */ fp = nfp; - - FILE_LOCK(nfp); nfp->f_vnode = vp; - nfp->f_data = vp; - nfp->f_flag = fmode & FMASK; - nfp->f_type = DTYPE_VNODE; - nfp->f_ops = &vnops; - FILE_UNLOCK(nfp); + finit(nfp, fmode & FMASK, DTYPE_VNODE, vp, &vnops); if (fmode & (O_EXLOCK | O_SHLOCK)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (fmode & O_EXLOCK) lf.l_type = F_WRLCK; else lf.l_type = F_RDLCK; type = F_FLOCK; if ((fmode & FNONBLOCK) == 0) type |= F_WAIT; VOP_UNLOCK(vp, 0, td); if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) { /* * The lock request failed. Normally close the * descriptor but handle the case where someone might * have dup()d or close()d it when we weren't looking. */ fdclose(fdp, fp, indx, td); /* * release our private reference */ fdrop(fp, td); goto out; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); - fp->f_flag |= FHASLOCK; + atomic_set_int(&fp->f_flag, FHASLOCK); } VOP_UNLOCK(vp, 0, td); fdrop(fp, td); vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); td->td_retval[0] = indx; return (0); bad: vput(vp); out: vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Stat an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct fhstat_args { struct fhandle *u_fhp; struct stat *sb; }; #endif int fhstat(td, uap) struct thread *td; register struct fhstat_args /* { struct fhandle *u_fhp; struct stat *sb; } */ *uap; { struct stat sb; fhandle_t fh; struct mount *mp; struct vnode *vp; int vfslocked; int error; error = priv_check(td, PRIV_VFS_FHSTAT); if (error) return (error); error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) return (ESTALE); vfslocked = VFS_LOCK_GIANT(mp); if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) { vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td); vput(vp); vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); if (error) return (error); error = copyout(&sb, uap->sb, sizeof(sb)); return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct fhstatfs_args { struct fhandle *u_fhp; struct statfs *buf; }; #endif int fhstatfs(td, uap) struct thread *td; struct fhstatfs_args /* { struct fhandle *u_fhp; struct statfs *buf; } */ *uap; { struct statfs sf; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); error = kern_fhstatfs(td, fh, &sf); if (error) return (error); return (copyout(&sf, uap->buf, sizeof(sf))); } int kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf) { struct statfs *sp; struct mount *mp; struct vnode *vp; int vfslocked; int error; error = priv_check(td, PRIV_VFS_FHSTATFS); if (error) return (error); if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) return (ESTALE); vfslocked = VFS_LOCK_GIANT(mp); error = VFS_FHTOVP(mp, &fh.fh_fid, &vp); if (error) { VFS_UNLOCK_GIANT(vfslocked); vfs_rel(mp); return (error); } vput(vp); error = prison_canseemount(td->td_ucred, mp); if (error) goto out; #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error) goto out; #endif /* * Set these in case the underlying filesystem fails to do so. */ sp = &mp->mnt_stat; sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp, td); if (error == 0) *buf = *sp; out: vfs_rel(mp); VFS_UNLOCK_GIANT(vfslocked); return (error); } Index: head/sys/kern/vfs_vnops.c =================================================================== --- head/sys/kern/vfs_vnops.c (revision 174987) +++ head/sys/kern/vfs_vnops.c (revision 174988) @@ -1,1252 +1,1256 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static fo_rdwr_t vn_read; static fo_rdwr_t vn_write; static fo_ioctl_t vn_ioctl; static fo_poll_t vn_poll; static fo_kqfilter_t vn_kqfilter; static fo_stat_t vn_statfile; static fo_close_t vn_closefile; struct fileops vnops = { .fo_read = vn_read, .fo_write = vn_write, .fo_ioctl = vn_ioctl, .fo_poll = vn_poll, .fo_kqfilter = vn_kqfilter, .fo_stat = vn_statfile, .fo_close = vn_closefile, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; int vn_open(ndp, flagp, cmode, fp) struct nameidata *ndp; int *flagp, cmode; struct file *fp; { struct thread *td = ndp->ni_cnd.cn_thread; return (vn_open_cred(ndp, flagp, cmode, td->td_ucred, fp)); } /* * Common code for vnode open operations. * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. * * Note that this does NOT free nameidata for the successful case, * due to the NDINIT being done elsewhere. */ int vn_open_cred(ndp, flagp, cmode, cred, fp) struct nameidata *ndp; int *flagp, cmode; struct ucred *cred; struct file *fp; { struct vnode *vp; struct mount *mp; struct thread *td = ndp->ni_cnd.cn_thread; struct vattr vat; struct vattr *vap = &vat; int mode, fmode, error; int vfslocked, mpsafe; mpsafe = ndp->ni_cnd.cn_flags & MPSAFE; restart: vfslocked = 0; fmode = *flagp; if (fmode & O_CREAT) { ndp->ni_cnd.cn_nameiop = CREATE; ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1; if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) ndp->ni_cnd.cn_flags |= FOLLOW; bwillwrite(); if ((error = namei(ndp)) != 0) return (error); vfslocked = NDHASGIANT(ndp); if (!mpsafe) ndp->ni_cnd.cn_flags &= ~MPSAFE; if (ndp->ni_vp == NULL) { VATTR_NULL(vap); vap->va_type = VREG; vap->va_mode = cmode; if (fmode & O_EXCL) vap->va_vaflags |= VA_EXCLUSIVE; if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(ndp, NDF_ONLY_PNBUF); vput(ndp->ni_dvp); VFS_UNLOCK_GIANT(vfslocked); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC error = mac_vnode_check_create(cred, ndp->ni_dvp, &ndp->ni_cnd, vap); if (error == 0) { #endif VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE); error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, vap); #ifdef MAC } #endif vput(ndp->ni_dvp); vn_finished_write(mp); if (error) { VFS_UNLOCK_GIANT(vfslocked); NDFREE(ndp, NDF_ONLY_PNBUF); return (error); } fmode &= ~O_TRUNC; vp = ndp->ni_vp; } else { if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); ndp->ni_dvp = NULL; vp = ndp->ni_vp; if (fmode & O_EXCL) { error = EEXIST; goto bad; } fmode &= ~O_CREAT; } } else { ndp->ni_cnd.cn_nameiop = LOOKUP; ndp->ni_cnd.cn_flags = ISOPEN | ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF | MPSAFE | AUDITVNODE1; if ((error = namei(ndp)) != 0) return (error); if (!mpsafe) ndp->ni_cnd.cn_flags &= ~MPSAFE; vfslocked = NDHASGIANT(ndp); vp = ndp->ni_vp; } if (vp->v_type == VLNK) { error = EMLINK; goto bad; } if (vp->v_type == VSOCK) { error = EOPNOTSUPP; goto bad; } mode = 0; if (fmode & (FWRITE | O_TRUNC)) { if (vp->v_type == VDIR) { error = EISDIR; goto bad; } mode |= VWRITE; } if (fmode & FREAD) mode |= VREAD; if (fmode & O_APPEND) mode |= VAPPEND; #ifdef MAC error = mac_vnode_check_open(cred, vp, mode); if (error) goto bad; #endif if ((fmode & O_CREAT) == 0) { if (mode & VWRITE) { error = vn_writechk(vp); if (error) goto bad; } if (mode) { error = VOP_ACCESS(vp, mode, cred, td); if (error) goto bad; } } if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0) goto bad; if (fmode & FWRITE) vp->v_writecount++; *flagp = fmode; ASSERT_VOP_ELOCKED(vp, "vn_open_cred"); if (!mpsafe) VFS_UNLOCK_GIANT(vfslocked); return (0); bad: NDFREE(ndp, NDF_ONLY_PNBUF); vput(vp); VFS_UNLOCK_GIANT(vfslocked); *flagp = fmode; ndp->ni_vp = NULL; return (error); } /* * Check for write permissions on the specified vnode. * Prototype text segments cannot be written. */ int vn_writechk(vp) register struct vnode *vp; { ASSERT_VOP_LOCKED(vp, "vn_writechk"); /* * If there's shared text associated with * the vnode, try to free it up once. If * we fail, we can't allow writing. */ if (vp->v_vflag & VV_TEXT) return (ETXTBSY); return (0); } /* * Vnode close call */ int vn_close(vp, flags, file_cred, td) register struct vnode *vp; int flags; struct ucred *file_cred; struct thread *td; { struct mount *mp; int error; VFS_ASSERT_GIANT(vp->v_mount); vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (flags & FWRITE) { VNASSERT(vp->v_writecount > 0, vp, ("vn_close: negative writecount")); vp->v_writecount--; } error = VOP_CLOSE(vp, flags, file_cred, td); vput(vp); vn_finished_write(mp); return (error); } /* * Sequential heuristic - detect sequential operation */ static __inline int sequential_heuristic(struct uio *uio, struct file *fp) { if ((uio->uio_offset == 0 && fp->f_seqcount > 0) || uio->uio_offset == fp->f_nextoff) { /* * XXX we assume that the filesystem block size is * the default. Not true, but still gives us a pretty * good indicator of how sequential the read operations * are. */ fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE; if (fp->f_seqcount > IO_SEQMAX) fp->f_seqcount = IO_SEQMAX; return(fp->f_seqcount << IO_SEQSHIFT); } /* * Not sequential, quick draw-down of seqcount */ if (fp->f_seqcount > 1) fp->f_seqcount = 1; else fp->f_seqcount = 0; return(0); } /* * Package up an I/O request on a vnode into a uio and do it. */ int vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, aresid, td) enum uio_rw rw; struct vnode *vp; void *base; int len; off_t offset; enum uio_seg segflg; int ioflg; struct ucred *active_cred; struct ucred *file_cred; int *aresid; struct thread *td; { struct uio auio; struct iovec aiov; struct mount *mp; struct ucred *cred; int error; VFS_ASSERT_GIANT(vp->v_mount); if ((ioflg & IO_NODELOCKED) == 0) { mp = NULL; if (rw == UIO_WRITE) { if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); } else { /* * XXX This should be LK_SHARED but I don't trust VFS * enough to leave it like that until it has been * reviewed further. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); } } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = base; aiov.iov_len = len; auio.uio_resid = len; auio.uio_offset = offset; auio.uio_segflg = segflg; auio.uio_rw = rw; auio.uio_td = td; error = 0; #ifdef MAC if ((ioflg & IO_NOMACCHECK) == 0) { if (rw == UIO_READ) error = mac_vnode_check_read(active_cred, file_cred, vp); else error = mac_vnode_check_write(active_cred, file_cred, vp); } #endif if (error == 0) { if (file_cred) cred = file_cred; else cred = active_cred; if (rw == UIO_READ) error = VOP_READ(vp, &auio, ioflg, cred); else error = VOP_WRITE(vp, &auio, ioflg, cred); } if (aresid) *aresid = auio.uio_resid; else if (auio.uio_resid && error == 0) error = EIO; if ((ioflg & IO_NODELOCKED) == 0) { if (rw == UIO_WRITE && vp->v_type != VCHR) vn_finished_write(mp); VOP_UNLOCK(vp, 0, td); } return (error); } /* * Package up an I/O request on a vnode into a uio and do it. The I/O * request is split up into smaller chunks and we try to avoid saturating * the buffer cache while potentially holding a vnode locked, so we * check bwillwrite() before calling vn_rdwr(). We also call uio_yield() * to give other processes a chance to lock the vnode (either other processes * core'ing the same binary, or unrelated processes scanning the directory). */ int vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, aresid, td) enum uio_rw rw; struct vnode *vp; void *base; size_t len; off_t offset; enum uio_seg segflg; int ioflg; struct ucred *active_cred; struct ucred *file_cred; size_t *aresid; struct thread *td; { int error = 0; int iaresid; VFS_ASSERT_GIANT(vp->v_mount); do { int chunk; /* * Force `offset' to a multiple of MAXBSIZE except possibly * for the first chunk, so that filesystems only need to * write full blocks except possibly for the first and last * chunks. */ chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; if (chunk > len) chunk = len; if (rw != UIO_READ && vp->v_type == VREG) bwillwrite(); iaresid = 0; error = vn_rdwr(rw, vp, base, chunk, offset, segflg, ioflg, active_cred, file_cred, &iaresid, td); len -= chunk; /* aresid calc already includes length */ if (error) break; offset += chunk; base = (char *)base + chunk; uio_yield(); } while (len); if (aresid) *aresid = len + iaresid; return (error); } /* * File table vnode read routine. */ static int vn_read(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; struct thread *td; int flags; { struct vnode *vp; int error, ioflag; + struct mtx *mtxp; int vfslocked; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); + mtxp = NULL; vp = fp->f_vnode; ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; vfslocked = VFS_LOCK_GIANT(vp->v_mount); VOP_LEASE(vp, td, fp->f_cred, LEASE_READ); /* * According to McKusick the vn lock was protecting f_offset here. * It is now protected by the FOFFSET_LOCKED flag. */ if ((flags & FOF_OFFSET) == 0) { - FILE_LOCK(fp); + mtxp = mtx_pool_find(mtxpool_sleep, fp); + mtx_lock(mtxp); while(fp->f_vnread_flags & FOFFSET_LOCKED) { fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; - msleep(&fp->f_vnread_flags,fp->f_mtxp,PUSER -1,"vnread offlock",0); + msleep(&fp->f_vnread_flags, mtxp, PUSER -1, + "vnread offlock", 0); } fp->f_vnread_flags |= FOFFSET_LOCKED; - FILE_UNLOCK(fp); + mtx_unlock(mtxp); vn_lock(vp, LK_SHARED | LK_RETRY, td); uio->uio_offset = fp->f_offset; } else vn_lock(vp, LK_SHARED | LK_RETRY, td); ioflag |= sequential_heuristic(uio, fp); #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_READ(vp, uio, ioflag, fp->f_cred); if ((flags & FOF_OFFSET) == 0) { fp->f_offset = uio->uio_offset; - FILE_LOCK(fp); + mtx_lock(mtxp); if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) wakeup(&fp->f_vnread_flags); fp->f_vnread_flags = 0; - FILE_UNLOCK(fp); + mtx_unlock(mtxp); } fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * File table vnode write routine. */ static int vn_write(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; struct thread *td; int flags; { struct vnode *vp; struct mount *mp; int error, ioflag; int vfslocked; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (vp->v_type == VREG) bwillwrite(); ioflag = IO_UNIT; if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) ioflag |= IO_APPEND; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; if ((fp->f_flag & O_FSYNC) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; mp = NULL; if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto unlock; VOP_LEASE(vp, td, fp->f_cred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if ((flags & FOF_OFFSET) == 0) uio->uio_offset = fp->f_offset; ioflag |= sequential_heuristic(uio, fp); #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); if ((flags & FOF_OFFSET) == 0) fp->f_offset = uio->uio_offset; fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0, td); if (vp->v_type != VCHR) vn_finished_write(mp); unlock: VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * File table vnode stat routine. */ static int vn_statfile(fp, sb, active_cred, td) struct file *fp; struct stat *sb; struct ucred *active_cred; struct thread *td; { struct vnode *vp = fp->f_vnode; int vfslocked; int error; vfslocked = VFS_LOCK_GIANT(vp->v_mount); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = vn_stat(vp, sb, active_cred, fp->f_cred, td); VOP_UNLOCK(vp, 0, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Stat a vnode; implementation for the stat syscall */ int vn_stat(vp, sb, active_cred, file_cred, td) struct vnode *vp; register struct stat *sb; struct ucred *active_cred; struct ucred *file_cred; struct thread *td; { struct vattr vattr; register struct vattr *vap; int error; u_short mode; #ifdef MAC error = mac_vnode_check_stat(active_cred, file_cred, vp); if (error) return (error); #endif vap = &vattr; error = VOP_GETATTR(vp, vap, active_cred, td); if (error) return (error); /* * Zero the spare stat fields */ bzero(sb, sizeof *sb); /* * Copy from vattr table */ if (vap->va_fsid != VNOVAL) sb->st_dev = vap->va_fsid; else sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; sb->st_ino = vap->va_fileid; mode = vap->va_mode; switch (vap->va_type) { case VREG: mode |= S_IFREG; break; case VDIR: mode |= S_IFDIR; break; case VBLK: mode |= S_IFBLK; break; case VCHR: mode |= S_IFCHR; break; case VLNK: mode |= S_IFLNK; /* This is a cosmetic change, symlinks do not have a mode. */ if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) sb->st_mode &= ~ACCESSPERMS; /* 0000 */ else sb->st_mode |= ACCESSPERMS; /* 0777 */ break; case VSOCK: mode |= S_IFSOCK; break; case VFIFO: mode |= S_IFIFO; break; default: return (EBADF); }; sb->st_mode = mode; sb->st_nlink = vap->va_nlink; sb->st_uid = vap->va_uid; sb->st_gid = vap->va_gid; sb->st_rdev = vap->va_rdev; if (vap->va_size > OFF_MAX) return (EOVERFLOW); sb->st_size = vap->va_size; sb->st_atimespec = vap->va_atime; sb->st_mtimespec = vap->va_mtime; sb->st_ctimespec = vap->va_ctime; sb->st_birthtimespec = vap->va_birthtime; /* * According to www.opengroup.org, the meaning of st_blksize is * "a filesystem-specific preferred I/O block size for this * object. In some filesystem types, this may vary from file * to file" * Default to PAGE_SIZE after much discussion. * XXX: min(PAGE_SIZE, vp->v_bufobj.bo_bsize) may be more correct. */ sb->st_blksize = PAGE_SIZE; sb->st_flags = vap->va_flags; if (priv_check(td, PRIV_VFS_GENERATION)) sb->st_gen = 0; else sb->st_gen = vap->va_gen; sb->st_blocks = vap->va_bytes / S_BLKSIZE; return (0); } /* * File table vnode ioctl routine. */ static int vn_ioctl(fp, com, data, active_cred, td) struct file *fp; u_long com; void *data; struct ucred *active_cred; struct thread *td; { struct vnode *vp = fp->f_vnode; struct vattr vattr; int vfslocked; int error; vfslocked = VFS_LOCK_GIANT(vp->v_mount); error = ENOTTY; switch (vp->v_type) { case VREG: case VDIR: if (com == FIONREAD) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_GETATTR(vp, &vattr, active_cred, td); VOP_UNLOCK(vp, 0, td); if (!error) *(int *)data = vattr.va_size - fp->f_offset; } if (com == FIONBIO || com == FIOASYNC) /* XXX */ error = 0; else error = VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td); break; default: break; } VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * File table vnode poll routine. */ static int vn_poll(fp, events, active_cred, td) struct file *fp; int events; struct ucred *active_cred; struct thread *td; { struct vnode *vp; int vfslocked; int error; vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); #ifdef MAC vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); error = mac_vnode_check_poll(active_cred, fp->f_cred, vp); VOP_UNLOCK(vp, 0, td); if (!error) #endif error = VOP_POLL(vp, events, fp->f_cred, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Check that the vnode is still valid, and if so * acquire requested lock. */ int _vn_lock(struct vnode *vp, int flags, struct thread *td, char *file, int line) { int error; do { if ((flags & LK_INTERLOCK) == 0) VI_LOCK(vp); if ((flags & LK_NOWAIT || (flags & LK_TYPE_MASK) == 0) && vp->v_iflag & VI_DOOMED) { VI_UNLOCK(vp); return (ENOENT); } /* * Just polling to check validity. */ if ((flags & LK_TYPE_MASK) == 0) { VI_UNLOCK(vp); return (0); } /* * lockmgr drops interlock before it will return for * any reason. So force the code above to relock it. */ error = VOP_LOCK1(vp, flags | LK_INTERLOCK, td, file, line); flags &= ~LK_INTERLOCK; KASSERT((flags & LK_RETRY) == 0 || error == 0, ("LK_RETRY set with incompatible flags %d\n", flags)); /* * Callers specify LK_RETRY if they wish to get dead vnodes. * If RETRY is not set, we return ENOENT instead. */ if (error == 0 && vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) { VOP_UNLOCK(vp, 0, td); error = ENOENT; break; } } while (flags & LK_RETRY && error != 0); return (error); } /* * File table vnode close routine. */ static int vn_closefile(fp, td) struct file *fp; struct thread *td; { struct vnode *vp; struct flock lf; int vfslocked; int error; vp = fp->f_vnode; vfslocked = VFS_LOCK_GIANT(vp->v_mount); if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK); } fp->f_ops = &badfileops; error = vn_close(vp, fp->f_flag, fp->f_cred, td); VFS_UNLOCK_GIANT(vfslocked); return (error); } /* * Preparing to start a filesystem write operation. If the operation is * permitted, then we bump the count of operations in progress and * proceed. If a suspend request is in progress, we wait until the * suspension is over, and then proceed. */ int vn_start_write(vp, mpp, flags) struct vnode *vp; struct mount **mpp; int flags; { struct mount *mp; int error; error = 0; /* * If a vnode is provided, get and return the mount point that * to which it will write. */ if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } if ((mp = *mpp) == NULL) return (0); MNT_ILOCK(mp); if (vp == NULL) MNT_REF(mp); /* * Check on status of suspension. */ while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { if (flags & V_NOWAIT) { error = EWOULDBLOCK; goto unlock; } error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | (flags & PCATCH), "suspfs", 0); if (error) goto unlock; } if (flags & V_XSLEEP) goto unlock; mp->mnt_writeopcount++; unlock: MNT_REL(mp); MNT_IUNLOCK(mp); return (error); } /* * Secondary suspension. Used by operations such as vop_inactive * routines that are needed by the higher level functions. These * are allowed to proceed until all the higher level functions have * completed (indicated by mnt_writeopcount dropping to zero). At that * time, these operations are halted until the suspension is over. */ int vn_write_suspend_wait(vp, mp, flags) struct vnode *vp; struct mount *mp; int flags; { int error; if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) { if (error != EOPNOTSUPP) return (error); return (0); } } /* * If we are not suspended or have not yet reached suspended * mode, then let the operation proceed. */ if (mp == NULL) return (0); MNT_ILOCK(mp); if (vp == NULL) MNT_REF(mp); if ((mp->mnt_kern_flag & MNTK_SUSPENDED) == 0) { MNT_REL(mp); MNT_IUNLOCK(mp); return (0); } if (flags & V_NOWAIT) { MNT_REL(mp); MNT_IUNLOCK(mp); return (EWOULDBLOCK); } /* * Wait for the suspension to finish. */ error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0); vfs_rel(mp); return (error); } /* * Secondary suspension. Used by operations such as vop_inactive * routines that are needed by the higher level functions. These * are allowed to proceed until all the higher level functions have * completed (indicated by mnt_writeopcount dropping to zero). At that * time, these operations are halted until the suspension is over. */ int vn_start_secondary_write(vp, mpp, flags) struct vnode *vp; struct mount **mpp; int flags; { struct mount *mp; int error; retry: if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } /* * If we are not suspended or have not yet reached suspended * mode, then let the operation proceed. */ if ((mp = *mpp) == NULL) return (0); MNT_ILOCK(mp); if (vp == NULL) MNT_REF(mp); if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) { mp->mnt_secondary_writes++; mp->mnt_secondary_accwrites++; MNT_REL(mp); MNT_IUNLOCK(mp); return (0); } if (flags & V_NOWAIT) { MNT_REL(mp); MNT_IUNLOCK(mp); return (EWOULDBLOCK); } /* * Wait for the suspension to finish. */ error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0); vfs_rel(mp); if (error == 0) goto retry; return (error); } /* * Filesystem write operation has completed. If we are suspending and this * operation is the last one, notify the suspender that the suspension is * now in effect. */ void vn_finished_write(mp) struct mount *mp; { if (mp == NULL) return; MNT_ILOCK(mp); mp->mnt_writeopcount--; if (mp->mnt_writeopcount < 0) panic("vn_finished_write: neg cnt"); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && mp->mnt_writeopcount <= 0) wakeup(&mp->mnt_writeopcount); MNT_IUNLOCK(mp); } /* * Filesystem secondary write operation has completed. If we are * suspending and this operation is the last one, notify the suspender * that the suspension is now in effect. */ void vn_finished_secondary_write(mp) struct mount *mp; { if (mp == NULL) return; MNT_ILOCK(mp); mp->mnt_secondary_writes--; if (mp->mnt_secondary_writes < 0) panic("vn_finished_secondary_write: neg cnt"); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && mp->mnt_secondary_writes <= 0) wakeup(&mp->mnt_secondary_writes); MNT_IUNLOCK(mp); } /* * Request a filesystem to suspend write operations. */ int vfs_write_suspend(mp) struct mount *mp; { struct thread *td = curthread; int error; MNT_ILOCK(mp); if (mp->mnt_kern_flag & MNTK_SUSPEND) { MNT_IUNLOCK(mp); return (0); } mp->mnt_kern_flag |= MNTK_SUSPEND; if (mp->mnt_writeopcount > 0) (void) msleep(&mp->mnt_writeopcount, MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); else MNT_IUNLOCK(mp); if ((error = VFS_SYNC(mp, MNT_SUSPEND, td)) != 0) vfs_write_resume(mp); return (error); } /* * Request a filesystem to resume write operations. */ void vfs_write_resume(mp) struct mount *mp; { MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 | MNTK_SUSPENDED); wakeup(&mp->mnt_writeopcount); wakeup(&mp->mnt_flag); } MNT_IUNLOCK(mp); } /* * Implement kqueues for files by translating it to vnode operation. */ static int vn_kqfilter(struct file *fp, struct knote *kn) { int vfslocked; int error; vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount); error = VOP_KQFILTER(fp->f_vnode, kn); VFS_UNLOCK_GIANT(vfslocked); return error; } /* * Simplified in-kernel wrapper calls for extended attribute access. * Both calls pass in a NULL credential, authorizing as "kernel" access. * Set IO_NODELOCKED in ioflg if the vnode is already locked. */ int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; int error; iov.iov_len = *buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = *buflen; if ((ioflg & IO_NODELOCKED) == 0) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute retrieval as kernel */ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) VOP_UNLOCK(vp, 0, td); if (error == 0) { *buflen = *buflen - auio.uio_resid; } return (error); } /* * XXX failure mode if partially written? */ int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; struct mount *mp; int error; iov.iov_len = buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = buflen; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute setting as kernel */ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp, 0, td); } return (error); } int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td) { struct mount *mp; int error; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute removal as kernel */ error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td); if (error == EOPNOTSUPP) error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp, 0, td); } return (error); } Index: head/sys/netgraph/ng_socket.c =================================================================== --- head/sys/netgraph/ng_socket.c (revision 174987) +++ head/sys/netgraph/ng_socket.c (revision 174988) @@ -1,1140 +1,1140 @@ /* * ng_socket.c */ /*- * Copyright (c) 1996-1999 Whistle Communications, Inc. * All rights reserved. * * Subject to the following obligations and disclaimer of warranty, use and * redistribution of this software, in source or object code forms, with or * without modifications are expressly permitted by Whistle Communications; * provided, however, that: * 1. Any and all reproductions of the source or object code must include the * copyright notice above and the following disclaimer of warranties; and * 2. No rights are granted, in any manner or form, to use Whistle * Communications, Inc. trademarks, including the mark "WHISTLE * COMMUNICATIONS" on advertising, endorsements, or otherwise except as * such appears in the above copyright notice or in the software. * * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. * * Author: Julian Elischer * * $FreeBSD$ * $Whistle: ng_socket.c,v 1.28 1999/11/01 09:24:52 julian Exp $ */ /* * Netgraph socket nodes * * There are two types of netgraph sockets, control and data. * Control sockets have a netgraph node, but data sockets are * parasitic on control sockets, and have no node of their own. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef NOTYET #include #endif #include #include #include #include #ifdef NG_SEPARATE_MALLOC MALLOC_DEFINE(M_NETGRAPH_PATH, "netgraph_path", "netgraph path info "); MALLOC_DEFINE(M_NETGRAPH_SOCK, "netgraph_sock", "netgraph socket info "); #else #define M_NETGRAPH_PATH M_NETGRAPH #define M_NETGRAPH_SOCK M_NETGRAPH #endif /* * It's Ascii-art time! * +-------------+ +-------------+ * |socket (ctl)| |socket (data)| * +-------------+ +-------------+ * ^ ^ * | | * v v * +-----------+ +-----------+ * |pcb (ctl)| |pcb (data)| * +-----------+ +-----------+ * ^ ^ * | | * v v * +--------------------------+ * | Socket type private | * | data | * +--------------------------+ * ^ * | * v * +----------------+ * | struct ng_node | * +----------------+ */ /* Netgraph node methods */ static ng_constructor_t ngs_constructor; static ng_rcvmsg_t ngs_rcvmsg; static ng_shutdown_t ngs_shutdown; static ng_newhook_t ngs_newhook; static ng_connect_t ngs_connect; static ng_rcvdata_t ngs_rcvdata; static ng_disconnect_t ngs_disconnect; /* Internal methods */ static int ng_attach_data(struct socket *so); static int ng_attach_cntl(struct socket *so); static int ng_attach_common(struct socket *so, int type); static void ng_detach_common(struct ngpcb *pcbp, int type); static void ng_socket_free_priv(struct ngsock *priv); #ifdef NOTYET static int ng_internalize(struct mbuf *m, struct thread *p); #endif static int ng_connect_data(struct sockaddr *nam, struct ngpcb *pcbp); static int ng_bind(struct sockaddr *nam, struct ngpcb *pcbp); static int ngs_mod_event(module_t mod, int event, void *data); static void ng_socket_item_applied(void *context, int error); /* Netgraph type descriptor */ static struct ng_type typestruct = { .version = NG_ABI_VERSION, .name = NG_SOCKET_NODE_TYPE, .mod_event = ngs_mod_event, .constructor = ngs_constructor, .rcvmsg = ngs_rcvmsg, .shutdown = ngs_shutdown, .newhook = ngs_newhook, .connect = ngs_connect, .rcvdata = ngs_rcvdata, .disconnect = ngs_disconnect, }; NETGRAPH_INIT_ORDERED(socket, &typestruct, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); /* Buffer space */ static u_long ngpdg_sendspace = 20 * 1024; /* really max datagram size */ SYSCTL_INT(_net_graph, OID_AUTO, maxdgram, CTLFLAG_RW, &ngpdg_sendspace , 0, "Maximum outgoing Netgraph datagram size"); static u_long ngpdg_recvspace = 20 * 1024; SYSCTL_INT(_net_graph, OID_AUTO, recvspace, CTLFLAG_RW, &ngpdg_recvspace , 0, "Maximum space for incoming Netgraph datagrams"); #define sotongpcb(so) ((struct ngpcb *)(so)->so_pcb) /* If getting unexplained errors returned, set this to "kdb_enter("X"); */ #ifndef TRAP_ERROR #define TRAP_ERROR #endif /*************************************************************** Control sockets ***************************************************************/ static int ngc_attach(struct socket *so, int proto, struct thread *td) { struct ngpcb *const pcbp = sotongpcb(so); int error; error = priv_check(td, PRIV_NETGRAPH_CONTROL); if (error) return (error); if (pcbp != NULL) return (EISCONN); return (ng_attach_cntl(so)); } static void ngc_detach(struct socket *so) { struct ngpcb *const pcbp = sotongpcb(so); KASSERT(pcbp != NULL, ("ngc_detach: pcbp == NULL")); ng_detach_common(pcbp, NG_CONTROL); } static int ngc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct ngpcb *const pcbp = sotongpcb(so); struct ngsock *const priv = NG_NODE_PRIVATE(pcbp->sockdata->node); struct sockaddr_ng *const sap = (struct sockaddr_ng *) addr; struct ng_mesg *msg; struct mbuf *m0; item_p item; char *path = NULL; int len, error = 0; struct ng_apply_info apply; #ifdef NOTYET if (control && (error = ng_internalize(control, td))) { if (pcbp->sockdata == NULL) { error = ENOTCONN; goto release; } } #else /* NOTYET */ if (control) { error = EINVAL; goto release; } #endif /* NOTYET */ /* Require destination as there may be >= 1 hooks on this node. */ if (addr == NULL) { error = EDESTADDRREQ; goto release; } /* * Allocate an expendable buffer for the path, chop off * the sockaddr header, and make sure it's NUL terminated. */ len = sap->sg_len - 2; path = malloc(len + 1, M_NETGRAPH_PATH, M_WAITOK); bcopy(sap->sg_data, path, len); path[len] = '\0'; /* * Move the actual message out of mbufs into a linear buffer. * Start by adding up the size of the data. (could use mh_len?) */ for (len = 0, m0 = m; m0 != NULL; m0 = m0->m_next) len += m0->m_len; /* * Move the data into a linear buffer as well. * Messages are not delivered in mbufs. */ msg = malloc(len + 1, M_NETGRAPH_MSG, M_WAITOK); m_copydata(m, 0, len, (char *)msg); if (msg->header.version != NG_VERSION) { free(msg, M_NETGRAPH_MSG); error = EINVAL; goto release; } /* * Hack alert! * We look into the message and if it mkpeers a node of unknown type, we * try to load it. We need to do this now, in syscall thread, because if * message gets queued and applied later we will get panic. */ if (msg->header.typecookie == NGM_GENERIC_COOKIE && msg->header.cmd == NGM_MKPEER) { struct ngm_mkpeer *const mkp = (struct ngm_mkpeer *) msg->data; struct ng_type *type; if ((type = ng_findtype(mkp->type)) == NULL) { char filename[NG_TYPESIZ + 3]; int fileid; /* Not found, try to load it as a loadable module. */ snprintf(filename, sizeof(filename), "ng_%s", mkp->type); error = kern_kldload(curthread, filename, &fileid); if (error != 0) { free(msg, M_NETGRAPH_MSG); goto release; } /* See if type has been loaded successfully. */ if ((type = ng_findtype(mkp->type)) == NULL) { free(msg, M_NETGRAPH_MSG); (void)kern_kldunload(curthread, fileid, LINKER_UNLOAD_NORMAL); error = ENXIO; goto release; } } } item = ng_package_msg(msg, M_WAITOK); if ((error = ng_address_path((pcbp->sockdata->node), item, path, 0)) != 0) { #ifdef TRACE_MESSAGES printf("ng_address_path: errx=%d\n", error); #endif goto release; } #ifdef TRACE_MESSAGES printf("[%x]:<---------[socket]: c=<%d>cmd=%x(%s) f=%x #%d (%s)\n", item->el_dest->nd_ID, msg->header.typecookie, msg->header.cmd, msg->header.cmdstr, msg->header.flags, msg->header.token, item->el_dest->nd_type->name); #endif SAVE_LINE(item); /* * We do not want to return from syscall until the item * is processed by destination node. We register callback * on the item, which will update priv->error when item * was applied. * If ng_snd_item() has queued item, we sleep until * callback wakes us up. */ bzero(&apply, sizeof(apply)); apply.apply = ng_socket_item_applied; apply.context = priv; item->apply = &apply; priv->error = -1; error = ng_snd_item(item, 0); mtx_lock(&priv->mtx); if (priv->error == -1) msleep(priv, &priv->mtx, 0, "ngsock", 0); mtx_unlock(&priv->mtx); KASSERT(priv->error != -1, ("ng_socket: priv->error wasn't updated")); error = priv->error; release: if (path != NULL) free(path, M_NETGRAPH_PATH); if (control != NULL) m_freem(control); if (m != NULL) m_freem(m); return (error); } static int ngc_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct ngpcb *const pcbp = sotongpcb(so); if (pcbp == 0) return (EINVAL); return (ng_bind(nam, pcbp)); } static int ngc_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { /* * At this time refuse to do this.. it used to * do something but it was undocumented and not used. */ printf("program tried to connect control socket to remote node\n"); return (EINVAL); } /*************************************************************** Data sockets ***************************************************************/ static int ngd_attach(struct socket *so, int proto, struct thread *td) { struct ngpcb *const pcbp = sotongpcb(so); if (pcbp != NULL) return (EISCONN); return (ng_attach_data(so)); } static void ngd_detach(struct socket *so) { struct ngpcb *const pcbp = sotongpcb(so); KASSERT(pcbp != NULL, ("ngd_detach: pcbp == NULL")); ng_detach_common(pcbp, NG_DATA); } static int ngd_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct ngpcb *const pcbp = sotongpcb(so); struct sockaddr_ng *const sap = (struct sockaddr_ng *) addr; int len, error; hook_p hook = NULL; char hookname[NG_HOOKSIZ]; if ((pcbp == NULL) || (control != NULL)) { error = EINVAL; goto release; } if (pcbp->sockdata == NULL) { error = ENOTCONN; goto release; } if (sap == NULL) len = 0; /* Make compiler happy. */ else len = sap->sg_len - 2; /* * If the user used any of these ways to not specify an address * then handle specially. */ if ((sap == NULL) || (len <= 0) || (*sap->sg_data == '\0')) { if (NG_NODE_NUMHOOKS(pcbp->sockdata->node) != 1) { error = EDESTADDRREQ; goto release; } /* * If exactly one hook exists, just use it. * Special case to allow write(2) to work on an ng_socket. */ hook = LIST_FIRST(&pcbp->sockdata->node->nd_hooks); } else { if (len >= NG_HOOKSIZ) { error = EINVAL; goto release; } /* * chop off the sockaddr header, and make sure it's NUL * terminated */ bcopy(sap->sg_data, hookname, len); hookname[len] = '\0'; /* Find the correct hook from 'hookname' */ hook = ng_findhook(pcbp->sockdata->node, hookname); if (hook == NULL) { error = EHOSTUNREACH; goto release; } } /* Send data. */ NG_SEND_DATA_FLAGS(error, hook, m, NG_WAITOK); release: if (control != NULL) m_freem(control); if (m != NULL) m_freem(m); return (error); } static int ngd_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct ngpcb *const pcbp = sotongpcb(so); if (pcbp == 0) return (EINVAL); return (ng_connect_data(nam, pcbp)); } /* * Used for both data and control sockets */ static int ng_getsockaddr(struct socket *so, struct sockaddr **addr) { struct ngpcb *pcbp; struct sockaddr_ng *sg; int sg_len; int error = 0; /* Why isn't sg_data a `char[1]' ? :-( */ sg_len = sizeof(struct sockaddr_ng) - sizeof(sg->sg_data) + 1; pcbp = sotongpcb(so); if ((pcbp == NULL) || (pcbp->sockdata == NULL)) /* XXXGL: can this still happen? */ return (EINVAL); mtx_lock(&pcbp->sockdata->mtx); if (pcbp->sockdata->node != NULL) { node_p node = pcbp->sockdata->node; int namelen = 0; /* silence compiler! */ if (NG_NODE_HAS_NAME(node)) sg_len += namelen = strlen(NG_NODE_NAME(node)); sg = malloc(sg_len, M_SONAME, M_WAITOK | M_ZERO); if (NG_NODE_HAS_NAME(node)) bcopy(NG_NODE_NAME(node), sg->sg_data, namelen); sg->sg_len = sg_len; sg->sg_family = AF_NETGRAPH; *addr = (struct sockaddr *)sg; mtx_unlock(&pcbp->sockdata->mtx); } else { mtx_unlock(&pcbp->sockdata->mtx); error = EINVAL; } return (error); } /* * Attach a socket to it's protocol specific partner. * For a control socket, actually create a netgraph node and attach * to it as well. */ static int ng_attach_cntl(struct socket *so) { struct ngsock *priv; struct ngpcb *pcbp; int error; /* Allocate node private info */ priv = malloc(sizeof(*priv), M_NETGRAPH_SOCK, M_WAITOK | M_ZERO); /* Setup protocol control block */ if ((error = ng_attach_common(so, NG_CONTROL)) != 0) { free(priv, M_NETGRAPH_SOCK); return (error); } pcbp = sotongpcb(so); /* Link the pcb the private data. */ priv->ctlsock = pcbp; pcbp->sockdata = priv; priv->refs++; /* Initialize mutex. */ mtx_init(&priv->mtx, "ng_socket", NULL, MTX_DEF); /* Make the generic node components */ if ((error = ng_make_node_common(&typestruct, &priv->node)) != 0) { free(priv, M_NETGRAPH_SOCK); ng_detach_common(pcbp, NG_CONTROL); return (error); } /* Link the node and the private data. */ NG_NODE_SET_PRIVATE(priv->node, priv); NG_NODE_REF(priv->node); priv->refs++; return (0); } static int ng_attach_data(struct socket *so) { return (ng_attach_common(so, NG_DATA)); } /* * Set up a socket protocol control block. * This code is shared between control and data sockets. */ static int ng_attach_common(struct socket *so, int type) { struct ngpcb *pcbp; int error; /* Standard socket setup stuff. */ error = soreserve(so, ngpdg_sendspace, ngpdg_recvspace); if (error) return (error); /* Allocate the pcb. */ pcbp = malloc(sizeof(struct ngpcb), M_PCB, M_WAITOK | M_ZERO); pcbp->type = type; /* Link the pcb and the socket. */ so->so_pcb = (caddr_t)pcbp; pcbp->ng_socket = so; return (0); } /* * Disassociate the socket from it's protocol specific * partner. If it's attached to a node's private data structure, * then unlink from that too. If we were the last socket attached to it, * then shut down the entire node. Shared code for control and data sockets. */ static void ng_detach_common(struct ngpcb *pcbp, int which) { struct ngsock *priv = pcbp->sockdata; if (priv != NULL) { mtx_lock(&priv->mtx); switch (which) { case NG_CONTROL: priv->ctlsock = NULL; break; case NG_DATA: priv->datasock = NULL; break; default: panic(__func__); } pcbp->sockdata = NULL; ng_socket_free_priv(priv); } pcbp->ng_socket->so_pcb = NULL; free(pcbp, M_PCB); } /* * Remove a reference from node private data. */ static void ng_socket_free_priv(struct ngsock *priv) { mtx_assert(&priv->mtx, MA_OWNED); priv->refs--; if (priv->refs == 0) { mtx_destroy(&priv->mtx); free(priv, M_NETGRAPH_SOCK); return; } if ((priv->refs == 1) && (priv->node != NULL)) { node_p node = priv->node; priv->node = NULL; mtx_unlock(&priv->mtx); NG_NODE_UNREF(node); ng_rmnode_self(node); } else mtx_unlock(&priv->mtx); } #ifdef NOTYET /* * File descriptors can be passed into an AF_NETGRAPH socket. * Note, that file descriptors cannot be passed OUT. * Only character device descriptors are accepted. * Character devices are useful to connect a graph to a device, * which after all is the purpose of this whole system. */ static int ng_internalize(struct mbuf *control, struct thread *td) { const struct cmsghdr *cm = mtod(control, const struct cmsghdr *); struct file *fp; struct vnode *vn; int oldfds; int fd; if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET || cm->cmsg_len != control->m_len) { TRAP_ERROR; return (EINVAL); } /* Check there is only one FD. XXX what would more than one signify? */ oldfds = ((caddr_t)cm + cm->cmsg_len - (caddr_t)data) / sizeof (int); if (oldfds != 1) { TRAP_ERROR; return (EINVAL); } /* Check that the FD given is legit. and change it to a pointer to a * struct file. */ fd = CMSG_DATA(cm); if ((error = fget(td, fd, &fp)) != 0) return (error); /* Depending on what kind of resource it is, act differently. For * devices, we treat it as a file. For an AF_NETGRAPH socket, * shortcut straight to the node. */ switch (fp->f_type) { case DTYPE_VNODE: vn = fp->f_data; if (vn && (vn->v_type == VCHR)) { /* for a VCHR, actually reference the FILE */ - fp->f_count++; + fhold(fp); /* XXX then what :) */ /* how to pass on to other modules? */ } else { fdrop(fp, td); TRAP_ERROR; return (EINVAL); } break; default: fdrop(fp, td); TRAP_ERROR; return (EINVAL); } fdrop(fp, td); return (0); } #endif /* NOTYET */ /* * Connect the data socket to a named control socket node. */ static int ng_connect_data(struct sockaddr *nam, struct ngpcb *pcbp) { struct sockaddr_ng *sap; node_p farnode; struct ngsock *priv; int error; item_p item; /* If we are already connected, don't do it again. */ if (pcbp->sockdata != NULL) return (EISCONN); /* * Find the target (victim) and check it doesn't already have * a data socket. Also check it is a 'socket' type node. * Use ng_package_data() and ng_address_path() to do this. */ sap = (struct sockaddr_ng *) nam; /* The item will hold the node reference. */ item = ng_package_data(NULL, NG_WAITOK); if ((error = ng_address_path(NULL, item, sap->sg_data, 0))) return (error); /* item is freed on failure */ /* * Extract node from item and free item. Remember we now have * a reference on the node. The item holds it for us. * when we free the item we release the reference. */ farnode = item->el_dest; /* shortcut */ if (strcmp(farnode->nd_type->name, NG_SOCKET_NODE_TYPE) != 0) { NG_FREE_ITEM(item); /* drop the reference to the node */ return (EINVAL); } priv = NG_NODE_PRIVATE(farnode); if (priv->datasock != NULL) { NG_FREE_ITEM(item); /* drop the reference to the node */ return (EADDRINUSE); } /* * Link the PCB and the private data struct. and note the extra * reference. Drop the extra reference on the node. */ mtx_lock(&priv->mtx); priv->datasock = pcbp; pcbp->sockdata = priv; priv->refs++; mtx_unlock(&priv->mtx); NG_FREE_ITEM(item); /* drop the reference to the node */ return (0); } /* * Binding a socket means giving the corresponding node a name */ static int ng_bind(struct sockaddr *nam, struct ngpcb *pcbp) { struct ngsock *const priv = pcbp->sockdata; struct sockaddr_ng *const sap = (struct sockaddr_ng *) nam; if (priv == NULL) { TRAP_ERROR; return (EINVAL); } if ((sap->sg_len < 4) || (sap->sg_len > (NG_NODESIZ + 2)) || (sap->sg_data[0] == '\0') || (sap->sg_data[sap->sg_len - 3] != '\0')) { TRAP_ERROR; return (EINVAL); } return (ng_name_node(priv->node, sap->sg_data)); } /*************************************************************** Netgraph node ***************************************************************/ /* * You can only create new nodes from the socket end of things. */ static int ngs_constructor(node_p nodep) { return (EINVAL); } /* * We allow any hook to be connected to the node. * There is no per-hook private information though. */ static int ngs_newhook(node_p node, hook_p hook, const char *name) { NG_HOOK_SET_PRIVATE(hook, NG_NODE_PRIVATE(node)); return (0); } /* * If only one hook, allow read(2) and write(2) to work. */ static int ngs_connect(hook_p hook) { node_p node = NG_HOOK_NODE(hook); struct ngsock *priv = NG_NODE_PRIVATE(node); if ((priv->datasock) && (priv->datasock->ng_socket)) { if (NG_NODE_NUMHOOKS(node) == 1) priv->datasock->ng_socket->so_state |= SS_ISCONNECTED; else priv->datasock->ng_socket->so_state &= ~SS_ISCONNECTED; } return (0); } /* * Incoming messages get passed up to the control socket. * Unless they are for us specifically (socket_type) */ static int ngs_rcvmsg(node_p node, item_p item, hook_p lasthook) { struct ngsock *const priv = NG_NODE_PRIVATE(node); struct ngpcb *const pcbp = priv->ctlsock; struct socket *so; struct sockaddr_ng addr; struct ng_mesg *msg; struct mbuf *m; ng_ID_t retaddr = NGI_RETADDR(item); int addrlen; int error = 0; NGI_GET_MSG(item, msg); NG_FREE_ITEM(item); /* * Only allow mesgs to be passed if we have the control socket. * Data sockets can only support the generic messages. */ if (pcbp == NULL) { TRAP_ERROR; NG_FREE_MSG(msg); return (EINVAL); } so = pcbp->ng_socket; #ifdef TRACE_MESSAGES printf("[%x]:---------->[socket]: c=<%d>cmd=%x(%s) f=%x #%d\n", retaddr, msg->header.typecookie, msg->header.cmd, msg->header.cmdstr, msg->header.flags, msg->header.token); #endif if (msg->header.typecookie == NGM_SOCKET_COOKIE) { switch (msg->header.cmd) { case NGM_SOCK_CMD_NOLINGER: priv->flags |= NGS_FLAG_NOLINGER; break; case NGM_SOCK_CMD_LINGER: priv->flags &= ~NGS_FLAG_NOLINGER; break; default: error = EINVAL; /* unknown command */ } /* Free the message and return. */ NG_FREE_MSG(msg); return (error); } /* Get the return address into a sockaddr. */ bzero(&addr, sizeof(addr)); addr.sg_len = sizeof(addr); addr.sg_family = AF_NETGRAPH; addrlen = snprintf((char *)&addr.sg_data, sizeof(addr.sg_data), "[%x]:", retaddr); if (addrlen < 0 || addrlen > sizeof(addr.sg_data)) { printf("%s: snprintf([%x]) failed - %d\n", __func__, retaddr, addrlen); NG_FREE_MSG(msg); return (EINVAL); } /* Copy the message itself into an mbuf chain. */ m = m_devget((caddr_t)msg, sizeof(struct ng_mesg) + msg->header.arglen, 0, NULL, NULL); /* * Here we free the message. We need to do that * regardless of whether we got mbufs. */ NG_FREE_MSG(msg); if (m == NULL) { TRAP_ERROR; return (ENOBUFS); } /* Send it up to the socket. */ if (sbappendaddr(&so->so_rcv, (struct sockaddr *)&addr, m, NULL) == 0) { TRAP_ERROR; m_freem(m); error = so->so_error = ENOBUFS; } sorwakeup(so); return (error); } /* * Receive data on a hook */ static int ngs_rcvdata(hook_p hook, item_p item) { struct ngsock *const priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); struct ngpcb *const pcbp = priv->datasock; struct socket *so; struct sockaddr_ng *addr; char *addrbuf[NG_HOOKSIZ + 4]; int addrlen; struct mbuf *m; NGI_GET_M(item, m); NG_FREE_ITEM(item); /* If there is no data socket, black-hole it. */ if (pcbp == NULL) { NG_FREE_M(m); return (0); } so = pcbp->ng_socket; /* Get the return address into a sockaddr. */ addrlen = strlen(NG_HOOK_NAME(hook)); /* <= NG_HOOKSIZ - 1 */ addr = (struct sockaddr_ng *) addrbuf; addr->sg_len = addrlen + 3; addr->sg_family = AF_NETGRAPH; bcopy(NG_HOOK_NAME(hook), addr->sg_data, addrlen); addr->sg_data[addrlen] = '\0'; /* Try to tell the socket which hook it came in on. */ if (sbappendaddr(&so->so_rcv, (struct sockaddr *)addr, m, NULL) == 0) { m_freem(m); TRAP_ERROR; return (ENOBUFS); } sorwakeup(so); return (0); } /* * Hook disconnection * * For this type, removal of the last link destroys the node * if the NOLINGER flag is set. */ static int ngs_disconnect(hook_p hook) { node_p node = NG_HOOK_NODE(hook); struct ngsock *const priv = NG_NODE_PRIVATE(node); if ((priv->datasock) && (priv->datasock->ng_socket)) { if (NG_NODE_NUMHOOKS(node) == 1) priv->datasock->ng_socket->so_state |= SS_ISCONNECTED; else priv->datasock->ng_socket->so_state &= ~SS_ISCONNECTED; } if ((priv->flags & NGS_FLAG_NOLINGER) && (NG_NODE_NUMHOOKS(node) == 0) && (NG_NODE_IS_VALID(node))) ng_rmnode_self(node); return (0); } /* * Do local shutdown processing. * In this case, that involves making sure the socket * knows we should be shutting down. */ static int ngs_shutdown(node_p node) { struct ngsock *const priv = NG_NODE_PRIVATE(node); struct ngpcb *const dpcbp = priv->datasock; struct ngpcb *const pcbp = priv->ctlsock; if (dpcbp != NULL) soisdisconnected(dpcbp->ng_socket); if (pcbp != NULL) soisdisconnected(pcbp->ng_socket); mtx_lock(&priv->mtx); priv->node = NULL; NG_NODE_SET_PRIVATE(node, NULL); ng_socket_free_priv(priv); NG_NODE_UNREF(node); return (0); } static void ng_socket_item_applied(void *context, int error) { struct ngsock *const priv = (struct ngsock *)context; mtx_lock(&priv->mtx); priv->error = error; wakeup(priv); mtx_unlock(&priv->mtx); } static int dummy_disconnect(struct socket *so) { return (0); } /* * Control and data socket type descriptors * * XXXRW: Perhaps _close should do something? */ static struct pr_usrreqs ngc_usrreqs = { .pru_abort = NULL, .pru_attach = ngc_attach, .pru_bind = ngc_bind, .pru_connect = ngc_connect, .pru_detach = ngc_detach, .pru_disconnect = dummy_disconnect, .pru_peeraddr = NULL, .pru_send = ngc_send, .pru_shutdown = NULL, .pru_sockaddr = ng_getsockaddr, .pru_close = NULL, }; static struct pr_usrreqs ngd_usrreqs = { .pru_abort = NULL, .pru_attach = ngd_attach, .pru_bind = NULL, .pru_connect = ngd_connect, .pru_detach = ngd_detach, .pru_disconnect = dummy_disconnect, .pru_peeraddr = NULL, .pru_send = ngd_send, .pru_shutdown = NULL, .pru_sockaddr = ng_getsockaddr, .pru_close = NULL, }; /* * Definitions of protocols supported in the NETGRAPH domain. */ extern struct domain ngdomain; /* stop compiler warnings */ static struct protosw ngsw[] = { { .pr_type = SOCK_DGRAM, .pr_domain = &ngdomain, .pr_protocol = NG_CONTROL, .pr_flags = PR_ATOMIC | PR_ADDR /* | PR_RIGHTS */, .pr_usrreqs = &ngc_usrreqs }, { .pr_type = SOCK_DGRAM, .pr_domain = &ngdomain, .pr_protocol = NG_DATA, .pr_flags = PR_ATOMIC | PR_ADDR, .pr_usrreqs = &ngd_usrreqs } }; struct domain ngdomain = { .dom_family = AF_NETGRAPH, .dom_name = "netgraph", .dom_protosw = ngsw, .dom_protoswNPROTOSW = &ngsw[sizeof(ngsw) / sizeof(ngsw[0])] }; /* * Handle loading and unloading for this node type. * This is to handle auxiliary linkages (e.g protocol domain addition). */ static int ngs_mod_event(module_t mod, int event, void *data) { int error = 0; switch (event) { case MOD_LOAD: /* Register protocol domain. */ net_add_domain(&ngdomain); break; case MOD_UNLOAD: #ifdef NOTYET /* Unregister protocol domain XXX can't do this yet.. */ if ((error = net_rm_domain(&ngdomain)) != 0) break; else #endif error = EBUSY; break; default: error = EOPNOTSUPP; break; } return (error); } SYSCTL_INT(_net_graph, OID_AUTO, family, CTLFLAG_RD, 0, AF_NETGRAPH, ""); SYSCTL_NODE(_net_graph, OID_AUTO, data, CTLFLAG_RW, 0, "DATA"); SYSCTL_INT(_net_graph_data, OID_AUTO, proto, CTLFLAG_RD, 0, NG_DATA, ""); SYSCTL_NODE(_net_graph, OID_AUTO, control, CTLFLAG_RW, 0, "CONTROL"); SYSCTL_INT(_net_graph_control, OID_AUTO, proto, CTLFLAG_RD, 0, NG_CONTROL, ""); Index: head/sys/opencrypto/cryptodev.c =================================================================== --- head/sys/opencrypto/cryptodev.c (revision 174987) +++ head/sys/opencrypto/cryptodev.c (revision 174988) @@ -1,906 +1,901 @@ /* $OpenBSD: cryptodev.c,v 1.52 2002/06/19 07:22:46 deraadt Exp $ */ /*- * Copyright (c) 2001 Theo de Raadt * Copyright (c) 2002-2006 Sam Leffler, Errno Consulting * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Effort sponsored in part by the Defense Advanced Research Projects * Agency (DARPA) and Air Force Research Laboratory, Air Force * Materiel Command, USAF, under agreement number F30602-01-2-0537. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct csession { TAILQ_ENTRY(csession) next; u_int64_t sid; u_int32_t ses; struct mtx lock; /* for op submission */ u_int32_t cipher; struct enc_xform *txform; u_int32_t mac; struct auth_hash *thash; caddr_t key; int keylen; u_char tmp_iv[EALG_MAX_BLOCK_LEN]; caddr_t mackey; int mackeylen; struct iovec iovec; struct uio uio; int error; }; struct fcrypt { TAILQ_HEAD(csessionlist, csession) csessions; int sesn; }; static int cryptof_rw(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *); static int cryptof_ioctl(struct file *, u_long, void *, struct ucred *, struct thread *); static int cryptof_poll(struct file *, int, struct ucred *, struct thread *); static int cryptof_kqfilter(struct file *, struct knote *); static int cryptof_stat(struct file *, struct stat *, struct ucred *, struct thread *); static int cryptof_close(struct file *, struct thread *); static struct fileops cryptofops = { .fo_read = cryptof_rw, .fo_write = cryptof_rw, .fo_ioctl = cryptof_ioctl, .fo_poll = cryptof_poll, .fo_kqfilter = cryptof_kqfilter, .fo_stat = cryptof_stat, .fo_close = cryptof_close }; static struct csession *csefind(struct fcrypt *, u_int); static int csedelete(struct fcrypt *, struct csession *); static struct csession *cseadd(struct fcrypt *, struct csession *); static struct csession *csecreate(struct fcrypt *, u_int64_t, caddr_t, u_int64_t, caddr_t, u_int64_t, u_int32_t, u_int32_t, struct enc_xform *, struct auth_hash *); static int csefree(struct csession *); static int cryptodev_op(struct csession *, struct crypt_op *, struct ucred *, struct thread *td); static int cryptodev_key(struct crypt_kop *); static int cryptodev_find(struct crypt_find_op *); static int cryptof_rw( struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EIO); } /* * Check a crypto identifier to see if it requested * a software device/driver. This can be done either * by device name/class or through search constraints. */ static int checkforsoftware(int crid) { if (crid & CRYPTOCAP_F_SOFTWARE) return EINVAL; /* XXX */ if ((crid & CRYPTOCAP_F_HARDWARE) == 0 && (crypto_getcaps(crid) & CRYPTOCAP_F_HARDWARE) == 0) return EINVAL; /* XXX */ return 0; } /* ARGSUSED */ static int cryptof_ioctl( struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { #define SES2(p) ((struct session2_op *)p) struct cryptoini cria, crie; struct fcrypt *fcr = fp->f_data; struct csession *cse; struct session_op *sop; struct crypt_op *cop; struct enc_xform *txform = NULL; struct auth_hash *thash = NULL; struct crypt_kop *kop; u_int64_t sid; u_int32_t ses; int error = 0, crid; switch (cmd) { case CIOCGSESSION: case CIOCGSESSION2: sop = (struct session_op *)data; switch (sop->cipher) { case 0: break; case CRYPTO_DES_CBC: txform = &enc_xform_des; break; case CRYPTO_3DES_CBC: txform = &enc_xform_3des; break; case CRYPTO_BLF_CBC: txform = &enc_xform_blf; break; case CRYPTO_CAST_CBC: txform = &enc_xform_cast5; break; case CRYPTO_SKIPJACK_CBC: txform = &enc_xform_skipjack; break; case CRYPTO_AES_CBC: txform = &enc_xform_rijndael128; break; case CRYPTO_NULL_CBC: txform = &enc_xform_null; break; case CRYPTO_ARC4: txform = &enc_xform_arc4; break; case CRYPTO_CAMELLIA_CBC: txform = &enc_xform_camellia; break; default: return (EINVAL); } switch (sop->mac) { case 0: break; case CRYPTO_MD5_HMAC: thash = &auth_hash_hmac_md5; break; case CRYPTO_SHA1_HMAC: thash = &auth_hash_hmac_sha1; break; case CRYPTO_SHA2_256_HMAC: thash = &auth_hash_hmac_sha2_256; break; case CRYPTO_SHA2_384_HMAC: thash = &auth_hash_hmac_sha2_384; break; case CRYPTO_SHA2_512_HMAC: thash = &auth_hash_hmac_sha2_512; break; case CRYPTO_RIPEMD160_HMAC: thash = &auth_hash_hmac_ripemd_160; break; #ifdef notdef case CRYPTO_MD5: thash = &auth_hash_md5; break; case CRYPTO_SHA1: thash = &auth_hash_sha1; break; #endif case CRYPTO_NULL_HMAC: thash = &auth_hash_null; break; default: return (EINVAL); } bzero(&crie, sizeof(crie)); bzero(&cria, sizeof(cria)); if (txform) { crie.cri_alg = txform->type; crie.cri_klen = sop->keylen * 8; if (sop->keylen > txform->maxkey || sop->keylen < txform->minkey) { error = EINVAL; goto bail; } MALLOC(crie.cri_key, u_int8_t *, crie.cri_klen / 8, M_XDATA, M_WAITOK); if ((error = copyin(sop->key, crie.cri_key, crie.cri_klen / 8))) goto bail; if (thash) crie.cri_next = &cria; } if (thash) { cria.cri_alg = thash->type; cria.cri_klen = sop->mackeylen * 8; if (sop->mackeylen != thash->keysize) { error = EINVAL; goto bail; } if (cria.cri_klen) { MALLOC(cria.cri_key, u_int8_t *, cria.cri_klen / 8, M_XDATA, M_WAITOK); if ((error = copyin(sop->mackey, cria.cri_key, cria.cri_klen / 8))) goto bail; } } /* NB: CIOGSESSION2 has the crid */ if (cmd == CIOCGSESSION2) { crid = SES2(sop)->crid; error = checkforsoftware(crid); if (error) goto bail; } else crid = CRYPTOCAP_F_HARDWARE; error = crypto_newsession(&sid, (txform ? &crie : &cria), crid); if (error) goto bail; cse = csecreate(fcr, sid, crie.cri_key, crie.cri_klen, cria.cri_key, cria.cri_klen, sop->cipher, sop->mac, txform, thash); if (cse == NULL) { crypto_freesession(sid); error = EINVAL; goto bail; } sop->ses = cse->ses; if (cmd == CIOCGSESSION2) { /* return hardware/driver id */ SES2(sop)->crid = CRYPTO_SESID2HID(cse->sid); } bail: if (error) { if (crie.cri_key) FREE(crie.cri_key, M_XDATA); if (cria.cri_key) FREE(cria.cri_key, M_XDATA); } break; case CIOCFSESSION: ses = *(u_int32_t *)data; cse = csefind(fcr, ses); if (cse == NULL) return (EINVAL); csedelete(fcr, cse); error = csefree(cse); break; case CIOCCRYPT: cop = (struct crypt_op *)data; cse = csefind(fcr, cop->ses); if (cse == NULL) return (EINVAL); error = cryptodev_op(cse, cop, active_cred, td); break; case CIOCKEY: case CIOCKEY2: if (!crypto_userasymcrypto) return (EPERM); /* XXX compat? */ mtx_lock(&Giant); kop = (struct crypt_kop *)data; if (cmd == CIOCKEY) { /* NB: crypto core enforces s/w driver use */ kop->crk_crid = CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE; } error = cryptodev_key(kop); mtx_unlock(&Giant); break; case CIOCASYMFEAT: if (!crypto_userasymcrypto) { /* * NB: if user asym crypto operations are * not permitted return "no algorithms" * so well-behaved applications will just * fallback to doing them in software. */ *(int *)data = 0; } else error = crypto_getfeat((int *)data); break; case CIOCFINDDEV: error = cryptodev_find((struct crypt_find_op *)data); break; default: error = EINVAL; break; } return (error); #undef SES2 } static int cryptodev_cb(void *); static int cryptodev_op( struct csession *cse, struct crypt_op *cop, struct ucred *active_cred, struct thread *td) { struct cryptop *crp = NULL; struct cryptodesc *crde = NULL, *crda = NULL; int error; if (cop->len > 256*1024-4) return (E2BIG); if (cse->txform) { if (cop->len == 0 || (cop->len % cse->txform->blocksize) != 0) return (EINVAL); } cse->uio.uio_iov = &cse->iovec; cse->uio.uio_iovcnt = 1; cse->uio.uio_offset = 0; cse->uio.uio_resid = cop->len; cse->uio.uio_segflg = UIO_SYSSPACE; cse->uio.uio_rw = UIO_WRITE; cse->uio.uio_td = td; cse->uio.uio_iov[0].iov_len = cop->len; if (cse->thash) cse->uio.uio_iov[0].iov_len += cse->thash->hashsize; cse->uio.uio_iov[0].iov_base = malloc(cse->uio.uio_iov[0].iov_len, M_XDATA, M_WAITOK); crp = crypto_getreq((cse->txform != NULL) + (cse->thash != NULL)); if (crp == NULL) { error = ENOMEM; goto bail; } if (cse->thash) { crda = crp->crp_desc; if (cse->txform) crde = crda->crd_next; } else { if (cse->txform) crde = crp->crp_desc; else { error = EINVAL; goto bail; } } if ((error = copyin(cop->src, cse->uio.uio_iov[0].iov_base, cop->len))) goto bail; if (crda) { crda->crd_skip = 0; crda->crd_len = cop->len; crda->crd_inject = cop->len; crda->crd_alg = cse->mac; crda->crd_key = cse->mackey; crda->crd_klen = cse->mackeylen * 8; } if (crde) { if (cop->op == COP_ENCRYPT) crde->crd_flags |= CRD_F_ENCRYPT; else crde->crd_flags &= ~CRD_F_ENCRYPT; crde->crd_len = cop->len; crde->crd_inject = 0; crde->crd_alg = cse->cipher; crde->crd_key = cse->key; crde->crd_klen = cse->keylen * 8; } crp->crp_ilen = cop->len; crp->crp_flags = CRYPTO_F_IOV | CRYPTO_F_CBIMM | (cop->flags & COP_F_BATCH); crp->crp_buf = (caddr_t)&cse->uio; crp->crp_callback = (int (*) (struct cryptop *)) cryptodev_cb; crp->crp_sid = cse->sid; crp->crp_opaque = (void *)cse; if (cop->iv) { if (crde == NULL) { error = EINVAL; goto bail; } if (cse->cipher == CRYPTO_ARC4) { /* XXX use flag? */ error = EINVAL; goto bail; } if ((error = copyin(cop->iv, cse->tmp_iv, cse->txform->blocksize))) goto bail; bcopy(cse->tmp_iv, crde->crd_iv, cse->txform->blocksize); crde->crd_flags |= CRD_F_IV_EXPLICIT | CRD_F_IV_PRESENT; crde->crd_skip = 0; } else if (cse->cipher == CRYPTO_ARC4) { /* XXX use flag? */ crde->crd_skip = 0; } else if (crde) { crde->crd_flags |= CRD_F_IV_PRESENT; crde->crd_skip = cse->txform->blocksize; crde->crd_len -= cse->txform->blocksize; } if (cop->mac && crda == NULL) { error = EINVAL; goto bail; } /* * Let the dispatch run unlocked, then, interlock against the * callback before checking if the operation completed and going * to sleep. This insures drivers don't inherit our lock which * results in a lock order reversal between crypto_dispatch forced * entry and the crypto_done callback into us. */ error = crypto_dispatch(crp); mtx_lock(&cse->lock); if (error == 0 && (crp->crp_flags & CRYPTO_F_DONE) == 0) error = msleep(crp, &cse->lock, PWAIT, "crydev", 0); mtx_unlock(&cse->lock); if (error != 0) goto bail; if (crp->crp_etype != 0) { error = crp->crp_etype; goto bail; } if (cse->error) { error = cse->error; goto bail; } if (cop->dst && (error = copyout(cse->uio.uio_iov[0].iov_base, cop->dst, cop->len))) goto bail; if (cop->mac && (error = copyout((caddr_t)cse->uio.uio_iov[0].iov_base + cop->len, cop->mac, cse->thash->hashsize))) goto bail; bail: if (crp) crypto_freereq(crp); if (cse->uio.uio_iov[0].iov_base) free(cse->uio.uio_iov[0].iov_base, M_XDATA); return (error); } static int cryptodev_cb(void *op) { struct cryptop *crp = (struct cryptop *) op; struct csession *cse = (struct csession *)crp->crp_opaque; int error; error = crp->crp_etype; if (error == EAGAIN) error = crypto_dispatch(crp); mtx_lock(&cse->lock); if (error != 0 || (crp->crp_flags & CRYPTO_F_DONE)) { cse->error = error; wakeup_one(crp); } mtx_unlock(&cse->lock); return (0); } static int cryptodevkey_cb(void *op) { struct cryptkop *krp = (struct cryptkop *) op; wakeup_one(krp); return (0); } static int cryptodev_key(struct crypt_kop *kop) { struct cryptkop *krp = NULL; int error = EINVAL; int in, out, size, i; if (kop->crk_iparams + kop->crk_oparams > CRK_MAXPARAM) { return (EFBIG); } in = kop->crk_iparams; out = kop->crk_oparams; switch (kop->crk_op) { case CRK_MOD_EXP: if (in == 3 && out == 1) break; return (EINVAL); case CRK_MOD_EXP_CRT: if (in == 6 && out == 1) break; return (EINVAL); case CRK_DSA_SIGN: if (in == 5 && out == 2) break; return (EINVAL); case CRK_DSA_VERIFY: if (in == 7 && out == 0) break; return (EINVAL); case CRK_DH_COMPUTE_KEY: if (in == 3 && out == 1) break; return (EINVAL); default: return (EINVAL); } krp = (struct cryptkop *)malloc(sizeof *krp, M_XDATA, M_WAITOK|M_ZERO); if (!krp) return (ENOMEM); krp->krp_op = kop->crk_op; krp->krp_status = kop->crk_status; krp->krp_iparams = kop->crk_iparams; krp->krp_oparams = kop->crk_oparams; krp->krp_crid = kop->crk_crid; krp->krp_status = 0; krp->krp_callback = (int (*) (struct cryptkop *)) cryptodevkey_cb; for (i = 0; i < CRK_MAXPARAM; i++) { if (kop->crk_param[i].crp_nbits > 65536) /* Limit is the same as in OpenBSD */ goto fail; krp->krp_param[i].crp_nbits = kop->crk_param[i].crp_nbits; } for (i = 0; i < krp->krp_iparams + krp->krp_oparams; i++) { size = (krp->krp_param[i].crp_nbits + 7) / 8; if (size == 0) continue; MALLOC(krp->krp_param[i].crp_p, caddr_t, size, M_XDATA, M_WAITOK); if (i >= krp->krp_iparams) continue; error = copyin(kop->crk_param[i].crp_p, krp->krp_param[i].crp_p, size); if (error) goto fail; } error = crypto_kdispatch(krp); if (error) goto fail; error = tsleep(krp, PSOCK, "crydev", 0); if (error) { /* XXX can this happen? if so, how do we recover? */ goto fail; } kop->crk_crid = krp->krp_crid; /* device that did the work */ if (krp->krp_status != 0) { error = krp->krp_status; goto fail; } for (i = krp->krp_iparams; i < krp->krp_iparams + krp->krp_oparams; i++) { size = (krp->krp_param[i].crp_nbits + 7) / 8; if (size == 0) continue; error = copyout(krp->krp_param[i].crp_p, kop->crk_param[i].crp_p, size); if (error) goto fail; } fail: if (krp) { kop->crk_status = krp->krp_status; for (i = 0; i < CRK_MAXPARAM; i++) { if (krp->krp_param[i].crp_p) FREE(krp->krp_param[i].crp_p, M_XDATA); } free(krp, M_XDATA); } return (error); } static int cryptodev_find(struct crypt_find_op *find) { device_t dev; if (find->crid != -1) { dev = crypto_find_device_byhid(find->crid); if (dev == NULL) return (ENOENT); strlcpy(find->name, device_get_nameunit(dev), sizeof(find->name)); } else { find->crid = crypto_find_driver(find->name); if (find->crid == -1) return (ENOENT); } return (0); } /* ARGSUSED */ static int cryptof_poll( struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (0); } /* ARGSUSED */ static int cryptof_kqfilter(struct file *fp, struct knote *kn) { return (0); } /* ARGSUSED */ static int cryptof_stat( struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { return (EOPNOTSUPP); } /* ARGSUSED */ static int cryptof_close(struct file *fp, struct thread *td) { struct fcrypt *fcr = fp->f_data; struct csession *cse; while ((cse = TAILQ_FIRST(&fcr->csessions))) { TAILQ_REMOVE(&fcr->csessions, cse, next); (void)csefree(cse); } FREE(fcr, M_XDATA); fp->f_data = NULL; return 0; } static struct csession * csefind(struct fcrypt *fcr, u_int ses) { struct csession *cse; TAILQ_FOREACH(cse, &fcr->csessions, next) if (cse->ses == ses) return (cse); return (NULL); } static int csedelete(struct fcrypt *fcr, struct csession *cse_del) { struct csession *cse; TAILQ_FOREACH(cse, &fcr->csessions, next) { if (cse == cse_del) { TAILQ_REMOVE(&fcr->csessions, cse, next); return (1); } } return (0); } static struct csession * cseadd(struct fcrypt *fcr, struct csession *cse) { TAILQ_INSERT_TAIL(&fcr->csessions, cse, next); cse->ses = fcr->sesn++; return (cse); } struct csession * csecreate(struct fcrypt *fcr, u_int64_t sid, caddr_t key, u_int64_t keylen, caddr_t mackey, u_int64_t mackeylen, u_int32_t cipher, u_int32_t mac, struct enc_xform *txform, struct auth_hash *thash) { struct csession *cse; #ifdef INVARIANTS /* NB: required when mtx_init is built with INVARIANTS */ MALLOC(cse, struct csession *, sizeof(struct csession), M_XDATA, M_NOWAIT | M_ZERO); #else MALLOC(cse, struct csession *, sizeof(struct csession), M_XDATA, M_NOWAIT); #endif if (cse == NULL) return NULL; mtx_init(&cse->lock, "cryptodev", "crypto session lock", MTX_DEF); cse->key = key; cse->keylen = keylen/8; cse->mackey = mackey; cse->mackeylen = mackeylen/8; cse->sid = sid; cse->cipher = cipher; cse->mac = mac; cse->txform = txform; cse->thash = thash; cseadd(fcr, cse); return (cse); } static int csefree(struct csession *cse) { int error; error = crypto_freesession(cse->sid); mtx_destroy(&cse->lock); if (cse->key) FREE(cse->key, M_XDATA); if (cse->mackey) FREE(cse->mackey, M_XDATA); FREE(cse, M_XDATA); return (error); } static int cryptoopen(struct cdev *dev, int oflags, int devtype, struct thread *td) { return (0); } static int cryptoread(struct cdev *dev, struct uio *uio, int ioflag) { return (EIO); } static int cryptowrite(struct cdev *dev, struct uio *uio, int ioflag) { return (EIO); } static int cryptoioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { struct file *f; struct fcrypt *fcr; int fd, error; switch (cmd) { case CRIOGET: MALLOC(fcr, struct fcrypt *, sizeof(struct fcrypt), M_XDATA, M_WAITOK); TAILQ_INIT(&fcr->csessions); fcr->sesn = 0; error = falloc(td, &f, &fd); if (error) { FREE(fcr, M_XDATA); return (error); } /* falloc automatically provides an extra reference to 'f'. */ - FILE_LOCK(f); - f->f_flag = FREAD | FWRITE; - f->f_type = DTYPE_CRYPTO; - f->f_data = fcr; - f->f_ops = &cryptofops; - FILE_UNLOCK(f); + finit(f, FREAD | FWRITE, DTYPE_CRYPTO, fcr, &cryptofops); *(u_int32_t *)data = fd; fdrop(f, td); break; case CRIOFINDDEV: error = cryptodev_find((struct crypt_find_op *)data); break; case CRIOASYMFEAT: error = crypto_getfeat((int *)data); break; default: error = EINVAL; break; } return (error); } static struct cdevsw crypto_cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDGIANT, .d_open = cryptoopen, .d_read = cryptoread, .d_write = cryptowrite, .d_ioctl = cryptoioctl, .d_name = "crypto", }; static struct cdev *crypto_dev; /* * Initialization code, both for static and dynamic loading. */ static int cryptodev_modevent(module_t mod, int type, void *unused) { switch (type) { case MOD_LOAD: if (bootverbose) printf("crypto: \n"); crypto_dev = make_dev(&crypto_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "crypto"); return 0; case MOD_UNLOAD: /*XXX disallow if active sessions */ destroy_dev(crypto_dev); return 0; } return EINVAL; } static moduledata_t cryptodev_mod = { "cryptodev", cryptodev_modevent, 0 }; MODULE_VERSION(cryptodev, 1); DECLARE_MODULE(cryptodev, cryptodev_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_DEPEND(cryptodev, crypto, 1, 1, 1); MODULE_DEPEND(cryptodev, zlib, 1, 1, 1); Index: head/sys/sys/file.h =================================================================== --- head/sys/sys/file.h (revision 174987) +++ head/sys/sys/file.h (revision 174988) @@ -1,311 +1,282 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)file.h 8.3 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_FILE_H_ #define _SYS_FILE_H_ #ifndef _KERNEL #include /* XXX */ #include #include #else #include #include #include struct stat; struct thread; struct uio; struct knote; struct vnode; struct socket; #endif /* _KERNEL */ #define DTYPE_VNODE 1 /* file */ #define DTYPE_SOCKET 2 /* communications endpoint */ #define DTYPE_PIPE 3 /* pipe */ #define DTYPE_FIFO 4 /* fifo (named pipe) */ #define DTYPE_KQUEUE 5 /* event queue */ #define DTYPE_CRYPTO 6 /* crypto */ #define DTYPE_MQUEUE 7 /* posix message queue */ #ifdef _KERNEL struct file; struct ucred; typedef int fo_rdwr_t(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td); #define FOF_OFFSET 1 /* Use the offset in uio argument */ typedef int fo_ioctl_t(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td); typedef int fo_poll_t(struct file *fp, int events, struct ucred *active_cred, struct thread *td); typedef int fo_kqfilter_t(struct file *fp, struct knote *kn); typedef int fo_stat_t(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td); typedef int fo_close_t(struct file *fp, struct thread *td); typedef int fo_flags_t; struct fileops { fo_rdwr_t *fo_read; fo_rdwr_t *fo_write; fo_ioctl_t *fo_ioctl; fo_poll_t *fo_poll; fo_kqfilter_t *fo_kqfilter; fo_stat_t *fo_stat; fo_close_t *fo_close; fo_flags_t fo_flags; /* DFLAG_* below */ }; #define DFLAG_PASSABLE 0x01 /* may be passed via unix sockets. */ #define DFLAG_SEEKABLE 0x02 /* seekable / nonsequential */ /* * Kernel descriptor table. * One entry for each open kernel vnode and socket. * * Below is the list of locks that protects members in struct file. * - * (fl) filelist_lock - * (f) f_mtx in struct file + * (f) protected with mtx_lock(mtx_pool_find(fp)) * none not locked */ struct file { - LIST_ENTRY(file) f_list;/* (fl) list of active files */ - short f_type; /* descriptor type */ - void *f_data; /* file descriptor specific data */ - u_int f_flag; /* see fcntl.h */ - struct mtx *f_mtxp; /* mutex to protect data */ - struct fileops *f_ops; /* File operations */ - struct ucred *f_cred; /* credentials associated with descriptor */ - int f_count; /* (f) reference count */ - struct vnode *f_vnode; /* NULL or applicable vnode */ - - /* DFLAG_SEEKABLE specific fields */ - off_t f_offset; - short f_vnread_flags; /* - * (f) home grown sleep lock for f_offset - * Used only for shared vnode locking in - * vnread() - */ -#define FOFFSET_LOCKED 0x1 -#define FOFFSET_LOCK_WAITING 0x2 - /* DTYPE_SOCKET specific fields */ - short f_gcflag; /* used by thread doing fd garbage collection */ -#define FMARK 0x1 /* mark during gc() */ -#define FDEFER 0x2 /* defer for next gc pass */ -#define FWAIT 0x4 /* gc is scanning message buffers */ - int f_msgcount; /* (f) references from message queue */ - - /* DTYPE_VNODE specific fields */ - int f_seqcount; /* - * count of sequential accesses -- cleared - * by most seek operations. - */ - off_t f_nextoff; /* - * offset of next expected read or write - */ - void *f_label; /* Place-holder for struct label pointer. */ + void *f_data; /* file descriptor specific data */ + struct fileops *f_ops; /* File operations */ + struct ucred *f_cred; /* associated credentials. */ + struct vnode *f_vnode; /* NULL or applicable vnode */ + short f_type; /* descriptor type */ + short f_vnread_flags; /* (f) Sleep lock for f_offset */ + volatile u_int f_flag; /* see fcntl.h */ + volatile int f_count; /* reference count */ + /* + * DTYPE_VNODE specific fields. + */ + int f_seqcount; /* Count of sequential accesses. */ + off_t f_nextoff; /* next expected read/write offset. */ + /* + * DFLAG_SEEKABLE specific fields + */ + off_t f_offset; + /* + * Mandatory Access control information. + */ + void *f_label; /* Place-holder for MAC label. */ }; +#define FOFFSET_LOCKED 0x1 +#define FOFFSET_LOCK_WAITING 0x2 + #endif /* _KERNEL */ /* * Userland version of struct file, for sysctl */ struct xfile { size_t xf_size; /* size of struct xfile */ pid_t xf_pid; /* owning process */ uid_t xf_uid; /* effective uid of owning process */ int xf_fd; /* descriptor number */ void *xf_file; /* address of struct file */ short xf_type; /* descriptor type */ int xf_count; /* reference count */ int xf_msgcount; /* references from message queue */ off_t xf_offset; /* file offset */ void *xf_data; /* file descriptor specific data */ void *xf_vnode; /* vnode pointer */ u_int xf_flag; /* flags (see fcntl.h) */ }; #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_FILE); #endif -LIST_HEAD(filelist, file); -extern struct filelist filehead; /* (fl) head of list of open files */ extern struct fileops vnops; extern struct fileops badfileops; extern struct fileops socketops; extern int maxfiles; /* kernel limit on number of open files */ extern int maxfilesperproc; /* per process limit on number of open files */ -extern int openfiles; /* (fl) actual number of open files */ -extern struct sx filelist_lock; /* sx to protect filelist and openfiles */ +extern volatile int openfiles; /* actual number of open files */ int fget(struct thread *td, int fd, struct file **fpp); int fget_read(struct thread *td, int fd, struct file **fpp); int fget_write(struct thread *td, int fd, struct file **fpp); -int fdrop(struct file *fp, struct thread *td); +int _fdrop(struct file *fp, struct thread *td); /* * The socket operations are used a couple of places. * XXX: This is wrong, they should go through the operations vector for * XXX: sockets instead of going directly for the individual functions. /phk */ fo_rdwr_t soo_read; fo_rdwr_t soo_write; fo_ioctl_t soo_ioctl; fo_poll_t soo_poll; fo_kqfilter_t soo_kqfilter; fo_stat_t soo_stat; fo_close_t soo_close; -/* Lock a file. */ -#define FILE_LOCK(f) mtx_lock((f)->f_mtxp) -#define FILE_UNLOCK(f) mtx_unlock((f)->f_mtxp) -#define FILE_LOCKED(f) mtx_owned((f)->f_mtxp) -#define FILE_LOCK_ASSERT(f, type) mtx_assert((f)->f_mtxp, (type)) - +void finit(struct file *, u_int, short, void *, struct fileops *); int fgetvp(struct thread *td, int fd, struct vnode **vpp); int fgetvp_read(struct thread *td, int fd, struct vnode **vpp); int fgetvp_write(struct thread *td, int fd, struct vnode **vpp); int fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp); void fputsock(struct socket *sp); -#define fhold_locked(fp) \ - do { \ - FILE_LOCK_ASSERT(fp, MA_OWNED); \ - (fp)->f_count++; \ - } while (0) - -#define fhold(fp) \ - do { \ - FILE_LOCK(fp); \ - (fp)->f_count++; \ - FILE_UNLOCK(fp); \ - } while (0) +#define fhold(fp) atomic_add_int(&(fp)->f_count, 1) +#define fdrop(fp, td) \ + (atomic_fetchadd_int(&(fp)->f_count, -1) <= 1 ? _fdrop((fp), (td)) : 0) static __inline fo_rdwr_t fo_read; static __inline fo_rdwr_t fo_write; static __inline fo_ioctl_t fo_ioctl; static __inline fo_poll_t fo_poll; static __inline fo_kqfilter_t fo_kqfilter; static __inline fo_stat_t fo_stat; static __inline fo_close_t fo_close; static __inline int fo_read(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; int flags; struct thread *td; { return ((*fp->f_ops->fo_read)(fp, uio, active_cred, flags, td)); } static __inline int fo_write(fp, uio, active_cred, flags, td) struct file *fp; struct uio *uio; struct ucred *active_cred; int flags; struct thread *td; { return ((*fp->f_ops->fo_write)(fp, uio, active_cred, flags, td)); } static __inline int fo_ioctl(fp, com, data, active_cred, td) struct file *fp; u_long com; void *data; struct ucred *active_cred; struct thread *td; { return ((*fp->f_ops->fo_ioctl)(fp, com, data, active_cred, td)); } static __inline int fo_poll(fp, events, active_cred, td) struct file *fp; int events; struct ucred *active_cred; struct thread *td; { return ((*fp->f_ops->fo_poll)(fp, events, active_cred, td)); } static __inline int fo_stat(fp, sb, active_cred, td) struct file *fp; struct stat *sb; struct ucred *active_cred; struct thread *td; { return ((*fp->f_ops->fo_stat)(fp, sb, active_cred, td)); } static __inline int fo_close(fp, td) struct file *fp; struct thread *td; { return ((*fp->f_ops->fo_close)(fp, td)); } static __inline int fo_kqfilter(fp, kn) struct file *fp; struct knote *kn; { return ((*fp->f_ops->fo_kqfilter)(fp, kn)); } #endif /* _KERNEL */ #endif /* !SYS_FILE_H */ Index: head/sys/sys/unpcb.h =================================================================== --- head/sys/sys/unpcb.h (revision 174987) +++ head/sys/sys/unpcb.h (revision 174988) @@ -1,141 +1,148 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)unpcb.h 8.1 (Berkeley) 6/2/93 * $FreeBSD$ */ #ifndef _SYS_UNPCB_H_ #define _SYS_UNPCB_H_ #include #include /* * Protocol control block for an active * instance of a UNIX internal protocol. * * A socket may be associated with a vnode in the * filesystem. If so, the unp_vnode pointer holds * a reference count to this vnode, which should be irele'd * when the socket goes away. * * A socket may be connected to another socket, in which * case the control block of the socket to which it is connected * is given by unp_conn. * * A socket may be referenced by a number of sockets (e.g. several * sockets may be connected to a datagram socket.) These sockets * are in a linked list starting with unp_refs, linked through * unp_nextref and null-terminated. Note that a socket may be referenced * by a number of other sockets and may also reference a socket (not * necessarily one which is referencing it). This generates * the need for unp_refs and unp_nextref to be separate fields. * * Stream sockets keep copies of receive sockbuf sb_cc and sb_mbcnt * so that changes in the sockbuf may be computed to modify * back pressure on the sender accordingly. */ typedef u_quad_t unp_gen_t; LIST_HEAD(unp_head, unpcb); struct unpcb { LIST_ENTRY(unpcb) unp_link; /* glue on list of all PCBs */ struct socket *unp_socket; /* pointer back to socket */ + struct file *unp_file; /* back-pointer to file for gc. */ struct vnode *unp_vnode; /* if associated with file */ ino_t unp_ino; /* fake inode number */ struct unpcb *unp_conn; /* control block of connected socket */ struct unp_head unp_refs; /* referencing socket linked list */ LIST_ENTRY(unpcb) unp_reflink; /* link in unp_refs list */ struct sockaddr_un *unp_addr; /* bound address of socket */ int unp_cc; /* copy of rcv.sb_cc */ int unp_mbcnt; /* copy of rcv.sb_mbcnt */ unp_gen_t unp_gencnt; /* generation count of this instance */ - int unp_flags; /* flags */ + short unp_flags; /* flags */ + short unp_gcflag; /* Garbage collector flags. */ struct xucred unp_peercred; /* peer credentials, if applicable */ u_int unp_refcount; + u_int unp_msgcount; /* references from message queue */ struct mtx unp_mtx; /* mutex */ }; /* * Flags in unp_flags. * * UNP_HAVEPC - indicates that the unp_peercred member is filled in * and is really the credentials of the connected peer. This is used * to determine whether the contents should be sent to the user or * not. * * UNP_HAVEPCCACHED - indicates that the unp_peercred member is filled * in, but does *not* contain the credentials of the connected peer * (there may not even be a peer). This is set in unp_listen() when * it fills in unp_peercred for later consumption by unp_connect(). */ #define UNP_HAVEPC 0x001 #define UNP_HAVEPCCACHED 0x002 #define UNP_WANTCRED 0x004 /* credentials wanted */ #define UNP_CONNWAIT 0x008 /* connect blocks until accepted */ + +#define UNPGC_REF 0x1 /* unpcb has external ref. */ +#define UNPGC_DEAD 0x2 /* unpcb might be dead. */ +#define UNPGC_SCANNED 0x4 /* Has been scanned. */ /* * These flags are used to handle non-atomicity in connect() and bind() * operations on a socket: in particular, to avoid races between multiple * threads or processes operating simultaneously on the same socket. */ #define UNP_CONNECTING 0x010 /* Currently connecting. */ #define UNP_BINDING 0x020 /* Currently binding. */ #define sotounpcb(so) ((struct unpcb *)((so)->so_pcb)) /* Hack alert -- this structure depends on . */ #ifdef _SYS_SOCKETVAR_H_ struct xunpcb { size_t xu_len; /* length of this structure */ struct unpcb *xu_unpp; /* to help netstat, fstat */ struct unpcb xu_unp; /* our information */ union { struct sockaddr_un xuu_addr; /* our bound address */ char xu_dummy1[256]; } xu_au; #define xu_addr xu_au.xuu_addr union { struct sockaddr_un xuu_caddr; /* their bound address */ char xu_dummy2[256]; } xu_cau; #define xu_caddr xu_cau.xuu_caddr struct xsocket xu_socket; u_quad_t xu_alignment_hack; }; struct xunpgen { size_t xug_len; u_int xug_count; unp_gen_t xug_gen; so_gen_t xug_sogen; }; #endif /* _SYS_SOCKETVAR_H_ */ #endif /* _SYS_UNPCB_H_ */