Index: stable/11/sys/compat/cloudabi/cloudabi_sock.c =================================================================== --- stable/11/sys/compat/cloudabi/cloudabi_sock.c (revision 315311) +++ stable/11/sys/compat/cloudabi/cloudabi_sock.c (revision 315312) @@ -1,252 +1,252 @@ /*- * Copyright (c) 2015 Nuxi, https://nuxi.nl/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Converts FreeBSD's struct sockaddr to CloudABI's cloudabi_sockaddr_t. */ void cloudabi_convert_sockaddr(const struct sockaddr *sa, socklen_t sal, cloudabi_sockaddr_t *rsa) { const struct sockaddr_in *sin; const struct sockaddr_in6 *sin6; /* Zero-sized socket address. */ if (sal < offsetof(struct sockaddr, sa_family) + sizeof(sa->sa_family)) return; switch (sa->sa_family) { case AF_INET: if (sal < sizeof(struct sockaddr_in)) return; sin = (const struct sockaddr_in *)sa; rsa->sa_family = CLOUDABI_AF_INET; memcpy(&rsa->sa_inet.addr, &sin->sin_addr, sizeof(rsa->sa_inet.addr)); rsa->sa_inet.port = ntohs(sin->sin_port); return; case AF_INET6: if (sal < sizeof(struct sockaddr_in6)) return; sin6 = (const struct sockaddr_in6 *)sa; rsa->sa_family = CLOUDABI_AF_INET6; memcpy(&rsa->sa_inet6.addr, &sin6->sin6_addr, sizeof(rsa->sa_inet6.addr)); rsa->sa_inet6.port = ntohs(sin6->sin6_port); return; case AF_UNIX: rsa->sa_family = CLOUDABI_AF_UNIX; return; } } /* Copies a pathname into a UNIX socket address structure. */ static int copyin_sockaddr_un(const char *path, size_t pathlen, struct sockaddr_un *sun) { int error; /* Copy in pathname string if there's enough space. */ if (pathlen >= sizeof(sun->sun_path)) return (ENAMETOOLONG); error = copyin(path, &sun->sun_path, pathlen); if (error != 0) return (error); if (memchr(sun->sun_path, '\0', pathlen) != NULL) return (EINVAL); /* Initialize the rest of the socket address. */ sun->sun_path[pathlen] = '\0'; sun->sun_family = AF_UNIX; sun->sun_len = sizeof(*sun); return (0); } int cloudabi_sys_sock_accept(struct thread *td, struct cloudabi_sys_sock_accept_args *uap) { struct sockaddr *sa; cloudabi_sockstat_t ss = {}; socklen_t sal; int error; if (uap->buf == NULL) { /* Only return the new file descriptor number. */ return (kern_accept(td, uap->sock, NULL, NULL, NULL)); } else { /* Also return properties of the new socket descriptor. */ sal = MAX(sizeof(struct sockaddr_in), sizeof(struct sockaddr_in6)); error = kern_accept(td, uap->sock, (void *)&sa, &sal, NULL); if (error != 0) return (error); /* TODO(ed): Fill the other members of cloudabi_sockstat_t. */ cloudabi_convert_sockaddr(sa, sal, &ss.ss_peername); free(sa, M_SONAME); return (copyout(&ss, uap->buf, sizeof(ss))); } } int cloudabi_sys_sock_bind(struct thread *td, struct cloudabi_sys_sock_bind_args *uap) { struct sockaddr_un sun; int error; error = copyin_sockaddr_un(uap->path, uap->pathlen, &sun); if (error != 0) return (error); return (kern_bindat(td, uap->fd, uap->sock, (struct sockaddr *)&sun)); } int cloudabi_sys_sock_connect(struct thread *td, struct cloudabi_sys_sock_connect_args *uap) { struct sockaddr_un sun; int error; error = copyin_sockaddr_un(uap->path, uap->pathlen, &sun); if (error != 0) return (error); return (kern_connectat(td, uap->fd, uap->sock, (struct sockaddr *)&sun)); } int cloudabi_sys_sock_listen(struct thread *td, struct cloudabi_sys_sock_listen_args *uap) { struct listen_args listen_args = { .s = uap->sock, .backlog = uap->backlog, }; return (sys_listen(td, &listen_args)); } int cloudabi_sys_sock_shutdown(struct thread *td, struct cloudabi_sys_sock_shutdown_args *uap) { struct shutdown_args shutdown_args = { .s = uap->sock, }; switch (uap->how) { case CLOUDABI_SHUT_RD: shutdown_args.how = SHUT_RD; break; case CLOUDABI_SHUT_WR: shutdown_args.how = SHUT_WR; break; case CLOUDABI_SHUT_RD | CLOUDABI_SHUT_WR: shutdown_args.how = SHUT_RDWR; break; default: return (EINVAL); } return (sys_shutdown(td, &shutdown_args)); } int cloudabi_sys_sock_stat_get(struct thread *td, struct cloudabi_sys_sock_stat_get_args *uap) { cloudabi_sockstat_t ss = {}; cap_rights_t rights; struct file *fp; struct sockaddr *sa; struct socket *so; int error; error = getsock_cap(td, uap->sock, cap_rights_init(&rights, - CAP_GETSOCKOPT, CAP_GETPEERNAME, CAP_GETSOCKNAME), &fp, NULL); + CAP_GETSOCKOPT, CAP_GETPEERNAME, CAP_GETSOCKNAME), &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; CURVNET_SET(so->so_vnet); /* Set ss_sockname. */ error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); if (error == 0) { cloudabi_convert_sockaddr(sa, sa->sa_len, &ss.ss_sockname); free(sa, M_SONAME); } /* Set ss_peername. */ if ((so->so_state & (SS_ISCONNECTED | SS_ISCONFIRMING)) != 0) { error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa); if (error == 0) { cloudabi_convert_sockaddr(sa, sa->sa_len, &ss.ss_peername); free(sa, M_SONAME); } } CURVNET_RESTORE(); /* Set ss_error. */ SOCK_LOCK(so); ss.ss_error = cloudabi_convert_errno(so->so_error); if ((uap->flags & CLOUDABI_SOCKSTAT_CLEAR_ERROR) != 0) so->so_error = 0; SOCK_UNLOCK(so); /* Set ss_state. */ if ((so->so_options & SO_ACCEPTCONN) != 0) ss.ss_state |= CLOUDABI_SOCKSTATE_ACCEPTCONN; fdrop(fp, td); return (copyout(&ss, uap->buf, sizeof(ss))); } Index: stable/11/sys/compat/linux/linux_socket.c =================================================================== --- stable/11/sys/compat/linux/linux_socket.c (revision 315311) +++ stable/11/sys/compat/linux/linux_socket.c (revision 315312) @@ -1,1789 +1,1789 @@ /*- * Copyright (c) 1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* XXX we use functions that might not exist. */ #include "opt_compat.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include static int linux_to_bsd_domain(int); static int linux_sendmsg_common(struct thread *, l_int, struct l_msghdr *, l_uint); static int linux_recvmsg_common(struct thread *, l_int, struct l_msghdr *, l_uint, struct msghdr *); static int linux_set_socket_flags(int, int *); /* * Reads a linux sockaddr and does any necessary translation. * Linux sockaddrs don't have a length field, only a family. * Copy the osockaddr structure pointed to by osa to kernel, adjust * family and convert to sockaddr. */ static int linux_getsockaddr(struct sockaddr **sap, const struct osockaddr *osa, int salen) { struct sockaddr *sa; struct osockaddr *kosa; #ifdef INET6 struct sockaddr_in6 *sin6; int oldv6size; #endif char *name; int bdom, error, hdrlen, namelen; if (salen < 2 || salen > UCHAR_MAX || !osa) return (EINVAL); #ifdef INET6 oldv6size = 0; /* * Check for old (pre-RFC2553) sockaddr_in6. We may accept it * if it's a v4-mapped address, so reserve the proper space * for it. */ if (salen == sizeof(struct sockaddr_in6) - sizeof(uint32_t)) { salen += sizeof(uint32_t); oldv6size = 1; } #endif kosa = malloc(salen, M_SONAME, M_WAITOK); if ((error = copyin(osa, kosa, salen))) goto out; bdom = linux_to_bsd_domain(kosa->sa_family); if (bdom == -1) { error = EAFNOSUPPORT; goto out; } #ifdef INET6 /* * Older Linux IPv6 code uses obsolete RFC2133 struct sockaddr_in6, * which lacks the scope id compared with RFC2553 one. If we detect * the situation, reject the address and write a message to system log. * * Still accept addresses for which the scope id is not used. */ if (oldv6size) { if (bdom == AF_INET6) { sin6 = (struct sockaddr_in6 *)kosa; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) || (!IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) && !IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) && !IN6_IS_ADDR_V4COMPAT(&sin6->sin6_addr) && !IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) && !IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))) { sin6->sin6_scope_id = 0; } else { log(LOG_DEBUG, "obsolete pre-RFC2553 sockaddr_in6 rejected\n"); error = EINVAL; goto out; } } else salen -= sizeof(uint32_t); } #endif if (bdom == AF_INET) { if (salen < sizeof(struct sockaddr_in)) { error = EINVAL; goto out; } salen = sizeof(struct sockaddr_in); } if (bdom == AF_LOCAL && salen > sizeof(struct sockaddr_un)) { hdrlen = offsetof(struct sockaddr_un, sun_path); name = ((struct sockaddr_un *)kosa)->sun_path; if (*name == '\0') { /* * Linux abstract namespace starts with a NULL byte. * XXX We do not support abstract namespace yet. */ namelen = strnlen(name + 1, salen - hdrlen - 1) + 1; } else namelen = strnlen(name, salen - hdrlen); salen = hdrlen + namelen; if (salen > sizeof(struct sockaddr_un)) { error = ENAMETOOLONG; goto out; } } sa = (struct sockaddr *)kosa; sa->sa_family = bdom; sa->sa_len = salen; *sap = sa; return (0); out: free(kosa, M_SONAME); return (error); } static int linux_to_bsd_domain(int domain) { switch (domain) { case LINUX_AF_UNSPEC: return (AF_UNSPEC); case LINUX_AF_UNIX: return (AF_LOCAL); case LINUX_AF_INET: return (AF_INET); case LINUX_AF_INET6: return (AF_INET6); case LINUX_AF_AX25: return (AF_CCITT); case LINUX_AF_IPX: return (AF_IPX); case LINUX_AF_APPLETALK: return (AF_APPLETALK); } return (-1); } static int bsd_to_linux_domain(int domain) { switch (domain) { case AF_UNSPEC: return (LINUX_AF_UNSPEC); case AF_LOCAL: return (LINUX_AF_UNIX); case AF_INET: return (LINUX_AF_INET); case AF_INET6: return (LINUX_AF_INET6); case AF_CCITT: return (LINUX_AF_AX25); case AF_IPX: return (LINUX_AF_IPX); case AF_APPLETALK: return (LINUX_AF_APPLETALK); } return (-1); } static int linux_to_bsd_sockopt_level(int level) { switch (level) { case LINUX_SOL_SOCKET: return (SOL_SOCKET); } return (level); } static int bsd_to_linux_sockopt_level(int level) { switch (level) { case SOL_SOCKET: return (LINUX_SOL_SOCKET); } return (level); } static int linux_to_bsd_ip_sockopt(int opt) { switch (opt) { case LINUX_IP_TOS: return (IP_TOS); case LINUX_IP_TTL: return (IP_TTL); case LINUX_IP_OPTIONS: return (IP_OPTIONS); case LINUX_IP_MULTICAST_IF: return (IP_MULTICAST_IF); case LINUX_IP_MULTICAST_TTL: return (IP_MULTICAST_TTL); case LINUX_IP_MULTICAST_LOOP: return (IP_MULTICAST_LOOP); case LINUX_IP_ADD_MEMBERSHIP: return (IP_ADD_MEMBERSHIP); case LINUX_IP_DROP_MEMBERSHIP: return (IP_DROP_MEMBERSHIP); case LINUX_IP_HDRINCL: return (IP_HDRINCL); } return (-1); } static int linux_to_bsd_ip6_sockopt(int opt) { switch (opt) { case LINUX_IPV6_NEXTHOP: return (IPV6_NEXTHOP); case LINUX_IPV6_UNICAST_HOPS: return (IPV6_UNICAST_HOPS); case LINUX_IPV6_MULTICAST_IF: return (IPV6_MULTICAST_IF); case LINUX_IPV6_MULTICAST_HOPS: return (IPV6_MULTICAST_HOPS); case LINUX_IPV6_MULTICAST_LOOP: return (IPV6_MULTICAST_LOOP); case LINUX_IPV6_ADD_MEMBERSHIP: return (IPV6_JOIN_GROUP); case LINUX_IPV6_DROP_MEMBERSHIP: return (IPV6_LEAVE_GROUP); case LINUX_IPV6_V6ONLY: return (IPV6_V6ONLY); case LINUX_IPV6_DONTFRAG: return (IPV6_DONTFRAG); #if 0 case LINUX_IPV6_CHECKSUM: return (IPV6_CHECKSUM); case LINUX_IPV6_RECVPKTINFO: return (IPV6_RECVPKTINFO); case LINUX_IPV6_PKTINFO: return (IPV6_PKTINFO); case LINUX_IPV6_RECVHOPLIMIT: return (IPV6_RECVHOPLIMIT); case LINUX_IPV6_HOPLIMIT: return (IPV6_HOPLIMIT); case LINUX_IPV6_RECVHOPOPTS: return (IPV6_RECVHOPOPTS); case LINUX_IPV6_HOPOPTS: return (IPV6_HOPOPTS); case LINUX_IPV6_RTHDRDSTOPTS: return (IPV6_RTHDRDSTOPTS); case LINUX_IPV6_RECVRTHDR: return (IPV6_RECVRTHDR); case LINUX_IPV6_RTHDR: return (IPV6_RTHDR); case LINUX_IPV6_RECVDSTOPTS: return (IPV6_RECVDSTOPTS); case LINUX_IPV6_DSTOPTS: return (IPV6_DSTOPTS); case LINUX_IPV6_RECVPATHMTU: return (IPV6_RECVPATHMTU); case LINUX_IPV6_PATHMTU: return (IPV6_PATHMTU); #endif } return (-1); } static int linux_to_bsd_so_sockopt(int opt) { switch (opt) { case LINUX_SO_DEBUG: return (SO_DEBUG); case LINUX_SO_REUSEADDR: return (SO_REUSEADDR); case LINUX_SO_TYPE: return (SO_TYPE); case LINUX_SO_ERROR: return (SO_ERROR); case LINUX_SO_DONTROUTE: return (SO_DONTROUTE); case LINUX_SO_BROADCAST: return (SO_BROADCAST); case LINUX_SO_SNDBUF: return (SO_SNDBUF); case LINUX_SO_RCVBUF: return (SO_RCVBUF); case LINUX_SO_KEEPALIVE: return (SO_KEEPALIVE); case LINUX_SO_OOBINLINE: return (SO_OOBINLINE); case LINUX_SO_LINGER: return (SO_LINGER); case LINUX_SO_PEERCRED: return (LOCAL_PEERCRED); case LINUX_SO_RCVLOWAT: return (SO_RCVLOWAT); case LINUX_SO_SNDLOWAT: return (SO_SNDLOWAT); case LINUX_SO_RCVTIMEO: return (SO_RCVTIMEO); case LINUX_SO_SNDTIMEO: return (SO_SNDTIMEO); case LINUX_SO_TIMESTAMP: return (SO_TIMESTAMP); case LINUX_SO_ACCEPTCONN: return (SO_ACCEPTCONN); } return (-1); } static int linux_to_bsd_tcp_sockopt(int opt) { switch (opt) { case LINUX_TCP_NODELAY: return (TCP_NODELAY); case LINUX_TCP_MAXSEG: return (TCP_MAXSEG); case LINUX_TCP_KEEPIDLE: return (TCP_KEEPIDLE); case LINUX_TCP_KEEPINTVL: return (TCP_KEEPINTVL); case LINUX_TCP_KEEPCNT: return (TCP_KEEPCNT); case LINUX_TCP_MD5SIG: return (TCP_MD5SIG); } return (-1); } static int linux_to_bsd_msg_flags(int flags) { int ret_flags = 0; if (flags & LINUX_MSG_OOB) ret_flags |= MSG_OOB; if (flags & LINUX_MSG_PEEK) ret_flags |= MSG_PEEK; if (flags & LINUX_MSG_DONTROUTE) ret_flags |= MSG_DONTROUTE; if (flags & LINUX_MSG_CTRUNC) ret_flags |= MSG_CTRUNC; if (flags & LINUX_MSG_TRUNC) ret_flags |= MSG_TRUNC; if (flags & LINUX_MSG_DONTWAIT) ret_flags |= MSG_DONTWAIT; if (flags & LINUX_MSG_EOR) ret_flags |= MSG_EOR; if (flags & LINUX_MSG_WAITALL) ret_flags |= MSG_WAITALL; if (flags & LINUX_MSG_NOSIGNAL) ret_flags |= MSG_NOSIGNAL; #if 0 /* not handled */ if (flags & LINUX_MSG_PROXY) ; if (flags & LINUX_MSG_FIN) ; if (flags & LINUX_MSG_SYN) ; if (flags & LINUX_MSG_CONFIRM) ; if (flags & LINUX_MSG_RST) ; if (flags & LINUX_MSG_ERRQUEUE) ; #endif return (ret_flags); } /* * If bsd_to_linux_sockaddr() or linux_to_bsd_sockaddr() faults, then the * native syscall will fault. Thus, we don't really need to check the * return values for these functions. */ static int bsd_to_linux_sockaddr(struct sockaddr *arg) { struct sockaddr sa; size_t sa_len = sizeof(struct sockaddr); int error, bdom; if ((error = copyin(arg, &sa, sa_len))) return (error); bdom = bsd_to_linux_domain(sa.sa_family); if (bdom == -1) return (EAFNOSUPPORT); *(u_short *)&sa = bdom; return (copyout(&sa, arg, sa_len)); } static int linux_to_bsd_sockaddr(struct sockaddr *arg, int len) { struct sockaddr sa; size_t sa_len = sizeof(struct sockaddr); int error, bdom; if ((error = copyin(arg, &sa, sa_len))) return (error); bdom = linux_to_bsd_domain(*(sa_family_t *)&sa); if (bdom == -1) return (EAFNOSUPPORT); sa.sa_family = bdom; sa.sa_len = len; return (copyout(&sa, arg, sa_len)); } static int linux_sa_put(struct osockaddr *osa) { struct osockaddr sa; int error, bdom; /* * Only read/write the osockaddr family part, the rest is * not changed. */ error = copyin(osa, &sa, sizeof(sa.sa_family)); if (error) return (error); bdom = bsd_to_linux_domain(sa.sa_family); if (bdom == -1) return (EINVAL); sa.sa_family = bdom; return (copyout(&sa, osa, sizeof(sa.sa_family))); } static int linux_to_bsd_cmsg_type(int cmsg_type) { switch (cmsg_type) { case LINUX_SCM_RIGHTS: return (SCM_RIGHTS); case LINUX_SCM_CREDENTIALS: return (SCM_CREDS); } return (-1); } static int bsd_to_linux_cmsg_type(int cmsg_type) { switch (cmsg_type) { case SCM_RIGHTS: return (LINUX_SCM_RIGHTS); case SCM_CREDS: return (LINUX_SCM_CREDENTIALS); case SCM_TIMESTAMP: return (LINUX_SCM_TIMESTAMP); } return (-1); } static int linux_to_bsd_msghdr(struct msghdr *bhdr, const struct l_msghdr *lhdr) { if (lhdr->msg_controllen > INT_MAX) return (ENOBUFS); bhdr->msg_name = PTRIN(lhdr->msg_name); bhdr->msg_namelen = lhdr->msg_namelen; bhdr->msg_iov = PTRIN(lhdr->msg_iov); bhdr->msg_iovlen = lhdr->msg_iovlen; bhdr->msg_control = PTRIN(lhdr->msg_control); /* * msg_controllen is skipped since BSD and LINUX control messages * are potentially different sizes (e.g. the cred structure used * by SCM_CREDS is different between the two operating system). * * The caller can set it (if necessary) after converting all the * control messages. */ bhdr->msg_flags = linux_to_bsd_msg_flags(lhdr->msg_flags); return (0); } static int bsd_to_linux_msghdr(const struct msghdr *bhdr, struct l_msghdr *lhdr) { lhdr->msg_name = PTROUT(bhdr->msg_name); lhdr->msg_namelen = bhdr->msg_namelen; lhdr->msg_iov = PTROUT(bhdr->msg_iov); lhdr->msg_iovlen = bhdr->msg_iovlen; lhdr->msg_control = PTROUT(bhdr->msg_control); /* * msg_controllen is skipped since BSD and LINUX control messages * are potentially different sizes (e.g. the cred structure used * by SCM_CREDS is different between the two operating system). * * The caller can set it (if necessary) after converting all the * control messages. */ /* msg_flags skipped */ return (0); } static int linux_set_socket_flags(int lflags, int *flags) { if (lflags & ~(LINUX_SOCK_CLOEXEC | LINUX_SOCK_NONBLOCK)) return (EINVAL); if (lflags & LINUX_SOCK_NONBLOCK) *flags |= SOCK_NONBLOCK; if (lflags & LINUX_SOCK_CLOEXEC) *flags |= SOCK_CLOEXEC; return (0); } static int linux_sendit(struct thread *td, int s, struct msghdr *mp, int flags, struct mbuf *control, enum uio_seg segflg) { struct sockaddr *to; int error; if (mp->msg_name != NULL) { error = linux_getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error) return (error); mp->msg_name = to; } else to = NULL; error = kern_sendit(td, s, mp, linux_to_bsd_msg_flags(flags), control, segflg); if (to) free(to, M_SONAME); return (error); } /* Return 0 if IP_HDRINCL is set for the given socket. */ static int linux_check_hdrincl(struct thread *td, int s) { int error, optval; socklen_t size_val; size_val = sizeof(optval); error = kern_getsockopt(td, s, IPPROTO_IP, IP_HDRINCL, &optval, UIO_SYSSPACE, &size_val); if (error) return (error); return (optval == 0); } /* * Updated sendto() when IP_HDRINCL is set: * tweak endian-dependent fields in the IP packet. */ static int linux_sendto_hdrincl(struct thread *td, struct linux_sendto_args *linux_args) { /* * linux_ip_copysize defines how many bytes we should copy * from the beginning of the IP packet before we customize it for BSD. * It should include all the fields we modify (ip_len and ip_off). */ #define linux_ip_copysize 8 struct ip *packet; struct msghdr msg; struct iovec aiov[1]; int error; /* Check that the packet isn't too big or too small. */ if (linux_args->len < linux_ip_copysize || linux_args->len > IP_MAXPACKET) return (EINVAL); packet = (struct ip *)malloc(linux_args->len, M_LINUX, M_WAITOK); /* Make kernel copy of the packet to be sent */ if ((error = copyin(PTRIN(linux_args->msg), packet, linux_args->len))) goto goout; /* Convert fields from Linux to BSD raw IP socket format */ packet->ip_len = linux_args->len; packet->ip_off = ntohs(packet->ip_off); /* Prepare the msghdr and iovec structures describing the new packet */ msg.msg_name = PTRIN(linux_args->to); msg.msg_namelen = linux_args->tolen; msg.msg_iov = aiov; msg.msg_iovlen = 1; msg.msg_control = NULL; msg.msg_flags = 0; aiov[0].iov_base = (char *)packet; aiov[0].iov_len = linux_args->len; error = linux_sendit(td, linux_args->s, &msg, linux_args->flags, NULL, UIO_SYSSPACE); goout: free(packet, M_LINUX); return (error); } int linux_socket(struct thread *td, struct linux_socket_args *args) { struct socket_args /* { int domain; int type; int protocol; } */ bsd_args; int retval_socket; bsd_args.protocol = args->protocol; bsd_args.type = args->type & LINUX_SOCK_TYPE_MASK; if (bsd_args.type < 0 || bsd_args.type > LINUX_SOCK_MAX) return (EINVAL); retval_socket = linux_set_socket_flags(args->type & ~LINUX_SOCK_TYPE_MASK, &bsd_args.type); if (retval_socket != 0) return (retval_socket); bsd_args.domain = linux_to_bsd_domain(args->domain); if (bsd_args.domain == -1) return (EAFNOSUPPORT); retval_socket = sys_socket(td, &bsd_args); if (retval_socket) return (retval_socket); if (bsd_args.type == SOCK_RAW && (bsd_args.protocol == IPPROTO_RAW || bsd_args.protocol == 0) && bsd_args.domain == PF_INET) { /* It's a raw IP socket: set the IP_HDRINCL option. */ int hdrincl; hdrincl = 1; /* We ignore any error returned by kern_setsockopt() */ kern_setsockopt(td, td->td_retval[0], IPPROTO_IP, IP_HDRINCL, &hdrincl, UIO_SYSSPACE, sizeof(hdrincl)); } #ifdef INET6 /* * Linux AF_INET6 socket has IPV6_V6ONLY setsockopt set to 0 by default * and some apps depend on this. So, set V6ONLY to 0 for Linux apps. * For simplicity we do this unconditionally of the net.inet6.ip6.v6only * sysctl value. */ if (bsd_args.domain == PF_INET6) { int v6only; v6only = 0; /* We ignore any error returned by setsockopt() */ kern_setsockopt(td, td->td_retval[0], IPPROTO_IPV6, IPV6_V6ONLY, &v6only, UIO_SYSSPACE, sizeof(v6only)); } #endif return (retval_socket); } int linux_bind(struct thread *td, struct linux_bind_args *args) { struct sockaddr *sa; int error; error = linux_getsockaddr(&sa, PTRIN(args->name), args->namelen); if (error) return (error); error = kern_bindat(td, AT_FDCWD, args->s, sa); free(sa, M_SONAME); if (error == EADDRNOTAVAIL && args->namelen != sizeof(struct sockaddr_in)) return (EINVAL); return (error); } int linux_connect(struct thread *td, struct linux_connect_args *args) { cap_rights_t rights; struct socket *so; struct sockaddr *sa; + struct file *fp; u_int fflag; int error; error = linux_getsockaddr(&sa, (struct osockaddr *)PTRIN(args->name), args->namelen); if (error) return (error); error = kern_connectat(td, AT_FDCWD, args->s, sa); free(sa, M_SONAME); if (error != EISCONN) return (error); /* * Linux doesn't return EISCONN the first time it occurs, * when on a non-blocking socket. Instead it returns the * error getsockopt(SOL_SOCKET, SO_ERROR) would return on BSD. - * - * XXXRW: Instead of using fgetsock(), check that it is a - * socket and use the file descriptor reference instead of - * creating a new one. */ - error = fgetsock(td, args->s, cap_rights_init(&rights, CAP_CONNECT), - &so, &fflag); - if (error == 0) { - error = EISCONN; - if (fflag & FNONBLOCK) { - SOCK_LOCK(so); - if (so->so_emuldata == 0) - error = so->so_error; - so->so_emuldata = (void *)1; - SOCK_UNLOCK(so); - } - fputsock(so); + error = getsock_cap(td, args->s, cap_rights_init(&rights, CAP_CONNECT), + &fp, &fflag, NULL); + if (error != 0) + return (error); + + error = EISCONN; + so = fp->f_data; + if (fflag & FNONBLOCK) { + SOCK_LOCK(so); + if (so->so_emuldata == 0) + error = so->so_error; + so->so_emuldata = (void *)1; + SOCK_UNLOCK(so); } + fdrop(fp, td); + return (error); } int linux_listen(struct thread *td, struct linux_listen_args *args) { struct listen_args /* { int s; int backlog; } */ bsd_args; bsd_args.s = args->s; bsd_args.backlog = args->backlog; return (sys_listen(td, &bsd_args)); } static int linux_accept_common(struct thread *td, int s, l_uintptr_t addr, l_uintptr_t namelen, int flags) { struct accept4_args /* { int s; struct sockaddr * __restrict name; socklen_t * __restrict anamelen; int flags; } */ bsd_args; cap_rights_t rights; struct socket *so; struct file *fp; int error, error1; bsd_args.s = s; /* XXX: */ bsd_args.name = (struct sockaddr * __restrict)PTRIN(addr); bsd_args.anamelen = PTRIN(namelen);/* XXX */ bsd_args.flags = 0; error = linux_set_socket_flags(flags, &bsd_args.flags); if (error != 0) return (error); error = sys_accept4(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.name); if (error) { if (error == EFAULT && namelen != sizeof(struct sockaddr_in)) return (EINVAL); if (error == EINVAL) { - error1 = getsock_cap(td, s, &rights, &fp, NULL); + error1 = getsock_cap(td, s, &rights, &fp, NULL, NULL); if (error1 != 0) return (error1); so = fp->f_data; if (so->so_type == SOCK_DGRAM) { fdrop(fp, td); return (EOPNOTSUPP); } fdrop(fp, td); } return (error); } if (addr) error = linux_sa_put(PTRIN(addr)); if (error) { (void)kern_close(td, td->td_retval[0]); td->td_retval[0] = 0; } return (error); } int linux_accept(struct thread *td, struct linux_accept_args *args) { return (linux_accept_common(td, args->s, args->addr, args->namelen, 0)); } int linux_accept4(struct thread *td, struct linux_accept4_args *args) { return (linux_accept_common(td, args->s, args->addr, args->namelen, args->flags)); } int linux_getsockname(struct thread *td, struct linux_getsockname_args *args) { struct getsockname_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ bsd_args; int error; bsd_args.fdes = args->s; /* XXX: */ bsd_args.asa = (struct sockaddr * __restrict)PTRIN(args->addr); bsd_args.alen = PTRIN(args->namelen); /* XXX */ error = sys_getsockname(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.asa); if (error) return (error); return (linux_sa_put(PTRIN(args->addr))); } int linux_getpeername(struct thread *td, struct linux_getpeername_args *args) { struct getpeername_args /* { int fdes; caddr_t asa; int *alen; } */ bsd_args; int error; bsd_args.fdes = args->s; bsd_args.asa = (struct sockaddr *)PTRIN(args->addr); bsd_args.alen = (socklen_t *)PTRIN(args->namelen); error = sys_getpeername(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.asa); if (error) return (error); return (linux_sa_put(PTRIN(args->addr))); } int linux_socketpair(struct thread *td, struct linux_socketpair_args *args) { struct socketpair_args /* { int domain; int type; int protocol; int *rsv; } */ bsd_args; int error; bsd_args.domain = linux_to_bsd_domain(args->domain); if (bsd_args.domain != PF_LOCAL) return (EAFNOSUPPORT); bsd_args.type = args->type & LINUX_SOCK_TYPE_MASK; if (bsd_args.type < 0 || bsd_args.type > LINUX_SOCK_MAX) return (EINVAL); error = linux_set_socket_flags(args->type & ~LINUX_SOCK_TYPE_MASK, &bsd_args.type); if (error != 0) return (error); if (args->protocol != 0 && args->protocol != PF_UNIX) /* * Use of PF_UNIX as protocol argument is not right, * but Linux does it. * Do not map PF_UNIX as its Linux value is identical * to FreeBSD one. */ return (EPROTONOSUPPORT); else bsd_args.protocol = 0; bsd_args.rsv = (int *)PTRIN(args->rsv); return (sys_socketpair(td, &bsd_args)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) struct linux_send_args { register_t s; register_t msg; register_t len; register_t flags; }; static int linux_send(struct thread *td, struct linux_send_args *args) { struct sendto_args /* { int s; caddr_t buf; int len; int flags; caddr_t to; int tolen; } */ bsd_args; bsd_args.s = args->s; bsd_args.buf = (caddr_t)PTRIN(args->msg); bsd_args.len = args->len; bsd_args.flags = args->flags; bsd_args.to = NULL; bsd_args.tolen = 0; return (sys_sendto(td, &bsd_args)); } struct linux_recv_args { register_t s; register_t msg; register_t len; register_t flags; }; static int linux_recv(struct thread *td, struct linux_recv_args *args) { struct recvfrom_args /* { int s; caddr_t buf; int len; int flags; struct sockaddr *from; socklen_t fromlenaddr; } */ bsd_args; bsd_args.s = args->s; bsd_args.buf = (caddr_t)PTRIN(args->msg); bsd_args.len = args->len; bsd_args.flags = linux_to_bsd_msg_flags(args->flags); bsd_args.from = NULL; bsd_args.fromlenaddr = 0; return (sys_recvfrom(td, &bsd_args)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_sendto(struct thread *td, struct linux_sendto_args *args) { struct msghdr msg; struct iovec aiov; if (linux_check_hdrincl(td, args->s) == 0) /* IP_HDRINCL set, tweak the packet before sending */ return (linux_sendto_hdrincl(td, args)); msg.msg_name = PTRIN(args->to); msg.msg_namelen = args->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = NULL; msg.msg_flags = 0; aiov.iov_base = PTRIN(args->msg); aiov.iov_len = args->len; return (linux_sendit(td, args->s, &msg, args->flags, NULL, UIO_USERSPACE)); } int linux_recvfrom(struct thread *td, struct linux_recvfrom_args *args) { struct msghdr msg; struct iovec aiov; int error, fromlen; if (PTRIN(args->fromlen) != NULL) { error = copyin(PTRIN(args->fromlen), &fromlen, sizeof(fromlen)); if (error != 0) return (error); if (fromlen < 0) return (EINVAL); msg.msg_namelen = fromlen; } else msg.msg_namelen = 0; msg.msg_name = (struct sockaddr * __restrict)PTRIN(args->from); msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = PTRIN(args->buf); aiov.iov_len = args->len; msg.msg_control = 0; msg.msg_flags = linux_to_bsd_msg_flags(args->flags); error = kern_recvit(td, args->s, &msg, UIO_USERSPACE, NULL); if (error != 0) return (error); if (PTRIN(args->from) != NULL) { error = bsd_to_linux_sockaddr((struct sockaddr *) PTRIN(args->from)); if (error != 0) return (error); error = linux_sa_put((struct osockaddr *) PTRIN(args->from)); } if (PTRIN(args->fromlen) != NULL) error = copyout(&msg.msg_namelen, PTRIN(args->fromlen), sizeof(msg.msg_namelen)); return (error); } static int linux_sendmsg_common(struct thread *td, l_int s, struct l_msghdr *msghdr, l_uint flags) { struct cmsghdr *cmsg; struct cmsgcred cmcred; struct mbuf *control; struct msghdr msg; struct l_cmsghdr linux_cmsg; struct l_cmsghdr *ptr_cmsg; struct l_msghdr linux_msg; struct iovec *iov; socklen_t datalen; struct sockaddr *sa; sa_family_t sa_family; void *data; int error; error = copyin(msghdr, &linux_msg, sizeof(linux_msg)); if (error != 0) return (error); /* * Some Linux applications (ping) define a non-NULL control data * pointer, but a msg_controllen of 0, which is not allowed in the * FreeBSD system call interface. NULL the msg_control pointer in * order to handle this case. This should be checked, but allows the * Linux ping to work. */ if (PTRIN(linux_msg.msg_control) != NULL && linux_msg.msg_controllen == 0) linux_msg.msg_control = PTROUT(NULL); error = linux_to_bsd_msghdr(&msg, &linux_msg); if (error != 0) return (error); #ifdef COMPAT_LINUX32 error = linux32_copyiniov(PTRIN(msg.msg_iov), msg.msg_iovlen, &iov, EMSGSIZE); #else error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); #endif if (error != 0) return (error); control = NULL; cmsg = NULL; if ((ptr_cmsg = LINUX_CMSG_FIRSTHDR(&linux_msg)) != NULL) { error = kern_getsockname(td, s, &sa, &datalen); if (error != 0) goto bad; sa_family = sa->sa_family; free(sa, M_SONAME); error = ENOBUFS; cmsg = malloc(CMSG_HDRSZ, M_LINUX, M_WAITOK|M_ZERO); control = m_get(M_WAITOK, MT_CONTROL); do { error = copyin(ptr_cmsg, &linux_cmsg, sizeof(struct l_cmsghdr)); if (error != 0) goto bad; error = EINVAL; if (linux_cmsg.cmsg_len < sizeof(struct l_cmsghdr)) goto bad; /* * Now we support only SCM_RIGHTS and SCM_CRED, * so return EINVAL in any other cmsg_type */ cmsg->cmsg_type = linux_to_bsd_cmsg_type(linux_cmsg.cmsg_type); cmsg->cmsg_level = linux_to_bsd_sockopt_level(linux_cmsg.cmsg_level); if (cmsg->cmsg_type == -1 || cmsg->cmsg_level != SOL_SOCKET) goto bad; /* * Some applications (e.g. pulseaudio) attempt to * send ancillary data even if the underlying protocol * doesn't support it which is not allowed in the * FreeBSD system call interface. */ if (sa_family != AF_UNIX) continue; data = LINUX_CMSG_DATA(ptr_cmsg); datalen = linux_cmsg.cmsg_len - L_CMSG_HDRSZ; switch (cmsg->cmsg_type) { case SCM_RIGHTS: break; case SCM_CREDS: data = &cmcred; datalen = sizeof(cmcred); /* * The lower levels will fill in the structure */ bzero(data, datalen); break; } cmsg->cmsg_len = CMSG_LEN(datalen); error = ENOBUFS; if (!m_append(control, CMSG_HDRSZ, (c_caddr_t)cmsg)) goto bad; if (!m_append(control, datalen, (c_caddr_t)data)) goto bad; } while ((ptr_cmsg = LINUX_CMSG_NXTHDR(&linux_msg, ptr_cmsg))); if (m_length(control, NULL) == 0) { m_freem(control); control = NULL; } } msg.msg_iov = iov; msg.msg_flags = 0; error = linux_sendit(td, s, &msg, flags, control, UIO_USERSPACE); control = NULL; bad: m_freem(control); free(iov, M_IOV); if (cmsg) free(cmsg, M_LINUX); return (error); } int linux_sendmsg(struct thread *td, struct linux_sendmsg_args *args) { return (linux_sendmsg_common(td, args->s, PTRIN(args->msg), args->flags)); } int linux_sendmmsg(struct thread *td, struct linux_sendmmsg_args *args) { struct l_mmsghdr *msg; l_uint retval; int error, datagrams; if (args->vlen > UIO_MAXIOV) args->vlen = UIO_MAXIOV; msg = PTRIN(args->msg); datagrams = 0; while (datagrams < args->vlen) { error = linux_sendmsg_common(td, args->s, &msg->msg_hdr, args->flags); if (error != 0) break; retval = td->td_retval[0]; error = copyout(&retval, &msg->msg_len, sizeof(msg->msg_len)); if (error != 0) break; ++msg; ++datagrams; } if (error == 0) td->td_retval[0] = datagrams; return (error); } static int linux_recvmsg_common(struct thread *td, l_int s, struct l_msghdr *msghdr, l_uint flags, struct msghdr *msg) { struct cmsghdr *cm; struct cmsgcred *cmcred; struct l_cmsghdr *linux_cmsg = NULL; struct l_ucred linux_ucred; socklen_t datalen, outlen; struct l_msghdr linux_msg; struct iovec *iov, *uiov; struct mbuf *control = NULL; struct mbuf **controlp; struct timeval *ftmvl; l_timeval ltmvl; caddr_t outbuf; void *data; int error, i, fd, fds, *fdp; error = copyin(msghdr, &linux_msg, sizeof(linux_msg)); if (error != 0) return (error); error = linux_to_bsd_msghdr(msg, &linux_msg); if (error != 0) return (error); #ifdef COMPAT_LINUX32 error = linux32_copyiniov(PTRIN(msg->msg_iov), msg->msg_iovlen, &iov, EMSGSIZE); #else error = copyiniov(msg->msg_iov, msg->msg_iovlen, &iov, EMSGSIZE); #endif if (error != 0) return (error); if (msg->msg_name) { error = linux_to_bsd_sockaddr((struct sockaddr *)msg->msg_name, msg->msg_namelen); if (error != 0) goto bad; } uiov = msg->msg_iov; msg->msg_iov = iov; controlp = (msg->msg_control != NULL) ? &control : NULL; error = kern_recvit(td, s, msg, UIO_USERSPACE, controlp); msg->msg_iov = uiov; if (error != 0) goto bad; error = bsd_to_linux_msghdr(msg, &linux_msg); if (error != 0) goto bad; if (linux_msg.msg_name) { error = bsd_to_linux_sockaddr((struct sockaddr *) PTRIN(linux_msg.msg_name)); if (error != 0) goto bad; } if (linux_msg.msg_name && linux_msg.msg_namelen > 2) { error = linux_sa_put(PTRIN(linux_msg.msg_name)); if (error != 0) goto bad; } outbuf = PTRIN(linux_msg.msg_control); outlen = 0; if (control) { linux_cmsg = malloc(L_CMSG_HDRSZ, M_LINUX, M_WAITOK | M_ZERO); msg->msg_control = mtod(control, struct cmsghdr *); msg->msg_controllen = control->m_len; cm = CMSG_FIRSTHDR(msg); while (cm != NULL) { linux_cmsg->cmsg_type = bsd_to_linux_cmsg_type(cm->cmsg_type); linux_cmsg->cmsg_level = bsd_to_linux_sockopt_level(cm->cmsg_level); if (linux_cmsg->cmsg_type == -1 || cm->cmsg_level != SOL_SOCKET) { error = EINVAL; goto bad; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; switch (cm->cmsg_type) { case SCM_RIGHTS: if (flags & LINUX_MSG_CMSG_CLOEXEC) { fds = datalen / sizeof(int); fdp = data; for (i = 0; i < fds; i++) { fd = *fdp++; (void)kern_fcntl(td, fd, F_SETFD, FD_CLOEXEC); } } break; case SCM_CREDS: /* * Currently LOCAL_CREDS is never in * effect for Linux so no need to worry * about sockcred */ if (datalen != sizeof(*cmcred)) { error = EMSGSIZE; goto bad; } cmcred = (struct cmsgcred *)data; bzero(&linux_ucred, sizeof(linux_ucred)); linux_ucred.pid = cmcred->cmcred_pid; linux_ucred.uid = cmcred->cmcred_uid; linux_ucred.gid = cmcred->cmcred_gid; data = &linux_ucred; datalen = sizeof(linux_ucred); break; case SCM_TIMESTAMP: if (datalen != sizeof(struct timeval)) { error = EMSGSIZE; goto bad; } ftmvl = (struct timeval *)data; ltmvl.tv_sec = ftmvl->tv_sec; ltmvl.tv_usec = ftmvl->tv_usec; data = <mvl; datalen = sizeof(ltmvl); break; } if (outlen + LINUX_CMSG_LEN(datalen) > linux_msg.msg_controllen) { if (outlen == 0) { error = EMSGSIZE; goto bad; } else { linux_msg.msg_flags |= LINUX_MSG_CTRUNC; goto out; } } linux_cmsg->cmsg_len = LINUX_CMSG_LEN(datalen); error = copyout(linux_cmsg, outbuf, L_CMSG_HDRSZ); if (error) goto bad; outbuf += L_CMSG_HDRSZ; error = copyout(data, outbuf, datalen); if (error) goto bad; outbuf += LINUX_CMSG_ALIGN(datalen); outlen += LINUX_CMSG_LEN(datalen); cm = CMSG_NXTHDR(msg, cm); } } out: linux_msg.msg_controllen = outlen; error = copyout(&linux_msg, msghdr, sizeof(linux_msg)); bad: free(iov, M_IOV); m_freem(control); free(linux_cmsg, M_LINUX); return (error); } int linux_recvmsg(struct thread *td, struct linux_recvmsg_args *args) { struct msghdr bsd_msg; return (linux_recvmsg_common(td, args->s, PTRIN(args->msg), args->flags, &bsd_msg)); } int linux_recvmmsg(struct thread *td, struct linux_recvmmsg_args *args) { struct l_mmsghdr *msg; struct msghdr bsd_msg; struct l_timespec lts; struct timespec ts, tts; l_uint retval; int error, datagrams; if (args->timeout) { error = copyin(args->timeout, <s, sizeof(struct l_timespec)); if (error != 0) return (error); error = linux_to_native_timespec(&ts, <s); if (error != 0) return (error); getnanotime(&tts); timespecadd(&tts, &ts); } msg = PTRIN(args->msg); datagrams = 0; while (datagrams < args->vlen) { error = linux_recvmsg_common(td, args->s, &msg->msg_hdr, args->flags & ~LINUX_MSG_WAITFORONE, &bsd_msg); if (error != 0) break; retval = td->td_retval[0]; error = copyout(&retval, &msg->msg_len, sizeof(msg->msg_len)); if (error != 0) break; ++msg; ++datagrams; /* * MSG_WAITFORONE turns on MSG_DONTWAIT after one packet. */ if (args->flags & LINUX_MSG_WAITFORONE) args->flags |= LINUX_MSG_DONTWAIT; /* * See BUGS section of recvmmsg(2). */ if (args->timeout) { getnanotime(&ts); timespecsub(&ts, &tts); if (!timespecisset(&ts) || ts.tv_sec > 0) break; } /* Out of band data, return right away. */ if (bsd_msg.msg_flags & MSG_OOB) break; } if (error == 0) td->td_retval[0] = datagrams; return (error); } int linux_shutdown(struct thread *td, struct linux_shutdown_args *args) { struct shutdown_args /* { int s; int how; } */ bsd_args; bsd_args.s = args->s; bsd_args.how = args->how; return (sys_shutdown(td, &bsd_args)); } int linux_setsockopt(struct thread *td, struct linux_setsockopt_args *args) { struct setsockopt_args /* { int s; int level; int name; caddr_t val; int valsize; } */ bsd_args; l_timeval linux_tv; struct timeval tv; int error, name; bsd_args.s = args->s; bsd_args.level = linux_to_bsd_sockopt_level(args->level); switch (bsd_args.level) { case SOL_SOCKET: name = linux_to_bsd_so_sockopt(args->optname); switch (name) { case SO_RCVTIMEO: /* FALLTHROUGH */ case SO_SNDTIMEO: error = copyin(PTRIN(args->optval), &linux_tv, sizeof(linux_tv)); if (error) return (error); tv.tv_sec = linux_tv.tv_sec; tv.tv_usec = linux_tv.tv_usec; return (kern_setsockopt(td, args->s, bsd_args.level, name, &tv, UIO_SYSSPACE, sizeof(tv))); /* NOTREACHED */ break; default: break; } break; case IPPROTO_IP: name = linux_to_bsd_ip_sockopt(args->optname); break; case IPPROTO_IPV6: name = linux_to_bsd_ip6_sockopt(args->optname); break; case IPPROTO_TCP: name = linux_to_bsd_tcp_sockopt(args->optname); break; default: name = -1; break; } if (name == -1) return (ENOPROTOOPT); bsd_args.name = name; bsd_args.val = PTRIN(args->optval); bsd_args.valsize = args->optlen; if (name == IPV6_NEXTHOP) { linux_to_bsd_sockaddr((struct sockaddr *)bsd_args.val, bsd_args.valsize); error = sys_setsockopt(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.val); } else error = sys_setsockopt(td, &bsd_args); return (error); } int linux_getsockopt(struct thread *td, struct linux_getsockopt_args *args) { struct getsockopt_args /* { int s; int level; int name; caddr_t val; int *avalsize; } */ bsd_args; l_timeval linux_tv; struct timeval tv; socklen_t tv_len, xulen, len; struct xucred xu; struct l_ucred lxu; int error, name, newval; bsd_args.s = args->s; bsd_args.level = linux_to_bsd_sockopt_level(args->level); switch (bsd_args.level) { case SOL_SOCKET: name = linux_to_bsd_so_sockopt(args->optname); switch (name) { case SO_RCVTIMEO: /* FALLTHROUGH */ case SO_SNDTIMEO: tv_len = sizeof(tv); error = kern_getsockopt(td, args->s, bsd_args.level, name, &tv, UIO_SYSSPACE, &tv_len); if (error) return (error); linux_tv.tv_sec = tv.tv_sec; linux_tv.tv_usec = tv.tv_usec; return (copyout(&linux_tv, PTRIN(args->optval), sizeof(linux_tv))); /* NOTREACHED */ break; case LOCAL_PEERCRED: if (args->optlen != sizeof(lxu)) return (EINVAL); xulen = sizeof(xu); error = kern_getsockopt(td, args->s, bsd_args.level, name, &xu, UIO_SYSSPACE, &xulen); if (error) return (error); /* * XXX Use 0 for pid as the FreeBSD does not cache peer pid. */ lxu.pid = 0; lxu.uid = xu.cr_uid; lxu.gid = xu.cr_gid; return (copyout(&lxu, PTRIN(args->optval), sizeof(lxu))); /* NOTREACHED */ break; case SO_ERROR: len = sizeof(newval); error = kern_getsockopt(td, args->s, bsd_args.level, name, &newval, UIO_SYSSPACE, &len); if (error) return (error); newval = -SV_ABI_ERRNO(td->td_proc, newval); return (copyout(&newval, PTRIN(args->optval), len)); /* NOTREACHED */ default: break; } break; case IPPROTO_IP: name = linux_to_bsd_ip_sockopt(args->optname); break; case IPPROTO_IPV6: name = linux_to_bsd_ip6_sockopt(args->optname); break; case IPPROTO_TCP: name = linux_to_bsd_tcp_sockopt(args->optname); break; default: name = -1; break; } if (name == -1) return (EINVAL); bsd_args.name = name; bsd_args.val = PTRIN(args->optval); bsd_args.avalsize = PTRIN(args->optlen); if (name == IPV6_NEXTHOP) { error = sys_getsockopt(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.val); } else error = sys_getsockopt(td, &bsd_args); return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) /* Argument list sizes for linux_socketcall */ static const unsigned char lxs_args_cnt[] = { 0 /* unused*/, 3 /* socket */, 3 /* bind */, 3 /* connect */, 2 /* listen */, 3 /* accept */, 3 /* getsockname */, 3 /* getpeername */, 4 /* socketpair */, 4 /* send */, 4 /* recv */, 6 /* sendto */, 6 /* recvfrom */, 2 /* shutdown */, 5 /* setsockopt */, 5 /* getsockopt */, 3 /* sendmsg */, 3 /* recvmsg */, 4 /* accept4 */, 5 /* recvmmsg */, 4 /* sendmmsg */ }; #define LINUX_ARGS_CNT (nitems(lxs_args_cnt) - 1) #define LINUX_ARG_SIZE(x) (lxs_args_cnt[x] * sizeof(l_ulong)) int linux_socketcall(struct thread *td, struct linux_socketcall_args *args) { l_ulong a[6]; #if defined(__amd64__) && defined(COMPAT_LINUX32) register_t l_args[6]; #endif void *arg; int error; if (args->what < LINUX_SOCKET || args->what > LINUX_ARGS_CNT) return (EINVAL); error = copyin(PTRIN(args->args), a, LINUX_ARG_SIZE(args->what)); if (error != 0) return (error); #if defined(__amd64__) && defined(COMPAT_LINUX32) for (int i = 0; i < lxs_args_cnt[args->what]; ++i) l_args[i] = a[i]; arg = l_args; #else arg = a; #endif switch (args->what) { case LINUX_SOCKET: return (linux_socket(td, arg)); case LINUX_BIND: return (linux_bind(td, arg)); case LINUX_CONNECT: return (linux_connect(td, arg)); case LINUX_LISTEN: return (linux_listen(td, arg)); case LINUX_ACCEPT: return (linux_accept(td, arg)); case LINUX_GETSOCKNAME: return (linux_getsockname(td, arg)); case LINUX_GETPEERNAME: return (linux_getpeername(td, arg)); case LINUX_SOCKETPAIR: return (linux_socketpair(td, arg)); case LINUX_SEND: return (linux_send(td, arg)); case LINUX_RECV: return (linux_recv(td, arg)); case LINUX_SENDTO: return (linux_sendto(td, arg)); case LINUX_RECVFROM: return (linux_recvfrom(td, arg)); case LINUX_SHUTDOWN: return (linux_shutdown(td, arg)); case LINUX_SETSOCKOPT: return (linux_setsockopt(td, arg)); case LINUX_GETSOCKOPT: return (linux_getsockopt(td, arg)); case LINUX_SENDMSG: return (linux_sendmsg(td, arg)); case LINUX_RECVMSG: return (linux_recvmsg(td, arg)); case LINUX_ACCEPT4: return (linux_accept4(td, arg)); case LINUX_RECVMMSG: return (linux_recvmmsg(td, arg)); case LINUX_SENDMMSG: return (linux_sendmmsg(td, arg)); } uprintf("LINUX: 'socket' typ=%d not implemented\n", args->what); return (ENOSYS); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ Index: stable/11/sys/dev/iscsi_initiator/isc_soc.c =================================================================== --- stable/11/sys/dev/iscsi_initiator/isc_soc.c (revision 315311) +++ stable/11/sys/dev/iscsi_initiator/isc_soc.c (revision 315312) @@ -1,702 +1,701 @@ /*- * Copyright (c) 2005-2010 Daniel Braniss * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* | $Id: isc_soc.c 998 2009-12-20 10:32:45Z danny $ */ #include __FBSDID("$FreeBSD$"); #include "opt_iscsi_initiator.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef NO_USE_MBUF #define USE_MBUF #endif #ifdef USE_MBUF static int ou_refcnt = 0; /* | function for freeing external storage for mbuf */ static void ext_free(struct mbuf *m, void *a, void *b) { pduq_t *pq = b; if(pq->buf != NULL) { debug(3, "ou_refcnt=%d a=%p b=%p", ou_refcnt, a, pq->buf); free(pq->buf, M_ISCSIBUF); pq->buf = NULL; } } int isc_sendPDU(isc_session_t *sp, pduq_t *pq) { struct mbuf *mh, **mp; pdu_t *pp = &pq->pdu; int len, error; debug_called(8); /* | mbuf for the iSCSI header */ MGETHDR(mh, M_WAITOK, MT_DATA); mh->m_pkthdr.rcvif = NULL; mh->m_next = NULL; mh->m_len = sizeof(union ipdu_u); if(ISOK2DIG(sp->hdrDigest, pp)) { pp->hdr_dig = sp->hdrDigest(&pp->ipdu, sizeof(union ipdu_u), 0); mh->m_len += sizeof(pp->hdr_dig); if(pp->ahs_len) { debug(2, "ahs_len=%d", pp->ahs_len); pp->hdr_dig = sp->hdrDigest(&pp->ahs_addr, pp->ahs_len, pp->hdr_dig); } debug(3, "pp->hdr_dig=%04x", htonl(pp->hdr_dig)); } if(pp->ahs_len) { /* | Add any AHS to the iSCSI hdr mbuf */ if((mh->m_len + pp->ahs_len) < MHLEN) { M_ALIGN(mh, mh->m_len + pp->ahs_len); bcopy(&pp->ipdu, mh->m_data, mh->m_len); bcopy(pp->ahs_addr, mh->m_data + mh->m_len, pp->ahs_len); mh->m_len += pp->ahs_len; } else panic("len AHS=%d too big, not impleneted yet", pp->ahs_len); } else { M_ALIGN(mh, mh->m_len); bcopy(&pp->ipdu, mh->m_data, mh->m_len); } mh->m_pkthdr.len = mh->m_len; mp = &mh->m_next; if(pp->ds_len && pq->pdu.ds_addr) { struct mbuf *md; int off = 0; len = pp->ds_len; while(len > 0) { int l; MGET(md, M_WAITOK, MT_DATA); md->m_ext.ext_cnt = &ou_refcnt; l = min(MCLBYTES, len); debug(4, "setting ext_free(arg=%p len/l=%d/%d)", pq->buf, len, l); MEXTADD(md, pp->ds_addr + off, l, ext_free, #if __FreeBSD_version >= 800000 pp->ds_addr + off, #endif pq, 0, EXT_EXTREF); md->m_len = l; md->m_next = NULL; mh->m_pkthdr.len += l; *mp = md; mp = &md->m_next; len -= l; off += l; } if(((pp->ds_len & 03) != 0) || ISOK2DIG(sp->dataDigest, pp)) { MGET(md, M_WAITOK, MT_DATA); if(pp->ds_len & 03) len = 4 - (pp->ds_len & 03); else len = 0; md->m_len = len; if(ISOK2DIG(sp->dataDigest, pp)) md->m_len += sizeof(pp->ds_dig); M_ALIGN(md, md->m_len); if(ISOK2DIG(sp->dataDigest, pp)) { pp->ds_dig = sp->dataDigest(pp->ds_addr, pp->ds_len, 0); if(len) { bzero(md->m_data, len); // RFC says SHOULD be 0 pp->ds_dig = sp->dataDigest(md->m_data, len, pp->ds_dig); } bcopy(&pp->ds_dig, md->m_data+len, sizeof(pp->ds_dig)); } md->m_next = NULL; mh->m_pkthdr.len += md->m_len; *mp = md; } } if((error = sosend(sp->soc, NULL, NULL, mh, 0, 0, sp->td)) != 0) { sdebug(2, "error=%d", error); return error; } sp->stats.nsent++; getbintime(&sp->stats.t_sent); return 0; } #else /* NO_USE_MBUF */ int isc_sendPDU(isc_session_t *sp, pduq_t *pq) { struct uio *uio = &pq->uio; struct iovec *iv; pdu_t *pp = &pq->pdu; int len, error; debug_called(8); bzero(uio, sizeof(struct uio)); uio->uio_rw = UIO_WRITE; uio->uio_segflg = UIO_SYSSPACE; uio->uio_td = sp->td; uio->uio_iov = iv = pq->iov; iv->iov_base = &pp->ipdu; iv->iov_len = sizeof(union ipdu_u); uio->uio_resid = iv->iov_len; iv++; if(ISOK2DIG(sp->hdrDigest, pp)) pq->pdu.hdr_dig = sp->hdrDigest(&pp->ipdu, sizeof(union ipdu_u), 0); if(pp->ahs_len) { iv->iov_base = pp->ahs_addr; iv->iov_len = pp->ahs_len; uio->uio_resid += iv->iov_len; iv++; if(ISOK2DIG(sp->hdrDigest, pp)) pp->hdr_dig = sp->hdrDigest(&pp->ahs_addr, pp->ahs_len, pp->hdr_dig); } if(ISOK2DIG(sp->hdrDigest, pp)) { debug(3, "hdr_dig=%04x", htonl(pp->hdr_dig)); iv->iov_base = &pp->hdr_dig; iv->iov_len = sizeof(int); uio->uio_resid += iv->iov_len ; iv++; } if(pq->pdu.ds_addr && pp->ds_len) { iv->iov_base = pp->ds_addr; iv->iov_len = pp->ds_len; while(iv->iov_len & 03) // the specs say it must be int aligned iv->iov_len++; uio->uio_resid += iv->iov_len ; iv++; if(ISOK2DIG(sp->dataDigest, pp)) { pp->ds_dig = sp->dataDigest(pp->ds, pp->ds_len, 0); iv->iov_base = &pp->ds_dig; iv->iov_len = sizeof(pp->ds_dig); uio->uio_resid += iv->iov_len ; iv++; } } uio->uio_iovcnt = iv - pq->iov; sdebug(4, "pq->len=%d uio->uio_resid=%d uio->uio_iovcnt=%d", pq->len, uio->uio_resid, uio->uio_iovcnt); sdebug(4, "opcode=%x iovcnt=%d uio_resid=%d itt=%x", pp->ipdu.bhs.opcode, uio->uio_iovcnt, uio->uio_resid, ntohl(pp->ipdu.bhs.itt)); sdebug(5, "sp=%p sp->soc=%p uio=%p sp->td=%p", sp, sp->soc, uio, sp->td); do { len = uio->uio_resid; error = sosend(sp->soc, NULL, uio, 0, 0, 0, sp->td); if(uio->uio_resid == 0 || error || len == uio->uio_resid) { if(uio->uio_resid) { sdebug(2, "uio->uio_resid=%d uio->uio_iovcnt=%d error=%d len=%d", uio->uio_resid, uio->uio_iovcnt, error, len); if(error == 0) error = EAGAIN; // 35 } break; } /* | XXX: untested code */ sdebug(1, "uio->uio_resid=%d uio->uio_iovcnt=%d", uio->uio_resid, uio->uio_iovcnt); iv = uio->uio_iov; len -= uio->uio_resid; while(uio->uio_iovcnt > 0) { if(iv->iov_len > len) { caddr_t bp = (caddr_t)iv->iov_base; iv->iov_len -= len; iv->iov_base = (void *)&bp[len]; break; } len -= iv->iov_len; uio->uio_iovcnt--; uio->uio_iov++; iv++; } } while(uio->uio_resid); if(error == 0) { sp->stats.nsent++; getbintime(&sp->stats.t_sent); } return error; } #endif /* USE_MBUF */ /* | wait till a PDU header is received | from the socket. */ /* The format of the BHS is: Byte/ 0 | 1 | 2 | 3 | / | | | | |0 1 2 3 4 5 6 7|0 1 2 3 4 5 6 7|0 1 2 3 4 5 6 7|0 1 2 3 4 5 6 7| +---------------+---------------+---------------+---------------+ 0|.|I| Opcode |F| Opcode-specific fields | +---------------+---------------+---------------+---------------+ 4|TotalAHSLength | DataSegmentLength | +---------------+---------------+---------------+---------------+ 8| LUN or Opcode-specific fields | + + 12| | +---------------+---------------+---------------+---------------+ 16| Initiator Task Tag | +---------------+---------------+---------------+---------------+ 20/ Opcode-specific fields / +/ / +---------------+---------------+---------------+---------------+ 48 */ static __inline int so_getbhs(isc_session_t *sp) { bhs_t *bhs = &sp->bhs; struct uio *uio = &sp->uio; struct iovec *iov = &sp->iov; int error, flags; debug_called(8); iov->iov_base = bhs; iov->iov_len = sizeof(bhs_t); uio->uio_iov = iov; uio->uio_iovcnt = 1; uio->uio_rw = UIO_READ; uio->uio_segflg = UIO_SYSSPACE; uio->uio_td = curthread; // why ... uio->uio_resid = sizeof(bhs_t); flags = MSG_WAITALL; error = soreceive(sp->soc, NULL, uio, 0, 0, &flags); if(error) debug(2, #if __FreeBSD_version > 800000 "error=%d so_error=%d uio->uio_resid=%zd iov.iov_len=%zd", #else "error=%d so_error=%d uio->uio_resid=%d iov.iov_len=%zd", #endif error, sp->soc->so_error, uio->uio_resid, iov->iov_len); if(!error && (uio->uio_resid > 0)) { error = EPIPE; // was EAGAIN debug(2, #if __FreeBSD_version > 800000 "error=%d so_error=%d uio->uio_resid=%zd iov.iov_len=%zd so_state=%x", #else "error=%d so_error=%d uio->uio_resid=%d iov.iov_len=%zd so_state=%x", #endif error, sp->soc->so_error, uio->uio_resid, iov->iov_len, sp->soc->so_state); } return error; } /* | so_recv gets called when | an iSCSI header has been received. | Note: the designers had no intentions | in making programmer's life easy. */ static int so_recv(isc_session_t *sp, pduq_t *pq) { sn_t *sn = &sp->sn; struct uio *uio = &pq->uio; pdu_t *pp = &pq->pdu; bhs_t *bhs = &pp->ipdu.bhs; struct iovec *iov = pq->iov; int error; u_int len; u_int max, exp; int flags = MSG_WAITALL; debug_called(8); /* | now calculate how much data should be in the buffer */ uio->uio_iov = iov; uio->uio_iovcnt = 0; len = 0; if(bhs->AHSLength) { debug(2, "bhs->AHSLength=%d", bhs->AHSLength); pp->ahs_len = bhs->AHSLength * 4; len += pp->ahs_len; pp->ahs_addr = malloc(pp->ahs_len, M_TEMP, M_WAITOK); // XXX: could get stuck here iov->iov_base = pp->ahs_addr; iov->iov_len = pp->ahs_len; uio->uio_iovcnt++; iov++; } if(ISOK2DIG(sp->hdrDigest, pp)) { len += sizeof(pp->hdr_dig); iov->iov_base = &pp->hdr_dig; iov->iov_len = sizeof(pp->hdr_dig); uio->uio_iovcnt++; } if(len) { uio->uio_rw = UIO_READ; uio->uio_segflg = UIO_SYSSPACE; uio->uio_resid = len; uio->uio_td = sp->td; // why ... error = soreceive(sp->soc, NULL, uio, NULL, NULL, &flags); //if(error == EAGAIN) // XXX: this needs work! it hangs iscontrol if(error || uio->uio_resid) { debug(2, #if __FreeBSD_version > 800000 "len=%d error=%d uio->uio_resid=%zd", #else "len=%d error=%d uio->uio_resid=%d", #endif len, error, uio->uio_resid); goto out; } if(ISOK2DIG(sp->hdrDigest, pp)) { bhs_t *bhs; u_int digest; bhs = (bhs_t *)&pp->ipdu; digest = sp->hdrDigest(bhs, sizeof(bhs_t), 0); if(pp->ahs_len) digest = sp->hdrDigest(pp->ahs_addr, pp->ahs_len, digest); if(pp->hdr_dig != digest) { debug(2, "bad header digest: received=%x calculated=%x", pp->hdr_dig, digest); // XXX: now what? error = EIO; goto out; } } if(pp->ahs_len) { debug(2, "ahs len=%x type=%x spec=%x", pp->ahs_addr->len, pp->ahs_addr->type, pp->ahs_addr->spec); // XXX: till I figure out what to do with this free(pp->ahs_addr, M_TEMP); } pq->len += len; // XXX: who needs this? bzero(uio, sizeof(struct uio)); len = 0; } if(bhs->DSLength) { len = bhs->DSLength; #if BYTE_ORDER == LITTLE_ENDIAN len = ((len & 0x00ff0000) >> 16) | (len & 0x0000ff00) | ((len & 0x000000ff) << 16); #endif pp->ds_len = len; if((sp->opt.maxRecvDataSegmentLength > 0) && (len > sp->opt.maxRecvDataSegmentLength)) { xdebug("impossible PDU length(%d) opt.maxRecvDataSegmentLength=%d", len, sp->opt.maxRecvDataSegmentLength); log(LOG_ERR, "so_recv: impossible PDU length(%d) from iSCSI %s/%s\n", len, sp->opt.targetAddress, sp->opt.targetName); /* | XXX: this will really screwup the stream. | should clear up the buffer till a valid header | is found, or just close connection ... | should read the RFC. */ error = E2BIG; goto out; } while(len & 03) len++; if(ISOK2DIG(sp->dataDigest, pp)) len += 4; uio->uio_resid = len; uio->uio_td = sp->td; // why ... pq->len += len; // XXX: do we need this? error = soreceive(sp->soc, NULL, uio, &pq->mp, NULL, &flags); //if(error == EAGAIN) // XXX: this needs work! it hangs iscontrol if(error || uio->uio_resid) goto out; if(ISOK2DIG(sp->dataDigest, pp)) { struct mbuf *m; u_int digest, ds_len, cnt; // get the received digest m_copydata(pq->mp, len - sizeof(pp->ds_dig), sizeof(pp->ds_dig), (caddr_t)&pp->ds_dig); // calculate all mbufs digest = 0; ds_len = len - sizeof(pp->ds_dig); for(m = pq->mp; m != NULL; m = m->m_next) { cnt = MIN(ds_len, m->m_len); digest = sp->dataDigest(mtod(m, char *), cnt, digest); ds_len -= cnt; if(ds_len == 0) break; } if(digest != pp->ds_dig) { sdebug(1, "bad data digest: received=%x calculated=%x", pp->ds_dig, digest); error = EIO; // XXX: find a better error goto out; } KASSERT(ds_len == 0, ("ds_len not zero")); } } sdebug(6, "len=%d] opcode=0x%x ahs_len=0x%x ds_len=0x%x", pq->len, bhs->opcode, pp->ahs_len, pp->ds_len); max = ntohl(bhs->MaxCmdSN); exp = ntohl(bhs->ExpStSN); if(max < exp - 1 && max > exp - _MAXINCR) { sdebug(2, "bad cmd window size"); error = EIO; // XXX: for now; goto out; // error } if(SNA_GT(max, sn->maxCmd)) sn->maxCmd = max; if(SNA_GT(exp, sn->expCmd)) sn->expCmd = exp; /* | remove from the holding queue packets | that have been acked and don't need | further processing. */ i_acked_hld(sp, NULL); sp->cws = sn->maxCmd - sn->expCmd + 1; return 0; out: // XXX: need some work here if(pp->ahs_len) { // XXX: till I figure out what to do with this free(pp->ahs_addr, M_TEMP); } xdebug("have a problem, error=%d", error); pdu_free(sp->isc, pq); if(!error && uio->uio_resid > 0) error = EPIPE; return error; } /* | wait for something to arrive. | and if the pdu is without errors, process it. */ static int so_input(isc_session_t *sp) { pduq_t *pq; int error; debug_called(8); /* | first read in the iSCSI header */ error = so_getbhs(sp); if(error == 0) { /* | now read the rest. */ pq = pdu_alloc(sp->isc, M_NOWAIT); if(pq == NULL) { // XXX: might cause a deadlock ... debug(2, "out of pdus, wait"); pq = pdu_alloc(sp->isc, M_WAITOK); // OK to WAIT } pq->pdu.ipdu.bhs = sp->bhs; pq->len = sizeof(bhs_t); // so far only the header was read error = so_recv(sp, pq); if(error != 0) { error += 0x800; // XXX: just to see the error. // terminal error // XXX: close connection and exit } else { sp->stats.nrecv++; getbintime(&sp->stats.t_recv); ism_recv(sp, pq); } } return error; } /* | one per active (connected) session. | this thread is responsible for reading | in packets from the target. */ static void isc_in(void *vp) { isc_session_t *sp = (isc_session_t *)vp; struct socket *so = sp->soc; int error; debug_called(8); sp->flags |= ISC_CON_RUNNING; error = 0; while((sp->flags & (ISC_CON_RUN | ISC_LINK_UP)) == (ISC_CON_RUN | ISC_LINK_UP)) { // XXX: hunting ... if(sp->soc == NULL || !(so->so_state & SS_ISCONNECTED)) { debug(2, "sp->soc=%p", sp->soc); break; } error = so_input(sp); if(error == 0) { mtx_lock(&sp->io_mtx); if(sp->flags & ISC_OWAITING) { wakeup(&sp->flags); } mtx_unlock(&sp->io_mtx); } else if(error == EPIPE) { break; } else if(error == EAGAIN) { if(so->so_state & SS_ISCONNECTED) // there seems to be a problem in 6.0 ... tsleep(sp, PRIBIO, "isc_soc", 2*hz); } } sdebug(2, "terminated, flags=%x so_count=%d so_state=%x error=%d proc=%p", sp->flags, so->so_count, so->so_state, error, sp->proc); if((sp->proc != NULL) && sp->signal) { PROC_LOCK(sp->proc); kern_psignal(sp->proc, sp->signal); PROC_UNLOCK(sp->proc); sp->flags |= ISC_SIGNALED; sdebug(2, "pid=%d signaled(%d)", sp->proc->p_pid, sp->signal); } else { // we have to do something ourselves // like closing this session ... } /* | we've been terminated */ // do we need this mutex ...? mtx_lock(&sp->io_mtx); sp->flags &= ~(ISC_CON_RUNNING | ISC_LINK_UP); wakeup(&sp->soc); mtx_unlock(&sp->io_mtx); sdebug(2, "dropped ISC_CON_RUNNING"); #if __FreeBSD_version >= 800000 kproc_exit(0); #else kthread_exit(0); #endif } void isc_stop_receiver(isc_session_t *sp) { int n; debug_called(8); sdebug(3, "sp=%p sp->soc=%p", sp, sp? sp->soc: 0); mtx_lock(&sp->io_mtx); sp->flags &= ~ISC_LINK_UP; msleep(&sp->soc, &sp->io_mtx, PRIBIO|PDROP, "isc_stpc", 5*hz); soshutdown(sp->soc, SHUT_RD); mtx_lock(&sp->io_mtx); sdebug(3, "soshutdown"); sp->flags &= ~ISC_CON_RUN; n = 2; while(n-- && (sp->flags & ISC_CON_RUNNING)) { sdebug(3, "waiting n=%d... flags=%x", n, sp->flags); msleep(&sp->soc, &sp->io_mtx, PRIBIO, "isc_stpc", 5*hz); } mtx_unlock(&sp->io_mtx); if(sp->fp != NULL) fdrop(sp->fp, sp->td); - fputsock(sp->soc); sp->soc = NULL; sp->fp = NULL; sdebug(3, "done"); } void isc_start_receiver(isc_session_t *sp) { debug_called(8); sp->flags |= ISC_CON_RUN | ISC_LINK_UP; #if __FreeBSD_version >= 800000 kproc_create #else kthread_create #endif (isc_in, sp, &sp->soc_proc, 0, 0, "isc_in %d", sp->sid); } Index: stable/11/sys/dev/iscsi_initiator/iscsi.c =================================================================== --- stable/11/sys/dev/iscsi_initiator/iscsi.c (revision 315311) +++ stable/11/sys/dev/iscsi_initiator/iscsi.c (revision 315312) @@ -1,882 +1,876 @@ /*- * Copyright (c) 2005-2011 Daniel Braniss * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* | $Id: iscsi.c 752 2009-08-20 11:23:28Z danny $ */ #include __FBSDID("$FreeBSD$"); #include "opt_iscsi_initiator.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static char *iscsi_driver_version = "2.3.1"; static struct isc_softc *isc; MALLOC_DEFINE(M_ISCSI, "iSCSI", "iSCSI driver"); MALLOC_DEFINE(M_ISCSIBUF, "iSCbuf", "iSCSI buffers"); static MALLOC_DEFINE(M_TMP, "iSCtmp", "iSCSI tmp"); #ifdef ISCSI_INITIATOR_DEBUG int iscsi_debug = ISCSI_INITIATOR_DEBUG; SYSCTL_INT(_debug, OID_AUTO, iscsi_initiator, CTLFLAG_RW, &iscsi_debug, 0, "iSCSI driver debug flag"); struct mtx iscsi_dbg_mtx; #endif static int max_sessions = MAX_SESSIONS; SYSCTL_INT(_net, OID_AUTO, iscsi_initiator_max_sessions, CTLFLAG_RDTUN, &max_sessions, 0, "Max sessions allowed"); static int max_pdus = MAX_PDUS; SYSCTL_INT(_net, OID_AUTO, iscsi_initiator_max_pdus, CTLFLAG_RDTUN, &max_pdus, 0, "Max PDU pool"); static char isid[6+1] = { 0x80, 'D', 'I', 'B', '0', '0', 0 }; static int i_create_session(struct cdev *dev, int *ndev); static int i_ping(struct cdev *dev); static int i_send(struct cdev *dev, caddr_t arg, struct thread *td); static int i_recv(struct cdev *dev, caddr_t arg, struct thread *td); static int i_setsoc(isc_session_t *sp, int fd, struct thread *td); static int i_fullfeature(struct cdev *dev, int flag); static d_open_t iscsi_open; static d_close_t iscsi_close; static d_ioctl_t iscsi_ioctl; #ifdef ISCSI_INITIATOR_DEBUG static d_read_t iscsi_read; #endif static struct cdevsw iscsi_cdevsw = { .d_version = D_VERSION, .d_open = iscsi_open, .d_close = iscsi_close, .d_ioctl = iscsi_ioctl, #ifdef ISCSI_INITIATOR_DEBUG .d_read = iscsi_read, #endif .d_name = "iSCSI", }; static int iscsi_open(struct cdev *dev, int flags, int otype, struct thread *td) { debug_called(8); debug(7, "dev=%d", dev2unit(dev)); if(dev2unit(dev) > max_sessions) { // should not happen return ENODEV; } return 0; } static int iscsi_close(struct cdev *dev, int flag, int otyp, struct thread *td) { isc_session_t *sp; debug_called(8); debug(3, "session=%d flag=%x", dev2unit(dev), flag); if(dev2unit(dev) == max_sessions) { return 0; } sp = dev->si_drv2; if(sp != NULL) { sdebug(3, "sp->flags=%x", sp->flags ); /* | if still in full phase, this probably means | that something went really bad. | it could be a result from 'shutdown', in which case | we will ignore it (so buffers can be flushed). | the problem is that there is no way of differentiating | between a shutdown procedure and 'iscontrol' dying. */ if(sp->flags & ISC_FFPHASE) // delay in case this is a shutdown. tsleep(sp, PRIBIO, "isc-cls", 60*hz); ism_stop(sp); } debug(2, "done"); return 0; } static int iscsi_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int mode, struct thread *td) { struct isc_softc *sc; isc_session_t *sp; isc_opt_t *opt; int error; debug_called(8); error = 0; if(dev2unit(dev) == max_sessions) { /* | non Session commands */ sc = dev->si_drv1; if(sc == NULL) return ENXIO; switch(cmd) { case ISCSISETSES: error = i_create_session(dev, (int *)arg); if(error == 0) break; default: error = ENXIO; } return error; } /* | session commands */ sp = dev->si_drv2; if(sp == NULL) return ENXIO; sdebug(6, "dev=%d cmd=%d", dev2unit(dev), (int)(cmd & 0xff)); switch(cmd) { case ISCSISETSOC: error = i_setsoc(sp, *(u_int *)arg, td); break; case ISCSISETOPT: opt = (isc_opt_t *)arg; error = i_setopt(sp, opt); break; case ISCSISEND: error = i_send(dev, arg, td); break; case ISCSIRECV: error = i_recv(dev, arg, td); break; case ISCSIPING: error = i_ping(dev); break; case ISCSISTART: error = sp->soc == NULL? ENOTCONN: i_fullfeature(dev, 1); if(error == 0) { sp->proc = td->td_proc; SYSCTL_ADD_INT(&sp->clist, SYSCTL_CHILDREN(sp->oid), OID_AUTO, "pid", CTLFLAG_RD, &sp->proc->p_pid, sizeof(pid_t), "control process id"); } break; case ISCSIRESTART: error = sp->soc == NULL? ENOTCONN: i_fullfeature(dev, 2); break; case ISCSISTOP: error = i_fullfeature(dev, 0); break; case ISCSISIGNAL: { int sig = *(int *)arg; if(sig < 0 || sig > _SIG_MAXSIG) error = EINVAL; else sp->signal = sig; break; } case ISCSIGETCAM: { iscsi_cam_t *cp = (iscsi_cam_t *)arg; error = ic_getCamVals(sp, cp); break; } default: error = ENOIOCTL; } return error; } static int iscsi_read(struct cdev *dev, struct uio *uio, int ioflag) { #ifdef ISCSI_INITIATOR_DEBUG struct isc_softc *sc; isc_session_t *sp; pduq_t *pq; char buf[1024]; sc = dev->si_drv1; sp = dev->si_drv2; if(dev2unit(dev) == max_sessions) { sprintf(buf, "/----- Session ------/\n"); uiomove(buf, strlen(buf), uio); int i = 0; TAILQ_FOREACH(sp, &sc->isc_sess, sp_link) { if(uio->uio_resid == 0) return 0; sprintf(buf, "%03d] '%s' '%s'\n", i++, sp->opt.targetAddress, sp->opt.targetName); uiomove(buf, strlen(buf), uio); } sprintf(buf, "free npdu_alloc=%d, npdu_max=%d\n", sc->npdu_alloc, sc->npdu_max); uiomove(buf, strlen(buf), uio); } else { int i = 0; struct socket *so = sp->soc; #define pukeit(i, pq) do {\ sprintf(buf, "%03d] %06x %02x %06x %06x %jd\n",\ i, ntohl(pq->pdu.ipdu.bhs.CmdSN),\ pq->pdu.ipdu.bhs.opcode, ntohl(pq->pdu.ipdu.bhs.itt),\ ntohl(pq->pdu.ipdu.bhs.ExpStSN),\ (intmax_t)pq->ts.sec);\ } while(0) sprintf(buf, "%d/%d /---- hld -----/\n", sp->stats.nhld, sp->stats.max_hld); uiomove(buf, strlen(buf), uio); TAILQ_FOREACH(pq, &sp->hld, pq_link) { if(uio->uio_resid == 0) return 0; pukeit(i, pq); i++; uiomove(buf, strlen(buf), uio); } sprintf(buf, "%d/%d /---- rsp -----/\n", sp->stats.nrsp, sp->stats.max_rsp); uiomove(buf, strlen(buf), uio); i = 0; TAILQ_FOREACH(pq, &sp->rsp, pq_link) { if(uio->uio_resid == 0) return 0; pukeit(i, pq); i++; uiomove(buf, strlen(buf), uio); } sprintf(buf, "%d/%d /---- csnd -----/\n", sp->stats.ncsnd, sp->stats.max_csnd); i = 0; uiomove(buf, strlen(buf), uio); TAILQ_FOREACH(pq, &sp->csnd, pq_link) { if(uio->uio_resid == 0) return 0; pukeit(i, pq); i++; uiomove(buf, strlen(buf), uio); } sprintf(buf, "%d/%d /---- wsnd -----/\n", sp->stats.nwsnd, sp->stats.max_wsnd); i = 0; uiomove(buf, strlen(buf), uio); TAILQ_FOREACH(pq, &sp->wsnd, pq_link) { if(uio->uio_resid == 0) return 0; pukeit(i, pq); i++; uiomove(buf, strlen(buf), uio); } sprintf(buf, "%d/%d /---- isnd -----/\n", sp->stats.nisnd, sp->stats.max_isnd); i = 0; uiomove(buf, strlen(buf), uio); TAILQ_FOREACH(pq, &sp->isnd, pq_link) { if(uio->uio_resid == 0) return 0; pukeit(i, pq); i++; uiomove(buf, strlen(buf), uio); } sprintf(buf, "/---- Stats ---/\n"); uiomove(buf, strlen(buf), uio); sprintf(buf, "recv=%d sent=%d\n", sp->stats.nrecv, sp->stats.nsent); uiomove(buf, strlen(buf), uio); sprintf(buf, "flags=%x pdus: alloc=%d max=%d\n", sp->flags, sc->npdu_alloc, sc->npdu_max); uiomove(buf, strlen(buf), uio); sprintf(buf, "cws=%d last cmd=%x exp=%x max=%x stat=%x itt=%x\n", sp->cws, sp->sn.cmd, sp->sn.expCmd, sp->sn.maxCmd, sp->sn.stat, sp->sn.itt); uiomove(buf, strlen(buf), uio); sprintf(buf, "/---- socket -----/\nso_count=%d so_state=%x\n", so->so_count, so->so_state); uiomove(buf, strlen(buf), uio); } #endif return 0; } static int i_ping(struct cdev *dev) { return 0; } /* | low level I/O */ static int i_setsoc(isc_session_t *sp, int fd, struct thread *td) { cap_rights_t rights; int error = 0; if(sp->soc != NULL) isc_stop_receiver(sp); - error = fget(td, fd, cap_rights_init(&rights, CAP_SOCK_CLIENT), &sp->fp); + error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_SOCK_CLIENT), + &sp->fp, NULL, NULL); if(error) return error; - error = fgetsock(td, fd, cap_rights_init(&rights, CAP_SOCK_CLIENT), - &sp->soc, 0); - if(error == 0) { - sp->td = td; - isc_start_receiver(sp); - } - else { - fdrop(sp->fp, td); - sp->fp = NULL; - } + sp->soc = sp->fp->f_data; + sp->td = td; + isc_start_receiver(sp); return error; } static int i_send(struct cdev *dev, caddr_t arg, struct thread *td) { isc_session_t *sp = dev->si_drv2; caddr_t bp; pduq_t *pq; pdu_t *pp; int n, error; debug_called(8); if(sp->soc == NULL) return ENOTCONN; if((pq = pdu_alloc(sp->isc, M_NOWAIT)) == NULL) return EAGAIN; pp = &pq->pdu; pq->pdu = *(pdu_t *)arg; if((error = i_prepPDU(sp, pq)) != 0) goto out; bp = NULL; if((pq->len - sizeof(union ipdu_u)) > 0) { pq->buf = bp = malloc(pq->len - sizeof(union ipdu_u), M_ISCSIBUF, M_NOWAIT); if(pq->buf == NULL) { error = EAGAIN; goto out; } } else pq->buf = NULL; // just in case? sdebug(2, "len=%d ahs_len=%d ds_len=%d buf=%zu@%p", pq->len, pp->ahs_len, pp->ds_len, pq->len - sizeof(union ipdu_u), bp); if(pp->ahs_len) { // XXX: never tested, looks suspicious n = pp->ahs_len; error = copyin(pp->ahs_addr, bp, n); if(error != 0) { sdebug(3, "copyin ahs: error=%d", error); goto out; } pp->ahs_addr = (ahs_t *)bp; bp += n; } if(pp->ds_len) { n = pp->ds_len; error = copyin(pp->ds_addr, bp, n); if(error != 0) { sdebug(3, "copyin ds: error=%d", error); goto out; } pp->ds_addr = bp; bp += n; while(n & 03) { n++; *bp++ = 0; } } error = isc_qout(sp, pq); if(error == 0) wakeup(&sp->flags); // XXX: to 'push' proc_out ... out: if(error) pdu_free(sp->isc, pq); return error; } static int i_recv(struct cdev *dev, caddr_t arg, struct thread *td) { isc_session_t *sp = dev->si_drv2; pduq_t *pq; pdu_t *pp, *up; caddr_t bp; int error, mustfree, cnt; size_t need, have, n; debug_called(8); if(sp == NULL) return EIO; if(sp->soc == NULL) return ENOTCONN; cnt = 6; // XXX: maybe the user can request a time out? mtx_lock(&sp->rsp_mtx); while((pq = TAILQ_FIRST(&sp->rsp)) == NULL) { msleep(&sp->rsp, &sp->rsp_mtx, PRIBIO, "isc_rsp", hz*10); if(cnt-- == 0) break; // XXX: for now, needs work } if(pq != NULL) { sp->stats.nrsp--; TAILQ_REMOVE(&sp->rsp, pq, pq_link); } mtx_unlock(&sp->rsp_mtx); sdebug(6, "cnt=%d", cnt); if(pq == NULL) { error = ENOTCONN; sdebug(3, "error=%d sp->flags=%x ", error, sp->flags); return error; } up = (pdu_t *)arg; pp = &pq->pdu; up->ipdu = pp->ipdu; n = 0; up->ds_len = 0; up->ahs_len = 0; error = 0; if(pq->mp) { u_int len; // Grr... len = 0; if(pp->ahs_len) { len += pp->ahs_len; } if(pp->ds_len) { len += pp->ds_len; } mustfree = 0; if(len > pq->mp->m_len) { mustfree++; bp = malloc(len, M_TMP, M_WAITOK); sdebug(4, "need mbufcopy: %d", len); i_mbufcopy(pq->mp, bp, len); } else bp = mtod(pq->mp, caddr_t); if(pp->ahs_len) { need = pp->ahs_len; n = MIN(up->ahs_size, need); error = copyout(bp, (caddr_t)up->ahs_addr, n); up->ahs_len = n; bp += need; } if(!error && pp->ds_len) { need = pp->ds_len; if((have = up->ds_size) == 0) { have = up->ahs_size - n; up->ds_addr = (caddr_t)up->ahs_addr + n; } n = MIN(have, need); error = copyout(bp, (caddr_t)up->ds_addr, n); up->ds_len = n; } if(mustfree) free(bp, M_TMP); } sdebug(6, "len=%d ahs_len=%d ds_len=%d", pq->len, pp->ahs_len, pp->ds_len); pdu_free(sp->isc, pq); return error; } static int i_fullfeature(struct cdev *dev, int flag) { isc_session_t *sp = dev->si_drv2; int error; sdebug(2, "flag=%d", flag); error = 0; switch(flag) { case 0: // stop sp->flags &= ~ISC_FFPHASE; break; case 1: // start sp->flags |= ISC_FFPHASE; error = ic_init(sp); break; case 2: // restart sp->flags |= ISC_FFPHASE; ism_restart(sp); break; } return error; } static int i_create_session(struct cdev *dev, int *ndev) { struct isc_softc *sc = dev->si_drv1; isc_session_t *sp; int error, n; debug_called(8); sp = malloc(sizeof(isc_session_t), M_ISCSI, M_WAITOK | M_ZERO); if(sp == NULL) return ENOMEM; sx_xlock(&sc->unit_sx); if((n = alloc_unr(sc->unit)) < 0) { sx_unlock(&sc->unit_sx); free(sp, M_ISCSI); xdebug("too many sessions!"); return EPERM; } sx_unlock(&sc->unit_sx); mtx_lock(&sc->isc_mtx); TAILQ_INSERT_TAIL(&sc->isc_sess, sp, sp_link); isc->nsess++; mtx_unlock(&sc->isc_mtx); sp->dev = make_dev(&iscsi_cdevsw, n, UID_ROOT, GID_WHEEL, 0600, "iscsi%d", n); *ndev = sp->sid = n; sp->isc = sc; sp->dev->si_drv1 = sc; sp->dev->si_drv2 = sp; sp->opt.maxRecvDataSegmentLength = 8192; sp->opt.maxXmitDataSegmentLength = 8192; sp->opt.maxBurstLength = 65536; // 64k sp->opt.maxluns = ISCSI_MAX_LUNS; error = ism_start(sp); return error; } #ifdef notused static void iscsi_counters(isc_session_t *sp) { int h, r, s; pduq_t *pq; #define _puke(i, pq) do {\ debug(2, "%03d] %06x %02x %x %ld %jd %x\n",\ i, ntohl( pq->pdu.ipdu.bhs.CmdSN), \ pq->pdu.ipdu.bhs.opcode, ntohl(pq->pdu.ipdu.bhs.itt),\ (long)pq->ts.sec, pq->ts.frac, pq->flags);\ } while(0) h = r = s = 0; TAILQ_FOREACH(pq, &sp->hld, pq_link) { _puke(h, pq); h++; } TAILQ_FOREACH(pq, &sp->rsp, pq_link) r++; TAILQ_FOREACH(pq, &sp->csnd, pq_link) s++; TAILQ_FOREACH(pq, &sp->wsnd, pq_link) s++; TAILQ_FOREACH(pq, &sp->isnd, pq_link) s++; debug(2, "hld=%d rsp=%d snd=%d", h, r, s); } #endif static void iscsi_shutdown(void *v) { struct isc_softc *sc = v; isc_session_t *sp; int n; debug_called(8); if(sc == NULL) { xdebug("sc is NULL!"); return; } #ifdef DO_EVENTHANDLER if(sc->eh == NULL) debug(2, "sc->eh is NULL"); else { EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->eh); debug(2, "done n=%d", sc->nsess); } #endif n = 0; TAILQ_FOREACH(sp, &sc->isc_sess, sp_link) { debug(2, "%2d] sp->flags=0x%08x", n, sp->flags); n++; } debug(2, "done"); } static void free_pdus(struct isc_softc *sc) { debug_called(8); if(sc->pdu_zone != NULL) { uma_zdestroy(sc->pdu_zone); sc->pdu_zone = NULL; } } static int iscsi_start(void) { debug_called(8); isc = malloc(sizeof(struct isc_softc), M_ISCSI, M_ZERO|M_WAITOK); mtx_init(&isc->isc_mtx, "iscsi-isc", NULL, MTX_DEF); TAILQ_INIT(&isc->isc_sess); /* | now init the free pdu list */ isc->pdu_zone = uma_zcreate("pdu", sizeof(pduq_t), NULL, NULL, NULL, NULL, 0, 0); uma_zone_set_max(isc->pdu_zone, max_pdus); isc->unit = new_unrhdr(0, max_sessions-1, NULL); sx_init(&isc->unit_sx, "iscsi sx"); #ifdef DO_EVENTHANDLER if((isc->eh = EVENTHANDLER_REGISTER(shutdown_pre_sync, iscsi_shutdown, sc, SHUTDOWN_PRI_DEFAULT-1)) == NULL) xdebug("shutdown event registration failed\n"); #endif /* | sysctl stuff */ sysctl_ctx_init(&isc->clist); isc->oid = SYSCTL_ADD_NODE(&isc->clist, SYSCTL_STATIC_CHILDREN(_net), OID_AUTO, "iscsi_initiator", CTLFLAG_RD, 0, "iSCSI Subsystem"); SYSCTL_ADD_STRING(&isc->clist, SYSCTL_CHILDREN(isc->oid), OID_AUTO, "driver_version", CTLFLAG_RD, iscsi_driver_version, 0, "iscsi driver version"); SYSCTL_ADD_STRING(&isc->clist, SYSCTL_CHILDREN(isc->oid), OID_AUTO, "isid", CTLFLAG_RW, isid, 6+1, "initiator part of the Session Identifier"); SYSCTL_ADD_INT(&isc->clist, SYSCTL_CHILDREN(isc->oid), OID_AUTO, "sessions", CTLFLAG_RD, &isc->nsess, sizeof(isc->nsess), "number of active session"); #ifdef ISCSI_INITIATOR_DEBUG mtx_init(&iscsi_dbg_mtx, "iscsi_dbg", NULL, MTX_DEF); #endif isc->dev = make_dev_credf(MAKEDEV_CHECKNAME, &iscsi_cdevsw, max_sessions, NULL, UID_ROOT, GID_WHEEL, 0600, "iscsi"); if (isc->dev == NULL) { xdebug("iscsi_initiator: make_dev_credf failed"); return (EEXIST); } isc->dev->si_drv1 = isc; printf("iscsi: version %s\n", iscsi_driver_version); return (0); } /* | Notes: | unload SHOULD fail if there is activity | activity: there is/are active session/s */ static void iscsi_stop(void) { isc_session_t *sp, *sp_tmp; debug_called(8); /* | go through all the sessions | Note: close should have done this ... */ TAILQ_FOREACH_SAFE(sp, &isc->isc_sess, sp_link, sp_tmp) { //XXX: check for activity ... ism_stop(sp); } mtx_destroy(&isc->isc_mtx); sx_destroy(&isc->unit_sx); free_pdus(isc); if(isc->dev) destroy_dev(isc->dev); if(sysctl_ctx_free(&isc->clist)) xdebug("sysctl_ctx_free failed"); iscsi_shutdown(isc); // XXX: check EVENTHANDLER_ ... #ifdef ISCSI_INITIATOR_DEBUG mtx_destroy(&iscsi_dbg_mtx); #endif free(isc, M_ISCSI); } static int iscsi_modevent(module_t mod, int what, void *arg) { int error = 0; debug_called(8); switch(what) { case MOD_LOAD: error = iscsi_start(); break; case MOD_QUIESCE: if(isc->nsess) { xdebug("iscsi module busy(nsess=%d), cannot unload", isc->nsess); log(LOG_ERR, "iscsi module busy, cannot unload"); } return isc->nsess; case MOD_SHUTDOWN: break; case MOD_UNLOAD: iscsi_stop(); break; default: break; } return (error); } moduledata_t iscsi_mod = { "iscsi_initiator", (modeventhand_t) iscsi_modevent, 0 }; #ifdef ISCSI_ROOT static void iscsi_rootconf(void) { #if 0 nfs_setup_diskless(); if (nfs_diskless_valid) rootdevnames[0] = "nfs:"; #endif printf("** iscsi_rootconf **\n"); } SYSINIT(cpu_rootconf1, SI_SUB_ROOT_CONF, SI_ORDER_FIRST, iscsi_rootconf, NULL) #endif DECLARE_MODULE(iscsi_initiator, iscsi_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); MODULE_DEPEND(iscsi_initiator, cam, 1, 1, 1); Index: stable/11/sys/kern/kern_descrip.c =================================================================== --- stable/11/sys/kern/kern_descrip.c (revision 315311) +++ stable/11/sys/kern/kern_descrip.c (revision 315312) @@ -1,4108 +1,4127 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_compat.h" #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", "file desc to leader structures"); static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities"); MALLOC_DECLARE(M_FADVISE); static uma_zone_t file_zone; static uma_zone_t filedesc0_zone; static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, int holdleaders); static int fd_first_free(struct filedesc *fdp, int low, int size); static int fd_last_used(struct filedesc *fdp, int size); static void fdgrowtable(struct filedesc *fdp, int nfd); static void fdgrowtable_exp(struct filedesc *fdp, int nfd); static void fdunused(struct filedesc *fdp, int fd); static void fdused(struct filedesc *fdp, int fd); static int getmaxfd(struct thread *td); /* * Each process has: * * - An array of open file descriptors (fd_ofiles) * - An array of file flags (fd_ofileflags) * - A bitmap recording which descriptors are in use (fd_map) * * A process starts out with NDFILE descriptors. The value of NDFILE has * been selected based the historical limit of 20 open files, and an * assumption that the majority of processes, especially short-lived * processes like shells, will never need more. * * If this initial allocation is exhausted, a larger descriptor table and * map are allocated dynamically, and the pointers in the process's struct * filedesc are updated to point to those. This is repeated every time * the process runs out of file descriptors (provided it hasn't hit its * resource limit). * * Since threads may hold references to individual descriptor table * entries, the tables are never freed. Instead, they are placed on a * linked list and freed only when the struct filedesc is released. */ #define NDFILE 20 #define NDSLOTSIZE sizeof(NDSLOTTYPE) #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) #define NDSLOT(x) ((x) / NDENTRIES) #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) /* * SLIST entry used to keep track of ofiles which must be reclaimed when * the process exits. */ struct freetable { struct fdescenttbl *ft_table; SLIST_ENTRY(freetable) ft_next; }; /* * Initial allocation: a filedesc structure + the head of SLIST used to * keep track of old ofiles + enough space for NDFILE descriptors. */ struct fdescenttbl0 { int fdt_nfiles; struct filedescent fdt_ofiles[NDFILE]; }; struct filedesc0 { struct filedesc fd_fd; SLIST_HEAD(, freetable) fd_free; struct fdescenttbl0 fd_dfiles; NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; }; /* * Descriptor management. */ volatile int openfiles; /* actual number of open files */ struct mtx sigio_lock; /* mtx to protect pointers to sigio */ void (*mq_fdclose)(struct thread *td, int fd, struct file *fp); /* * If low >= size, just return low. Otherwise find the first zero bit in the * given bitmap, starting at low and not exceeding size - 1. Return size if * not found. */ static int fd_first_free(struct filedesc *fdp, int low, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, maxoff; if (low >= size) return (low); off = NDSLOT(low); if (low % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); if ((mask &= ~map[off]) != 0UL) return (off * NDENTRIES + ffsl(mask) - 1); ++off; } for (maxoff = NDSLOTS(size); off < maxoff; ++off) if (map[off] != ~0UL) return (off * NDENTRIES + ffsl(~map[off]) - 1); return (size); } /* * Find the highest non-zero bit in the given bitmap, starting at 0 and * not exceeding size - 1. Return -1 if not found. */ static int fd_last_used(struct filedesc *fdp, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, minoff; off = NDSLOT(size); if (size % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES)); if ((mask &= map[off]) != 0) return (off * NDENTRIES + flsl(mask) - 1); --off; } for (minoff = NDSLOT(0); off >= minoff; --off) if (map[off] != 0) return (off * NDENTRIES + flsl(map[off]) - 1); return (-1); } static int fdisused(struct filedesc *fdp, int fd) { KASSERT(fd >= 0 && fd < fdp->fd_nfiles, ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); } /* * Mark a file descriptor as used. */ static void fdused_init(struct filedesc *fdp, int fd) { KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd)); fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); } static void fdused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); fdused_init(fdp, fd); if (fd > fdp->fd_lastfile) fdp->fd_lastfile = fd; if (fd == fdp->fd_freefile) fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles); } /* * Mark a file descriptor as unused. */ static void fdunused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd)); KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, ("fd=%d is still in use", fd)); fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); if (fd < fdp->fd_freefile) fdp->fd_freefile = fd; if (fd == fdp->fd_lastfile) fdp->fd_lastfile = fd_last_used(fdp, fd); } /* * Free a file descriptor. * * Avoid some work if fdp is about to be destroyed. */ static inline void fdefree_last(struct filedescent *fde) { filecaps_free(&fde->fde_caps); } static inline void fdfree(struct filedesc *fdp, int fd) { struct filedescent *fde; fde = &fdp->fd_ofiles[fd]; #ifdef CAPABILITIES seq_write_begin(&fde->fde_seq); #endif fdefree_last(fde); fde->fde_file = NULL; fdunused(fdp, fd); #ifdef CAPABILITIES seq_write_end(&fde->fde_seq); #endif } void pwd_ensure_dirs(void) { struct filedesc *fdp; fdp = curproc->p_fd; FILEDESC_XLOCK(fdp); if (fdp->fd_cdir == NULL) { fdp->fd_cdir = rootvnode; vrefact(rootvnode); } if (fdp->fd_rdir == NULL) { fdp->fd_rdir = rootvnode; vrefact(rootvnode); } FILEDESC_XUNLOCK(fdp); } /* * System calls on descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct getdtablesize_args { int dummy; }; #endif /* ARGSUSED */ int sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) { #ifdef RACCT uint64_t lim; #endif td->td_retval[0] = min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc); #ifdef RACCT PROC_LOCK(td->td_proc); lim = racct_get_limit(td->td_proc, RACCT_NOFILE); PROC_UNLOCK(td->td_proc); if (lim < td->td_retval[0]) td->td_retval[0] = lim; #endif return (0); } /* * Duplicate a file descriptor to a particular value. * * Note: keep in mind that a potential race condition exists when closing * descriptors from a shared descriptor table (via rfork). */ #ifndef _SYS_SYSPROTO_H_ struct dup2_args { u_int from; u_int to; }; #endif /* ARGSUSED */ int sys_dup2(struct thread *td, struct dup2_args *uap) { return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to)); } /* * Duplicate a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct dup_args { u_int fd; }; #endif /* ARGSUSED */ int sys_dup(struct thread *td, struct dup_args *uap) { return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0)); } /* * The file control system call. */ #ifndef _SYS_SYSPROTO_H_ struct fcntl_args { int fd; int cmd; long arg; }; #endif /* ARGSUSED */ int sys_fcntl(struct thread *td, struct fcntl_args *uap) { return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg)); } int kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg) { struct flock fl; struct __oflock ofl; intptr_t arg1; int error, newcmd; error = 0; newcmd = cmd; switch (cmd) { case F_OGETLK: case F_OSETLK: case F_OSETLKW: /* * Convert old flock structure to new. */ error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl)); fl.l_start = ofl.l_start; fl.l_len = ofl.l_len; fl.l_pid = ofl.l_pid; fl.l_type = ofl.l_type; fl.l_whence = ofl.l_whence; fl.l_sysid = 0; switch (cmd) { case F_OGETLK: newcmd = F_GETLK; break; case F_OSETLK: newcmd = F_SETLK; break; case F_OSETLKW: newcmd = F_SETLKW; break; } arg1 = (intptr_t)&fl; break; case F_GETLK: case F_SETLK: case F_SETLKW: case F_SETLK_REMOTE: error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl)); arg1 = (intptr_t)&fl; break; default: arg1 = arg; break; } if (error) return (error); error = kern_fcntl(td, fd, newcmd, arg1); if (error) return (error); if (cmd == F_OGETLK) { ofl.l_start = fl.l_start; ofl.l_len = fl.l_len; ofl.l_pid = fl.l_pid; ofl.l_type = fl.l_type; ofl.l_whence = fl.l_whence; error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl)); } else if (cmd == F_GETLK) { error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl)); } return (error); } int kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) { struct filedesc *fdp; struct flock *flp; struct file *fp, *fp2; struct filedescent *fde; struct proc *p; struct vnode *vp; cap_rights_t rights; int error, flg, tmp; uint64_t bsize; off_t foffset; error = 0; flg = F_POSIX; p = td->td_proc; fdp = p->p_fd; switch (cmd) { case F_DUPFD: tmp = arg; error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp); break; case F_DUPFD_CLOEXEC: tmp = arg; error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp); break; case F_DUP2FD: tmp = arg; error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp); break; case F_DUP2FD_CLOEXEC: tmp = arg; error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp); break; case F_GETFD: + error = EBADF; FILEDESC_SLOCK(fdp); - if (fget_locked(fdp, fd) == NULL) { - FILEDESC_SUNLOCK(fdp); - error = EBADF; - break; + fde = fdeget_locked(fdp, fd); + if (fde != NULL) { + td->td_retval[0] = + (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0; + error = 0; } - fde = &fdp->fd_ofiles[fd]; - td->td_retval[0] = - (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0; FILEDESC_SUNLOCK(fdp); break; case F_SETFD: + error = EBADF; FILEDESC_XLOCK(fdp); - if (fget_locked(fdp, fd) == NULL) { - FILEDESC_XUNLOCK(fdp); - error = EBADF; - break; + fde = fdeget_locked(fdp, fd); + if (fde != NULL) { + fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) | + (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); + error = 0; } - fde = &fdp->fd_ofiles[fd]; - fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) | - (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); FILEDESC_XUNLOCK(fdp); break; case F_GETFL: error = fget_fcntl(td, fd, cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp); if (error != 0) break; td->td_retval[0] = OFLAGS(fp->f_flag); fdrop(fp, td); break; case F_SETFL: error = fget_fcntl(td, fd, cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp); if (error != 0) break; do { tmp = flg = fp->f_flag; tmp &= ~FCNTLFLAGS; tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); tmp = fp->f_flag & FNONBLOCK; error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); if (error != 0) { fdrop(fp, td); break; } tmp = fp->f_flag & FASYNC; error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); if (error == 0) { fdrop(fp, td); break; } atomic_clear_int(&fp->f_flag, FNONBLOCK); tmp = 0; (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_GETOWN: error = fget_fcntl(td, fd, cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp); if (error != 0) break; error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); if (error == 0) td->td_retval[0] = tmp; fdrop(fp, td); break; case F_SETOWN: error = fget_fcntl(td, fd, cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp); if (error != 0) break; tmp = arg; error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_SETLK_REMOTE: error = priv_check(td, PRIV_NFS_LOCKD); if (error) return (error); flg = F_REMOTE; goto do_setlk; case F_SETLKW: flg |= F_WAIT; /* FALLTHROUGH F_SETLK */ case F_SETLK: do_setlk: cap_rights_init(&rights, CAP_FLOCK); error = fget_unlocked(fdp, fd, &rights, &fp, NULL); if (error != 0) break; if (fp->f_type != DTYPE_VNODE) { error = EBADF; fdrop(fp, td); break; } flp = (struct flock *)arg; if (flp->l_whence == SEEK_CUR) { foffset = foffset_get(fp); if (foffset < 0 || (flp->l_start > 0 && foffset > OFF_MAX - flp->l_start)) { error = EOVERFLOW; fdrop(fp, td); break; } flp->l_start += foffset; } vp = fp->f_vnode; switch (flp->l_type) { case F_RDLCK: if ((fp->f_flag & FREAD) == 0) { error = EBADF; break; } PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_WRLCK: if ((fp->f_flag & FWRITE) == 0) { error = EBADF; break; } PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_UNLCK: error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, flg); break; case F_UNLCKSYS: /* * Temporary api for testing remote lock * infrastructure. */ if (flg != F_REMOTE) { error = EINVAL; break; } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCKSYS, flp, flg); break; default: error = EINVAL; break; } if (error != 0 || flp->l_type == F_UNLCK || flp->l_type == F_UNLCKSYS) { fdrop(fp, td); break; } /* * Check for a race with close. * * The vnode is now advisory locked (or unlocked, but this case * is not really important) as the caller requested. * We had to drop the filedesc lock, so we need to recheck if * the descriptor is still valid, because if it was closed * in the meantime we need to remove advisory lock from the * vnode - close on any descriptor leading to an advisory * locked vnode, removes that lock. * We will return 0 on purpose in that case, as the result of * successful advisory lock might have been externally visible * already. This is fine - effectively we pretend to the caller * that the closing thread was a bit slower and that the * advisory lock succeeded before the close. */ error = fget_unlocked(fdp, fd, &rights, &fp2, NULL); if (error != 0) { fdrop(fp, td); break; } if (fp != fp2) { flp->l_whence = SEEK_SET; flp->l_start = 0; flp->l_len = 0; flp->l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, F_POSIX); } fdrop(fp, td); fdrop(fp2, td); break; case F_GETLK: error = fget_unlocked(fdp, fd, cap_rights_init(&rights, CAP_FLOCK), &fp, NULL); if (error != 0) break; if (fp->f_type != DTYPE_VNODE) { error = EBADF; fdrop(fp, td); break; } flp = (struct flock *)arg; if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && flp->l_type != F_UNLCK) { error = EINVAL; fdrop(fp, td); break; } if (flp->l_whence == SEEK_CUR) { foffset = foffset_get(fp); if ((flp->l_start > 0 && foffset > OFF_MAX - flp->l_start) || (flp->l_start < 0 && foffset < OFF_MIN - flp->l_start)) { error = EOVERFLOW; fdrop(fp, td); break; } flp->l_start += foffset; } vp = fp->f_vnode; error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, F_POSIX); fdrop(fp, td); break; case F_RDAHEAD: arg = arg ? 128 * 1024: 0; /* FALLTHROUGH */ case F_READAHEAD: error = fget_unlocked(fdp, fd, cap_rights_init(&rights), &fp, NULL); if (error != 0) break; if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); error = EBADF; break; } vp = fp->f_vnode; /* * Exclusive lock synchronizes against f_seqcount reads and * writes in sequential_heuristic(). */ error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) { fdrop(fp, td); break; } if (arg >= 0) { bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; fp->f_seqcount = (arg + bsize - 1) / bsize; atomic_set_int(&fp->f_flag, FRDAHEAD); } else { atomic_clear_int(&fp->f_flag, FRDAHEAD); } VOP_UNLOCK(vp, 0); fdrop(fp, td); break; default: error = EINVAL; break; } return (error); } static int getmaxfd(struct thread *td) { return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc)); } /* * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). */ int kern_dup(struct thread *td, u_int mode, int flags, int old, int new) { struct filedesc *fdp; struct filedescent *oldfde, *newfde; struct proc *p; struct file *delfp; int error, maxfd; p = td->td_proc; fdp = p->p_fd; MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0); MPASS(mode < FDDUP_LASTMODE); AUDIT_ARG_FD(old); /* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */ /* * Verify we have a valid descriptor to dup from and possibly to * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should * return EINVAL when the new descriptor is out of bounds. */ if (old < 0) return (EBADF); if (new < 0) return (mode == FDDUP_FCNTL ? EINVAL : EBADF); maxfd = getmaxfd(td); if (new >= maxfd) return (mode == FDDUP_FCNTL ? EINVAL : EBADF); error = EBADF; FILEDESC_XLOCK(fdp); if (fget_locked(fdp, old) == NULL) goto unlock; if ((mode == FDDUP_FIXED || mode == FDDUP_MUSTREPLACE) && old == new) { td->td_retval[0] = new; if (flags & FDDUP_FLAG_CLOEXEC) fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE; error = 0; goto unlock; } /* * If the caller specified a file descriptor, make sure the file * table is large enough to hold it, and grab it. Otherwise, just * allocate a new descriptor the usual way. */ switch (mode) { case FDDUP_NORMAL: case FDDUP_FCNTL: if ((error = fdalloc(td, new, &new)) != 0) goto unlock; break; case FDDUP_MUSTREPLACE: /* Target file descriptor must exist. */ if (fget_locked(fdp, new) == NULL) goto unlock; break; case FDDUP_FIXED: if (new >= fdp->fd_nfiles) { /* * The resource limits are here instead of e.g. * fdalloc(), because the file descriptor table may be * shared between processes, so we can't really use * racct_add()/racct_sub(). Instead of counting the * number of actually allocated descriptors, just put * the limit on the size of the file descriptor table. */ #ifdef RACCT if (racct_enable) { PROC_LOCK(p); error = racct_set(p, RACCT_NOFILE, new + 1); PROC_UNLOCK(p); if (error != 0) { error = EMFILE; goto unlock; } } #endif fdgrowtable_exp(fdp, new + 1); } if (!fdisused(fdp, new)) fdused(fdp, new); break; default: KASSERT(0, ("%s unsupported mode %d", __func__, mode)); } KASSERT(old != new, ("new fd is same as old")); oldfde = &fdp->fd_ofiles[old]; fhold(oldfde->fde_file); newfde = &fdp->fd_ofiles[new]; delfp = newfde->fde_file; /* * Duplicate the source descriptor. */ #ifdef CAPABILITIES seq_write_begin(&newfde->fde_seq); #endif filecaps_free(&newfde->fde_caps); memcpy(newfde, oldfde, fde_change_size); filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps, true); if ((flags & FDDUP_FLAG_CLOEXEC) != 0) newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE; else newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE; #ifdef CAPABILITIES seq_write_end(&newfde->fde_seq); #endif td->td_retval[0] = new; error = 0; if (delfp != NULL) { (void) closefp(fdp, new, delfp, td, 1); FILEDESC_UNLOCK_ASSERT(fdp); } else { unlock: FILEDESC_XUNLOCK(fdp); } return (error); } /* * If sigio is on the list associated with a process or process group, * disable signalling from the device, remove sigio from the list and * free sigio. */ void funsetown(struct sigio **sigiop) { struct sigio *sigio; if (*sigiop == NULL) return; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } *(sigio->sio_myref) = NULL; if ((sigio)->sio_pgid < 0) { struct pgrp *pg = (sigio)->sio_pgrp; PGRP_LOCK(pg); SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else { struct proc *p = (sigio)->sio_proc; PROC_LOCK(p); SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } SIGIO_UNLOCK(); crfree(sigio->sio_ucred); free(sigio, M_SIGIO); } /* * Free a list of sigio structures. * We only need to lock the SIGIO_LOCK because we have made ourselves * inaccessible to callers of fsetown and therefore do not need to lock * the proc or pgrp struct for the list manipulation. */ void funsetownlst(struct sigiolst *sigiolst) { struct proc *p; struct pgrp *pg; struct sigio *sigio; sigio = SLIST_FIRST(sigiolst); if (sigio == NULL) return; p = NULL; pg = NULL; /* * Every entry of the list should belong * to a single proc or pgrp. */ if (sigio->sio_pgid < 0) { pg = sigio->sio_pgrp; PGRP_LOCK_ASSERT(pg, MA_NOTOWNED); } else /* if (sigio->sio_pgid > 0) */ { p = sigio->sio_proc; PROC_LOCK_ASSERT(p, MA_NOTOWNED); } SIGIO_LOCK(); while ((sigio = SLIST_FIRST(sigiolst)) != NULL) { *(sigio->sio_myref) = NULL; if (pg != NULL) { KASSERT(sigio->sio_pgid < 0, ("Proc sigio in pgrp sigio list")); KASSERT(sigio->sio_pgrp == pg, ("Bogus pgrp in sigio list")); PGRP_LOCK(pg); SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else /* if (p != NULL) */ { KASSERT(sigio->sio_pgid > 0, ("Pgrp sigio in proc sigio list")); KASSERT(sigio->sio_proc == p, ("Bogus proc in sigio list")); PROC_LOCK(p); SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } SIGIO_UNLOCK(); crfree(sigio->sio_ucred); free(sigio, M_SIGIO); SIGIO_LOCK(); } SIGIO_UNLOCK(); } /* * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). * * After permission checking, add a sigio structure to the sigio list for * the process or process group. */ int fsetown(pid_t pgid, struct sigio **sigiop) { struct proc *proc; struct pgrp *pgrp; struct sigio *sigio; int ret; if (pgid == 0) { funsetown(sigiop); return (0); } ret = 0; /* Allocate and fill in the new sigio out of locks. */ sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); sigio->sio_pgid = pgid; sigio->sio_ucred = crhold(curthread->td_ucred); sigio->sio_myref = sigiop; sx_slock(&proctree_lock); if (pgid > 0) { proc = pfind(pgid); if (proc == NULL) { ret = ESRCH; goto fail; } /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ PROC_UNLOCK(proc); if (proc->p_session != curthread->td_proc->p_session) { ret = EPERM; goto fail; } pgrp = NULL; } else /* if (pgid < 0) */ { pgrp = pgfind(-pgid); if (pgrp == NULL) { ret = ESRCH; goto fail; } PGRP_UNLOCK(pgrp); /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ if (pgrp->pg_session != curthread->td_proc->p_session) { ret = EPERM; goto fail; } proc = NULL; } funsetown(sigiop); if (pgid > 0) { PROC_LOCK(proc); /* * Since funsetownlst() is called without the proctree * locked, we need to check for P_WEXIT. * XXX: is ESRCH correct? */ if ((proc->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(proc); ret = ESRCH; goto fail; } SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); sigio->sio_proc = proc; PROC_UNLOCK(proc); } else { PGRP_LOCK(pgrp); SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); sigio->sio_pgrp = pgrp; PGRP_UNLOCK(pgrp); } sx_sunlock(&proctree_lock); SIGIO_LOCK(); *sigiop = sigio; SIGIO_UNLOCK(); return (0); fail: sx_sunlock(&proctree_lock); crfree(sigio->sio_ucred); free(sigio, M_SIGIO); return (ret); } /* * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). */ pid_t fgetown(sigiop) struct sigio **sigiop; { pid_t pgid; SIGIO_LOCK(); pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; SIGIO_UNLOCK(); return (pgid); } /* * Function drops the filedesc lock on return. */ static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, int holdleaders) { int error; FILEDESC_XLOCK_ASSERT(fdp); if (holdleaders) { if (td->td_proc->p_fdtol != NULL) { /* * Ask fdfree() to sleep to ensure that all relevant * process leaders can be traversed in closef(). */ fdp->fd_holdleaderscount++; } else { holdleaders = 0; } } /* * We now hold the fp reference that used to be owned by the * descriptor array. We have to unlock the FILEDESC *AFTER* * knote_fdclose to prevent a race of the fd getting opened, a knote * added, and deleteing a knote for the new fd. */ knote_fdclose(td, fd); /* * We need to notify mqueue if the object is of type mqueue. */ if (fp->f_type == DTYPE_MQUEUE) mq_fdclose(td, fd, fp); FILEDESC_XUNLOCK(fdp); error = closef(fp, td); if (holdleaders) { FILEDESC_XLOCK(fdp); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; wakeup(&fdp->fd_holdleaderscount); } FILEDESC_XUNLOCK(fdp); } return (error); } /* * Close a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct close_args { int fd; }; #endif /* ARGSUSED */ int sys_close(struct thread *td, struct close_args *uap) { return (kern_close(td, uap->fd)); } int kern_close(struct thread *td, int fd) { struct filedesc *fdp; struct file *fp; fdp = td->td_proc->p_fd; AUDIT_SYSCLOSE(td, fd); FILEDESC_XLOCK(fdp); if ((fp = fget_locked(fdp, fd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } fdfree(fdp, fd); /* closefp() drops the FILEDESC lock for us. */ return (closefp(fdp, fd, fp, td, 1)); } /* * Close open file descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct closefrom_args { int lowfd; }; #endif /* ARGSUSED */ int sys_closefrom(struct thread *td, struct closefrom_args *uap) { struct filedesc *fdp; int fd; fdp = td->td_proc->p_fd; AUDIT_ARG_FD(uap->lowfd); /* * Treat negative starting file descriptor values identical to * closefrom(0) which closes all files. */ if (uap->lowfd < 0) uap->lowfd = 0; FILEDESC_SLOCK(fdp); for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) { if (fdp->fd_ofiles[fd].fde_file != NULL) { FILEDESC_SUNLOCK(fdp); (void)kern_close(td, fd); FILEDESC_SLOCK(fdp); } } FILEDESC_SUNLOCK(fdp); return (0); } #if defined(COMPAT_43) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ofstat_args { int fd; struct ostat *sb; }; #endif /* ARGSUSED */ int ofstat(struct thread *td, struct ofstat_args *uap) { struct ostat oub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtstat(&ub, &oub); error = copyout(&oub, uap->sb, sizeof(oub)); } return (error); } #endif /* COMPAT_43 */ /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fstat_args { int fd; struct stat *sb; }; #endif /* ARGSUSED */ int sys_fstat(struct thread *td, struct fstat_args *uap) { struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) error = copyout(&ub, uap->sb, sizeof(ub)); return (error); } int kern_fstat(struct thread *td, int fd, struct stat *sbp) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(fd); error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp); if (error != 0) return (error); AUDIT_ARG_FILE(td->td_proc, fp); error = fo_stat(fp, sbp, td->td_ucred, td); fdrop(fp, td); #ifdef KTRACE if (error == 0 && KTRPOINT(td, KTR_STRUCT)) ktrstat(sbp); #endif return (error); } /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct nfstat_args { int fd; struct nstat *sb; }; #endif /* ARGSUSED */ int sys_nfstat(struct thread *td, struct nfstat_args *uap) { struct nstat nub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtnstat(&ub, &nub); error = copyout(&nub, uap->sb, sizeof(nub)); } return (error); } /* * Return pathconf information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fpathconf_args { int fd; int name; }; #endif /* ARGSUSED */ int sys_fpathconf(struct thread *td, struct fpathconf_args *uap) { struct file *fp; struct vnode *vp; cap_rights_t rights; int error; error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp); if (error != 0) return (error); if (uap->name == _PC_ASYNC_IO) { td->td_retval[0] = _POSIX_ASYNCHRONOUS_IO; goto out; } vp = fp->f_vnode; if (vp != NULL) { vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_PATHCONF(vp, uap->name, td->td_retval); VOP_UNLOCK(vp, 0); } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { if (uap->name != _PC_PIPE_BUF) { error = EINVAL; } else { td->td_retval[0] = PIPE_BUF; error = 0; } } else { error = EOPNOTSUPP; } out: fdrop(fp, td); return (error); } /* * Initialize filecaps structure. */ void filecaps_init(struct filecaps *fcaps) { bzero(fcaps, sizeof(*fcaps)); fcaps->fc_nioctls = -1; } /* * Copy filecaps structure allocating memory for ioctls array if needed. * * The last parameter indicates whether the fdtable is locked. If it is not and * ioctls are encountered, copying fails and the caller must lock the table. * * Note that if the table was not locked, the caller has to check the relevant * sequence counter to determine whether the operation was successful. */ int filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked) { size_t size; *dst = *src; if (src->fc_ioctls == NULL) return (0); if (!locked) return (1); KASSERT(src->fc_nioctls > 0, ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK); bcopy(src->fc_ioctls, dst->fc_ioctls, size); return (0); } /* * Move filecaps structure to the new place and clear the old place. */ void filecaps_move(struct filecaps *src, struct filecaps *dst) { *dst = *src; bzero(src, sizeof(*src)); } /* * Fill the given filecaps structure with full rights. */ static void filecaps_fill(struct filecaps *fcaps) { CAP_ALL(&fcaps->fc_rights); fcaps->fc_ioctls = NULL; fcaps->fc_nioctls = -1; fcaps->fc_fcntls = CAP_FCNTL_ALL; } /* * Free memory allocated within filecaps structure. */ void filecaps_free(struct filecaps *fcaps) { free(fcaps->fc_ioctls, M_FILECAPS); bzero(fcaps, sizeof(*fcaps)); } /* * Validate the given filecaps structure. */ static void filecaps_validate(const struct filecaps *fcaps, const char *func) { KASSERT(cap_rights_is_valid(&fcaps->fc_rights), ("%s: invalid rights", func)); KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0, ("%s: invalid fcntls", func)); KASSERT(fcaps->fc_fcntls == 0 || cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL), ("%s: fcntls without CAP_FCNTL", func)); KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 : (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0), ("%s: invalid ioctls", func)); KASSERT(fcaps->fc_nioctls == 0 || cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL), ("%s: ioctls without CAP_IOCTL", func)); } static void fdgrowtable_exp(struct filedesc *fdp, int nfd) { int nfd1; FILEDESC_XLOCK_ASSERT(fdp); nfd1 = fdp->fd_nfiles * 2; if (nfd1 < nfd) nfd1 = nfd; fdgrowtable(fdp, nfd1); } /* * Grow the file table to accommodate (at least) nfd descriptors. */ static void fdgrowtable(struct filedesc *fdp, int nfd) { struct filedesc0 *fdp0; struct freetable *ft; struct fdescenttbl *ntable; struct fdescenttbl *otable; int nnfiles, onfiles; NDSLOTTYPE *nmap, *omap; /* * If lastfile is -1 this struct filedesc was just allocated and we are * growing it to accommodate for the one we are going to copy from. There * is no need to have a lock on this one as it's not visible to anyone. */ if (fdp->fd_lastfile != -1) FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); /* save old values */ onfiles = fdp->fd_nfiles; otable = fdp->fd_files; omap = fdp->fd_map; /* compute the size of the new table */ nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ if (nnfiles <= onfiles) /* the table is already large enough */ return; /* * Allocate a new table. We need enough space for the number of * entries, file entries themselves and the struct freetable we will use * when we decommission the table and place it on the freelist. * We place the struct freetable in the middle so we don't have * to worry about padding. */ ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) + nnfiles * sizeof(ntable->fdt_ofiles[0]) + sizeof(struct freetable), M_FILEDESC, M_ZERO | M_WAITOK); /* copy the old data */ ntable->fdt_nfiles = nnfiles; memcpy(ntable->fdt_ofiles, otable->fdt_ofiles, onfiles * sizeof(ntable->fdt_ofiles[0])); /* * Allocate a new map only if the old is not large enough. It will * grow at a slower rate than the table as it can map more * entries than the table can hold. */ if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, M_ZERO | M_WAITOK); /* copy over the old data and update the pointer */ memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap)); fdp->fd_map = nmap; } /* * Make sure that ntable is correctly initialized before we replace * fd_files poiner. Otherwise fget_unlocked() may see inconsistent * data. */ atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable); /* * Do not free the old file table, as some threads may still * reference entries within it. Instead, place it on a freelist * which will be processed when the struct filedesc is released. * * Note that if onfiles == NDFILE, we're dealing with the original * static allocation contained within (struct filedesc0 *)fdp, * which must not be freed. */ if (onfiles > NDFILE) { ft = (struct freetable *)&otable->fdt_ofiles[onfiles]; fdp0 = (struct filedesc0 *)fdp; ft->ft_table = otable; SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next); } /* * The map does not have the same possibility of threads still * holding references to it. So always free it as long as it * does not reference the original static allocation. */ if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) free(omap, M_FILEDESC); } /* * Allocate a file descriptor for the process. */ int fdalloc(struct thread *td, int minfd, int *result) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int fd, maxfd, allocfd; #ifdef RACCT int error; #endif FILEDESC_XLOCK_ASSERT(fdp); if (fdp->fd_freefile > minfd) minfd = fdp->fd_freefile; maxfd = getmaxfd(td); /* * Search the bitmap for a free descriptor starting at minfd. * If none is found, grow the file table. */ fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); if (fd >= maxfd) return (EMFILE); if (fd >= fdp->fd_nfiles) { allocfd = min(fd * 2, maxfd); #ifdef RACCT if (racct_enable) { PROC_LOCK(p); error = racct_set(p, RACCT_NOFILE, allocfd); PROC_UNLOCK(p); if (error != 0) return (EMFILE); } #endif /* * fd is already equal to first free descriptor >= minfd, so * we only need to grow the table and we are done. */ fdgrowtable_exp(fdp, allocfd); } /* * Perform some sanity checks, then mark the file descriptor as * used and return it to the caller. */ KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles), ("invalid descriptor %d", fd)); KASSERT(!fdisused(fdp, fd), ("fd_first_free() returned non-free descriptor")); KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, ("file descriptor isn't free")); fdused(fdp, fd); *result = fd; return (0); } /* * Allocate n file descriptors for the process. */ int fdallocn(struct thread *td, int minfd, int *fds, int n) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int i; FILEDESC_XLOCK_ASSERT(fdp); for (i = 0; i < n; i++) if (fdalloc(td, 0, &fds[i]) != 0) break; if (i < n) { for (i--; i >= 0; i--) fdunused(fdp, fds[i]); return (EMFILE); } return (0); } /* * Create a new open file structure and allocate a file descriptor for the * process that refers to it. We add one reference to the file for the * descriptor table and one reference for resultfp. This is to prevent us * being preempted and the entry in the descriptor table closed after we * release the FILEDESC lock. */ int falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags, struct filecaps *fcaps) { struct file *fp; int error, fd; error = falloc_noinstall(td, &fp); if (error) return (error); /* no reference held on error */ error = finstall(td, fp, &fd, flags, fcaps); if (error) { fdrop(fp, td); /* one reference (fp only) */ return (error); } if (resultfp != NULL) *resultfp = fp; /* copy out result */ else fdrop(fp, td); /* release local reference */ if (resultfd != NULL) *resultfd = fd; return (0); } /* * Create a new open file structure without allocating a file descriptor. */ int falloc_noinstall(struct thread *td, struct file **resultfp) { struct file *fp; int maxuserfiles = maxfiles - (maxfiles / 20); int openfiles_new; static struct timeval lastfail; static int curfail; KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1; if ((openfiles_new >= maxuserfiles && priv_check(td, PRIV_MAXFILES) != 0) || openfiles_new >= maxfiles) { atomic_subtract_int(&openfiles, 1); if (ppsratecheck(&lastfail, &curfail, 1)) { printf("kern.maxfiles limit exceeded by uid %i, (%s) " "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm); } return (ENFILE); } fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); refcount_init(&fp->f_count, 1); fp->f_cred = crhold(td->td_ucred); fp->f_ops = &badfileops; *resultfp = fp; return (0); } /* * Install a file in a file descriptor table. */ void _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags, struct filecaps *fcaps) { struct filedescent *fde; MPASS(fp != NULL); if (fcaps != NULL) filecaps_validate(fcaps, __func__); FILEDESC_XLOCK_ASSERT(fdp); fde = &fdp->fd_ofiles[fd]; #ifdef CAPABILITIES seq_write_begin(&fde->fde_seq); #endif fde->fde_file = fp; fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0; if (fcaps != NULL) filecaps_move(fcaps, &fde->fde_caps); else filecaps_fill(&fde->fde_caps); #ifdef CAPABILITIES seq_write_end(&fde->fde_seq); #endif } int finstall(struct thread *td, struct file *fp, int *fd, int flags, struct filecaps *fcaps) { struct filedesc *fdp = td->td_proc->p_fd; int error; MPASS(fd != NULL); FILEDESC_XLOCK(fdp); if ((error = fdalloc(td, 0, fd))) { FILEDESC_XUNLOCK(fdp); return (error); } fhold(fp); _finstall(fdp, fp, *fd, flags, fcaps); FILEDESC_XUNLOCK(fdp); return (0); } /* * Build a new filedesc structure from another. * Copy the current, root, and jail root vnode references. * * If fdp is not NULL, return with it shared locked. */ struct filedesc * fdinit(struct filedesc *fdp, bool prepfiles) { struct filedesc0 *newfdp0; struct filedesc *newfdp; newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO); newfdp = &newfdp0->fd_fd; /* Create the file descriptor table. */ FILEDESC_LOCK_INIT(newfdp); refcount_init(&newfdp->fd_refcnt, 1); refcount_init(&newfdp->fd_holdcnt, 1); newfdp->fd_cmask = CMASK; newfdp->fd_map = newfdp0->fd_dmap; newfdp->fd_lastfile = -1; newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles; newfdp->fd_files->fdt_nfiles = NDFILE; if (fdp == NULL) return (newfdp); if (prepfiles && fdp->fd_lastfile >= newfdp->fd_nfiles) fdgrowtable(newfdp, fdp->fd_lastfile + 1); FILEDESC_SLOCK(fdp); newfdp->fd_cdir = fdp->fd_cdir; if (newfdp->fd_cdir) vrefact(newfdp->fd_cdir); newfdp->fd_rdir = fdp->fd_rdir; if (newfdp->fd_rdir) vrefact(newfdp->fd_rdir); newfdp->fd_jdir = fdp->fd_jdir; if (newfdp->fd_jdir) vrefact(newfdp->fd_jdir); if (!prepfiles) { FILEDESC_SUNLOCK(fdp); } else { while (fdp->fd_lastfile >= newfdp->fd_nfiles) { FILEDESC_SUNLOCK(fdp); fdgrowtable(newfdp, fdp->fd_lastfile + 1); FILEDESC_SLOCK(fdp); } } return (newfdp); } static struct filedesc * fdhold(struct proc *p) { struct filedesc *fdp; PROC_LOCK_ASSERT(p, MA_OWNED); fdp = p->p_fd; if (fdp != NULL) refcount_acquire(&fdp->fd_holdcnt); return (fdp); } static void fddrop(struct filedesc *fdp) { if (fdp->fd_holdcnt > 1) { if (refcount_release(&fdp->fd_holdcnt) == 0) return; } FILEDESC_LOCK_DESTROY(fdp); uma_zfree(filedesc0_zone, fdp); } /* * Share a filedesc structure. */ struct filedesc * fdshare(struct filedesc *fdp) { refcount_acquire(&fdp->fd_refcnt); return (fdp); } /* * Unshare a filedesc structure, if necessary by making a copy */ void fdunshare(struct thread *td) { struct filedesc *tmp; struct proc *p = td->td_proc; if (p->p_fd->fd_refcnt == 1) return; tmp = fdcopy(p->p_fd); fdescfree(td); p->p_fd = tmp; } void fdinstall_remapped(struct thread *td, struct filedesc *fdp) { fdescfree(td); td->td_proc->p_fd = fdp; } /* * Copy a filedesc structure. A NULL pointer in returns a NULL reference, * this is to ease callers, not catch errors. */ struct filedesc * fdcopy(struct filedesc *fdp) { struct filedesc *newfdp; struct filedescent *nfde, *ofde; int i; MPASS(fdp != NULL); newfdp = fdinit(fdp, true); /* copy all passable descriptors (i.e. not kqueue) */ newfdp->fd_freefile = -1; for (i = 0; i <= fdp->fd_lastfile; ++i) { ofde = &fdp->fd_ofiles[i]; if (ofde->fde_file == NULL || (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) { if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; continue; } nfde = &newfdp->fd_ofiles[i]; *nfde = *ofde; filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); fhold(nfde->fde_file); fdused_init(newfdp, i); newfdp->fd_lastfile = i; } if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; newfdp->fd_cmask = fdp->fd_cmask; FILEDESC_SUNLOCK(fdp); return (newfdp); } /* * Copies a filedesc structure, while remapping all file descriptors * stored inside using a translation table. * * File descriptors are copied over to the new file descriptor table, * regardless of whether the close-on-exec flag is set. */ int fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds, struct filedesc **ret) { struct filedesc *newfdp; struct filedescent *nfde, *ofde; int error, i; MPASS(fdp != NULL); newfdp = fdinit(fdp, true); if (nfds > fdp->fd_lastfile + 1) { /* New table cannot be larger than the old one. */ error = E2BIG; goto bad; } /* Copy all passable descriptors (i.e. not kqueue). */ newfdp->fd_freefile = nfds; for (i = 0; i < nfds; ++i) { if (fds[i] < 0 || fds[i] > fdp->fd_lastfile) { /* File descriptor out of bounds. */ error = EBADF; goto bad; } ofde = &fdp->fd_ofiles[fds[i]]; if (ofde->fde_file == NULL) { /* Unused file descriptor. */ error = EBADF; goto bad; } if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) { /* File descriptor cannot be passed. */ error = EINVAL; goto bad; } nfde = &newfdp->fd_ofiles[i]; *nfde = *ofde; filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); fhold(nfde->fde_file); fdused_init(newfdp, i); newfdp->fd_lastfile = i; } newfdp->fd_cmask = fdp->fd_cmask; FILEDESC_SUNLOCK(fdp); *ret = newfdp; return (0); bad: FILEDESC_SUNLOCK(fdp); fdescfree_remapped(newfdp); return (error); } /* * Clear POSIX style locks. This is only used when fdp looses a reference (i.e. * one of processes using it exits) and the table used to be shared. */ static void fdclearlocks(struct thread *td) { struct filedesc *fdp; struct filedesc_to_leader *fdtol; struct flock lf; struct file *fp; struct proc *p; struct vnode *vp; int i; p = td->td_proc; fdp = p->p_fd; fdtol = p->p_fdtol; MPASS(fdtol != NULL); FILEDESC_XLOCK(fdp); KASSERT(fdtol->fdl_refcount > 0, ("filedesc_to_refcount botch: fdl_refcount=%d", fdtol->fdl_refcount)); if (fdtol->fdl_refcount == 1 && (p->p_leader->p_flag & P_ADVLOCK) != 0) { for (i = 0; i <= fdp->fd_lastfile; i++) { fp = fdp->fd_ofiles[i].fde_file; if (fp == NULL || fp->f_type != DTYPE_VNODE) continue; fhold(fp); FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdrop(fp, td); } } retry: if (fdtol->fdl_refcount == 1) { if (fdp->fd_holdleaderscount > 0 && (p->p_leader->p_flag & P_ADVLOCK) != 0) { /* * close() or kern_dup() has cleared a reference * in a shared file descriptor table. */ fdp->fd_holdleaderswakeup = 1; sx_sleep(&fdp->fd_holdleaderscount, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } if (fdtol->fdl_holdcount > 0) { /* * Ensure that fdtol->fdl_leader remains * valid in closef(). */ fdtol->fdl_wakeup = 1; sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } } fdtol->fdl_refcount--; if (fdtol->fdl_refcount == 0 && fdtol->fdl_holdcount == 0) { fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; fdtol->fdl_prev->fdl_next = fdtol->fdl_next; } else fdtol = NULL; p->p_fdtol = NULL; FILEDESC_XUNLOCK(fdp); if (fdtol != NULL) free(fdtol, M_FILEDESC_TO_LEADER); } /* * Release a filedesc structure. */ static void fdescfree_fds(struct thread *td, struct filedesc *fdp, bool needclose) { struct filedesc0 *fdp0; struct freetable *ft, *tft; struct filedescent *fde; struct file *fp; int i; for (i = 0; i <= fdp->fd_lastfile; i++) { fde = &fdp->fd_ofiles[i]; fp = fde->fde_file; if (fp != NULL) { fdefree_last(fde); if (needclose) (void) closef(fp, td); else fdrop(fp, td); } } if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) free(fdp->fd_map, M_FILEDESC); if (fdp->fd_nfiles > NDFILE) free(fdp->fd_files, M_FILEDESC); fdp0 = (struct filedesc0 *)fdp; SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft) free(ft->ft_table, M_FILEDESC); fddrop(fdp); } void fdescfree(struct thread *td) { struct proc *p; struct filedesc *fdp; struct vnode *cdir, *jdir, *rdir; p = td->td_proc; fdp = p->p_fd; MPASS(fdp != NULL); #ifdef RACCT if (racct_enable) { PROC_LOCK(p); racct_set(p, RACCT_NOFILE, 0); PROC_UNLOCK(p); } #endif if (p->p_fdtol != NULL) fdclearlocks(td); PROC_LOCK(p); p->p_fd = NULL; PROC_UNLOCK(p); if (refcount_release(&fdp->fd_refcnt) == 0) return; FILEDESC_XLOCK(fdp); cdir = fdp->fd_cdir; fdp->fd_cdir = NULL; rdir = fdp->fd_rdir; fdp->fd_rdir = NULL; jdir = fdp->fd_jdir; fdp->fd_jdir = NULL; FILEDESC_XUNLOCK(fdp); if (cdir != NULL) vrele(cdir); if (rdir != NULL) vrele(rdir); if (jdir != NULL) vrele(jdir); fdescfree_fds(td, fdp, 1); } void fdescfree_remapped(struct filedesc *fdp) { if (fdp->fd_cdir != NULL) vrele(fdp->fd_cdir); if (fdp->fd_rdir != NULL) vrele(fdp->fd_rdir); if (fdp->fd_jdir != NULL) vrele(fdp->fd_jdir); fdescfree_fds(curthread, fdp, 0); } /* * For setugid programs, we don't want to people to use that setugidness * to generate error messages which write to a file which otherwise would * otherwise be off-limits to the process. We check for filesystems where * the vnode can change out from under us after execve (like [lin]procfs). * * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is * sufficient. We also don't check for setugidness since we know we are. */ static bool is_unsafe(struct file *fp) { struct vnode *vp; if (fp->f_type != DTYPE_VNODE) return (false); vp = fp->f_vnode; return ((vp->v_vflag & VV_PROCDEP) != 0); } /* * Make this setguid thing safe, if at all possible. */ void fdsetugidsafety(struct thread *td) { struct filedesc *fdp; struct file *fp; int i; fdp = td->td_proc->p_fd; KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); MPASS(fdp->fd_nfiles >= 3); for (i = 0; i <= 2; i++) { fp = fdp->fd_ofiles[i].fde_file; if (fp != NULL && is_unsafe(fp)) { FILEDESC_XLOCK(fdp); knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fdfree(fdp, i); FILEDESC_XUNLOCK(fdp); (void) closef(fp, td); } } } /* * If a specific file object occupies a specific file descriptor, close the * file descriptor entry and drop a reference on the file object. This is a * convenience function to handle a subsequent error in a function that calls * falloc() that handles the race that another thread might have closed the * file descriptor out from under the thread creating the file object. */ void fdclose(struct thread *td, struct file *fp, int idx) { struct filedesc *fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (fdp->fd_ofiles[idx].fde_file == fp) { fdfree(fdp, idx); FILEDESC_XUNLOCK(fdp); fdrop(fp, td); } else FILEDESC_XUNLOCK(fdp); } /* * Close any files on exec? */ void fdcloseexec(struct thread *td) { struct filedesc *fdp; struct filedescent *fde; struct file *fp; int i; fdp = td->td_proc->p_fd; KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); for (i = 0; i <= fdp->fd_lastfile; i++) { fde = &fdp->fd_ofiles[i]; fp = fde->fde_file; if (fp != NULL && (fp->f_type == DTYPE_MQUEUE || (fde->fde_flags & UF_EXCLOSE))) { FILEDESC_XLOCK(fdp); fdfree(fdp, i); (void) closefp(fdp, i, fp, td, 0); FILEDESC_UNLOCK_ASSERT(fdp); } } } /* * It is unsafe for set[ug]id processes to be started with file * descriptors 0..2 closed, as these descriptors are given implicit * significance in the Standard C library. fdcheckstd() will create a * descriptor referencing /dev/null for each of stdin, stdout, and * stderr that is not already open. */ int fdcheckstd(struct thread *td) { struct filedesc *fdp; register_t save; int i, error, devnull; fdp = td->td_proc->p_fd; KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); MPASS(fdp->fd_nfiles >= 3); devnull = -1; for (i = 0; i <= 2; i++) { if (fdp->fd_ofiles[i].fde_file != NULL) continue; save = td->td_retval[0]; if (devnull != -1) { error = kern_dup(td, FDDUP_FIXED, 0, devnull, i); } else { error = kern_openat(td, AT_FDCWD, "/dev/null", UIO_SYSSPACE, O_RDWR, 0); if (error == 0) { devnull = td->td_retval[0]; KASSERT(devnull == i, ("we didn't get our fd")); } } td->td_retval[0] = save; if (error != 0) return (error); } return (0); } /* * Internal form of close. Decrement reference count on file structure. * Note: td may be NULL when closing a file that was being passed in a * message. * * XXXRW: Giant is not required for the caller, but often will be held; this * makes it moderately likely the Giant will be recursed in the VFS case. */ int closef(struct file *fp, struct thread *td) { struct vnode *vp; struct flock lf; struct filedesc_to_leader *fdtol; struct filedesc *fdp; /* * POSIX record locking dictates that any close releases ALL * locks owned by this process. This is handled by setting * a flag in the unlock to free ONLY locks obeying POSIX * semantics, and not to free BSD-style file locks. * If the descriptor was in a message, POSIX-style locks * aren't passed with the descriptor, and the thread pointer * will be NULL. Callers should be careful only to pass a * NULL thread pointer when there really is no owning * context that might have locks, or the locks will be * leaked. */ if (fp->f_type == DTYPE_VNODE && td != NULL) { vp = fp->f_vnode; if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, F_UNLCK, &lf, F_POSIX); } fdtol = td->td_proc->p_fdtol; if (fdtol != NULL) { /* * Handle special case where file descriptor table is * shared between multiple process leaders. */ fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); for (fdtol = fdtol->fdl_next; fdtol != td->td_proc->p_fdtol; fdtol = fdtol->fdl_next) { if ((fdtol->fdl_leader->p_flag & P_ADVLOCK) == 0) continue; fdtol->fdl_holdcount++; FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdtol->fdl_holdcount--; if (fdtol->fdl_holdcount == 0 && fdtol->fdl_wakeup != 0) { fdtol->fdl_wakeup = 0; wakeup(fdtol); } } FILEDESC_XUNLOCK(fdp); } } return (fdrop(fp, td)); } /* * Initialize the file pointer with the specified properties. * * The ops are set with release semantics to be certain that the flags, type, * and data are visible when ops is. This is to prevent ops methods from being * called with bad data. */ void finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) { fp->f_data = data; fp->f_flag = flag; fp->f_type = type; atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); } int +fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, + struct file **fpp, struct filecaps *havecapsp) +{ + struct filedescent *fde; + int error; + + FILEDESC_LOCK_ASSERT(fdp); + + fde = fdeget_locked(fdp, fd); + if (fde == NULL) { + error = EBADF; + goto out; + } + +#ifdef CAPABILITIES + error = cap_check(cap_rights_fde(fde), needrightsp); + if (error != 0) + goto out; +#endif + + if (havecapsp != NULL) + filecaps_copy(&fde->fde_caps, havecapsp, true); + + *fpp = fde->fde_file; + + error = 0; +out: + return (error); +} + +int +fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, + struct file **fpp, struct filecaps *havecapsp) +{ + struct filedesc *fdp = td->td_proc->p_fd; + int error; +#ifndef CAPABILITIES + error = fget_unlocked(fdp, fd, needrightsp, fpp, NULL); + if (error == 0 && havecapsp != NULL) + filecaps_fill(havecapsp); +#else + struct file *fp; + seq_t seq; + + for (;;) { + error = fget_unlocked(fdp, fd, needrightsp, &fp, &seq); + if (error != 0) + return (error); + + if (havecapsp != NULL) { + if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, + havecapsp, false)) { + fdrop(fp, td); + goto get_locked; + } + } + + if (!fd_modified(fdp, fd, seq)) + break; + fdrop(fp, td); + } + + *fpp = fp; + return (0); + +get_locked: + FILEDESC_SLOCK(fdp); + error = fget_cap_locked(fdp, fd, needrightsp, fpp, havecapsp); + if (error == 0) + fhold(*fpp); + FILEDESC_SUNLOCK(fdp); +#endif + return (error); +} + +int fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp, seq_t *seqp) { #ifdef CAPABILITIES struct filedescent *fde; #endif struct fdescenttbl *fdt; struct file *fp; u_int count; #ifdef CAPABILITIES seq_t seq; cap_rights_t haverights; int error; #endif fdt = fdp->fd_files; if ((u_int)fd >= fdt->fdt_nfiles) return (EBADF); /* * Fetch the descriptor locklessly. We avoid fdrop() races by * never raising a refcount above 0. To accomplish this we have * to use a cmpset loop rather than an atomic_add. The descriptor * must be re-verified once we acquire a reference to be certain * that the identity is still correct and we did not lose a race * due to preemption. */ for (;;) { #ifdef CAPABILITIES seq = seq_read(fd_seq(fdt, fd)); fde = &fdt->fdt_ofiles[fd]; haverights = *cap_rights_fde(fde); fp = fde->fde_file; if (!seq_consistent(fd_seq(fdt, fd), seq)) continue; #else fp = fdt->fdt_ofiles[fd].fde_file; #endif if (fp == NULL) return (EBADF); #ifdef CAPABILITIES error = cap_check(&haverights, needrightsp); if (error != 0) return (error); #endif retry: count = fp->f_count; if (count == 0) { /* * Force a reload. Other thread could reallocate the * table before this fd was closed, so it possible that * there is a stale fp pointer in cached version. */ fdt = *(struct fdescenttbl * volatile *)&(fdp->fd_files); continue; } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) == 0) goto retry; fdt = fdp->fd_files; #ifdef CAPABILITIES if (seq_consistent_nomb(fd_seq(fdt, fd), seq)) #else if (fp == fdt->fdt_ofiles[fd].fde_file) #endif break; fdrop(fp, curthread); } *fpp = fp; if (seqp != NULL) { #ifdef CAPABILITIES *seqp = seq; #endif } return (0); } /* * Extract the file pointer associated with the specified descriptor for the * current user process. * * If the descriptor doesn't exist or doesn't match 'flags', EBADF is * returned. * * File's rights will be checked against the capability rights mask. * * If an error occurred the non-zero error is returned and *fpp is set to * NULL. Otherwise *fpp is held and set and zero is returned. Caller is * responsible for fdrop(). */ static __inline int _fget(struct thread *td, int fd, struct file **fpp, int flags, cap_rights_t *needrightsp, seq_t *seqp) { struct filedesc *fdp; struct file *fp; int error; *fpp = NULL; fdp = td->td_proc->p_fd; error = fget_unlocked(fdp, fd, needrightsp, &fp, seqp); if (error != 0) return (error); if (fp->f_ops == &badfileops) { fdrop(fp, td); return (EBADF); } /* * FREAD and FWRITE failure return EBADF as per POSIX. */ error = 0; switch (flags) { case FREAD: case FWRITE: if ((fp->f_flag & flags) == 0) error = EBADF; break; case FEXEC: if ((fp->f_flag & (FREAD | FEXEC)) == 0 || ((fp->f_flag & FWRITE) != 0)) error = EBADF; break; case 0: break; default: KASSERT(0, ("wrong flags")); } if (error != 0) { fdrop(fp, td); return (error); } *fpp = fp; return (0); } int fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, 0, rightsp, NULL)); } int fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp, struct file **fpp) { int error; #ifndef CAPABILITIES error = _fget(td, fd, fpp, 0, rightsp, NULL); if (maxprotp != NULL) *maxprotp = VM_PROT_ALL; #else struct filedesc *fdp = td->td_proc->p_fd; seq_t seq; MPASS(cap_rights_is_set(rightsp, CAP_MMAP)); for (;;) { error = _fget(td, fd, fpp, 0, rightsp, &seq); if (error != 0) return (error); /* * If requested, convert capability rights to access flags. */ if (maxprotp != NULL) *maxprotp = cap_rights_to_vmprot(cap_rights(fdp, fd)); if (!fd_modified(fdp, fd, seq)) break; fdrop(*fpp, td); } #endif return (error); } int fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, FREAD, rightsp, NULL)); } int fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, FWRITE, rightsp, NULL)); } int fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, struct file **fpp) { struct filedesc *fdp = td->td_proc->p_fd; #ifndef CAPABILITIES return (fget_unlocked(fdp, fd, rightsp, fpp, NULL)); #else int error; seq_t seq; MPASS(cap_rights_is_set(rightsp, CAP_FCNTL)); for (;;) { error = fget_unlocked(fdp, fd, rightsp, fpp, &seq); if (error != 0) return (error); error = cap_fcntl_check(fdp, fd, needfcntl); if (!fd_modified(fdp, fd, seq)) break; fdrop(*fpp, td); } if (error != 0) { fdrop(*fpp, td); *fpp = NULL; } return (error); #endif } /* * Like fget() but loads the underlying vnode, or returns an error if the * descriptor does not represent a vnode. Note that pipes use vnodes but * never have VM objects. The returned vnode will be vref()'d. * * XXX: what about the unused flags ? */ static __inline int _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp, struct vnode **vpp) { struct file *fp; int error; *vpp = NULL; error = _fget(td, fd, &fp, flags, needrightsp, NULL); if (error != 0) return (error); if (fp->f_vnode == NULL) { error = EINVAL; } else { *vpp = fp->f_vnode; vrefact(*vpp); } fdrop(fp, td); return (error); } int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, 0, rightsp, vpp)); } int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, struct filecaps *havecaps, struct vnode **vpp) { struct filedesc *fdp; + struct filecaps caps; struct file *fp; -#ifdef CAPABILITIES int error; -#endif fdp = td->td_proc->p_fd; - fp = fget_locked(fdp, fd); - if (fp == NULL || fp->f_ops == &badfileops) - return (EBADF); - -#ifdef CAPABILITIES - error = cap_check(cap_rights(fdp, fd), needrightsp); + error = fget_cap_locked(fdp, fd, needrightsp, &fp, &caps); if (error != 0) return (error); -#endif + if (fp->f_ops == &badfileops) { + error = EBADF; + goto out; + } + if (fp->f_vnode == NULL) { + error = EINVAL; + goto out; + } - if (fp->f_vnode == NULL) - return (EINVAL); - + *havecaps = caps; *vpp = fp->f_vnode; vrefact(*vpp); - filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps, true); return (0); +out: + filecaps_free(&caps); + return (error); } int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FREAD, rightsp, vpp)); } int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FEXEC, rightsp, vpp)); } #ifdef notyet int fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FWRITE, rightsp, vpp)); } #endif - -/* - * Like fget() but loads the underlying socket, or returns an error if the - * descriptor does not represent a socket. - * - * We bump the ref count on the returned socket. XXX Also obtain the SX lock - * in the future. - * - * Note: fgetsock() and fputsock() are deprecated, as consumers should rely - * on their file descriptor reference to prevent the socket from being free'd - * during use. - */ -int -fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp, - u_int *fflagp) -{ - struct file *fp; - int error; - - *spp = NULL; - if (fflagp != NULL) - *fflagp = 0; - if ((error = _fget(td, fd, &fp, 0, rightsp, NULL)) != 0) - return (error); - if (fp->f_type != DTYPE_SOCKET) { - error = ENOTSOCK; - } else { - *spp = fp->f_data; - if (fflagp) - *fflagp = fp->f_flag; - SOCK_LOCK(*spp); - soref(*spp); - SOCK_UNLOCK(*spp); - } - fdrop(fp, td); - - return (error); -} - -/* - * Drop the reference count on the socket and XXX release the SX lock in the - * future. The last reference closes the socket. - * - * Note: fputsock() is deprecated, see comment for fgetsock(). - */ -void -fputsock(struct socket *so) -{ - - ACCEPT_LOCK(); - SOCK_LOCK(so); - CURVNET_SET(so->so_vnet); - sorele(so); - CURVNET_RESTORE(); -} /* * Handle the last reference to a file being closed. */ int _fdrop(struct file *fp, struct thread *td) { int error; if (fp->f_count != 0) panic("fdrop: count %d", fp->f_count); error = fo_close(fp, td); atomic_subtract_int(&openfiles, 1); crfree(fp->f_cred); free(fp->f_advice, M_FADVISE); uma_zfree(file_zone, fp); return (error); } /* * Apply an advisory lock on a file descriptor. * * Just attempt to get a record lock of the requested type on the entire file * (l_whence = SEEK_SET, l_start = 0, l_len = 0). */ #ifndef _SYS_SYSPROTO_H_ struct flock_args { int fd; int how; }; #endif /* ARGSUSED */ int sys_flock(struct thread *td, struct flock_args *uap) { struct file *fp; struct vnode *vp; struct flock lf; cap_rights_t rights; int error; error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp); if (error != 0) return (error); if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); return (EOPNOTSUPP); } vp = fp->f_vnode; lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (uap->how & LOCK_UN) { lf.l_type = F_UNLCK; atomic_clear_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); goto done2; } if (uap->how & LOCK_EX) lf.l_type = F_WRLCK; else if (uap->how & LOCK_SH) lf.l_type = F_RDLCK; else { error = EBADF; goto done2; } atomic_set_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); done2: fdrop(fp, td); return (error); } /* * Duplicate the specified descriptor to a free descriptor. */ int dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, int openerror, int *indxp) { struct filedescent *newfde, *oldfde; struct file *fp; int error, indx; KASSERT(openerror == ENODEV || openerror == ENXIO, ("unexpected error %d in %s", openerror, __func__)); /* * If the to-be-dup'd fd number is greater than the allowed number * of file descriptors, or the fd to be dup'd has already been * closed, then reject. */ FILEDESC_XLOCK(fdp); if ((fp = fget_locked(fdp, dfd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } error = fdalloc(td, 0, &indx); if (error != 0) { FILEDESC_XUNLOCK(fdp); return (error); } /* * There are two cases of interest here. * * For ENODEV simply dup (dfd) to file descriptor (indx) and return. * * For ENXIO steal away the file structure from (dfd) and store it in * (indx). (dfd) is effectively closed by this operation. */ switch (openerror) { case ENODEV: /* * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { fdunused(fdp, indx); FILEDESC_XUNLOCK(fdp); return (EACCES); } fhold(fp); newfde = &fdp->fd_ofiles[indx]; oldfde = &fdp->fd_ofiles[dfd]; #ifdef CAPABILITIES seq_write_begin(&newfde->fde_seq); #endif memcpy(newfde, oldfde, fde_change_size); filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps, true); #ifdef CAPABILITIES seq_write_end(&newfde->fde_seq); #endif break; case ENXIO: /* * Steal away the file pointer from dfd and stuff it into indx. */ newfde = &fdp->fd_ofiles[indx]; oldfde = &fdp->fd_ofiles[dfd]; #ifdef CAPABILITIES seq_write_begin(&newfde->fde_seq); #endif memcpy(newfde, oldfde, fde_change_size); oldfde->fde_file = NULL; fdunused(fdp, dfd); #ifdef CAPABILITIES seq_write_end(&newfde->fde_seq); #endif break; } FILEDESC_XUNLOCK(fdp); *indxp = indx; return (0); } /* * This sysctl determines if we will allow a process to chroot(2) if it * has a directory open: * 0: disallowed for all processes. * 1: allowed for processes that were not already chroot(2)'ed. * 2: allowed for all processes. */ static int chroot_allow_open_directories = 1; SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, &chroot_allow_open_directories, 0, "Allow a process to chroot(2) if it has a directory open"); /* * Helper function for raised chroot(2) security function: Refuse if * any filedescriptors are open directories. */ static int chroot_refuse_vdir_fds(struct filedesc *fdp) { struct vnode *vp; struct file *fp; int fd; FILEDESC_LOCK_ASSERT(fdp); for (fd = 0; fd <= fdp->fd_lastfile; fd++) { fp = fget_locked(fdp, fd); if (fp == NULL) continue; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VDIR) return (EPERM); } } return (0); } /* * Common routine for kern_chroot() and jail_attach(). The caller is * responsible for invoking priv_check() and mac_vnode_check_chroot() to * authorize this operation. */ int pwd_chroot(struct thread *td, struct vnode *vp) { struct filedesc *fdp; struct vnode *oldvp; int error; fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (chroot_allow_open_directories == 0 || (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) { error = chroot_refuse_vdir_fds(fdp); if (error != 0) { FILEDESC_XUNLOCK(fdp); return (error); } } oldvp = fdp->fd_rdir; vrefact(vp); fdp->fd_rdir = vp; if (fdp->fd_jdir == NULL) { vrefact(vp); fdp->fd_jdir = vp; } FILEDESC_XUNLOCK(fdp); vrele(oldvp); return (0); } void pwd_chdir(struct thread *td, struct vnode *vp) { struct filedesc *fdp; struct vnode *oldvp; fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); VNASSERT(vp->v_usecount > 0, vp, ("chdir to a vnode with zero usecount")); oldvp = fdp->fd_cdir; fdp->fd_cdir = vp; FILEDESC_XUNLOCK(fdp); vrele(oldvp); } /* * Scan all active processes and prisons to see if any of them have a current * or root directory of `olddp'. If so, replace them with the new mount point. */ void mountcheckdirs(struct vnode *olddp, struct vnode *newdp) { struct filedesc *fdp; struct prison *pr; struct proc *p; int nrele; if (vrefcnt(olddp) == 1) return; nrele = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; FILEDESC_XLOCK(fdp); if (fdp->fd_cdir == olddp) { vrefact(newdp); fdp->fd_cdir = newdp; nrele++; } if (fdp->fd_rdir == olddp) { vrefact(newdp); fdp->fd_rdir = newdp; nrele++; } if (fdp->fd_jdir == olddp) { vrefact(newdp); fdp->fd_jdir = newdp; nrele++; } FILEDESC_XUNLOCK(fdp); fddrop(fdp); } sx_sunlock(&allproc_lock); if (rootvnode == olddp) { vrefact(newdp); rootvnode = newdp; nrele++; } mtx_lock(&prison0.pr_mtx); if (prison0.pr_root == olddp) { vrefact(newdp); prison0.pr_root = newdp; nrele++; } mtx_unlock(&prison0.pr_mtx); sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) { mtx_lock(&pr->pr_mtx); if (pr->pr_root == olddp) { vrefact(newdp); pr->pr_root = newdp; nrele++; } mtx_unlock(&pr->pr_mtx); } sx_sunlock(&allprison_lock); while (nrele--) vrele(olddp); } struct filedesc_to_leader * filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) { struct filedesc_to_leader *fdtol; fdtol = malloc(sizeof(struct filedesc_to_leader), M_FILEDESC_TO_LEADER, M_WAITOK); fdtol->fdl_refcount = 1; fdtol->fdl_holdcount = 0; fdtol->fdl_wakeup = 0; fdtol->fdl_leader = leader; if (old != NULL) { FILEDESC_XLOCK(fdp); fdtol->fdl_next = old->fdl_next; fdtol->fdl_prev = old; old->fdl_next = fdtol; fdtol->fdl_next->fdl_prev = fdtol; FILEDESC_XUNLOCK(fdp); } else { fdtol->fdl_next = fdtol; fdtol->fdl_prev = fdtol; } return (fdtol); } static int sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS) { struct filedesc *fdp; int i, count, slots; if (*(int *)arg1 != 0) return (EINVAL); fdp = curproc->p_fd; count = 0; FILEDESC_SLOCK(fdp); slots = NDSLOTS(fdp->fd_lastfile + 1); for (i = 0; i < slots; i++) count += bitcountl(fdp->fd_map[i]); FILEDESC_SUNLOCK(fdp); return (SYSCTL_OUT(req, &count, sizeof(count))); } static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds, CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds, "Number of open file descriptors"); /* * Get file structures globally. */ static int sysctl_kern_file(SYSCTL_HANDLER_ARGS) { struct xfile xf; struct filedesc *fdp; struct file *fp; struct proc *p; int error, n; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); if (req->oldptr == NULL) { n = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; /* overestimates sparse tables. */ if (fdp->fd_lastfile > 0) n += fdp->fd_lastfile; fddrop(fdp); } sx_sunlock(&allproc_lock); return (SYSCTL_OUT(req, 0, n * sizeof(xf))); } error = 0; bzero(&xf, sizeof(xf)); xf.xf_size = sizeof(xf); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } if (p_cansee(req->td, p) != 0) { PROC_UNLOCK(p); continue; } xf.xf_pid = p->p_pid; xf.xf_uid = p->p_ucred->cr_uid; fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; FILEDESC_SLOCK(fdp); for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) { if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) continue; xf.xf_fd = n; xf.xf_file = fp; xf.xf_data = fp->f_data; xf.xf_vnode = fp->f_vnode; xf.xf_type = fp->f_type; xf.xf_count = fp->f_count; xf.xf_msgcount = 0; xf.xf_offset = foffset_get(fp); xf.xf_flag = fp->f_flag; error = SYSCTL_OUT(req, &xf, sizeof(xf)); if (error) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); if (error) break; } sx_sunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); #ifdef KINFO_FILE_SIZE CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); #endif static int xlate_fflags(int fflags) { static const struct { int fflag; int kf_fflag; } fflags_table[] = { { FAPPEND, KF_FLAG_APPEND }, { FASYNC, KF_FLAG_ASYNC }, { FFSYNC, KF_FLAG_FSYNC }, { FHASLOCK, KF_FLAG_HASLOCK }, { FNONBLOCK, KF_FLAG_NONBLOCK }, { FREAD, KF_FLAG_READ }, { FWRITE, KF_FLAG_WRITE }, { O_CREAT, KF_FLAG_CREAT }, { O_DIRECT, KF_FLAG_DIRECT }, { O_EXCL, KF_FLAG_EXCL }, { O_EXEC, KF_FLAG_EXEC }, { O_EXLOCK, KF_FLAG_EXLOCK }, { O_NOFOLLOW, KF_FLAG_NOFOLLOW }, { O_SHLOCK, KF_FLAG_SHLOCK }, { O_TRUNC, KF_FLAG_TRUNC } }; unsigned int i; int kflags; kflags = 0; for (i = 0; i < nitems(fflags_table); i++) if (fflags & fflags_table[i].fflag) kflags |= fflags_table[i].kf_fflag; return (kflags); } /* Trim unused data from kf_path by truncating the structure size. */ static void pack_kinfo(struct kinfo_file *kif) { kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + strlen(kif->kf_path) + 1; kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); } static void export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp, struct kinfo_file *kif, struct filedesc *fdp, int flags) { int error; bzero(kif, sizeof(*kif)); /* Set a default type to allow for empty fill_kinfo() methods. */ kif->kf_type = KF_TYPE_UNKNOWN; kif->kf_flags = xlate_fflags(fp->f_flag); if (rightsp != NULL) kif->kf_cap_rights = *rightsp; else cap_rights_init(&kif->kf_cap_rights); kif->kf_fd = fd; kif->kf_ref_count = fp->f_count; kif->kf_offset = foffset_get(fp); /* * This may drop the filedesc lock, so the 'fp' cannot be * accessed after this call. */ error = fo_fill_kinfo(fp, kif, fdp); if (error == 0) kif->kf_status |= KF_ATTR_VALID; if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) pack_kinfo(kif); else kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); } static void export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags, struct kinfo_file *kif, int flags) { int error; bzero(kif, sizeof(*kif)); kif->kf_type = KF_TYPE_VNODE; error = vn_fill_kinfo_vnode(vp, kif); if (error == 0) kif->kf_status |= KF_ATTR_VALID; kif->kf_flags = xlate_fflags(fflags); cap_rights_init(&kif->kf_cap_rights); kif->kf_fd = fd; kif->kf_ref_count = -1; kif->kf_offset = -1; if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) pack_kinfo(kif); else kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); vrele(vp); } struct export_fd_buf { struct filedesc *fdp; struct sbuf *sb; ssize_t remainder; struct kinfo_file kif; int flags; }; static int export_kinfo_to_sb(struct export_fd_buf *efbuf) { struct kinfo_file *kif; kif = &efbuf->kif; if (efbuf->remainder != -1) { if (efbuf->remainder < kif->kf_structsize) { /* Terminate export. */ efbuf->remainder = 0; return (0); } efbuf->remainder -= kif->kf_structsize; } return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM); } static int export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp, struct export_fd_buf *efbuf) { int error; if (efbuf->remainder == 0) return (0); export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp, efbuf->flags); FILEDESC_SUNLOCK(efbuf->fdp); error = export_kinfo_to_sb(efbuf); FILEDESC_SLOCK(efbuf->fdp); return (error); } static int export_vnode_to_sb(struct vnode *vp, int fd, int fflags, struct export_fd_buf *efbuf) { int error; if (efbuf->remainder == 0) return (0); if (efbuf->fdp != NULL) FILEDESC_SUNLOCK(efbuf->fdp); export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags); error = export_kinfo_to_sb(efbuf); if (efbuf->fdp != NULL) FILEDESC_SLOCK(efbuf->fdp); return (error); } /* * Store a process file descriptor information to sbuf. * * Takes a locked proc as argument, and returns with the proc unlocked. */ int kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, int flags) { struct file *fp; struct filedesc *fdp; struct export_fd_buf *efbuf; struct vnode *cttyvp, *textvp, *tracevp; int error, i; cap_rights_t rights; PROC_LOCK_ASSERT(p, MA_OWNED); /* ktrace vnode */ tracevp = p->p_tracevp; if (tracevp != NULL) vrefact(tracevp); /* text vnode */ textvp = p->p_textvp; if (textvp != NULL) vrefact(textvp); /* Controlling tty. */ cttyvp = NULL; if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) { cttyvp = p->p_pgrp->pg_session->s_ttyvp; if (cttyvp != NULL) vrefact(cttyvp); } fdp = fdhold(p); PROC_UNLOCK(p); efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); efbuf->fdp = NULL; efbuf->sb = sb; efbuf->remainder = maxlen; efbuf->flags = flags; if (tracevp != NULL) export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE, efbuf); if (textvp != NULL) export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf); if (cttyvp != NULL) export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE, efbuf); error = 0; if (fdp == NULL) goto fail; efbuf->fdp = fdp; FILEDESC_SLOCK(fdp); /* working directory */ if (fdp->fd_cdir != NULL) { vrefact(fdp->fd_cdir); export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf); } /* root directory */ if (fdp->fd_rdir != NULL) { vrefact(fdp->fd_rdir); export_vnode_to_sb(fdp->fd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf); } /* jail directory */ if (fdp->fd_jdir != NULL) { vrefact(fdp->fd_jdir); export_vnode_to_sb(fdp->fd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf); } for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) { if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) continue; #ifdef CAPABILITIES rights = *cap_rights(fdp, i); #else /* !CAPABILITIES */ cap_rights_init(&rights); #endif /* * Create sysctl entry. It is OK to drop the filedesc * lock inside of export_file_to_sb() as we will * re-validate and re-evaluate its properties when the * loop continues. */ error = export_file_to_sb(fp, i, &rights, efbuf); if (error != 0 || efbuf->remainder == 0) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); fail: free(efbuf, M_TEMP); return (error); } #define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5) /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct proc *p; ssize_t maxlen; int error, error2, *name; name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } maxlen = req->oldptr != NULL ? req->oldlen : -1; error = kern_proc_filedesc_out(p, &sb, maxlen, KERN_FILEDESC_PACK_KINFO); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } #ifdef KINFO_OFILE_SIZE CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); #endif #ifdef COMPAT_FREEBSD7 static void kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif) { okif->kf_structsize = sizeof(*okif); okif->kf_type = kif->kf_type; okif->kf_fd = kif->kf_fd; okif->kf_ref_count = kif->kf_ref_count; okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE | KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK | KF_FLAG_DIRECT | KF_FLAG_HASLOCK); okif->kf_offset = kif->kf_offset; okif->kf_vnode_type = kif->kf_vnode_type; okif->kf_sock_domain = kif->kf_sock_domain; okif->kf_sock_type = kif->kf_sock_type; okif->kf_sock_protocol = kif->kf_sock_protocol; strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path)); okif->kf_sa_local = kif->kf_sa_local; okif->kf_sa_peer = kif->kf_sa_peer; } static int export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif, struct kinfo_ofile *okif, struct filedesc *fdp, struct sysctl_req *req) { int error; vrefact(vp); FILEDESC_SUNLOCK(fdp); export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO); kinfo_to_okinfo(kif, okif); error = SYSCTL_OUT(req, okif, sizeof(*okif)); FILEDESC_SLOCK(fdp); return (error); } /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) { struct kinfo_ofile *okif; struct kinfo_file *kif; struct filedesc *fdp; int error, i, *name; struct file *fp; struct proc *p; name = (int *)arg1; error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) return (error); fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) return (ENOENT); kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK); FILEDESC_SLOCK(fdp); if (fdp->fd_cdir != NULL) export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif, okif, fdp, req); if (fdp->fd_rdir != NULL) export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif, okif, fdp, req); if (fdp->fd_jdir != NULL) export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif, okif, fdp, req); for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) { if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) continue; export_file_to_kinfo(fp, i, NULL, kif, fdp, KERN_FILEDESC_PACK_KINFO); FILEDESC_SUNLOCK(fdp); kinfo_to_okinfo(kif, okif); error = SYSCTL_OUT(req, okif, sizeof(*okif)); FILEDESC_SLOCK(fdp); if (error) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); free(kif, M_TEMP); free(okif, M_TEMP); return (0); } static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc, "Process ofiledesc entries"); #endif /* COMPAT_FREEBSD7 */ int vntype_to_kinfo(int vtype) { struct { int vtype; int kf_vtype; } vtypes_table[] = { { VBAD, KF_VTYPE_VBAD }, { VBLK, KF_VTYPE_VBLK }, { VCHR, KF_VTYPE_VCHR }, { VDIR, KF_VTYPE_VDIR }, { VFIFO, KF_VTYPE_VFIFO }, { VLNK, KF_VTYPE_VLNK }, { VNON, KF_VTYPE_VNON }, { VREG, KF_VTYPE_VREG }, { VSOCK, KF_VTYPE_VSOCK } }; unsigned int i; /* * Perform vtype translation. */ for (i = 0; i < nitems(vtypes_table); i++) if (vtypes_table[i].vtype == vtype) return (vtypes_table[i].kf_vtype); return (KF_VTYPE_UNKNOWN); } static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc, "Process filedesc entries"); /* * Store a process current working directory information to sbuf. * * Takes a locked proc as argument, and returns with the proc unlocked. */ int kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen) { struct filedesc *fdp; struct export_fd_buf *efbuf; int error; PROC_LOCK_ASSERT(p, MA_OWNED); fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) return (EINVAL); efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); efbuf->fdp = fdp; efbuf->sb = sb; efbuf->remainder = maxlen; FILEDESC_SLOCK(fdp); if (fdp->fd_cdir == NULL) error = EINVAL; else { vrefact(fdp->fd_cdir); error = export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf); } FILEDESC_SUNLOCK(fdp); fddrop(fdp); free(efbuf, M_TEMP); return (error); } /* * Get per-process current working directory. */ static int sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct proc *p; ssize_t maxlen; int error, error2, *name; name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } maxlen = req->oldptr != NULL ? req->oldlen : -1; error = kern_proc_cwd_out(p, &sb, maxlen); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_cwd, "Process current working directory"); #ifdef DDB /* * For the purposes of debugging, generate a human-readable string for the * file type. */ static const char * file_type_to_name(short type) { switch (type) { case 0: return ("zero"); case DTYPE_VNODE: return ("vnod"); case DTYPE_SOCKET: return ("sock"); case DTYPE_PIPE: return ("pipe"); case DTYPE_FIFO: return ("fifo"); case DTYPE_KQUEUE: return ("kque"); case DTYPE_CRYPTO: return ("crpt"); case DTYPE_MQUEUE: return ("mque"); case DTYPE_SHM: return ("shm"); case DTYPE_SEM: return ("ksem"); default: return ("unkn"); } } /* * For the purposes of debugging, identify a process (if any, perhaps one of * many) that references the passed file in its file descriptor array. Return * NULL if none. */ static struct proc * file_to_first_proc(struct file *fp) { struct filedesc *fdp; struct proc *p; int n; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; fdp = p->p_fd; if (fdp == NULL) continue; for (n = 0; n <= fdp->fd_lastfile; n++) { if (fp == fdp->fd_ofiles[n].fde_file) return (p); } } return (NULL); } static void db_print_file(struct file *fp, int header) { struct proc *p; if (header) db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n", "File", "Type", "Data", "Flag", "GCFl", "Count", "MCount", "Vnode", "FPID", "FCmd"); p = file_to_first_proc(fp); db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp, file_type_to_name(fp->f_type), fp->f_data, fp->f_flag, 0, fp->f_count, 0, fp->f_vnode, p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); } DB_SHOW_COMMAND(file, db_show_file) { struct file *fp; if (!have_addr) { db_printf("usage: show file \n"); return; } fp = (struct file *)addr; db_print_file(fp, 1); } DB_SHOW_COMMAND(files, db_show_files) { struct filedesc *fdp; struct file *fp; struct proc *p; int header; int n; header = 1; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; if ((fdp = p->p_fd) == NULL) continue; for (n = 0; n <= fdp->fd_lastfile; ++n) { if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) continue; db_print_file(fp, header); header = 0; } } } #endif SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, &maxfilesperproc, 0, "Maximum files allowed open per process"); SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, "Maximum number of files"); SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files"); /* ARGSUSED*/ static void filelistinit(void *dummy) { file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); } SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); /*-------------------------------------------------------------------*/ static int badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EBADF); } static int badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } static int badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (0); } static int badfo_kqfilter(struct file *fp, struct knote *kn) { return (EBADF); } static int badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_close(struct file *fp, struct thread *td) { return (0); } static int badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return (EBADF); } static int badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { return (0); } struct fileops badfileops = { .fo_read = badfo_readwrite, .fo_write = badfo_readwrite, .fo_truncate = badfo_truncate, .fo_ioctl = badfo_ioctl, .fo_poll = badfo_poll, .fo_kqfilter = badfo_kqfilter, .fo_stat = badfo_stat, .fo_close = badfo_close, .fo_chmod = badfo_chmod, .fo_chown = badfo_chown, .fo_sendfile = badfo_sendfile, .fo_fill_kinfo = badfo_fill_kinfo, }; int invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EOPNOTSUPP); } int invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (ENOTTY); } int invfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (poll_no_poll(events)); } int invfo_kqfilter(struct file *fp, struct knote *kn) { return (EINVAL); } int invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return (EINVAL); } /*-------------------------------------------------------------------*/ /* * File Descriptor pseudo-device driver (/dev/fd/). * * Opening minor device N dup()s the file (if any) connected to file * descriptor N belonging to the calling process. Note that this driver * consists of only the ``open()'' routine, because all subsequent * references to this file will be direct to the other driver. * * XXX: we could give this one a cloning event handler if necessary. */ /* ARGSUSED */ static int fdopen(struct cdev *dev, int mode, int type, struct thread *td) { /* * XXX Kludge: set curthread->td_dupfd to contain the value of the * the file descriptor being sought for duplication. The error * return ensures that the vnode for this device will be released * by vn_open. Open will detect this special error and take the * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN * will simply report the error. */ td->td_dupfd = dev2unit(dev); return (ENODEV); } static struct cdevsw fildesc_cdevsw = { .d_version = D_VERSION, .d_open = fdopen, .d_name = "FD", }; static void fildesc_drvinit(void *unused) { struct cdev *dev; dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/0"); make_dev_alias(dev, "stdin"); dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/1"); make_dev_alias(dev, "stdout"); dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/2"); make_dev_alias(dev, "stderr"); } SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); Index: stable/11/sys/kern/kern_sendfile.c =================================================================== --- stable/11/sys/kern/kern_sendfile.c (revision 315311) +++ stable/11/sys/kern/kern_sendfile.c (revision 315312) @@ -1,1025 +1,1025 @@ /*- * Copyright (c) 2013-2015 Gleb Smirnoff * Copyright (c) 1998, David Greenman. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Structure describing a single sendfile(2) I/O, which may consist of * several underlying pager I/Os. * * The syscall context allocates the structure and initializes 'nios' * to 1. As sendfile_swapin() runs through pages and starts asynchronous * paging operations, it increments 'nios'. * * Every I/O completion calls sendfile_iodone(), which decrements the 'nios', * and the syscall also calls sendfile_iodone() after allocating all mbufs, * linking them and sending to socket. Whoever reaches zero 'nios' is * responsible to * call pru_ready on the socket, to notify it of readyness * of the data. */ struct sf_io { volatile u_int nios; u_int error; int npages; struct file *sock_fp; struct mbuf *m; vm_page_t pa[]; }; /* * Structure used to track requests with SF_SYNC flag. */ struct sendfile_sync { struct mtx mtx; struct cv cv; unsigned count; }; counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; static void sfstat_init(const void *unused) { COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t), M_WAITOK); } SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); static int sfstat_sysctl(SYSCTL_HANDLER_ARGS) { struct sfstat s; COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t)); if (req->newptr) COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t)); return (SYSCTL_OUT(req, &s, sizeof(s))); } SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW, NULL, 0, sfstat_sysctl, "I", "sendfile statistics"); /* * Detach mapped page and release resources back to the system. Called * by mbuf(9) code when last reference to a page is freed. */ void sf_ext_free(void *arg1, void *arg2) { struct sf_buf *sf = arg1; struct sendfile_sync *sfs = arg2; vm_page_t pg = sf_buf_page(sf); sf_buf_free(sf); vm_page_lock(pg); /* * Check for the object going away on us. This can * happen since we don't hold a reference to it. * If so, we're responsible for freeing the page. */ if (vm_page_unwire(pg, PQ_INACTIVE) && pg->object == NULL) vm_page_free(pg); vm_page_unlock(pg); if (sfs != NULL) { mtx_lock(&sfs->mtx); KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); if (--sfs->count == 0) cv_signal(&sfs->cv); mtx_unlock(&sfs->mtx); } } /* * Same as above, but forces the page to be detached from the object * and go into free pool. */ void sf_ext_free_nocache(void *arg1, void *arg2) { struct sf_buf *sf = arg1; struct sendfile_sync *sfs = arg2; vm_page_t pg = sf_buf_page(sf); sf_buf_free(sf); vm_page_lock(pg); if (vm_page_unwire(pg, PQ_NONE)) { vm_object_t obj; /* Try to free the page, but only if it is cheap to. */ if ((obj = pg->object) == NULL) vm_page_free(pg); else if (!vm_page_xbusied(pg) && VM_OBJECT_TRYWLOCK(obj)) { vm_page_free(pg); VM_OBJECT_WUNLOCK(obj); } else vm_page_deactivate(pg); } vm_page_unlock(pg); if (sfs != NULL) { mtx_lock(&sfs->mtx); KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); if (--sfs->count == 0) cv_signal(&sfs->cv); mtx_unlock(&sfs->mtx); } } /* * Helper function to calculate how much data to put into page i of n. * Only first and last pages are special. */ static inline off_t xfsize(int i, int n, off_t off, off_t len) { if (i == 0) return (omin(PAGE_SIZE - (off & PAGE_MASK), len)); if (i == n - 1 && ((off + len) & PAGE_MASK) > 0) return ((off + len) & PAGE_MASK); return (PAGE_SIZE); } /* * Helper function to get offset within object for i page. */ static inline vm_offset_t vmoff(int i, off_t off) { if (i == 0) return ((vm_offset_t)off); return (trunc_page(off + i * PAGE_SIZE)); } /* * Helper function used when allocation of a page or sf_buf failed. * Pretend as if we don't have enough space, subtract xfsize() of * all pages that failed. */ static inline void fixspace(int old, int new, off_t off, int *space) { KASSERT(old > new, ("%s: old %d new %d", __func__, old, new)); /* Subtract last one. */ *space -= xfsize(old - 1, old, off, *space); old--; if (new == old) /* There was only one page. */ return; /* Subtract first one. */ if (new == 0) { *space -= xfsize(0, old, off, *space); new++; } /* Rest of pages are full sized. */ *space -= (old - new) * PAGE_SIZE; KASSERT(*space >= 0, ("%s: space went backwards", __func__)); } /* * I/O completion callback. */ static void sendfile_iodone(void *arg, vm_page_t *pg, int count, int error) { struct sf_io *sfio = arg; struct socket *so; for (int i = 0; i < count; i++) vm_page_xunbusy(pg[i]); if (error) sfio->error = error; if (!refcount_release(&sfio->nios)) return; so = sfio->sock_fp->f_data; if (sfio->error) { struct mbuf *m; /* * I/O operation failed. The state of data in the socket * is now inconsistent, and all what we can do is to tear * it down. Protocol abort method would tear down protocol * state, free all ready mbufs and detach not ready ones. * We will free the mbufs corresponding to this I/O manually. * * The socket would be marked with EIO and made available * for read, so that application receives EIO on next * syscall and eventually closes the socket. */ so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = EIO; m = sfio->m; for (int i = 0; i < sfio->npages; i++) m = m_free(m); } else { CURVNET_SET(so->so_vnet); (void )(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m, sfio->npages); CURVNET_RESTORE(); } /* XXXGL: curthread */ fdrop(sfio->sock_fp, curthread); free(sfio, M_TEMP); } /* * Iterate through pages vector and request paging for non-valid pages. */ static int sendfile_swapin(vm_object_t obj, struct sf_io *sfio, off_t off, off_t len, int npages, int rhpages, int flags) { vm_page_t *pa = sfio->pa; int nios; nios = 0; flags = (flags & SF_NODISKIO) ? VM_ALLOC_NOWAIT : 0; /* * First grab all the pages and wire them. Note that we grab * only required pages. Readahead pages are dealt with later. */ VM_OBJECT_WLOCK(obj); for (int i = 0; i < npages; i++) { pa[i] = vm_page_grab(obj, OFF_TO_IDX(vmoff(i, off)), VM_ALLOC_WIRED | VM_ALLOC_NORMAL | flags); if (pa[i] == NULL) { npages = i; rhpages = 0; break; } } for (int i = 0; i < npages;) { int j, a, count, rv; /* Skip valid pages. */ if (vm_page_is_valid(pa[i], vmoff(i, off) & PAGE_MASK, xfsize(i, npages, off, len))) { vm_page_xunbusy(pa[i]); SFSTAT_INC(sf_pages_valid); i++; continue; } /* * Now 'i' points to first invalid page, iterate further * to make 'j' point at first valid after a bunch of * invalid ones. */ for (j = i + 1; j < npages; j++) if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK, xfsize(j, npages, off, len))) { SFSTAT_INC(sf_pages_valid); break; } /* * Now we got region of invalid pages between 'i' and 'j'. * Check that they belong to pager. They may not be there, * which is a regular situation for shmem pager. For vnode * pager this happens only in case of sparse file. * * Important feature of vm_pager_has_page() is the hint * stored in 'a', about how many pages we can pagein after * this page in a single I/O. */ while (!vm_pager_has_page(obj, OFF_TO_IDX(vmoff(i, off)), NULL, &a) && i < j) { pmap_zero_page(pa[i]); pa[i]->valid = VM_PAGE_BITS_ALL; pa[i]->dirty = 0; vm_page_xunbusy(pa[i]); i++; } if (i == j) continue; /* * We want to pagein as many pages as possible, limited only * by the 'a' hint and actual request. * * We should not pagein into already valid page, thus if * 'j' didn't reach last page, trim by that page. * * When the pagein fulfils the request, also specify readahead. */ if (j < npages) a = min(a, j - i - 1); count = min(a + 1, npages - i); refcount_acquire(&sfio->nios); rv = vm_pager_get_pages_async(obj, pa + i, count, NULL, i + count == npages ? &rhpages : NULL, &sendfile_iodone, sfio); KASSERT(rv == VM_PAGER_OK, ("%s: pager fail obj %p page %p", __func__, obj, pa[i])); SFSTAT_INC(sf_iocnt); SFSTAT_ADD(sf_pages_read, count); if (i + count == npages) SFSTAT_ADD(sf_rhpages_read, rhpages); #ifdef INVARIANTS for (j = i; j < i + count && j < npages; j++) KASSERT(pa[j] == vm_page_lookup(obj, OFF_TO_IDX(vmoff(j, off))), ("pa[j] %p lookup %p\n", pa[j], vm_page_lookup(obj, OFF_TO_IDX(vmoff(j, off))))); #endif i += count; nios++; } VM_OBJECT_WUNLOCK(obj); if (nios == 0 && npages != 0) SFSTAT_INC(sf_noiocnt); return (nios); } static int sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, int *bsize) { struct vattr va; vm_object_t obj; struct vnode *vp; struct shmfd *shmfd; int error; vp = *vp_res = NULL; obj = NULL; shmfd = *shmfd_res = NULL; *bsize = 0; /* * The file descriptor must be a regular file and have a * backing VM object. */ if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; vn_lock(vp, LK_SHARED | LK_RETRY); if (vp->v_type != VREG) { error = EINVAL; goto out; } *bsize = vp->v_mount->mnt_stat.f_iosize; error = VOP_GETATTR(vp, &va, td->td_ucred); if (error != 0) goto out; *obj_size = va.va_size; obj = vp->v_object; if (obj == NULL) { error = EINVAL; goto out; } } else if (fp->f_type == DTYPE_SHM) { error = 0; shmfd = fp->f_data; obj = shmfd->shm_object; *obj_size = shmfd->shm_size; } else { error = EINVAL; goto out; } VM_OBJECT_WLOCK(obj); if ((obj->flags & OBJ_DEAD) != 0) { VM_OBJECT_WUNLOCK(obj); error = EBADF; goto out; } /* * Temporarily increase the backing VM object's reference * count so that a forced reclamation of its vnode does not * immediately destroy it. */ vm_object_reference_locked(obj); VM_OBJECT_WUNLOCK(obj); *obj_res = obj; *vp_res = vp; *shmfd_res = shmfd; out: if (vp != NULL) VOP_UNLOCK(vp, 0); return (error); } static int sendfile_getsock(struct thread *td, int s, struct file **sock_fp, struct socket **so) { cap_rights_t rights; int error; *sock_fp = NULL; *so = NULL; /* * The socket must be a stream socket and connected. */ error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SEND), - sock_fp, NULL); + sock_fp, NULL, NULL); if (error != 0) return (error); *so = (*sock_fp)->f_data; if ((*so)->so_type != SOCK_STREAM) return (EINVAL); if (((*so)->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); return (0); } int vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { struct file *sock_fp; struct vnode *vp; struct vm_object *obj; struct socket *so; struct mbuf *m, *mh, *mhtail; struct sf_buf *sf; struct shmfd *shmfd; struct sendfile_sync *sfs; struct vattr va; off_t off, sbytes, rem, obj_size; int error, softerr, bsize, hdrlen; obj = NULL; so = NULL; m = mh = NULL; sfs = NULL; hdrlen = sbytes = 0; softerr = 0; error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); if (error != 0) return (error); error = sendfile_getsock(td, sockfd, &sock_fp, &so); if (error != 0) goto out; #ifdef MAC error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto out; #endif SFSTAT_INC(sf_syscalls); SFSTAT_ADD(sf_rhpages_requested, SF_READAHEAD(flags)); if (flags & SF_SYNC) { sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO); mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); cv_init(&sfs->cv, "sendfile"); } rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset; /* * Protect against multiple writers to the socket. * * XXXRW: Historically this has assumed non-interruptibility, so now * we implement that, but possibly shouldn't. */ (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); /* * Loop through the pages of the file, starting with the requested * offset. Get a file page (do I/O if necessary), map the file page * into an sf_buf, attach an mbuf header to the sf_buf, and queue * it on the socket. * This is done in two loops. The inner loop turns as many pages * as it can, up to available socket buffer space, without blocking * into mbufs to have it bulk delivered into the socket send buffer. * The outer loop checks the state and available space of the socket * and takes care of the overall progress. */ for (off = offset; rem > 0; ) { struct sf_io *sfio; vm_page_t *pa; struct mbuf *mtail; int nios, space, npages, rhpages; mtail = NULL; /* * Check the socket state for ongoing connection, * no errors and space in socket buffer. * If space is low allow for the remainder of the * file to be processed if it fits the socket buffer. * Otherwise block in waiting for sufficient space * to proceed, or if the socket is nonblocking, return * to userland with EAGAIN while reporting how far * we've come. * We wait until the socket buffer has significant free * space to do bulk sends. This makes good use of file * system read ahead and allows packet segmentation * offloading hardware to take over lots of work. If * we were not careful here we would send off only one * sfbuf at a time. */ SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; retry_space: if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; SOCKBUF_UNLOCK(&so->so_snd); goto done; } else if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto done; } space = sbspace(&so->so_snd); if (space < rem && (space <= 0 || space < so->so_snd.sb_lowat)) { if (so->so_state & SS_NBIO) { SOCKBUF_UNLOCK(&so->so_snd); error = EAGAIN; goto done; } /* * sbwait drops the lock while sleeping. * When we loop back to retry_space the * state may have changed and we retest * for it. */ error = sbwait(&so->so_snd); /* * An error from sbwait usually indicates that we've * been interrupted by a signal. If we've sent anything * then return bytes sent, otherwise return the error. */ if (error != 0) { SOCKBUF_UNLOCK(&so->so_snd); goto done; } goto retry_space; } SOCKBUF_UNLOCK(&so->so_snd); /* * At the beginning of the first loop check if any headers * are specified and copy them into mbufs. Reduce space in * the socket buffer by the size of the header mbuf chain. * Clear hdr_uio here and hdrlen at the end of the first loop. */ if (hdr_uio != NULL && hdr_uio->uio_resid > 0) { hdr_uio->uio_td = td; hdr_uio->uio_rw = UIO_WRITE; mh = m_uiotombuf(hdr_uio, M_WAITOK, space, 0, 0); hdrlen = m_length(mh, &mhtail); space -= hdrlen; /* * If header consumed all the socket buffer space, * don't waste CPU cycles and jump to the end. */ if (space == 0) { sfio = NULL; nios = 0; goto prepend_header; } hdr_uio = NULL; } if (vp != NULL) { error = vn_lock(vp, LK_SHARED); if (error != 0) goto done; error = VOP_GETATTR(vp, &va, td->td_ucred); if (error != 0 || off >= va.va_size) { VOP_UNLOCK(vp, 0); goto done; } if (va.va_size != obj_size) { if (nbytes == 0) rem += va.va_size - obj_size; else if (offset + nbytes > va.va_size) rem -= (offset + nbytes - va.va_size); obj_size = va.va_size; } } if (space > rem) space = rem; npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE); /* * Calculate maximum allowed number of pages for readahead * at this iteration. First, we allow readahead up to "rem". * If application wants more, let it be, but there is no * reason to go above MAXPHYS. Also check against "obj_size", * since vm_pager_has_page() can hint beyond EOF. */ rhpages = howmany(rem + (off & PAGE_MASK), PAGE_SIZE) - npages; rhpages += SF_READAHEAD(flags); rhpages = min(howmany(MAXPHYS, PAGE_SIZE), rhpages); rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) - npages, rhpages); sfio = malloc(sizeof(struct sf_io) + npages * sizeof(vm_page_t), M_TEMP, M_WAITOK); refcount_init(&sfio->nios, 1); sfio->error = 0; nios = sendfile_swapin(obj, sfio, off, space, npages, rhpages, flags); /* * Loop and construct maximum sized mbuf chain to be bulk * dumped into socket buffer. */ pa = sfio->pa; for (int i = 0; i < npages; i++) { struct mbuf *m0; /* * If a page wasn't grabbed successfully, then * trim the array. Can happen only with SF_NODISKIO. */ if (pa[i] == NULL) { SFSTAT_INC(sf_busy); fixspace(npages, i, off, &space); npages = i; softerr = EBUSY; break; } /* * Get a sendfile buf. When allocating the * first buffer for mbuf chain, we usually * wait as long as necessary, but this wait * can be interrupted. For consequent * buffers, do not sleep, since several * threads might exhaust the buffers and then * deadlock. */ sf = sf_buf_alloc(pa[i], m != NULL ? SFB_NOWAIT : SFB_CATCH); if (sf == NULL) { SFSTAT_INC(sf_allocfail); for (int j = i; j < npages; j++) { vm_page_lock(pa[j]); vm_page_unwire(pa[j], PQ_INACTIVE); vm_page_unlock(pa[j]); } if (m == NULL) softerr = ENOBUFS; fixspace(npages, i, off, &space); npages = i; break; } m0 = m_get(M_WAITOK, MT_DATA); m0->m_ext.ext_buf = (char *)sf_buf_kva(sf); m0->m_ext.ext_size = PAGE_SIZE; m0->m_ext.ext_arg1 = sf; m0->m_ext.ext_arg2 = sfs; /* * SF_NOCACHE sets the page as being freed upon send. * However, we ignore it for the last page in 'space', * if the page is truncated, and we got more data to * send (rem > space), or if we have readahead * configured (rhpages > 0). */ if ((flags & SF_NOCACHE) == 0 || (i == npages - 1 && ((off + space) & PAGE_MASK) && (rem > space || rhpages > 0))) m0->m_ext.ext_type = EXT_SFBUF; else m0->m_ext.ext_type = EXT_SFBUF_NOCACHE; m0->m_ext.ext_flags = EXT_FLAG_EMBREF; m0->m_ext.ext_count = 1; m0->m_flags |= (M_EXT | M_RDONLY); if (nios) m0->m_flags |= M_NOTREADY; m0->m_data = (char *)sf_buf_kva(sf) + (vmoff(i, off) & PAGE_MASK); m0->m_len = xfsize(i, npages, off, space); if (i == 0) sfio->m = m0; /* Append to mbuf chain. */ if (mtail != NULL) mtail->m_next = m0; else m = m0; mtail = m0; if (sfs != NULL) { mtx_lock(&sfs->mtx); sfs->count++; mtx_unlock(&sfs->mtx); } } if (vp != NULL) VOP_UNLOCK(vp, 0); /* Keep track of bytes processed. */ off += space; rem -= space; /* Prepend header, if any. */ if (hdrlen) { prepend_header: mhtail->m_next = m; m = mh; mh = NULL; } if (m == NULL) { KASSERT(softerr, ("%s: m NULL, no error", __func__)); error = softerr; free(sfio, M_TEMP); goto done; } /* Add the buffer chain to the socket buffer. */ KASSERT(m_length(m, NULL) == space + hdrlen, ("%s: mlen %u space %d hdrlen %d", __func__, m_length(m, NULL), space, hdrlen)); CURVNET_SET(so->so_vnet); if (nios == 0) { /* * If sendfile_swapin() didn't initiate any I/Os, * which happens if all data is cached in VM, then * we can send data right now without the * PRUS_NOTREADY flag. */ free(sfio, M_TEMP); error = (*so->so_proto->pr_usrreqs->pru_send) (so, 0, m, NULL, NULL, td); } else { sfio->sock_fp = sock_fp; sfio->npages = npages; fhold(sock_fp); error = (*so->so_proto->pr_usrreqs->pru_send) (so, PRUS_NOTREADY, m, NULL, NULL, td); sendfile_iodone(sfio, NULL, 0, 0); } CURVNET_RESTORE(); m = NULL; /* pru_send always consumes */ if (error) goto done; sbytes += space + hdrlen; if (hdrlen) hdrlen = 0; if (softerr) { error = softerr; goto done; } } /* * Send trailers. Wimp out and use writev(2). */ if (trl_uio != NULL) { sbunlock(&so->so_snd); error = kern_writev(td, sockfd, trl_uio); if (error == 0) sbytes += td->td_retval[0]; goto out; } done: sbunlock(&so->so_snd); out: /* * If there was no error we have to clear td->td_retval[0] * because it may have been set by writev. */ if (error == 0) { td->td_retval[0] = 0; } if (sent != NULL) { (*sent) = sbytes; } if (obj != NULL) vm_object_deallocate(obj); if (so) fdrop(sock_fp, td); if (m) m_freem(m); if (mh) m_freem(mh); if (sfs != NULL) { mtx_lock(&sfs->mtx); if (sfs->count != 0) cv_wait(&sfs->cv, &sfs->mtx); KASSERT(sfs->count == 0, ("sendfile sync still busy")); cv_destroy(&sfs->cv); mtx_destroy(&sfs->mtx); free(sfs, M_TEMP); } if (error == ERESTART) error = EINTR; return (error); } static int sendfile(struct thread *td, struct sendfile_args *uap, int compat) { struct sf_hdtr hdtr; struct uio *hdr_uio, *trl_uio; struct file *fp; cap_rights_t rights; off_t sbytes; int error; /* * File offset must be positive. If it goes beyond EOF * we send only the header/trailer and no payload data. */ if (uap->offset < 0) return (EINVAL); hdr_uio = trl_uio = NULL; if (uap->hdtr != NULL) { error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); if (error != 0) goto out; if (hdtr.headers != NULL) { error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio); if (error != 0) goto out; #ifdef COMPAT_FREEBSD4 /* * In FreeBSD < 5.0 the nbytes to send also included * the header. If compat is specified subtract the * header size from nbytes. */ if (compat) { if (uap->nbytes > hdr_uio->uio_resid) uap->nbytes -= hdr_uio->uio_resid; else uap->nbytes = 0; } #endif } if (hdtr.trailers != NULL) { error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio); if (error != 0) goto out; } } AUDIT_ARG_FD(uap->fd); /* * sendfile(2) can start at any offset within a file so we require * CAP_READ+CAP_SEEK = CAP_PREAD. */ if ((error = fget_read(td, uap->fd, cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) { goto out; } error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset, uap->nbytes, &sbytes, uap->flags, td); fdrop(fp, td); if (uap->sbytes != NULL) copyout(&sbytes, uap->sbytes, sizeof(off_t)); out: free(hdr_uio, M_IOV); free(trl_uio, M_IOV); return (error); } /* * sendfile(2) * * int sendfile(int fd, int s, off_t offset, size_t nbytes, * struct sf_hdtr *hdtr, off_t *sbytes, int flags) * * Send a file specified by 'fd' and starting at 'offset' to a socket * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == * 0. Optionally add a header and/or trailer to the socket output. If * specified, write the total number of bytes sent into *sbytes. */ int sys_sendfile(struct thread *td, struct sendfile_args *uap) { return (sendfile(td, uap, 0)); } #ifdef COMPAT_FREEBSD4 int freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) { struct sendfile_args args; args.fd = uap->fd; args.s = uap->s; args.offset = uap->offset; args.nbytes = uap->nbytes; args.hdtr = uap->hdtr; args.sbytes = uap->sbytes; args.flags = uap->flags; return (sendfile(td, &args, 1)); } #endif /* COMPAT_FREEBSD4 */ Index: stable/11/sys/kern/uipc_syscalls.c =================================================================== --- stable/11/sys/kern/uipc_syscalls.c (revision 315311) +++ stable/11/sys/kern/uipc_syscalls.c (revision 315312) @@ -1,1762 +1,1769 @@ /*- * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef COMPAT_FREEBSD32 #include #endif #include #include #include /* * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC * and SOCK_NONBLOCK. */ #define ACCEPT4_INHERIT 0x1 #define ACCEPT4_COMPAT 0x2 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); static int accept1(struct thread *td, int s, struct sockaddr *uname, socklen_t *anamelen, int flags); static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat); static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat); static int sockargs(struct mbuf **, char *, socklen_t, int); /* * Convert a user file descriptor to a kernel file entry and check if required * capability rights are present. + * If required copy of current set of capability rights is returned. * A reference on the file entry is held upon returning. */ int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, - struct file **fpp, u_int *fflagp) + struct file **fpp, u_int *fflagp, struct filecaps *havecapsp) { struct file *fp; int error; - error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL); + error = fget_cap(td, fd, rightsp, &fp, havecapsp); if (error != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, td); + if (havecapsp != NULL) + filecaps_free(havecapsp); return (ENOTSOCK); } if (fflagp != NULL) *fflagp = fp->f_flag; *fpp = fp; return (0); } /* * System call interface to the socket abstraction. */ #if defined(COMPAT_43) #define COMPAT_OLDSOCK #endif int sys_socket(td, uap) struct thread *td; struct socket_args /* { int domain; int type; int protocol; } */ *uap; { struct socket *so; struct file *fp; int fd, error, type, oflag, fflag; AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol); type = uap->type; oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC error = mac_socket_check_create(td->td_ucred, uap->domain, type, uap->protocol); if (error != 0) return (error); #endif error = falloc(td, &fp, &fd, oflag); if (error != 0) return (error); /* An extra reference on `fp' has been held for us by falloc(). */ error = socreate(uap->domain, &so, type, uap->protocol, td->td_ucred, td); if (error != 0) { fdclose(td, fp, fd); } else { finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); if ((fflag & FNONBLOCK) != 0) (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); td->td_retval[0] = fd; } fdrop(fp, td); return (error); } /* ARGSUSED */ int sys_bind(td, uap) struct thread *td; struct bind_args /* { int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_BIND), - &fp, NULL); + &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_bind(td->td_ucred, so, sa); if (error == 0) { #endif if (dirfd == AT_FDCWD) error = sobind(so, sa, td); else error = sobindat(dirfd, so, sa, td); #ifdef MAC } #endif fdrop(fp, td); return (error); } /* ARGSUSED */ int sys_bindat(td, uap) struct thread *td; struct bindat_args /* { int fd; int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } /* ARGSUSED */ int sys_listen(td, uap) struct thread *td; struct listen_args /* { int s; int backlog; } */ *uap; { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->s); error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_LISTEN), - &fp, NULL); + &fp, NULL, NULL); if (error == 0) { so = fp->f_data; #ifdef MAC error = mac_socket_check_listen(td->td_ucred, so); if (error == 0) #endif error = solisten(so, uap->backlog, td); fdrop(fp, td); } return(error); } /* * accept1() */ static int accept1(td, s, uname, anamelen, flags) struct thread *td; int s; struct sockaddr *uname; socklen_t *anamelen; int flags; { struct sockaddr *name; socklen_t namelen; struct file *fp; int error; if (uname == NULL) return (kern_accept4(td, s, NULL, NULL, flags, NULL)); error = copyin(anamelen, &namelen, sizeof (namelen)); if (error != 0) return (error); error = kern_accept4(td, s, &name, &namelen, flags, &fp); if (error != 0) return (error); if (error == 0 && uname != NULL) { #ifdef COMPAT_OLDSOCK if (flags & ACCEPT4_COMPAT) ((struct osockaddr *)name)->sa_family = name->sa_family; #endif error = copyout(name, uname, namelen); } if (error == 0) error = copyout(&namelen, anamelen, sizeof(namelen)); if (error != 0) fdclose(td, fp, td->td_retval[0]); fdrop(fp, td); free(name, M_SONAME); return (error); } int kern_accept(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, struct file **fp) { return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); } int kern_accept4(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, int flags, struct file **fp) { struct file *headfp, *nfp = NULL; struct sockaddr *sa = NULL; struct socket *head, *so; + struct filecaps fcaps; cap_rights_t rights; u_int fflag; pid_t pgid; int error, fd, tmp; if (name != NULL) *name = NULL; AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_ACCEPT), - &headfp, &fflag); + &headfp, &fflag, &fcaps); if (error != 0) return (error); head = headfp->f_data; if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; } #ifdef MAC error = mac_socket_check_accept(td->td_ucred, head); if (error != 0) goto done; #endif - error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0); + error = falloc_caps(td, &nfp, &fd, + (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps); if (error != 0) goto done; ACCEPT_LOCK(); if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { ACCEPT_UNLOCK(); error = EWOULDBLOCK; goto noconnection; } while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { head->so_error = ECONNABORTED; break; } error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, "accept", 0); if (error != 0) { ACCEPT_UNLOCK(); goto noconnection; } } if (head->so_error) { error = head->so_error; head->so_error = 0; ACCEPT_UNLOCK(); goto noconnection; } so = TAILQ_FIRST(&head->so_comp); KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); /* * Before changing the flags on the socket, we have to bump the * reference count. Otherwise, if the protocol calls sofree(), * the socket will be released due to a zero refcount. */ SOCK_LOCK(so); /* soref() and so_state update */ soref(so); /* file descriptor reference */ TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; if (flags & ACCEPT4_INHERIT) so->so_state |= (head->so_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; so->so_qstate &= ~SQ_COMP; so->so_head = NULL; SOCK_UNLOCK(so); ACCEPT_UNLOCK(); /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; /* connection has been removed from the listen queue */ KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); if (flags & ACCEPT4_INHERIT) { pgid = fgetown(&head->so_sigio); if (pgid != 0) fsetown(pgid, &so->so_sigio); } else { fflag &= ~(FNONBLOCK | FASYNC); if (flags & SOCK_NONBLOCK) fflag |= FNONBLOCK; } finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); /* Sync socket nonblocking/async state with file flags */ tmp = fflag & FNONBLOCK; (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); sa = NULL; error = soaccept(so, &sa); if (error != 0) goto noconnection; if (sa == NULL) { if (name) *namelen = 0; goto done; } AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); if (name) { /* check sa_len before it is destroyed */ if (*namelen > sa->sa_len) *namelen = sa->sa_len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif *name = sa; sa = NULL; } noconnection: free(sa, M_SONAME); /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error != 0) fdclose(td, nfp, fd); /* * Release explicitly held references before returning. We return * a reference on nfp to the caller on success if they request it. */ done: + if (nfp == NULL) + filecaps_free(&fcaps); if (fp != NULL) { if (error == 0) { *fp = nfp; nfp = NULL; } else *fp = NULL; } if (nfp != NULL) fdrop(nfp, td); fdrop(headfp, td); return (error); } int sys_accept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); } int sys_accept4(td, uap) struct thread *td; struct accept4_args *uap; { if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return (EINVAL); return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); } #ifdef COMPAT_OLDSOCK int oaccept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT | ACCEPT4_COMPAT)); } #endif /* COMPAT_OLDSOCK */ /* ARGSUSED */ int sys_connect(td, uap) struct thread *td; struct connect_args /* { int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; cap_rights_t rights; int error, interrupted = 0; AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_CONNECT), - &fp, NULL); + &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; if (so->so_state & SS_ISCONNECTING) { error = EALREADY; goto done1; } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_connect(td->td_ucred, so, sa); if (error != 0) goto bad; #endif if (dirfd == AT_FDCWD) error = soconnect(so, sa, td); else error = soconnectat(dirfd, so, sa, td); if (error != 0) goto bad; if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { error = EINPROGRESS; goto done1; } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, "connec", 0); if (error != 0) { if (error == EINTR || error == ERESTART) interrupted = 1; break; } } if (error == 0) { error = so->so_error; so->so_error = 0; } SOCK_UNLOCK(so); bad: if (!interrupted) so->so_state &= ~SS_ISCONNECTING; if (error == ERESTART) error = EINTR; done1: fdrop(fp, td); return (error); } /* ARGSUSED */ int sys_connectat(td, uap) struct thread *td; struct connectat_args /* { int fd; int s; caddr_t name; int namelen; } */ *uap; { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_socketpair(struct thread *td, int domain, int type, int protocol, int *rsv) { struct file *fp1, *fp2; struct socket *so1, *so2; int fd, error, oflag, fflag; AUDIT_ARG_SOCKET(domain, type, protocol); oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC /* We might want to have a separate check for socket pairs. */ error = mac_socket_check_create(td->td_ucred, domain, type, protocol); if (error != 0) return (error); #endif error = socreate(domain, &so1, type, protocol, td->td_ucred, td); if (error != 0) return (error); error = socreate(domain, &so2, type, protocol, td->td_ucred, td); if (error != 0) goto free1; /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ error = falloc(td, &fp1, &fd, oflag); if (error != 0) goto free2; rsv[0] = fd; fp1->f_data = so1; /* so1 already has ref count */ error = falloc(td, &fp2, &fd, oflag); if (error != 0) goto free3; fp2->f_data = so2; /* so2 already has ref count */ rsv[1] = fd; error = soconnect2(so1, so2); if (error != 0) goto free4; if (type == SOCK_DGRAM) { /* * Datagram socket connection is asymmetric. */ error = soconnect2(so2, so1); if (error != 0) goto free4; } finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, &socketops); finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, &socketops); if ((fflag & FNONBLOCK) != 0) { (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); } fdrop(fp1, td); fdrop(fp2, td); return (0); free4: fdclose(td, fp2, rsv[1]); fdrop(fp2, td); free3: fdclose(td, fp1, rsv[0]); fdrop(fp1, td); free2: if (so2 != NULL) (void)soclose(so2); free1: if (so1 != NULL) (void)soclose(so1); return (error); } int sys_socketpair(struct thread *td, struct socketpair_args *uap) { int error, sv[2]; error = kern_socketpair(td, uap->domain, uap->type, uap->protocol, sv); if (error != 0) return (error); error = copyout(sv, uap->rsv, 2 * sizeof(int)); if (error != 0) { (void)kern_close(td, sv[0]); (void)kern_close(td, sv[1]); } return (error); } static int sendit(td, s, mp, flags) struct thread *td; int s; struct msghdr *mp; int flags; { struct mbuf *control; struct sockaddr *to; int error; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) return (ECAPMODE); #endif if (mp->msg_name != NULL) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error != 0) { to = NULL; goto bad; } mp->msg_name = to; } else { to = NULL; } if (mp->msg_control) { if (mp->msg_controllen < sizeof(struct cmsghdr) #ifdef COMPAT_OLDSOCK && mp->msg_flags != MSG_COMPAT #endif ) { error = EINVAL; goto bad; } error = sockargs(&control, mp->msg_control, mp->msg_controllen, MT_CONTROL); if (error != 0) goto bad; #ifdef COMPAT_OLDSOCK if (mp->msg_flags == MSG_COMPAT) { struct cmsghdr *cm; M_PREPEND(control, sizeof(*cm), M_WAITOK); cm = mtod(control, struct cmsghdr *); cm->cmsg_len = control->m_len; cm->cmsg_level = SOL_SOCKET; cm->cmsg_type = SCM_RIGHTS; } #endif } else { control = NULL; } error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); bad: free(to, M_SONAME); return (error); } int kern_sendit(td, s, mp, flags, control, segflg) struct thread *td; int s; struct msghdr *mp; int flags; struct mbuf *control; enum uio_seg segflg; { struct file *fp; struct uio auio; struct iovec *iov; struct socket *so; cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int i, error; AUDIT_ARG_FD(s); cap_rights_init(&rights, CAP_SEND); if (mp->msg_name != NULL) { AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); cap_rights_set(&rights, CAP_CONNECT); } - error = getsock_cap(td, s, &rights, &fp, NULL); + error = getsock_cap(td, s, &rights, &fp, NULL, NULL); if (error != 0) { m_freem(control); return (error); } so = (struct socket *)fp->f_data; #ifdef KTRACE if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(mp->msg_name); #endif #ifdef MAC if (mp->msg_name != NULL) { error = mac_socket_check_connect(td->td_ucred, so, mp->msg_name); if (error != 0) { m_freem(control); goto bad; } } error = mac_socket_check_send(td->td_ucred, so); if (error != 0) { m_freem(control); goto bad; } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = segflg; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { error = EINVAL; m_freem(control); goto bad; } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(s, UIO_WRITE, ktruio, error); } #endif bad: fdrop(fp, td); return (error); } int sys_sendto(td, uap) struct thread *td; struct sendto_args /* { int s; caddr_t buf; size_t len; int flags; caddr_t to; int tolen; } */ *uap; { struct msghdr msg; struct iovec aiov; msg.msg_name = uap->to; msg.msg_namelen = uap->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif aiov.iov_base = uap->buf; aiov.iov_len = uap->len; return (sendit(td, uap->s, &msg, uap->flags)); } #ifdef COMPAT_OLDSOCK int osend(td, uap) struct thread *td; struct osend_args /* { int s; caddr_t buf; int len; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = 0; return (sendit(td, uap->s, &msg, uap->flags)); } int osendmsg(td, uap) struct thread *td; struct osendmsg_args /* { int s; caddr_t msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; msg.msg_flags = MSG_COMPAT; error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } #endif int sys_sendmsg(td, uap) struct thread *td; struct sendmsg_args /* { int s; caddr_t msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } int kern_recvit(td, s, mp, fromseg, controlp) struct thread *td; int s; struct msghdr *mp; enum uio_seg fromseg; struct mbuf **controlp; { struct uio auio; struct iovec *iov; struct mbuf *m, *control = NULL; caddr_t ctlbuf; struct file *fp; struct socket *so; struct sockaddr *fromsa = NULL; cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int error, i; if (controlp != NULL) *controlp = NULL; AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_RECV), - &fp, NULL); + &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef MAC error = mac_socket_check_receive(td->td_ucred, so); if (error != 0) { fdrop(fp, td); return (error); } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { fdrop(fp, td); return (EINVAL); } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = soreceive(so, &fromsa, &auio, NULL, (mp->msg_control || controlp) ? &control : NULL, &mp->msg_flags); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } if (fromsa != NULL) AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = len - auio.uio_resid; ktrgenio(s, UIO_READ, ktruio, error); } #endif if (error != 0) goto out; td->td_retval[0] = len - auio.uio_resid; if (mp->msg_name) { len = mp->msg_namelen; if (len <= 0 || fromsa == NULL) len = 0; else { /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) ((struct osockaddr *)fromsa)->sa_family = fromsa->sa_family; #endif if (fromseg == UIO_USERSPACE) { error = copyout(fromsa, mp->msg_name, (unsigned)len); if (error != 0) goto out; } else bcopy(fromsa, mp->msg_name, len); } mp->msg_namelen = len; } if (mp->msg_control && controlp == NULL) { #ifdef COMPAT_OLDSOCK /* * We assume that old recvmsg calls won't receive access * rights and other control info, esp. as control info * is always optional and those options didn't exist in 4.3. * If we receive rights, trim the cmsghdr; anything else * is tossed. */ if (control && mp->msg_flags & MSG_COMPAT) { if (mtod(control, struct cmsghdr *)->cmsg_level != SOL_SOCKET || mtod(control, struct cmsghdr *)->cmsg_type != SCM_RIGHTS) { mp->msg_controllen = 0; goto out; } control->m_len -= sizeof (struct cmsghdr); control->m_data += sizeof (struct cmsghdr); } #endif len = mp->msg_controllen; m = control; mp->msg_controllen = 0; ctlbuf = mp->msg_control; while (m && len > 0) { unsigned int tocopy; if (len >= m->m_len) tocopy = m->m_len; else { mp->msg_flags |= MSG_CTRUNC; tocopy = len; } if ((error = copyout(mtod(m, caddr_t), ctlbuf, tocopy)) != 0) goto out; ctlbuf += tocopy; len -= tocopy; m = m->m_next; } mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; } out: fdrop(fp, td); #ifdef KTRACE if (fromsa && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(fromsa); #endif free(fromsa, M_SONAME); if (error == 0 && controlp != NULL) *controlp = control; else if (control) m_freem(control); return (error); } static int recvit(td, s, mp, namelenp) struct thread *td; int s; struct msghdr *mp; void *namelenp; { int error; error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); if (error != 0) return (error); if (namelenp != NULL) { error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) error = 0; /* old recvfrom didn't check */ #endif } return (error); } int sys_recvfrom(td, uap) struct thread *td; struct recvfrom_args /* { int s; caddr_t buf; size_t len; int flags; struct sockaddr * __restrict from; socklen_t * __restrict fromlenaddr; } */ *uap; { struct msghdr msg; struct iovec aiov; int error; if (uap->fromlenaddr) { error = copyin(uap->fromlenaddr, &msg.msg_namelen, sizeof (msg.msg_namelen)); if (error != 0) goto done2; } else { msg.msg_namelen = 0; } msg.msg_name = uap->from; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, uap->fromlenaddr); done2: return (error); } #ifdef COMPAT_OLDSOCK int orecvfrom(td, uap) struct thread *td; struct recvfrom_args *uap; { uap->flags |= MSG_COMPAT; return (sys_recvfrom(td, uap)); } #endif #ifdef COMPAT_OLDSOCK int orecv(td, uap) struct thread *td; struct orecv_args /* { int s; caddr_t buf; int len; int flags; } */ *uap; { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; return (recvit(td, uap->s, &msg, NULL)); } /* * Old recvmsg. This code takes advantage of the fact that the old msghdr * overlays the new one, missing only the flags, and with the (old) access * rights where the control fields are now. */ int orecvmsg(td, uap) struct thread *td; struct orecvmsg_args /* { int s; struct omsghdr *msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags | MSG_COMPAT; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); if (msg.msg_controllen && error == 0) error = copyout(&msg.msg_controllen, &uap->msg->msg_accrightslen, sizeof (int)); free(iov, M_IOV); return (error); } #endif int sys_recvmsg(td, uap) struct thread *td; struct recvmsg_args /* { int s; struct msghdr *msg; int flags; } */ *uap; { struct msghdr msg; struct iovec *uiov, *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags; #ifdef COMPAT_OLDSOCK msg.msg_flags &= ~MSG_COMPAT; #endif uiov = msg.msg_iov; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, NULL); if (error == 0) { msg.msg_iov = uiov; error = copyout(&msg, uap->msg, sizeof(msg)); } free(iov, M_IOV); return (error); } /* ARGSUSED */ int sys_shutdown(td, uap) struct thread *td; struct shutdown_args /* { int s; int how; } */ *uap; { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->s); error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_SHUTDOWN), - &fp, NULL); + &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = soshutdown(so, uap->how); /* * Previous versions did not return ENOTCONN, but 0 in * case the socket was not connected. Some important * programs like syslogd up to r279016, 2015-02-19, * still depend on this behavior. */ if (error == ENOTCONN && td->td_proc->p_osrel < P_OSREL_SHUTDOWN_ENOTCONN) error = 0; fdrop(fp, td); } return (error); } /* ARGSUSED */ int sys_setsockopt(td, uap) struct thread *td; struct setsockopt_args /* { int s; int level; int name; caddr_t val; int valsize; } */ *uap; { return (kern_setsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, uap->valsize)); } int kern_setsockopt(td, s, level, name, val, valseg, valsize) struct thread *td; int s; int level; int name; void *val; enum uio_seg valseg; socklen_t valsize; { struct socket *so; struct file *fp; struct sockopt sopt; cap_rights_t rights; int error; if (val == NULL && valsize != 0) return (EFAULT); if ((int)valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = valsize; switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_setsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SETSOCKOPT), - &fp, NULL); + &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = sosetopt(so, &sopt); fdrop(fp, td); } return(error); } /* ARGSUSED */ int sys_getsockopt(td, uap) struct thread *td; struct getsockopt_args /* { int s; int level; int name; void * __restrict val; socklen_t * __restrict avalsize; } */ *uap; { socklen_t valsize; int error; if (uap->val) { error = copyin(uap->avalsize, &valsize, sizeof (valsize)); if (error != 0) return (error); } error = kern_getsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, &valsize); if (error == 0) error = copyout(&valsize, uap->avalsize, sizeof (valsize)); return (error); } /* * Kernel version of getsockopt. * optval can be a userland or userspace. optlen is always a kernel pointer. */ int kern_getsockopt(td, s, level, name, val, valseg, valsize) struct thread *td; int s; int level; int name; void *val; enum uio_seg valseg; socklen_t *valsize; { struct socket *so; struct file *fp; struct sockopt sopt; cap_rights_t rights; int error; if (val == NULL) *valsize = 0; if ((int)*valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_GET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_getsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_GETSOCKOPT), - &fp, NULL); + &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = sogetopt(so, &sopt); *valsize = sopt.sopt_valsize; fdrop(fp, td); } return (error); } /* * getsockname1() - Get socket name. */ /* ARGSUSED */ static int getsockname1(td, uap, compat) struct thread *td; struct getsockname_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ *uap; int compat; { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof(len)); if (error != 0) return (error); error = kern_getsockname(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; cap_rights_t rights; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETSOCKNAME), - &fp, NULL); + &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: fdrop(fp, td); if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } return (error); } int sys_getsockname(td, uap) struct thread *td; struct getsockname_args *uap; { return (getsockname1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetsockname(td, uap) struct thread *td; struct getsockname_args *uap; { return (getsockname1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* * getpeername1() - Get name of peer for connected socket. */ /* ARGSUSED */ static int getpeername1(td, uap, compat) struct thread *td; struct getpeername_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ *uap; int compat; { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof (len)); if (error != 0) return (error); error = kern_getpeername(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; cap_rights_t rights; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETPEERNAME), - &fp, NULL); + &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { error = ENOTCONN; goto done; } *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } done: fdrop(fp, td); return (error); } int sys_getpeername(td, uap) struct thread *td; struct getpeername_args *uap; { return (getpeername1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetpeername(td, uap) struct thread *td; struct ogetpeername_args *uap; { /* XXX uap should have type `getpeername_args *' to begin with. */ return (getpeername1(td, (struct getpeername_args *)uap, 1)); } #endif /* COMPAT_OLDSOCK */ static int sockargs(struct mbuf **mp, char *buf, socklen_t buflen, int type) { struct sockaddr *sa; struct mbuf *m; int error; if (buflen > MLEN) { #ifdef COMPAT_OLDSOCK if (type == MT_SONAME && buflen <= 112) buflen = MLEN; /* unix domain compat. hack */ else #endif if (buflen > MCLBYTES) return (EINVAL); } m = m_get2(buflen, M_WAITOK, type, 0); m->m_len = buflen; error = copyin(buf, mtod(m, void *), buflen); if (error != 0) (void) m_free(m); else { *mp = m; if (type == MT_SONAME) { sa = mtod(m, struct sockaddr *); #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = buflen; } } return (error); } int getsockaddr(namp, uaddr, len) struct sockaddr **namp; caddr_t uaddr; size_t len; { struct sockaddr *sa; int error; if (len > SOCK_MAXADDRLEN) return (ENAMETOOLONG); if (len < offsetof(struct sockaddr, sa_data[0])) return (EINVAL); sa = malloc(len, M_SONAME, M_WAITOK); error = copyin(uaddr, sa, len); if (error != 0) { free(sa, M_SONAME); } else { #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = len; *namp = sa; } return (error); } Index: stable/11/sys/netinet/sctp_syscalls.c =================================================================== --- stable/11/sys/netinet/sctp_syscalls.c (revision 315311) +++ stable/11/sys/netinet/sctp_syscalls.c (revision 315312) @@ -1,596 +1,597 @@ /*- * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_sctp.h" #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef COMPAT_FREEBSD32 #include #endif #include #include #include #include #include static struct syscall_helper_data sctp_syscalls[] = { SYSCALL_INIT_HELPER(sctp_peeloff), SYSCALL_INIT_HELPER(sctp_generic_sendmsg), SYSCALL_INIT_HELPER(sctp_generic_sendmsg_iov), SYSCALL_INIT_HELPER(sctp_generic_recvmsg), SYSCALL_INIT_LAST }; static void sctp_syscalls_init(void *unused __unused) { int error; error = syscall_helper_register(sctp_syscalls, SY_THR_STATIC); KASSERT((error == 0), ("%s: syscall_helper_register failed for sctp syscalls", __func__)); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(sctp_syscalls, SY_THR_STATIC); KASSERT((error == 0), ("%s: syscall32_helper_register failed for sctp syscalls", __func__)); #endif } SYSINIT(sctp_syscalls, SI_SUB_SYSCALLS, SI_ORDER_ANY, sctp_syscalls_init, NULL); /* * SCTP syscalls. * Functionality only compiled in if SCTP is defined in the kernel Makefile, * otherwise all return EOPNOTSUPP. * XXX: We should make this loadable one day. */ int sys_sctp_peeloff(td, uap) struct thread *td; struct sctp_peeloff_args /* { int sd; caddr_t name; } */ *uap; { #if (defined(INET) || defined(INET6)) && defined(SCTP) - struct file *nfp = NULL; + struct file *headfp, *nfp = NULL; struct socket *head, *so; cap_rights_t rights; u_int fflag; int error, fd; AUDIT_ARG_FD(uap->sd); - error = fgetsock(td, uap->sd, cap_rights_init(&rights, CAP_PEELOFF), - &head, &fflag); + error = getsock_cap(td, uap->sd, cap_rights_init(&rights, CAP_PEELOFF), + &headfp, &fflag, NULL); if (error != 0) goto done2; + head = headfp->f_data; if (head->so_proto->pr_protocol != IPPROTO_SCTP) { error = EOPNOTSUPP; goto done; } error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name); if (error != 0) goto done; /* * At this point we know we do have a assoc to pull * we proceed to get the fd setup. This may block * but that is ok. */ error = falloc(td, &nfp, &fd, 0); if (error != 0) goto done; td->td_retval[0] = fd; CURVNET_SET(head->so_vnet); so = sonewconn(head, SS_ISCONNECTED); if (so == NULL) { error = ENOMEM; goto noconnection; } /* * Before changing the flags on the socket, we have to bump the * reference count. Otherwise, if the protocol calls sofree(), * the socket will be released due to a zero refcount. */ SOCK_LOCK(so); soref(so); /* file descriptor reference */ SOCK_UNLOCK(so); ACCEPT_LOCK(); TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; so->so_state |= (head->so_state & SS_NBIO); so->so_state &= ~SS_NOFDREF; so->so_qstate &= ~SQ_COMP; so->so_head = NULL; ACCEPT_UNLOCK(); finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name); if (error != 0) goto noconnection; if (head->so_sigio != NULL) fsetown(fgetown(&head->so_sigio), &so->so_sigio); noconnection: /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error != 0) fdclose(td, nfp, fd); /* * Release explicitly held references before returning. */ CURVNET_RESTORE(); done: if (nfp != NULL) fdrop(nfp, td); - fputsock(head); + fdrop(headfp, td); done2: return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sys_sctp_generic_sendmsg (td, uap) struct thread *td; struct sctp_generic_sendmsg_args /* { int sd, caddr_t msg, int mlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags } */ *uap; { #if (defined(INET) || defined(INET6)) && defined(SCTP) struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; struct socket *so; struct file *fp = NULL; struct sockaddr *to = NULL; #ifdef KTRACE struct uio *ktruio = NULL; #endif struct uio auio; struct iovec iov[1]; cap_rights_t rights; int error = 0, len; if (uap->sinfo != NULL) { error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); if (error != 0) return (error); u_sinfo = &sinfo; } cap_rights_init(&rights, CAP_SEND); if (uap->tolen != 0) { error = getsockaddr(&to, uap->to, uap->tolen); if (error != 0) { to = NULL; goto sctp_bad2; } cap_rights_set(&rights, CAP_CONNECT); } AUDIT_ARG_FD(uap->sd); - error = getsock_cap(td, uap->sd, &rights, &fp, NULL); + error = getsock_cap(td, uap->sd, &rights, &fp, NULL, NULL); if (error != 0) goto sctp_bad; #ifdef KTRACE if (to && (KTRPOINT(td, KTR_STRUCT))) ktrsockaddr(to); #endif iov[0].iov_base = uap->msg; iov[0].iov_len = uap->mlen; so = (struct socket *)fp->f_data; if (so->so_proto->pr_protocol != IPPROTO_SCTP) { error = EOPNOTSUPP; goto sctp_bad; } #ifdef MAC error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto sctp_bad; #endif /* MAC */ auio.uio_iov = iov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif /* KTRACE */ len = auio.uio_resid = uap->mlen; CURVNET_SET(so->so_vnet); error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, (struct mbuf *)NULL, uap->flags, u_sinfo, td); CURVNET_RESTORE(); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket. */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(uap->flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(uap->sd, UIO_WRITE, ktruio, error); } #endif /* KTRACE */ sctp_bad: if (fp != NULL) fdrop(fp, td); sctp_bad2: free(to, M_SONAME); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sys_sctp_generic_sendmsg_iov(td, uap) struct thread *td; struct sctp_generic_sendmsg_iov_args /* { int sd, struct iovec *iov, int iovlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags } */ *uap; { #if (defined(INET) || defined(INET6)) && defined(SCTP) struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; struct socket *so; struct file *fp = NULL; struct sockaddr *to = NULL; #ifdef KTRACE struct uio *ktruio = NULL; #endif struct uio auio; struct iovec *iov, *tiov; cap_rights_t rights; ssize_t len; int error, i; if (uap->sinfo != NULL) { error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); if (error != 0) return (error); u_sinfo = &sinfo; } cap_rights_init(&rights, CAP_SEND); if (uap->tolen != 0) { error = getsockaddr(&to, uap->to, uap->tolen); if (error != 0) { to = NULL; goto sctp_bad2; } cap_rights_set(&rights, CAP_CONNECT); } AUDIT_ARG_FD(uap->sd); - error = getsock_cap(td, uap->sd, &rights, &fp, NULL); + error = getsock_cap(td, uap->sd, &rights, &fp, NULL, NULL); if (error != 0) goto sctp_bad1; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) error = freebsd32_copyiniov((struct iovec32 *)uap->iov, uap->iovlen, &iov, EMSGSIZE); else #endif error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); if (error != 0) goto sctp_bad1; #ifdef KTRACE if (to && (KTRPOINT(td, KTR_STRUCT))) ktrsockaddr(to); #endif so = (struct socket *)fp->f_data; if (so->so_proto->pr_protocol != IPPROTO_SCTP) { error = EOPNOTSUPP; goto sctp_bad; } #ifdef MAC error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto sctp_bad; #endif /* MAC */ auio.uio_iov = iov; auio.uio_iovcnt = uap->iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; tiov = iov; for (i = 0; i iovlen; i++, tiov++) { if ((auio.uio_resid += tiov->iov_len) < 0) { error = EINVAL; goto sctp_bad; } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif /* KTRACE */ len = auio.uio_resid; CURVNET_SET(so->so_vnet); error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, (struct mbuf *)NULL, uap->flags, u_sinfo, td); CURVNET_RESTORE(); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(uap->flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(uap->sd, UIO_WRITE, ktruio, error); } #endif /* KTRACE */ sctp_bad: free(iov, M_IOV); sctp_bad1: if (fp != NULL) fdrop(fp, td); sctp_bad2: free(to, M_SONAME); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sys_sctp_generic_recvmsg(td, uap) struct thread *td; struct sctp_generic_recvmsg_args /* { int sd, struct iovec *iov, int iovlen, struct sockaddr *from, __socklen_t *fromlenaddr, struct sctp_sndrcvinfo *sinfo, int *msg_flags } */ *uap; { #if (defined(INET) || defined(INET6)) && defined(SCTP) uint8_t sockbufstore[256]; struct uio auio; struct iovec *iov, *tiov; struct sctp_sndrcvinfo sinfo; struct socket *so; struct file *fp = NULL; struct sockaddr *fromsa; cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int error, fromlen, i, msg_flags; AUDIT_ARG_FD(uap->sd); error = getsock_cap(td, uap->sd, cap_rights_init(&rights, CAP_RECV), - &fp, NULL); + &fp, NULL, NULL); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) error = freebsd32_copyiniov((struct iovec32 *)uap->iov, uap->iovlen, &iov, EMSGSIZE); else #endif error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); if (error != 0) goto out1; so = fp->f_data; if (so->so_proto->pr_protocol != IPPROTO_SCTP) { error = EOPNOTSUPP; goto out; } #ifdef MAC error = mac_socket_check_receive(td->td_ucred, so); if (error != 0) goto out; #endif /* MAC */ if (uap->fromlenaddr != NULL) { error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen)); if (error != 0) goto out; } else { fromlen = 0; } if (uap->msg_flags) { error = copyin(uap->msg_flags, &msg_flags, sizeof (int)); if (error != 0) goto out; } else { msg_flags = 0; } auio.uio_iov = iov; auio.uio_iovcnt = uap->iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; tiov = iov; for (i = 0; i iovlen; i++, tiov++) { if ((auio.uio_resid += tiov->iov_len) < 0) { error = EINVAL; goto out; } } len = auio.uio_resid; fromsa = (struct sockaddr *)sockbufstore; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif /* KTRACE */ memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo)); CURVNET_SET(so->so_vnet); error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL, fromsa, fromlen, &msg_flags, (struct sctp_sndrcvinfo *)&sinfo, 1); CURVNET_RESTORE(); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } else { if (uap->sinfo) error = copyout(&sinfo, uap->sinfo, sizeof (sinfo)); } #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = len - auio.uio_resid; ktrgenio(uap->sd, UIO_READ, ktruio, error); } #endif /* KTRACE */ if (error != 0) goto out; td->td_retval[0] = len - auio.uio_resid; if (fromlen && uap->from) { len = fromlen; if (len <= 0 || fromsa == NULL) len = 0; else { len = MIN(len, fromsa->sa_len); error = copyout(fromsa, uap->from, (size_t)len); if (error != 0) goto out; } error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t)); if (error != 0) goto out; } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(fromsa); #endif if (uap->msg_flags) { error = copyout(&msg_flags, uap->msg_flags, sizeof (int)); if (error != 0) goto out; } out: free(iov, M_IOV); out1: if (fp != NULL) fdrop(fp, td); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } Index: stable/11/sys/sys/file.h =================================================================== --- stable/11/sys/sys/file.h (revision 315311) +++ stable/11/sys/sys/file.h (revision 315312) @@ -1,422 +1,416 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)file.h 8.3 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_FILE_H_ #define _SYS_FILE_H_ #ifndef _KERNEL #include /* XXX */ #include #include #else #include #include #include #include #include struct filedesc; struct stat; struct thread; struct uio; struct knote; struct vnode; -struct socket; - #endif /* _KERNEL */ #define DTYPE_NONE 0 /* not yet initialized */ #define DTYPE_VNODE 1 /* file */ #define DTYPE_SOCKET 2 /* communications endpoint */ #define DTYPE_PIPE 3 /* pipe */ #define DTYPE_FIFO 4 /* fifo (named pipe) */ #define DTYPE_KQUEUE 5 /* event queue */ #define DTYPE_CRYPTO 6 /* crypto */ #define DTYPE_MQUEUE 7 /* posix message queue */ #define DTYPE_SHM 8 /* swap-backed shared memory */ #define DTYPE_SEM 9 /* posix semaphore */ #define DTYPE_PTS 10 /* pseudo teletype master device */ #define DTYPE_DEV 11 /* Device specific fd type */ #define DTYPE_PROCDESC 12 /* process descriptor */ #define DTYPE_LINUXEFD 13 /* emulation eventfd type */ #ifdef _KERNEL struct file; struct filecaps; struct kaiocb; struct kinfo_file; struct ucred; #define FOF_OFFSET 0x01 /* Use the offset in uio argument */ #define FOF_NOLOCK 0x02 /* Do not take FOFFSET_LOCK */ #define FOF_NEXTOFF 0x04 /* Also update f_nextoff */ #define FOF_NOUPDATE 0x10 /* Do not update f_offset */ off_t foffset_lock(struct file *fp, int flags); void foffset_lock_uio(struct file *fp, struct uio *uio, int flags); void foffset_unlock(struct file *fp, off_t val, int flags); void foffset_unlock_uio(struct file *fp, struct uio *uio, int flags); static inline off_t foffset_get(struct file *fp) { return (foffset_lock(fp, FOF_NOLOCK)); } typedef int fo_rdwr_t(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td); typedef int fo_truncate_t(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td); typedef int fo_ioctl_t(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td); typedef int fo_poll_t(struct file *fp, int events, struct ucred *active_cred, struct thread *td); typedef int fo_kqfilter_t(struct file *fp, struct knote *kn); typedef int fo_stat_t(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td); typedef int fo_close_t(struct file *fp, struct thread *td); typedef int fo_chmod_t(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td); typedef int fo_chown_t(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td); typedef int fo_sendfile_t(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td); typedef int fo_seek_t(struct file *fp, off_t offset, int whence, struct thread *td); typedef int fo_fill_kinfo_t(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp); typedef int fo_mmap_t(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td); typedef int fo_aio_queue_t(struct file *fp, struct kaiocb *job); typedef int fo_flags_t; struct fileops { fo_rdwr_t *fo_read; fo_rdwr_t *fo_write; fo_truncate_t *fo_truncate; fo_ioctl_t *fo_ioctl; fo_poll_t *fo_poll; fo_kqfilter_t *fo_kqfilter; fo_stat_t *fo_stat; fo_close_t *fo_close; fo_chmod_t *fo_chmod; fo_chown_t *fo_chown; fo_sendfile_t *fo_sendfile; fo_seek_t *fo_seek; fo_fill_kinfo_t *fo_fill_kinfo; fo_mmap_t *fo_mmap; fo_aio_queue_t *fo_aio_queue; fo_flags_t fo_flags; /* DFLAG_* below */ }; #define DFLAG_PASSABLE 0x01 /* may be passed via unix sockets. */ #define DFLAG_SEEKABLE 0x02 /* seekable / nonsequential */ #endif /* _KERNEL */ #if defined(_KERNEL) || defined(_WANT_FILE) /* * Kernel descriptor table. * One entry for each open kernel vnode and socket. * * Below is the list of locks that protects members in struct file. * * (a) f_vnode lock required (shared allows both reads and writes) * (f) protected with mtx_lock(mtx_pool_find(fp)) * (d) cdevpriv_mtx * none not locked */ struct fadvise_info { int fa_advice; /* (f) FADV_* type. */ off_t fa_start; /* (f) Region start. */ off_t fa_end; /* (f) Region end. */ }; struct file { void *f_data; /* file descriptor specific data */ struct fileops *f_ops; /* File operations */ struct ucred *f_cred; /* associated credentials. */ struct vnode *f_vnode; /* NULL or applicable vnode */ short f_type; /* descriptor type */ short f_vnread_flags; /* (f) Sleep lock for f_offset */ volatile u_int f_flag; /* see fcntl.h */ volatile u_int f_count; /* reference count */ /* * DTYPE_VNODE specific fields. */ int f_seqcount; /* (a) Count of sequential accesses. */ off_t f_nextoff; /* next expected read/write offset. */ union { struct cdev_privdata *fvn_cdevpriv; /* (d) Private data for the cdev. */ struct fadvise_info *fvn_advice; } f_vnun; /* * DFLAG_SEEKABLE specific fields */ off_t f_offset; /* * Mandatory Access control information. */ void *f_label; /* Place-holder for MAC label. */ }; #define f_cdevpriv f_vnun.fvn_cdevpriv #define f_advice f_vnun.fvn_advice #define FOFFSET_LOCKED 0x1 #define FOFFSET_LOCK_WAITING 0x2 #define FDEVFS_VNODE 0x4 #endif /* _KERNEL || _WANT_FILE */ /* * Userland version of struct file, for sysctl */ struct xfile { size_t xf_size; /* size of struct xfile */ pid_t xf_pid; /* owning process */ uid_t xf_uid; /* effective uid of owning process */ int xf_fd; /* descriptor number */ void *xf_file; /* address of struct file */ short xf_type; /* descriptor type */ int xf_count; /* reference count */ int xf_msgcount; /* references from message queue */ off_t xf_offset; /* file offset */ void *xf_data; /* file descriptor specific data */ void *xf_vnode; /* vnode pointer */ u_int xf_flag; /* flags (see fcntl.h) */ }; #ifdef _KERNEL extern struct fileops vnops; extern struct fileops badfileops; extern struct fileops socketops; extern int maxfiles; /* kernel limit on number of open files */ extern int maxfilesperproc; /* per process limit on number of open files */ extern volatile int openfiles; /* actual number of open files */ int fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); int fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp, struct file **fpp); int fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); int fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); int fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, struct file **fpp); int _fdrop(struct file *fp, struct thread *td); fo_rdwr_t invfo_rdwr; fo_truncate_t invfo_truncate; fo_ioctl_t invfo_ioctl; fo_poll_t invfo_poll; fo_kqfilter_t invfo_kqfilter; fo_chmod_t invfo_chmod; fo_chown_t invfo_chown; fo_sendfile_t invfo_sendfile; fo_sendfile_t vn_sendfile; fo_seek_t vn_seek; fo_fill_kinfo_t vn_fill_kinfo; int vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif); void finit(struct file *, u_int, short, void *, struct fileops *); int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, struct filecaps *havecaps, struct vnode **vpp); int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); - -int fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, - struct socket **spp, u_int *fflagp); -void fputsock(struct socket *sp); static __inline int _fnoop(void) { return (0); } #define fhold(fp) \ (refcount_acquire(&(fp)->f_count)) #define fdrop(fp, td) \ (refcount_release(&(fp)->f_count) ? _fdrop((fp), (td)) : _fnoop()) static __inline fo_rdwr_t fo_read; static __inline fo_rdwr_t fo_write; static __inline fo_truncate_t fo_truncate; static __inline fo_ioctl_t fo_ioctl; static __inline fo_poll_t fo_poll; static __inline fo_kqfilter_t fo_kqfilter; static __inline fo_stat_t fo_stat; static __inline fo_close_t fo_close; static __inline fo_chmod_t fo_chmod; static __inline fo_chown_t fo_chown; static __inline fo_sendfile_t fo_sendfile; static __inline int fo_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return ((*fp->f_ops->fo_read)(fp, uio, active_cred, flags, td)); } static __inline int fo_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return ((*fp->f_ops->fo_write)(fp, uio, active_cred, flags, td)); } static __inline int fo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_truncate)(fp, length, active_cred, td)); } static __inline int fo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_ioctl)(fp, com, data, active_cred, td)); } static __inline int fo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_poll)(fp, events, active_cred, td)); } static __inline int fo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_stat)(fp, sb, active_cred, td)); } static __inline int fo_close(struct file *fp, struct thread *td) { return ((*fp->f_ops->fo_close)(fp, td)); } static __inline int fo_kqfilter(struct file *fp, struct knote *kn) { return ((*fp->f_ops->fo_kqfilter)(fp, kn)); } static __inline int fo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_chmod)(fp, mode, active_cred, td)); } static __inline int fo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_chown)(fp, uid, gid, active_cred, td)); } static __inline int fo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return ((*fp->f_ops->fo_sendfile)(fp, sockfd, hdr_uio, trl_uio, offset, nbytes, sent, flags, td)); } static __inline int fo_seek(struct file *fp, off_t offset, int whence, struct thread *td) { return ((*fp->f_ops->fo_seek)(fp, offset, whence, td)); } static __inline int fo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { return ((*fp->f_ops->fo_fill_kinfo)(fp, kif, fdp)); } static __inline int fo_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { if (fp->f_ops->fo_mmap == NULL) return (ENODEV); return ((*fp->f_ops->fo_mmap)(fp, map, addr, size, prot, cap_maxprot, flags, foff, td)); } static __inline int fo_aio_queue(struct file *fp, struct kaiocb *job) { return ((*fp->f_ops->fo_aio_queue)(fp, job)); } #endif /* _KERNEL */ #endif /* !SYS_FILE_H */ Index: stable/11/sys/sys/filedesc.h =================================================================== --- stable/11/sys/sys/filedesc.h (revision 315311) +++ stable/11/sys/sys/filedesc.h (revision 315312) @@ -1,224 +1,241 @@ /*- * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)filedesc.h 8.1 (Berkeley) 6/2/93 * $FreeBSD$ */ #ifndef _SYS_FILEDESC_H_ #define _SYS_FILEDESC_H_ #include #include #include #include #include #include #include #include struct filecaps { cap_rights_t fc_rights; /* per-descriptor capability rights */ u_long *fc_ioctls; /* per-descriptor allowed ioctls */ int16_t fc_nioctls; /* fc_ioctls array size */ uint32_t fc_fcntls; /* per-descriptor allowed fcntls */ }; struct filedescent { struct file *fde_file; /* file structure for open file */ struct filecaps fde_caps; /* per-descriptor rights */ uint8_t fde_flags; /* per-process open file flags */ seq_t fde_seq; /* keep file and caps in sync */ }; #define fde_rights fde_caps.fc_rights #define fde_fcntls fde_caps.fc_fcntls #define fde_ioctls fde_caps.fc_ioctls #define fde_nioctls fde_caps.fc_nioctls #define fde_change_size (offsetof(struct filedescent, fde_seq)) struct fdescenttbl { int fdt_nfiles; /* number of open files allocated */ struct filedescent fdt_ofiles[0]; /* open files */ }; #define fd_seq(fdt, fd) (&(fdt)->fdt_ofiles[(fd)].fde_seq) /* * This structure is used for the management of descriptors. It may be * shared by multiple processes. */ #define NDSLOTTYPE u_long struct filedesc { struct fdescenttbl *fd_files; /* open files table */ struct vnode *fd_cdir; /* current directory */ struct vnode *fd_rdir; /* root directory */ struct vnode *fd_jdir; /* jail root directory */ NDSLOTTYPE *fd_map; /* bitmap of free fds */ int fd_lastfile; /* high-water mark of fd_ofiles */ int fd_freefile; /* approx. next free file */ u_short fd_cmask; /* mask for file creation */ int fd_refcnt; /* thread reference count */ int fd_holdcnt; /* hold count on structure + mutex */ struct sx fd_sx; /* protects members of this struct */ struct kqlist fd_kqlist; /* list of kqueues on this filedesc */ int fd_holdleaderscount; /* block fdfree() for shared close() */ int fd_holdleaderswakeup; /* fdfree() needs wakeup */ }; /* * Structure to keep track of (process leader, struct fildedesc) tuples. * Each process has a pointer to such a structure when detailed tracking * is needed, e.g., when rfork(RFPROC | RFMEM) causes a file descriptor * table to be shared by processes having different "p_leader" pointers * and thus distinct POSIX style locks. * * fdl_refcount and fdl_holdcount are protected by struct filedesc mtx. */ struct filedesc_to_leader { int fdl_refcount; /* references from struct proc */ int fdl_holdcount; /* temporary hold during closef */ int fdl_wakeup; /* fdfree() waits on closef() */ struct proc *fdl_leader; /* owner of POSIX locks */ /* Circular list: */ struct filedesc_to_leader *fdl_prev; struct filedesc_to_leader *fdl_next; }; #define fd_nfiles fd_files->fdt_nfiles #define fd_ofiles fd_files->fdt_ofiles /* * Per-process open flags. */ #define UF_EXCLOSE 0x01 /* auto-close on exec */ #ifdef _KERNEL /* Lock a file descriptor table. */ #define FILEDESC_LOCK_INIT(fdp) sx_init(&(fdp)->fd_sx, "filedesc structure") #define FILEDESC_LOCK_DESTROY(fdp) sx_destroy(&(fdp)->fd_sx) #define FILEDESC_LOCK(fdp) (&(fdp)->fd_sx) #define FILEDESC_XLOCK(fdp) sx_xlock(&(fdp)->fd_sx) #define FILEDESC_XUNLOCK(fdp) sx_xunlock(&(fdp)->fd_sx) #define FILEDESC_SLOCK(fdp) sx_slock(&(fdp)->fd_sx) #define FILEDESC_SUNLOCK(fdp) sx_sunlock(&(fdp)->fd_sx) #define FILEDESC_LOCK_ASSERT(fdp) sx_assert(&(fdp)->fd_sx, SX_LOCKED | \ SX_NOTRECURSED) #define FILEDESC_XLOCK_ASSERT(fdp) sx_assert(&(fdp)->fd_sx, SX_XLOCKED | \ SX_NOTRECURSED) #define FILEDESC_UNLOCK_ASSERT(fdp) sx_assert(&(fdp)->fd_sx, SX_UNLOCKED) /* Operation types for kern_dup(). */ enum { FDDUP_NORMAL, /* dup() behavior. */ FDDUP_FCNTL, /* fcntl()-style errors. */ FDDUP_FIXED, /* Force fixed allocation. */ FDDUP_MUSTREPLACE, /* Target must exist. */ FDDUP_LASTMODE, }; /* Flags for kern_dup(). */ #define FDDUP_FLAG_CLOEXEC 0x1 /* Atomically set UF_EXCLOSE. */ /* For backward compatibility. */ #define falloc(td, resultfp, resultfd, flags) \ falloc_caps(td, resultfp, resultfd, flags, NULL) struct thread; void filecaps_init(struct filecaps *fcaps); int filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked); void filecaps_move(struct filecaps *src, struct filecaps *dst); void filecaps_free(struct filecaps *fcaps); int closef(struct file *fp, struct thread *td); int dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, int openerror, int *indxp); int falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags, struct filecaps *fcaps); int falloc_noinstall(struct thread *td, struct file **resultfp); void _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags, struct filecaps *fcaps); int finstall(struct thread *td, struct file *fp, int *resultfd, int flags, struct filecaps *fcaps); int fdalloc(struct thread *td, int minfd, int *result); int fdallocn(struct thread *td, int minfd, int *fds, int n); int fdcheckstd(struct thread *td); void fdclose(struct thread *td, struct file *fp, int idx); void fdcloseexec(struct thread *td); void fdsetugidsafety(struct thread *td); struct filedesc *fdcopy(struct filedesc *fdp); int fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds, struct filedesc **newfdp); void fdinstall_remapped(struct thread *td, struct filedesc *fdp); void fdunshare(struct thread *td); void fdescfree(struct thread *td); void fdescfree_remapped(struct filedesc *fdp); struct filedesc *fdinit(struct filedesc *fdp, bool prepfiles); struct filedesc *fdshare(struct filedesc *fdp); struct filedesc_to_leader * filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader); int getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); void mountcheckdirs(struct vnode *olddp, struct vnode *newdp); +int fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, + struct file **fpp, struct filecaps *havecapsp); +int fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, + struct file **fpp, struct filecaps *havecapsp); + /* Return a referenced file from an unlocked descriptor. */ int fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp, seq_t *seqp); /* Requires a FILEDESC_{S,X}LOCK held and returns without a ref. */ static __inline struct file * fget_locked(struct filedesc *fdp, int fd) { FILEDESC_LOCK_ASSERT(fdp); if (fd < 0 || fd > fdp->fd_lastfile) return (NULL); return (fdp->fd_ofiles[fd].fde_file); +} + +static __inline struct filedescent * +fdeget_locked(struct filedesc *fdp, int fd) +{ + + FILEDESC_LOCK_ASSERT(fdp); + + if ((u_int)fd > fdp->fd_lastfile) + return (NULL); + + return (&fdp->fd_ofiles[fd]); } static __inline bool fd_modified(struct filedesc *fdp, int fd, seq_t seq) { return (!seq_consistent(fd_seq(fdp->fd_files, fd), seq)); } /* cdir/rdir/jdir manipulation functions. */ void pwd_chdir(struct thread *td, struct vnode *vp); int pwd_chroot(struct thread *td, struct vnode *vp); void pwd_ensure_dirs(void); #endif /* _KERNEL */ #endif /* !_SYS_FILEDESC_H_ */ Index: stable/11/sys/sys/param.h =================================================================== --- stable/11/sys/sys/param.h (revision 315311) +++ stable/11/sys/sys/param.h (revision 315312) @@ -1,363 +1,363 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.h 8.3 (Berkeley) 4/4/95 * $FreeBSD$ */ #ifndef _SYS_PARAM_H_ #define _SYS_PARAM_H_ #include #define BSD 199506 /* System version (year & month). */ #define BSD4_3 1 #define BSD4_4 1 /* * __FreeBSD_version numbers are documented in the Porter's Handbook. * If you bump the version for any reason, you should update the documentation * there. * Currently this lives here in the doc/ repository: * * head/en_US.ISO8859-1/books/porters-handbook/versions/chapter.xml * * scheme is: Rxx * 'R' is in the range 0 to 4 if this is a release branch or * x.0-CURRENT before RELENG_*_0 is created, otherwise 'R' is * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1100509 /* Master, propagated to newvers */ +#define __FreeBSD_version 1100510 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, * which by definition is always true on FreeBSD. This macro is also defined * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD. * * It is tempting to use this macro in userland code when we want to enable * kernel-specific routines, and in fact it's fine to do this in code that * is part of FreeBSD itself. However, be aware that as presence of this * macro is still not widespread (e.g. older FreeBSD versions, 3rd party * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in * external applications without also checking for __FreeBSD__ as an * alternative. */ #undef __FreeBSD_kernel__ #define __FreeBSD_kernel__ #ifdef _KERNEL #define P_OSREL_SIGWAIT 700000 #define P_OSREL_SIGSEGV 700004 #define P_OSREL_MAP_ANON 800104 #define P_OSREL_MAP_FSTRICT 1100036 #define P_OSREL_SHUTDOWN_ENOTCONN 1100077 #define P_OSREL_MAJOR(x) ((x) / 100000) #endif #ifndef LOCORE #include #endif /* * Machine-independent constants (some used in following include files). * Redefined constants are from POSIX 1003.1 limits file. * * MAXCOMLEN should be >= sizeof(ac_comm) (see ) */ #include #define MAXCOMLEN 19 /* max command name remembered */ #define MAXINTERP PATH_MAX /* max interpreter file name length */ #define MAXLOGNAME 33 /* max login name length (incl. NUL) */ #define MAXUPRC CHILD_MAX /* max simultaneous processes */ #define NCARGS ARG_MAX /* max bytes for an exec function */ #define NGROUPS (NGROUPS_MAX+1) /* max number groups */ #define NOFILE OPEN_MAX /* max open files per process */ #define NOGROUP 65535 /* marker for empty group set member */ #define MAXHOSTNAMELEN 256 /* max hostname size */ #define SPECNAMELEN 63 /* max length of devicename */ /* More types and definitions used throughout the kernel. */ #ifdef _KERNEL #include #include #ifndef LOCORE #include #include #endif #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif #endif #ifndef _KERNEL /* Signals. */ #include #endif /* Machine type dependent parameters. */ #include #ifndef _KERNEL #include #endif #ifndef DEV_BSHIFT #define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ #endif #define DEV_BSIZE (1<>PAGE_SHIFT) #endif /* * btodb() is messy and perhaps slow because `bytes' may be an off_t. We * want to shift an unsigned type to avoid sign extension and we don't * want to widen `bytes' unnecessarily. Assume that the result fits in * a daddr_t. */ #ifndef btodb #define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ (sizeof (bytes) > sizeof(long) \ ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \ : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT)) #endif #ifndef dbtob #define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ ((off_t)(db) << DEV_BSHIFT) #endif #define PRIMASK 0x0ff #define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ #define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */ #define NZERO 0 /* default "nice" */ #define NBBY 8 /* number of bits in a byte */ #define NBPW sizeof(int) /* number of bytes per word (integer) */ #define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ #define NODEV (dev_t)(-1) /* non-existent device */ /* * File system parameters and macros. * * MAXBSIZE - Filesystems are made out of blocks of at most MAXBSIZE bytes * per block. MAXBSIZE may be made larger without effecting * any existing filesystems as long as it does not exceed MAXPHYS, * and may be made smaller at the risk of not being able to use * filesystems which require a block size exceeding MAXBSIZE. * * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache. This must * be >= MAXBSIZE and can be set differently for different * architectures by defining it in . * Making this larger allows NFS to do larger reads/writes. * * BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the * minimum KVM memory reservation the kernel is willing to make. * Filesystems can of course request smaller chunks. Actual * backing memory uses a chunk size of a page (PAGE_SIZE). * The default value here can be overridden on a per-architecture * basis by defining it in . This should * probably be done to increase its value, when MAXBCACHEBUF is * defined as a larger value in . * * If you make BKVASIZE too small you risk seriously fragmenting * the buffer KVM map which may slow things down a bit. If you * make it too big the kernel will not be able to optimally use * the KVM memory reserved for the buffer cache and will wind * up with too-few buffers. * * The default is 16384, roughly 2x the block size used by a * normal UFS filesystem. */ #define MAXBSIZE 65536 /* must be power of 2 */ #ifndef MAXBCACHEBUF #define MAXBCACHEBUF MAXBSIZE /* must be a power of 2 >= MAXBSIZE */ #endif #ifndef BKVASIZE #define BKVASIZE 16384 /* must be power of 2 */ #endif #define BKVAMASK (BKVASIZE-1) /* * MAXPATHLEN defines the longest permissible path length after expanding * symbolic links. It is used to allocate a temporary buffer from the buffer * pool in which to do the name expansion, hence should be a power of two, * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the * maximum number of symbolic links that may be expanded in a path name. * It should be set high enough to allow all legitimate uses, but halt * infinite loops reasonably quickly. */ #define MAXPATHLEN PATH_MAX #define MAXSYMLINKS 32 /* Bit map related macros. */ #define setbit(a,i) (((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY)) #define clrbit(a,i) (((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY))) #define isset(a,i) \ (((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) #define isclr(a,i) \ ((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0) /* Macros for counting and rounding. */ #ifndef howmany #define howmany(x, y) (((x)+((y)-1))/(y)) #endif #define nitems(x) (sizeof((x)) / sizeof((x)[0])) #define rounddown(x, y) (((x)/(y))*(y)) #define rounddown2(x, y) ((x)&(~((y)-1))) /* if y is power of two */ #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ #define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) #ifdef _KERNEL /* * Basic byte order function prototypes for non-inline functions. */ #ifndef LOCORE #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED __BEGIN_DECLS __uint32_t htonl(__uint32_t); __uint16_t htons(__uint16_t); __uint32_t ntohl(__uint32_t); __uint16_t ntohs(__uint16_t); __END_DECLS #endif #endif #ifndef lint #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif /* !_BYTEORDER_FUNC_DEFINED */ #endif /* lint */ #endif /* _KERNEL */ /* * Scale factor for scaled integers used to count %cpu time and load avgs. * * The number of CPU `tick's that map to a unique `%age' can be expressed * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that * can be calculated (assuming 32 bits) can be closely approximated using * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15). * * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024. */ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1<> (PAGE_SHIFT - DEV_BSHIFT)) #define ctodb(db) /* calculates pages to devblks */ \ ((db) << (PAGE_SHIFT - DEV_BSHIFT)) /* * Old spelling of __containerof(). */ #define member2struct(s, m, x) \ ((struct s *)(void *)((char *)(x) - offsetof(struct s, m))) /* * Access a variable length array that has been declared as a fixed * length array. */ #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset]) #endif /* _SYS_PARAM_H_ */ Index: stable/11/sys/sys/socketvar.h =================================================================== --- stable/11/sys/sys/socketvar.h (revision 315311) +++ stable/11/sys/sys/socketvar.h (revision 315312) @@ -1,422 +1,423 @@ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)socketvar.h 8.3 (Berkeley) 2/19/95 * * $FreeBSD$ */ #ifndef _SYS_SOCKETVAR_H_ #define _SYS_SOCKETVAR_H_ #include /* for TAILQ macros */ #include /* for struct selinfo */ #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif struct vnet; /* * Kernel structure per socket. * Contains send and receive buffer queues, * handle on protocol and pointer to protocol * private data and error information. */ typedef u_quad_t so_gen_t; struct socket; /*- * Locking key to struct socket: * (a) constant after allocation, no locking required. * (b) locked by SOCK_LOCK(so). * (c) locked by SOCKBUF_LOCK(&so->so_rcv). * (e) locked by ACCEPT_LOCK(). * (f) not locked since integer reads/writes are atomic. * (g) used only as a sleep/wakeup address, no value. * (h) locked by global mutex so_global_mtx. */ struct socket { int so_count; /* (b) reference count */ short so_type; /* (a) generic type, see socket.h */ short so_options; /* from socket call, see socket.h */ short so_linger; /* time to linger while closing */ short so_state; /* (b) internal state flags SS_* */ int so_qstate; /* (e) internal state flags SQ_* */ void *so_pcb; /* protocol control block */ struct vnet *so_vnet; /* (a) network stack instance */ struct protosw *so_proto; /* (a) protocol handle */ /* * Variables for connection queuing. * Socket where accepts occur is so_head in all subsidiary sockets. * If so_head is 0, socket is not related to an accept. * For head socket so_incomp queues partially completed connections, * while so_comp is a queue of connections ready to be accepted. * If a connection is aborted and it has so_head set, then * it has to be pulled out of either so_incomp or so_comp. * We allow connections to queue up based on current queue lengths * and limit on number of queued connections for this socket. */ struct socket *so_head; /* (e) back pointer to listen socket */ TAILQ_HEAD(, socket) so_incomp; /* (e) queue of partial unaccepted connections */ TAILQ_HEAD(, socket) so_comp; /* (e) queue of complete unaccepted connections */ TAILQ_ENTRY(socket) so_list; /* (e) list of unaccepted connections */ u_int so_qlen; /* (e) number of unaccepted connections */ u_int so_incqlen; /* (e) number of unaccepted incomplete connections */ u_int so_qlimit; /* (e) max number queued connections */ short so_timeo; /* (g) connection timeout */ u_short so_error; /* (f) error affecting connection */ struct sigio *so_sigio; /* [sg] information for async I/O or out of band data (SIGURG) */ u_long so_oobmark; /* (c) chars to oob mark */ struct sockbuf so_rcv, so_snd; struct ucred *so_cred; /* (a) user credentials */ struct label *so_label; /* (b) MAC label for socket */ struct label *so_peerlabel; /* (b) cached MAC label for peer */ /* NB: generation count must not be first. */ so_gen_t so_gencnt; /* (h) generation count */ void *so_emuldata; /* (b) private data for emulators */ struct so_accf { struct accept_filter *so_accept_filter; void *so_accept_filter_arg; /* saved filter args */ char *so_accept_filter_str; /* saved user args */ } *so_accf; struct osd osd; /* Object Specific extensions */ /* * so_fibnum, so_user_cookie and friends can be used to attach * some user-specified metadata to a socket, which then can be * used by the kernel for various actions. * so_user_cookie is used by ipfw/dummynet. */ int so_fibnum; /* routing domain for this socket */ uint32_t so_user_cookie; void *so_pspare[2]; /* packet pacing / general use */ int so_ispare[2]; /* packet pacing / general use */ }; /* * Global accept mutex to serialize access to accept queues and * fields associated with multiple sockets. This allows us to * avoid defining a lock order between listen and accept sockets * until such time as it proves to be a good idea. */ extern struct mtx accept_mtx; #define ACCEPT_LOCK_ASSERT() mtx_assert(&accept_mtx, MA_OWNED) #define ACCEPT_UNLOCK_ASSERT() mtx_assert(&accept_mtx, MA_NOTOWNED) #define ACCEPT_LOCK() mtx_lock(&accept_mtx) #define ACCEPT_UNLOCK() mtx_unlock(&accept_mtx) /* * Per-socket mutex: we reuse the receive socket buffer mutex for space * efficiency. This decision should probably be revisited as we optimize * locking for the socket code. */ #define SOCK_MTX(_so) SOCKBUF_MTX(&(_so)->so_rcv) #define SOCK_LOCK(_so) SOCKBUF_LOCK(&(_so)->so_rcv) #define SOCK_OWNED(_so) SOCKBUF_OWNED(&(_so)->so_rcv) #define SOCK_UNLOCK(_so) SOCKBUF_UNLOCK(&(_so)->so_rcv) #define SOCK_LOCK_ASSERT(_so) SOCKBUF_LOCK_ASSERT(&(_so)->so_rcv) /* * Socket state bits stored in so_qstate. */ #define SQ_INCOMP 0x0800 /* unaccepted, incomplete connection */ #define SQ_COMP 0x1000 /* unaccepted, complete connection */ /* * Externalized form of struct socket used by the sysctl(3) interface. */ struct xsocket { size_t xso_len; /* length of this structure */ struct socket *xso_so; /* makes a convenient handle sometimes */ short so_type; short so_options; short so_linger; short so_state; caddr_t so_pcb; /* another convenient handle */ int xso_protocol; int xso_family; u_int so_qlen; u_int so_incqlen; u_int so_qlimit; short so_timeo; u_short so_error; pid_t so_pgid; u_long so_oobmark; struct xsockbuf so_rcv, so_snd; uid_t so_uid; /* XXX */ }; #ifdef _KERNEL /* * Macros for sockets and socket buffering. */ /* * Flags to sblock(). */ #define SBL_WAIT 0x00000001 /* Wait if not immediately available. */ #define SBL_NOINTR 0x00000002 /* Force non-interruptible sleep. */ #define SBL_VALID (SBL_WAIT | SBL_NOINTR) /* * Do we need to notify the other side when I/O is possible? */ #define sb_notify(sb) (((sb)->sb_flags & (SB_WAIT | SB_SEL | SB_ASYNC | \ SB_UPCALL | SB_AIO | SB_KNOTE)) != 0) /* do we have to send all at once on a socket? */ #define sosendallatonce(so) \ ((so)->so_proto->pr_flags & PR_ATOMIC) /* can we read something from so? */ #define soreadabledata(so) \ (sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || \ !TAILQ_EMPTY(&(so)->so_comp) || (so)->so_error) #define soreadable(so) \ (soreadabledata(so) || ((so)->so_rcv.sb_state & SBS_CANTRCVMORE)) /* can we write something to so? */ #define sowriteable(so) \ ((sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat && \ (((so)->so_state&SS_ISCONNECTED) || \ ((so)->so_proto->pr_flags&PR_CONNREQUIRED)==0)) || \ ((so)->so_snd.sb_state & SBS_CANTSENDMORE) || \ (so)->so_error) /* * soref()/sorele() ref-count the socket structure. Note that you must * still explicitly close the socket, but the last ref count will free * the structure. */ #define soref(so) do { \ SOCK_LOCK_ASSERT(so); \ ++(so)->so_count; \ } while (0) #define sorele(so) do { \ ACCEPT_LOCK_ASSERT(); \ SOCK_LOCK_ASSERT(so); \ if ((so)->so_count <= 0) \ panic("sorele"); \ if (--(so)->so_count == 0) \ sofree(so); \ else { \ SOCK_UNLOCK(so); \ ACCEPT_UNLOCK(); \ } \ } while (0) /* * In sorwakeup() and sowwakeup(), acquire the socket buffer lock to * avoid a non-atomic test-and-wakeup. However, sowakeup is * responsible for releasing the lock if it is called. We unlock only * if we don't call into sowakeup. If any code is introduced that * directly invokes the underlying sowakeup() primitives, it must * maintain the same semantics. */ #define sorwakeup_locked(so) do { \ SOCKBUF_LOCK_ASSERT(&(so)->so_rcv); \ if (sb_notify(&(so)->so_rcv)) \ sowakeup((so), &(so)->so_rcv); \ else \ SOCKBUF_UNLOCK(&(so)->so_rcv); \ } while (0) #define sorwakeup(so) do { \ SOCKBUF_LOCK(&(so)->so_rcv); \ sorwakeup_locked(so); \ } while (0) #define sowwakeup_locked(so) do { \ SOCKBUF_LOCK_ASSERT(&(so)->so_snd); \ if (sb_notify(&(so)->so_snd)) \ sowakeup((so), &(so)->so_snd); \ else \ SOCKBUF_UNLOCK(&(so)->so_snd); \ } while (0) #define sowwakeup(so) do { \ SOCKBUF_LOCK(&(so)->so_snd); \ sowwakeup_locked(so); \ } while (0) struct accept_filter { char accf_name[16]; int (*accf_callback) (struct socket *so, void *arg, int waitflag); void * (*accf_create) (struct socket *so, char *arg); void (*accf_destroy) (struct socket *so); SLIST_ENTRY(accept_filter) accf_next; }; #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_ACCF); MALLOC_DECLARE(M_PCB); MALLOC_DECLARE(M_SONAME); #endif /* * Socket specific helper hook point identifiers * Do not leave holes in the sequence, hook registration is a loop. */ #define HHOOK_SOCKET_OPT 0 #define HHOOK_SOCKET_CREATE 1 #define HHOOK_SOCKET_RCV 2 #define HHOOK_SOCKET_SND 3 #define HHOOK_FILT_SOREAD 4 #define HHOOK_FILT_SOWRITE 5 #define HHOOK_SOCKET_CLOSE 6 #define HHOOK_SOCKET_LAST HHOOK_SOCKET_CLOSE struct socket_hhook_data { struct socket *so; struct mbuf *m; void *hctx; /* hook point specific data*/ int status; }; extern int maxsockets; extern u_long sb_max; extern so_gen_t so_gencnt; struct file; +struct filecaps; struct filedesc; struct mbuf; struct sockaddr; struct ucred; struct uio; /* 'which' values for socket upcalls. */ #define SO_RCV 1 #define SO_SND 2 /* Return values for socket upcalls. */ #define SU_OK 0 #define SU_ISCONNECTED 1 /* * From uipc_socket and friends */ int getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len); int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, - struct file **fpp, u_int *fflagp); + struct file **fpp, u_int *fflagp, struct filecaps *havecaps); void soabort(struct socket *so); int soaccept(struct socket *so, struct sockaddr **nam); void soaio_enqueue(struct task *task); void soaio_rcv(void *context, int pending); void soaio_snd(void *context, int pending); int socheckuid(struct socket *so, uid_t uid); int sobind(struct socket *so, struct sockaddr *nam, struct thread *td); int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td); int soclose(struct socket *so); int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td); int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td); int soconnect2(struct socket *so1, struct socket *so2); int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td); int sodisconnect(struct socket *so); struct sockaddr *sodupsockaddr(const struct sockaddr *sa, int mflags); void sofree(struct socket *so); void sohasoutofband(struct socket *so); int solisten(struct socket *so, int backlog, struct thread *td); void solisten_proto(struct socket *so, int backlog); int solisten_proto_check(struct socket *so); struct socket * sonewconn(struct socket *head, int connstatus); int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td); int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td); int soreceive(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreceive_stream(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreceive_dgram(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreceive_generic(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreserve(struct socket *so, u_long sndcc, u_long rcvcc); void sorflush(struct socket *so); int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int soshutdown(struct socket *so, int how); void sotoxsocket(struct socket *so, struct xsocket *xso); void soupcall_clear(struct socket *so, int which); void soupcall_set(struct socket *so, int which, int (*func)(struct socket *, void *, int), void *arg); void sowakeup(struct socket *so, struct sockbuf *sb); void sowakeup_aio(struct socket *so, struct sockbuf *sb); int selsocket(struct socket *so, int events, struct timeval *tv, struct thread *td); /* * Accept filter functions (duh). */ int accept_filt_add(struct accept_filter *filt); int accept_filt_del(char *name); struct accept_filter *accept_filt_get(char *name); #ifdef ACCEPT_FILTER_MOD #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_accf); #endif int accept_filt_generic_mod_event(module_t mod, int event, void *data); #endif #endif /* _KERNEL */ #endif /* !_SYS_SOCKETVAR_H_ */ Index: stable/11 =================================================================== --- stable/11 (revision 315311) +++ stable/11 (revision 315312) Property changes on: stable/11 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r305093,305756,306174,306184,306226,306232,311474,312079,312081,312087