Index: head/sys/compat/cloudabi/cloudabi_sock.c =================================================================== --- head/sys/compat/cloudabi/cloudabi_sock.c (revision 306173) +++ head/sys/compat/cloudabi/cloudabi_sock.c (revision 306174) @@ -1,252 +1,252 @@ /*- * Copyright (c) 2015 Nuxi, https://nuxi.nl/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Converts FreeBSD's struct sockaddr to CloudABI's cloudabi_sockaddr_t. */ void cloudabi_convert_sockaddr(const struct sockaddr *sa, socklen_t sal, cloudabi_sockaddr_t *rsa) { const struct sockaddr_in *sin; const struct sockaddr_in6 *sin6; /* Zero-sized socket address. */ if (sal < offsetof(struct sockaddr, sa_family) + sizeof(sa->sa_family)) return; switch (sa->sa_family) { case AF_INET: if (sal < sizeof(struct sockaddr_in)) return; sin = (const struct sockaddr_in *)sa; rsa->sa_family = CLOUDABI_AF_INET; memcpy(&rsa->sa_inet.addr, &sin->sin_addr, sizeof(rsa->sa_inet.addr)); rsa->sa_inet.port = ntohs(sin->sin_port); return; case AF_INET6: if (sal < sizeof(struct sockaddr_in6)) return; sin6 = (const struct sockaddr_in6 *)sa; rsa->sa_family = CLOUDABI_AF_INET6; memcpy(&rsa->sa_inet6.addr, &sin6->sin6_addr, sizeof(rsa->sa_inet6.addr)); rsa->sa_inet6.port = ntohs(sin6->sin6_port); return; case AF_UNIX: rsa->sa_family = CLOUDABI_AF_UNIX; return; } } /* Copies a pathname into a UNIX socket address structure. */ static int copyin_sockaddr_un(const char *path, size_t pathlen, struct sockaddr_un *sun) { int error; /* Copy in pathname string if there's enough space. */ if (pathlen >= sizeof(sun->sun_path)) return (ENAMETOOLONG); error = copyin(path, &sun->sun_path, pathlen); if (error != 0) return (error); if (memchr(sun->sun_path, '\0', pathlen) != NULL) return (EINVAL); /* Initialize the rest of the socket address. */ sun->sun_path[pathlen] = '\0'; sun->sun_family = AF_UNIX; sun->sun_len = sizeof(*sun); return (0); } int cloudabi_sys_sock_accept(struct thread *td, struct cloudabi_sys_sock_accept_args *uap) { struct sockaddr *sa; cloudabi_sockstat_t ss = {}; socklen_t sal; int error; if (uap->buf == NULL) { /* Only return the new file descriptor number. */ return (kern_accept(td, uap->sock, NULL, NULL, NULL)); } else { /* Also return properties of the new socket descriptor. */ sal = MAX(sizeof(struct sockaddr_in), sizeof(struct sockaddr_in6)); error = kern_accept(td, uap->sock, (void *)&sa, &sal, NULL); if (error != 0) return (error); /* TODO(ed): Fill the other members of cloudabi_sockstat_t. */ cloudabi_convert_sockaddr(sa, sal, &ss.ss_peername); free(sa, M_SONAME); return (copyout(&ss, uap->buf, sizeof(ss))); } } int cloudabi_sys_sock_bind(struct thread *td, struct cloudabi_sys_sock_bind_args *uap) { struct sockaddr_un sun; int error; error = copyin_sockaddr_un(uap->path, uap->pathlen, &sun); if (error != 0) return (error); return (kern_bindat(td, uap->fd, uap->sock, (struct sockaddr *)&sun)); } int cloudabi_sys_sock_connect(struct thread *td, struct cloudabi_sys_sock_connect_args *uap) { struct sockaddr_un sun; int error; error = copyin_sockaddr_un(uap->path, uap->pathlen, &sun); if (error != 0) return (error); return (kern_connectat(td, uap->fd, uap->sock, (struct sockaddr *)&sun)); } int cloudabi_sys_sock_listen(struct thread *td, struct cloudabi_sys_sock_listen_args *uap) { struct listen_args listen_args = { .s = uap->sock, .backlog = uap->backlog, }; return (sys_listen(td, &listen_args)); } int cloudabi_sys_sock_shutdown(struct thread *td, struct cloudabi_sys_sock_shutdown_args *uap) { struct shutdown_args shutdown_args = { .s = uap->sock, }; switch (uap->how) { case CLOUDABI_SHUT_RD: shutdown_args.how = SHUT_RD; break; case CLOUDABI_SHUT_WR: shutdown_args.how = SHUT_WR; break; case CLOUDABI_SHUT_RD | CLOUDABI_SHUT_WR: shutdown_args.how = SHUT_RDWR; break; default: return (EINVAL); } return (sys_shutdown(td, &shutdown_args)); } int cloudabi_sys_sock_stat_get(struct thread *td, struct cloudabi_sys_sock_stat_get_args *uap) { cloudabi_sockstat_t ss = {}; cap_rights_t rights; struct file *fp; struct sockaddr *sa; struct socket *so; int error; error = getsock_cap(td, uap->sock, cap_rights_init(&rights, - CAP_GETSOCKOPT, CAP_GETPEERNAME, CAP_GETSOCKNAME), &fp, NULL); + CAP_GETSOCKOPT, CAP_GETPEERNAME, CAP_GETSOCKNAME), &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; CURVNET_SET(so->so_vnet); /* Set ss_sockname. */ error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); if (error == 0) { cloudabi_convert_sockaddr(sa, sa->sa_len, &ss.ss_sockname); free(sa, M_SONAME); } /* Set ss_peername. */ if ((so->so_state & (SS_ISCONNECTED | SS_ISCONFIRMING)) != 0) { error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa); if (error == 0) { cloudabi_convert_sockaddr(sa, sa->sa_len, &ss.ss_peername); free(sa, M_SONAME); } } CURVNET_RESTORE(); /* Set ss_error. */ SOCK_LOCK(so); ss.ss_error = cloudabi_convert_errno(so->so_error); if ((uap->flags & CLOUDABI_SOCKSTAT_CLEAR_ERROR) != 0) so->so_error = 0; SOCK_UNLOCK(so); /* Set ss_state. */ if ((so->so_options & SO_ACCEPTCONN) != 0) ss.ss_state |= CLOUDABI_SOCKSTATE_ACCEPTCONN; fdrop(fp, td); return (copyout(&ss, uap->buf, sizeof(ss))); } Index: head/sys/compat/linux/linux_socket.c =================================================================== --- head/sys/compat/linux/linux_socket.c (revision 306173) +++ head/sys/compat/linux/linux_socket.c (revision 306174) @@ -1,1783 +1,1783 @@ /*- * Copyright (c) 1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* XXX we use functions that might not exist. */ #include "opt_compat.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include static int linux_to_bsd_domain(int); static int linux_sendmsg_common(struct thread *, l_int, struct l_msghdr *, l_uint); static int linux_recvmsg_common(struct thread *, l_int, struct l_msghdr *, l_uint, struct msghdr *); static int linux_set_socket_flags(int, int *); /* * Reads a linux sockaddr and does any necessary translation. * Linux sockaddrs don't have a length field, only a family. * Copy the osockaddr structure pointed to by osa to kernel, adjust * family and convert to sockaddr. */ static int linux_getsockaddr(struct sockaddr **sap, const struct osockaddr *osa, int salen) { struct sockaddr *sa; struct osockaddr *kosa; #ifdef INET6 struct sockaddr_in6 *sin6; int oldv6size; #endif char *name; int bdom, error, hdrlen, namelen; if (salen < 2 || salen > UCHAR_MAX || !osa) return (EINVAL); #ifdef INET6 oldv6size = 0; /* * Check for old (pre-RFC2553) sockaddr_in6. We may accept it * if it's a v4-mapped address, so reserve the proper space * for it. */ if (salen == sizeof(struct sockaddr_in6) - sizeof(uint32_t)) { salen += sizeof(uint32_t); oldv6size = 1; } #endif kosa = malloc(salen, M_SONAME, M_WAITOK); if ((error = copyin(osa, kosa, salen))) goto out; bdom = linux_to_bsd_domain(kosa->sa_family); if (bdom == -1) { error = EAFNOSUPPORT; goto out; } #ifdef INET6 /* * Older Linux IPv6 code uses obsolete RFC2133 struct sockaddr_in6, * which lacks the scope id compared with RFC2553 one. If we detect * the situation, reject the address and write a message to system log. * * Still accept addresses for which the scope id is not used. */ if (oldv6size) { if (bdom == AF_INET6) { sin6 = (struct sockaddr_in6 *)kosa; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) || (!IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) && !IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) && !IN6_IS_ADDR_V4COMPAT(&sin6->sin6_addr) && !IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) && !IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))) { sin6->sin6_scope_id = 0; } else { log(LOG_DEBUG, "obsolete pre-RFC2553 sockaddr_in6 rejected\n"); error = EINVAL; goto out; } } else salen -= sizeof(uint32_t); } #endif if (bdom == AF_INET) { if (salen < sizeof(struct sockaddr_in)) { error = EINVAL; goto out; } salen = sizeof(struct sockaddr_in); } if (bdom == AF_LOCAL && salen > sizeof(struct sockaddr_un)) { hdrlen = offsetof(struct sockaddr_un, sun_path); name = ((struct sockaddr_un *)kosa)->sun_path; if (*name == '\0') { /* * Linux abstract namespace starts with a NULL byte. * XXX We do not support abstract namespace yet. */ namelen = strnlen(name + 1, salen - hdrlen - 1) + 1; } else namelen = strnlen(name, salen - hdrlen); salen = hdrlen + namelen; if (salen > sizeof(struct sockaddr_un)) { error = ENAMETOOLONG; goto out; } } sa = (struct sockaddr *)kosa; sa->sa_family = bdom; sa->sa_len = salen; *sap = sa; return (0); out: free(kosa, M_SONAME); return (error); } static int linux_to_bsd_domain(int domain) { switch (domain) { case LINUX_AF_UNSPEC: return (AF_UNSPEC); case LINUX_AF_UNIX: return (AF_LOCAL); case LINUX_AF_INET: return (AF_INET); case LINUX_AF_INET6: return (AF_INET6); case LINUX_AF_AX25: return (AF_CCITT); case LINUX_AF_IPX: return (AF_IPX); case LINUX_AF_APPLETALK: return (AF_APPLETALK); } return (-1); } static int bsd_to_linux_domain(int domain) { switch (domain) { case AF_UNSPEC: return (LINUX_AF_UNSPEC); case AF_LOCAL: return (LINUX_AF_UNIX); case AF_INET: return (LINUX_AF_INET); case AF_INET6: return (LINUX_AF_INET6); case AF_CCITT: return (LINUX_AF_AX25); case AF_IPX: return (LINUX_AF_IPX); case AF_APPLETALK: return (LINUX_AF_APPLETALK); } return (-1); } static int linux_to_bsd_sockopt_level(int level) { switch (level) { case LINUX_SOL_SOCKET: return (SOL_SOCKET); } return (level); } static int bsd_to_linux_sockopt_level(int level) { switch (level) { case SOL_SOCKET: return (LINUX_SOL_SOCKET); } return (level); } static int linux_to_bsd_ip_sockopt(int opt) { switch (opt) { case LINUX_IP_TOS: return (IP_TOS); case LINUX_IP_TTL: return (IP_TTL); case LINUX_IP_OPTIONS: return (IP_OPTIONS); case LINUX_IP_MULTICAST_IF: return (IP_MULTICAST_IF); case LINUX_IP_MULTICAST_TTL: return (IP_MULTICAST_TTL); case LINUX_IP_MULTICAST_LOOP: return (IP_MULTICAST_LOOP); case LINUX_IP_ADD_MEMBERSHIP: return (IP_ADD_MEMBERSHIP); case LINUX_IP_DROP_MEMBERSHIP: return (IP_DROP_MEMBERSHIP); case LINUX_IP_HDRINCL: return (IP_HDRINCL); } return (-1); } static int linux_to_bsd_ip6_sockopt(int opt) { switch (opt) { case LINUX_IPV6_NEXTHOP: return (IPV6_NEXTHOP); case LINUX_IPV6_UNICAST_HOPS: return (IPV6_UNICAST_HOPS); case LINUX_IPV6_MULTICAST_IF: return (IPV6_MULTICAST_IF); case LINUX_IPV6_MULTICAST_HOPS: return (IPV6_MULTICAST_HOPS); case LINUX_IPV6_MULTICAST_LOOP: return (IPV6_MULTICAST_LOOP); case LINUX_IPV6_ADD_MEMBERSHIP: return (IPV6_JOIN_GROUP); case LINUX_IPV6_DROP_MEMBERSHIP: return (IPV6_LEAVE_GROUP); case LINUX_IPV6_V6ONLY: return (IPV6_V6ONLY); case LINUX_IPV6_DONTFRAG: return (IPV6_DONTFRAG); #if 0 case LINUX_IPV6_CHECKSUM: return (IPV6_CHECKSUM); case LINUX_IPV6_RECVPKTINFO: return (IPV6_RECVPKTINFO); case LINUX_IPV6_PKTINFO: return (IPV6_PKTINFO); case LINUX_IPV6_RECVHOPLIMIT: return (IPV6_RECVHOPLIMIT); case LINUX_IPV6_HOPLIMIT: return (IPV6_HOPLIMIT); case LINUX_IPV6_RECVHOPOPTS: return (IPV6_RECVHOPOPTS); case LINUX_IPV6_HOPOPTS: return (IPV6_HOPOPTS); case LINUX_IPV6_RTHDRDSTOPTS: return (IPV6_RTHDRDSTOPTS); case LINUX_IPV6_RECVRTHDR: return (IPV6_RECVRTHDR); case LINUX_IPV6_RTHDR: return (IPV6_RTHDR); case LINUX_IPV6_RECVDSTOPTS: return (IPV6_RECVDSTOPTS); case LINUX_IPV6_DSTOPTS: return (IPV6_DSTOPTS); case LINUX_IPV6_RECVPATHMTU: return (IPV6_RECVPATHMTU); case LINUX_IPV6_PATHMTU: return (IPV6_PATHMTU); #endif } return (-1); } static int linux_to_bsd_so_sockopt(int opt) { switch (opt) { case LINUX_SO_DEBUG: return (SO_DEBUG); case LINUX_SO_REUSEADDR: return (SO_REUSEADDR); case LINUX_SO_TYPE: return (SO_TYPE); case LINUX_SO_ERROR: return (SO_ERROR); case LINUX_SO_DONTROUTE: return (SO_DONTROUTE); case LINUX_SO_BROADCAST: return (SO_BROADCAST); case LINUX_SO_SNDBUF: return (SO_SNDBUF); case LINUX_SO_RCVBUF: return (SO_RCVBUF); case LINUX_SO_KEEPALIVE: return (SO_KEEPALIVE); case LINUX_SO_OOBINLINE: return (SO_OOBINLINE); case LINUX_SO_LINGER: return (SO_LINGER); case LINUX_SO_PEERCRED: return (LOCAL_PEERCRED); case LINUX_SO_RCVLOWAT: return (SO_RCVLOWAT); case LINUX_SO_SNDLOWAT: return (SO_SNDLOWAT); case LINUX_SO_RCVTIMEO: return (SO_RCVTIMEO); case LINUX_SO_SNDTIMEO: return (SO_SNDTIMEO); case LINUX_SO_TIMESTAMP: return (SO_TIMESTAMP); case LINUX_SO_ACCEPTCONN: return (SO_ACCEPTCONN); } return (-1); } static int linux_to_bsd_tcp_sockopt(int opt) { switch (opt) { case LINUX_TCP_NODELAY: return (TCP_NODELAY); case LINUX_TCP_MAXSEG: return (TCP_MAXSEG); case LINUX_TCP_KEEPIDLE: return (TCP_KEEPIDLE); case LINUX_TCP_KEEPINTVL: return (TCP_KEEPINTVL); case LINUX_TCP_KEEPCNT: return (TCP_KEEPCNT); case LINUX_TCP_MD5SIG: return (TCP_MD5SIG); } return (-1); } static int linux_to_bsd_msg_flags(int flags) { int ret_flags = 0; if (flags & LINUX_MSG_OOB) ret_flags |= MSG_OOB; if (flags & LINUX_MSG_PEEK) ret_flags |= MSG_PEEK; if (flags & LINUX_MSG_DONTROUTE) ret_flags |= MSG_DONTROUTE; if (flags & LINUX_MSG_CTRUNC) ret_flags |= MSG_CTRUNC; if (flags & LINUX_MSG_TRUNC) ret_flags |= MSG_TRUNC; if (flags & LINUX_MSG_DONTWAIT) ret_flags |= MSG_DONTWAIT; if (flags & LINUX_MSG_EOR) ret_flags |= MSG_EOR; if (flags & LINUX_MSG_WAITALL) ret_flags |= MSG_WAITALL; if (flags & LINUX_MSG_NOSIGNAL) ret_flags |= MSG_NOSIGNAL; #if 0 /* not handled */ if (flags & LINUX_MSG_PROXY) ; if (flags & LINUX_MSG_FIN) ; if (flags & LINUX_MSG_SYN) ; if (flags & LINUX_MSG_CONFIRM) ; if (flags & LINUX_MSG_RST) ; if (flags & LINUX_MSG_ERRQUEUE) ; #endif return (ret_flags); } /* * If bsd_to_linux_sockaddr() or linux_to_bsd_sockaddr() faults, then the * native syscall will fault. Thus, we don't really need to check the * return values for these functions. */ static int bsd_to_linux_sockaddr(struct sockaddr *arg) { struct sockaddr sa; size_t sa_len = sizeof(struct sockaddr); int error, bdom; if ((error = copyin(arg, &sa, sa_len))) return (error); bdom = bsd_to_linux_domain(sa.sa_family); if (bdom == -1) return (EAFNOSUPPORT); *(u_short *)&sa = bdom; return (copyout(&sa, arg, sa_len)); } static int linux_to_bsd_sockaddr(struct sockaddr *arg, int len) { struct sockaddr sa; size_t sa_len = sizeof(struct sockaddr); int error, bdom; if ((error = copyin(arg, &sa, sa_len))) return (error); bdom = linux_to_bsd_domain(*(sa_family_t *)&sa); if (bdom == -1) return (EAFNOSUPPORT); sa.sa_family = bdom; sa.sa_len = len; return (copyout(&sa, arg, sa_len)); } static int linux_sa_put(struct osockaddr *osa) { struct osockaddr sa; int error, bdom; /* * Only read/write the osockaddr family part, the rest is * not changed. */ error = copyin(osa, &sa, sizeof(sa.sa_family)); if (error) return (error); bdom = bsd_to_linux_domain(sa.sa_family); if (bdom == -1) return (EINVAL); sa.sa_family = bdom; return (copyout(&sa, osa, sizeof(sa.sa_family))); } static int linux_to_bsd_cmsg_type(int cmsg_type) { switch (cmsg_type) { case LINUX_SCM_RIGHTS: return (SCM_RIGHTS); case LINUX_SCM_CREDENTIALS: return (SCM_CREDS); } return (-1); } static int bsd_to_linux_cmsg_type(int cmsg_type) { switch (cmsg_type) { case SCM_RIGHTS: return (LINUX_SCM_RIGHTS); case SCM_CREDS: return (LINUX_SCM_CREDENTIALS); case SCM_TIMESTAMP: return (LINUX_SCM_TIMESTAMP); } return (-1); } static int linux_to_bsd_msghdr(struct msghdr *bhdr, const struct l_msghdr *lhdr) { if (lhdr->msg_controllen > INT_MAX) return (ENOBUFS); bhdr->msg_name = PTRIN(lhdr->msg_name); bhdr->msg_namelen = lhdr->msg_namelen; bhdr->msg_iov = PTRIN(lhdr->msg_iov); bhdr->msg_iovlen = lhdr->msg_iovlen; bhdr->msg_control = PTRIN(lhdr->msg_control); /* * msg_controllen is skipped since BSD and LINUX control messages * are potentially different sizes (e.g. the cred structure used * by SCM_CREDS is different between the two operating system). * * The caller can set it (if necessary) after converting all the * control messages. */ bhdr->msg_flags = linux_to_bsd_msg_flags(lhdr->msg_flags); return (0); } static int bsd_to_linux_msghdr(const struct msghdr *bhdr, struct l_msghdr *lhdr) { lhdr->msg_name = PTROUT(bhdr->msg_name); lhdr->msg_namelen = bhdr->msg_namelen; lhdr->msg_iov = PTROUT(bhdr->msg_iov); lhdr->msg_iovlen = bhdr->msg_iovlen; lhdr->msg_control = PTROUT(bhdr->msg_control); /* * msg_controllen is skipped since BSD and LINUX control messages * are potentially different sizes (e.g. the cred structure used * by SCM_CREDS is different between the two operating system). * * The caller can set it (if necessary) after converting all the * control messages. */ /* msg_flags skipped */ return (0); } static int linux_set_socket_flags(int lflags, int *flags) { if (lflags & ~(LINUX_SOCK_CLOEXEC | LINUX_SOCK_NONBLOCK)) return (EINVAL); if (lflags & LINUX_SOCK_NONBLOCK) *flags |= SOCK_NONBLOCK; if (lflags & LINUX_SOCK_CLOEXEC) *flags |= SOCK_CLOEXEC; return (0); } static int linux_sendit(struct thread *td, int s, struct msghdr *mp, int flags, struct mbuf *control, enum uio_seg segflg) { struct sockaddr *to; int error; if (mp->msg_name != NULL) { error = linux_getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error) return (error); mp->msg_name = to; } else to = NULL; error = kern_sendit(td, s, mp, linux_to_bsd_msg_flags(flags), control, segflg); if (to) free(to, M_SONAME); return (error); } /* Return 0 if IP_HDRINCL is set for the given socket. */ static int linux_check_hdrincl(struct thread *td, int s) { int error, optval; socklen_t size_val; size_val = sizeof(optval); error = kern_getsockopt(td, s, IPPROTO_IP, IP_HDRINCL, &optval, UIO_SYSSPACE, &size_val); if (error) return (error); return (optval == 0); } /* * Updated sendto() when IP_HDRINCL is set: * tweak endian-dependent fields in the IP packet. */ static int linux_sendto_hdrincl(struct thread *td, struct linux_sendto_args *linux_args) { /* * linux_ip_copysize defines how many bytes we should copy * from the beginning of the IP packet before we customize it for BSD. * It should include all the fields we modify (ip_len and ip_off). */ #define linux_ip_copysize 8 struct ip *packet; struct msghdr msg; struct iovec aiov[1]; int error; /* Check that the packet isn't too big or too small. */ if (linux_args->len < linux_ip_copysize || linux_args->len > IP_MAXPACKET) return (EINVAL); packet = (struct ip *)malloc(linux_args->len, M_LINUX, M_WAITOK); /* Make kernel copy of the packet to be sent */ if ((error = copyin(PTRIN(linux_args->msg), packet, linux_args->len))) goto goout; /* Convert fields from Linux to BSD raw IP socket format */ packet->ip_len = linux_args->len; packet->ip_off = ntohs(packet->ip_off); /* Prepare the msghdr and iovec structures describing the new packet */ msg.msg_name = PTRIN(linux_args->to); msg.msg_namelen = linux_args->tolen; msg.msg_iov = aiov; msg.msg_iovlen = 1; msg.msg_control = NULL; msg.msg_flags = 0; aiov[0].iov_base = (char *)packet; aiov[0].iov_len = linux_args->len; error = linux_sendit(td, linux_args->s, &msg, linux_args->flags, NULL, UIO_SYSSPACE); goout: free(packet, M_LINUX); return (error); } int linux_socket(struct thread *td, struct linux_socket_args *args) { struct socket_args /* { int domain; int type; int protocol; } */ bsd_args; int retval_socket; bsd_args.protocol = args->protocol; bsd_args.type = args->type & LINUX_SOCK_TYPE_MASK; if (bsd_args.type < 0 || bsd_args.type > LINUX_SOCK_MAX) return (EINVAL); retval_socket = linux_set_socket_flags(args->type & ~LINUX_SOCK_TYPE_MASK, &bsd_args.type); if (retval_socket != 0) return (retval_socket); bsd_args.domain = linux_to_bsd_domain(args->domain); if (bsd_args.domain == -1) return (EAFNOSUPPORT); retval_socket = sys_socket(td, &bsd_args); if (retval_socket) return (retval_socket); if (bsd_args.type == SOCK_RAW && (bsd_args.protocol == IPPROTO_RAW || bsd_args.protocol == 0) && bsd_args.domain == PF_INET) { /* It's a raw IP socket: set the IP_HDRINCL option. */ int hdrincl; hdrincl = 1; /* We ignore any error returned by kern_setsockopt() */ kern_setsockopt(td, td->td_retval[0], IPPROTO_IP, IP_HDRINCL, &hdrincl, UIO_SYSSPACE, sizeof(hdrincl)); } #ifdef INET6 /* * Linux AF_INET6 socket has IPV6_V6ONLY setsockopt set to 0 by default * and some apps depend on this. So, set V6ONLY to 0 for Linux apps. * For simplicity we do this unconditionally of the net.inet6.ip6.v6only * sysctl value. */ if (bsd_args.domain == PF_INET6) { int v6only; v6only = 0; /* We ignore any error returned by setsockopt() */ kern_setsockopt(td, td->td_retval[0], IPPROTO_IPV6, IPV6_V6ONLY, &v6only, UIO_SYSSPACE, sizeof(v6only)); } #endif return (retval_socket); } int linux_bind(struct thread *td, struct linux_bind_args *args) { struct sockaddr *sa; int error; error = linux_getsockaddr(&sa, PTRIN(args->name), args->namelen); if (error) return (error); error = kern_bindat(td, AT_FDCWD, args->s, sa); free(sa, M_SONAME); if (error == EADDRNOTAVAIL && args->namelen != sizeof(struct sockaddr_in)) return (EINVAL); return (error); } int linux_connect(struct thread *td, struct linux_connect_args *args) { cap_rights_t rights; struct socket *so; struct sockaddr *sa; u_int fflag; int error; error = linux_getsockaddr(&sa, (struct osockaddr *)PTRIN(args->name), args->namelen); if (error) return (error); error = kern_connectat(td, AT_FDCWD, args->s, sa); free(sa, M_SONAME); if (error != EISCONN) return (error); /* * Linux doesn't return EISCONN the first time it occurs, * when on a non-blocking socket. Instead it returns the * error getsockopt(SOL_SOCKET, SO_ERROR) would return on BSD. * * XXXRW: Instead of using fgetsock(), check that it is a * socket and use the file descriptor reference instead of * creating a new one. */ error = fgetsock(td, args->s, cap_rights_init(&rights, CAP_CONNECT), &so, &fflag); if (error == 0) { error = EISCONN; if (fflag & FNONBLOCK) { SOCK_LOCK(so); if (so->so_emuldata == 0) error = so->so_error; so->so_emuldata = (void *)1; SOCK_UNLOCK(so); } fputsock(so); } return (error); } int linux_listen(struct thread *td, struct linux_listen_args *args) { struct listen_args /* { int s; int backlog; } */ bsd_args; bsd_args.s = args->s; bsd_args.backlog = args->backlog; return (sys_listen(td, &bsd_args)); } static int linux_accept_common(struct thread *td, int s, l_uintptr_t addr, l_uintptr_t namelen, int flags) { struct accept4_args /* { int s; struct sockaddr * __restrict name; socklen_t * __restrict anamelen; int flags; } */ bsd_args; cap_rights_t rights; struct socket *so; struct file *fp; int error, error1; bsd_args.s = s; /* XXX: */ bsd_args.name = (struct sockaddr * __restrict)PTRIN(addr); bsd_args.anamelen = PTRIN(namelen);/* XXX */ bsd_args.flags = 0; error = linux_set_socket_flags(flags, &bsd_args.flags); if (error != 0) return (error); error = sys_accept4(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.name); if (error) { if (error == EFAULT && namelen != sizeof(struct sockaddr_in)) return (EINVAL); if (error == EINVAL) { - error1 = getsock_cap(td, s, &rights, &fp, NULL); + error1 = getsock_cap(td, s, &rights, &fp, NULL, NULL); if (error1 != 0) return (error1); so = fp->f_data; if (so->so_type == SOCK_DGRAM) { fdrop(fp, td); return (EOPNOTSUPP); } fdrop(fp, td); } return (error); } if (addr) error = linux_sa_put(PTRIN(addr)); if (error) { (void)kern_close(td, td->td_retval[0]); td->td_retval[0] = 0; } return (error); } int linux_accept(struct thread *td, struct linux_accept_args *args) { return (linux_accept_common(td, args->s, args->addr, args->namelen, 0)); } int linux_accept4(struct thread *td, struct linux_accept4_args *args) { return (linux_accept_common(td, args->s, args->addr, args->namelen, args->flags)); } int linux_getsockname(struct thread *td, struct linux_getsockname_args *args) { struct getsockname_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ bsd_args; int error; bsd_args.fdes = args->s; /* XXX: */ bsd_args.asa = (struct sockaddr * __restrict)PTRIN(args->addr); bsd_args.alen = PTRIN(args->namelen); /* XXX */ error = sys_getsockname(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.asa); if (error) return (error); return (linux_sa_put(PTRIN(args->addr))); } int linux_getpeername(struct thread *td, struct linux_getpeername_args *args) { struct getpeername_args /* { int fdes; caddr_t asa; int *alen; } */ bsd_args; int error; bsd_args.fdes = args->s; bsd_args.asa = (struct sockaddr *)PTRIN(args->addr); bsd_args.alen = (socklen_t *)PTRIN(args->namelen); error = sys_getpeername(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.asa); if (error) return (error); return (linux_sa_put(PTRIN(args->addr))); } int linux_socketpair(struct thread *td, struct linux_socketpair_args *args) { struct socketpair_args /* { int domain; int type; int protocol; int *rsv; } */ bsd_args; int error; bsd_args.domain = linux_to_bsd_domain(args->domain); if (bsd_args.domain != PF_LOCAL) return (EAFNOSUPPORT); bsd_args.type = args->type & LINUX_SOCK_TYPE_MASK; if (bsd_args.type < 0 || bsd_args.type > LINUX_SOCK_MAX) return (EINVAL); error = linux_set_socket_flags(args->type & ~LINUX_SOCK_TYPE_MASK, &bsd_args.type); if (error != 0) return (error); if (args->protocol != 0 && args->protocol != PF_UNIX) /* * Use of PF_UNIX as protocol argument is not right, * but Linux does it. * Do not map PF_UNIX as its Linux value is identical * to FreeBSD one. */ return (EPROTONOSUPPORT); else bsd_args.protocol = 0; bsd_args.rsv = (int *)PTRIN(args->rsv); return (sys_socketpair(td, &bsd_args)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) struct linux_send_args { int s; l_uintptr_t msg; int len; int flags; }; static int linux_send(struct thread *td, struct linux_send_args *args) { struct sendto_args /* { int s; caddr_t buf; int len; int flags; caddr_t to; int tolen; } */ bsd_args; bsd_args.s = args->s; bsd_args.buf = (caddr_t)PTRIN(args->msg); bsd_args.len = args->len; bsd_args.flags = args->flags; bsd_args.to = NULL; bsd_args.tolen = 0; return (sys_sendto(td, &bsd_args)); } struct linux_recv_args { int s; l_uintptr_t msg; int len; int flags; }; static int linux_recv(struct thread *td, struct linux_recv_args *args) { struct recvfrom_args /* { int s; caddr_t buf; int len; int flags; struct sockaddr *from; socklen_t fromlenaddr; } */ bsd_args; bsd_args.s = args->s; bsd_args.buf = (caddr_t)PTRIN(args->msg); bsd_args.len = args->len; bsd_args.flags = linux_to_bsd_msg_flags(args->flags); bsd_args.from = NULL; bsd_args.fromlenaddr = 0; return (sys_recvfrom(td, &bsd_args)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_sendto(struct thread *td, struct linux_sendto_args *args) { struct msghdr msg; struct iovec aiov; if (linux_check_hdrincl(td, args->s) == 0) /* IP_HDRINCL set, tweak the packet before sending */ return (linux_sendto_hdrincl(td, args)); msg.msg_name = PTRIN(args->to); msg.msg_namelen = args->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = NULL; msg.msg_flags = 0; aiov.iov_base = PTRIN(args->msg); aiov.iov_len = args->len; return (linux_sendit(td, args->s, &msg, args->flags, NULL, UIO_USERSPACE)); } int linux_recvfrom(struct thread *td, struct linux_recvfrom_args *args) { struct msghdr msg; struct iovec aiov; int error, fromlen; if (PTRIN(args->fromlen) != NULL) { error = copyin(PTRIN(args->fromlen), &fromlen, sizeof(fromlen)); if (error != 0) return (error); if (fromlen < 0) return (EINVAL); msg.msg_namelen = fromlen; } else msg.msg_namelen = 0; msg.msg_name = (struct sockaddr * __restrict)PTRIN(args->from); msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = PTRIN(args->buf); aiov.iov_len = args->len; msg.msg_control = 0; msg.msg_flags = linux_to_bsd_msg_flags(args->flags); error = kern_recvit(td, args->s, &msg, UIO_USERSPACE, NULL); if (error != 0) return (error); if (PTRIN(args->from) != NULL) { error = bsd_to_linux_sockaddr((struct sockaddr *) PTRIN(args->from)); if (error != 0) return (error); error = linux_sa_put((struct osockaddr *) PTRIN(args->from)); } if (PTRIN(args->fromlen) != NULL) error = copyout(&msg.msg_namelen, PTRIN(args->fromlen), sizeof(msg.msg_namelen)); return (error); } static int linux_sendmsg_common(struct thread *td, l_int s, struct l_msghdr *msghdr, l_uint flags) { struct cmsghdr *cmsg; struct cmsgcred cmcred; struct mbuf *control; struct msghdr msg; struct l_cmsghdr linux_cmsg; struct l_cmsghdr *ptr_cmsg; struct l_msghdr linux_msg; struct iovec *iov; socklen_t datalen; struct sockaddr *sa; sa_family_t sa_family; void *data; int error; error = copyin(msghdr, &linux_msg, sizeof(linux_msg)); if (error != 0) return (error); /* * Some Linux applications (ping) define a non-NULL control data * pointer, but a msg_controllen of 0, which is not allowed in the * FreeBSD system call interface. NULL the msg_control pointer in * order to handle this case. This should be checked, but allows the * Linux ping to work. */ if (PTRIN(linux_msg.msg_control) != NULL && linux_msg.msg_controllen == 0) linux_msg.msg_control = PTROUT(NULL); error = linux_to_bsd_msghdr(&msg, &linux_msg); if (error != 0) return (error); #ifdef COMPAT_LINUX32 error = linux32_copyiniov(PTRIN(msg.msg_iov), msg.msg_iovlen, &iov, EMSGSIZE); #else error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); #endif if (error != 0) return (error); control = NULL; cmsg = NULL; if ((ptr_cmsg = LINUX_CMSG_FIRSTHDR(&linux_msg)) != NULL) { error = kern_getsockname(td, s, &sa, &datalen); if (error != 0) goto bad; sa_family = sa->sa_family; free(sa, M_SONAME); error = ENOBUFS; cmsg = malloc(CMSG_HDRSZ, M_LINUX, M_WAITOK|M_ZERO); control = m_get(M_WAITOK, MT_CONTROL); do { error = copyin(ptr_cmsg, &linux_cmsg, sizeof(struct l_cmsghdr)); if (error != 0) goto bad; error = EINVAL; if (linux_cmsg.cmsg_len < sizeof(struct l_cmsghdr)) goto bad; /* * Now we support only SCM_RIGHTS and SCM_CRED, * so return EINVAL in any other cmsg_type */ cmsg->cmsg_type = linux_to_bsd_cmsg_type(linux_cmsg.cmsg_type); cmsg->cmsg_level = linux_to_bsd_sockopt_level(linux_cmsg.cmsg_level); if (cmsg->cmsg_type == -1 || cmsg->cmsg_level != SOL_SOCKET) goto bad; /* * Some applications (e.g. pulseaudio) attempt to * send ancillary data even if the underlying protocol * doesn't support it which is not allowed in the * FreeBSD system call interface. */ if (sa_family != AF_UNIX) continue; data = LINUX_CMSG_DATA(ptr_cmsg); datalen = linux_cmsg.cmsg_len - L_CMSG_HDRSZ; switch (cmsg->cmsg_type) { case SCM_RIGHTS: break; case SCM_CREDS: data = &cmcred; datalen = sizeof(cmcred); /* * The lower levels will fill in the structure */ bzero(data, datalen); break; } cmsg->cmsg_len = CMSG_LEN(datalen); error = ENOBUFS; if (!m_append(control, CMSG_HDRSZ, (c_caddr_t)cmsg)) goto bad; if (!m_append(control, datalen, (c_caddr_t)data)) goto bad; } while ((ptr_cmsg = LINUX_CMSG_NXTHDR(&linux_msg, ptr_cmsg))); if (m_length(control, NULL) == 0) { m_freem(control); control = NULL; } } msg.msg_iov = iov; msg.msg_flags = 0; error = linux_sendit(td, s, &msg, flags, control, UIO_USERSPACE); control = NULL; bad: m_freem(control); free(iov, M_IOV); if (cmsg) free(cmsg, M_LINUX); return (error); } int linux_sendmsg(struct thread *td, struct linux_sendmsg_args *args) { return (linux_sendmsg_common(td, args->s, PTRIN(args->msg), args->flags)); } int linux_sendmmsg(struct thread *td, struct linux_sendmmsg_args *args) { struct l_mmsghdr *msg; l_uint retval; int error, datagrams; if (args->vlen > UIO_MAXIOV) args->vlen = UIO_MAXIOV; msg = PTRIN(args->msg); datagrams = 0; while (datagrams < args->vlen) { error = linux_sendmsg_common(td, args->s, &msg->msg_hdr, args->flags); if (error != 0) break; retval = td->td_retval[0]; error = copyout(&retval, &msg->msg_len, sizeof(msg->msg_len)); if (error != 0) break; ++msg; ++datagrams; } if (error == 0) td->td_retval[0] = datagrams; return (error); } static int linux_recvmsg_common(struct thread *td, l_int s, struct l_msghdr *msghdr, l_uint flags, struct msghdr *msg) { struct cmsghdr *cm; struct cmsgcred *cmcred; struct l_cmsghdr *linux_cmsg = NULL; struct l_ucred linux_ucred; socklen_t datalen, outlen; struct l_msghdr linux_msg; struct iovec *iov, *uiov; struct mbuf *control = NULL; struct mbuf **controlp; struct timeval *ftmvl; l_timeval ltmvl; caddr_t outbuf; void *data; int error, i, fd, fds, *fdp; error = copyin(msghdr, &linux_msg, sizeof(linux_msg)); if (error != 0) return (error); error = linux_to_bsd_msghdr(msg, &linux_msg); if (error != 0) return (error); #ifdef COMPAT_LINUX32 error = linux32_copyiniov(PTRIN(msg->msg_iov), msg->msg_iovlen, &iov, EMSGSIZE); #else error = copyiniov(msg->msg_iov, msg->msg_iovlen, &iov, EMSGSIZE); #endif if (error != 0) return (error); if (msg->msg_name) { error = linux_to_bsd_sockaddr((struct sockaddr *)msg->msg_name, msg->msg_namelen); if (error != 0) goto bad; } uiov = msg->msg_iov; msg->msg_iov = iov; controlp = (msg->msg_control != NULL) ? &control : NULL; error = kern_recvit(td, s, msg, UIO_USERSPACE, controlp); msg->msg_iov = uiov; if (error != 0) goto bad; error = bsd_to_linux_msghdr(msg, &linux_msg); if (error != 0) goto bad; if (linux_msg.msg_name) { error = bsd_to_linux_sockaddr((struct sockaddr *) PTRIN(linux_msg.msg_name)); if (error != 0) goto bad; } if (linux_msg.msg_name && linux_msg.msg_namelen > 2) { error = linux_sa_put(PTRIN(linux_msg.msg_name)); if (error != 0) goto bad; } outbuf = PTRIN(linux_msg.msg_control); outlen = 0; if (control) { linux_cmsg = malloc(L_CMSG_HDRSZ, M_LINUX, M_WAITOK | M_ZERO); msg->msg_control = mtod(control, struct cmsghdr *); msg->msg_controllen = control->m_len; cm = CMSG_FIRSTHDR(msg); while (cm != NULL) { linux_cmsg->cmsg_type = bsd_to_linux_cmsg_type(cm->cmsg_type); linux_cmsg->cmsg_level = bsd_to_linux_sockopt_level(cm->cmsg_level); if (linux_cmsg->cmsg_type == -1 || cm->cmsg_level != SOL_SOCKET) { error = EINVAL; goto bad; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; switch (cm->cmsg_type) { case SCM_RIGHTS: if (flags & LINUX_MSG_CMSG_CLOEXEC) { fds = datalen / sizeof(int); fdp = data; for (i = 0; i < fds; i++) { fd = *fdp++; (void)kern_fcntl(td, fd, F_SETFD, FD_CLOEXEC); } } break; case SCM_CREDS: /* * Currently LOCAL_CREDS is never in * effect for Linux so no need to worry * about sockcred */ if (datalen != sizeof(*cmcred)) { error = EMSGSIZE; goto bad; } cmcred = (struct cmsgcred *)data; bzero(&linux_ucred, sizeof(linux_ucred)); linux_ucred.pid = cmcred->cmcred_pid; linux_ucred.uid = cmcred->cmcred_uid; linux_ucred.gid = cmcred->cmcred_gid; data = &linux_ucred; datalen = sizeof(linux_ucred); break; case SCM_TIMESTAMP: if (datalen != sizeof(struct timeval)) { error = EMSGSIZE; goto bad; } ftmvl = (struct timeval *)data; ltmvl.tv_sec = ftmvl->tv_sec; ltmvl.tv_usec = ftmvl->tv_usec; data = <mvl; datalen = sizeof(ltmvl); break; } if (outlen + LINUX_CMSG_LEN(datalen) > linux_msg.msg_controllen) { if (outlen == 0) { error = EMSGSIZE; goto bad; } else { linux_msg.msg_flags |= LINUX_MSG_CTRUNC; goto out; } } linux_cmsg->cmsg_len = LINUX_CMSG_LEN(datalen); error = copyout(linux_cmsg, outbuf, L_CMSG_HDRSZ); if (error) goto bad; outbuf += L_CMSG_HDRSZ; error = copyout(data, outbuf, datalen); if (error) goto bad; outbuf += LINUX_CMSG_ALIGN(datalen); outlen += LINUX_CMSG_LEN(datalen); cm = CMSG_NXTHDR(msg, cm); } } out: linux_msg.msg_controllen = outlen; error = copyout(&linux_msg, msghdr, sizeof(linux_msg)); bad: free(iov, M_IOV); m_freem(control); free(linux_cmsg, M_LINUX); return (error); } int linux_recvmsg(struct thread *td, struct linux_recvmsg_args *args) { struct msghdr bsd_msg; return (linux_recvmsg_common(td, args->s, PTRIN(args->msg), args->flags, &bsd_msg)); } int linux_recvmmsg(struct thread *td, struct linux_recvmmsg_args *args) { struct l_mmsghdr *msg; struct msghdr bsd_msg; struct l_timespec lts; struct timespec ts, tts; l_uint retval; int error, datagrams; if (args->timeout) { error = copyin(args->timeout, <s, sizeof(struct l_timespec)); if (error != 0) return (error); error = linux_to_native_timespec(&ts, <s); if (error != 0) return (error); getnanotime(&tts); timespecadd(&tts, &ts); } msg = PTRIN(args->msg); datagrams = 0; while (datagrams < args->vlen) { error = linux_recvmsg_common(td, args->s, &msg->msg_hdr, args->flags & ~LINUX_MSG_WAITFORONE, &bsd_msg); if (error != 0) break; retval = td->td_retval[0]; error = copyout(&retval, &msg->msg_len, sizeof(msg->msg_len)); if (error != 0) break; ++msg; ++datagrams; /* * MSG_WAITFORONE turns on MSG_DONTWAIT after one packet. */ if (args->flags & LINUX_MSG_WAITFORONE) args->flags |= LINUX_MSG_DONTWAIT; /* * See BUGS section of recvmmsg(2). */ if (args->timeout) { getnanotime(&ts); timespecsub(&ts, &tts); if (!timespecisset(&ts) || ts.tv_sec > 0) break; } /* Out of band data, return right away. */ if (bsd_msg.msg_flags & MSG_OOB) break; } if (error == 0) td->td_retval[0] = datagrams; return (error); } int linux_shutdown(struct thread *td, struct linux_shutdown_args *args) { struct shutdown_args /* { int s; int how; } */ bsd_args; bsd_args.s = args->s; bsd_args.how = args->how; return (sys_shutdown(td, &bsd_args)); } int linux_setsockopt(struct thread *td, struct linux_setsockopt_args *args) { struct setsockopt_args /* { int s; int level; int name; caddr_t val; int valsize; } */ bsd_args; l_timeval linux_tv; struct timeval tv; int error, name; bsd_args.s = args->s; bsd_args.level = linux_to_bsd_sockopt_level(args->level); switch (bsd_args.level) { case SOL_SOCKET: name = linux_to_bsd_so_sockopt(args->optname); switch (name) { case SO_RCVTIMEO: /* FALLTHROUGH */ case SO_SNDTIMEO: error = copyin(PTRIN(args->optval), &linux_tv, sizeof(linux_tv)); if (error) return (error); tv.tv_sec = linux_tv.tv_sec; tv.tv_usec = linux_tv.tv_usec; return (kern_setsockopt(td, args->s, bsd_args.level, name, &tv, UIO_SYSSPACE, sizeof(tv))); /* NOTREACHED */ break; default: break; } break; case IPPROTO_IP: name = linux_to_bsd_ip_sockopt(args->optname); break; case IPPROTO_IPV6: name = linux_to_bsd_ip6_sockopt(args->optname); break; case IPPROTO_TCP: name = linux_to_bsd_tcp_sockopt(args->optname); break; default: name = -1; break; } if (name == -1) return (ENOPROTOOPT); bsd_args.name = name; bsd_args.val = PTRIN(args->optval); bsd_args.valsize = args->optlen; if (name == IPV6_NEXTHOP) { linux_to_bsd_sockaddr((struct sockaddr *)bsd_args.val, bsd_args.valsize); error = sys_setsockopt(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.val); } else error = sys_setsockopt(td, &bsd_args); return (error); } int linux_getsockopt(struct thread *td, struct linux_getsockopt_args *args) { struct getsockopt_args /* { int s; int level; int name; caddr_t val; int *avalsize; } */ bsd_args; l_timeval linux_tv; struct timeval tv; socklen_t tv_len, xulen, len; struct xucred xu; struct l_ucred lxu; int error, name, newval; bsd_args.s = args->s; bsd_args.level = linux_to_bsd_sockopt_level(args->level); switch (bsd_args.level) { case SOL_SOCKET: name = linux_to_bsd_so_sockopt(args->optname); switch (name) { case SO_RCVTIMEO: /* FALLTHROUGH */ case SO_SNDTIMEO: tv_len = sizeof(tv); error = kern_getsockopt(td, args->s, bsd_args.level, name, &tv, UIO_SYSSPACE, &tv_len); if (error) return (error); linux_tv.tv_sec = tv.tv_sec; linux_tv.tv_usec = tv.tv_usec; return (copyout(&linux_tv, PTRIN(args->optval), sizeof(linux_tv))); /* NOTREACHED */ break; case LOCAL_PEERCRED: if (args->optlen != sizeof(lxu)) return (EINVAL); xulen = sizeof(xu); error = kern_getsockopt(td, args->s, bsd_args.level, name, &xu, UIO_SYSSPACE, &xulen); if (error) return (error); /* * XXX Use 0 for pid as the FreeBSD does not cache peer pid. */ lxu.pid = 0; lxu.uid = xu.cr_uid; lxu.gid = xu.cr_gid; return (copyout(&lxu, PTRIN(args->optval), sizeof(lxu))); /* NOTREACHED */ break; case SO_ERROR: len = sizeof(newval); error = kern_getsockopt(td, args->s, bsd_args.level, name, &newval, UIO_SYSSPACE, &len); if (error) return (error); newval = -SV_ABI_ERRNO(td->td_proc, newval); return (copyout(&newval, PTRIN(args->optval), len)); /* NOTREACHED */ default: break; } break; case IPPROTO_IP: name = linux_to_bsd_ip_sockopt(args->optname); break; case IPPROTO_IPV6: name = linux_to_bsd_ip6_sockopt(args->optname); break; case IPPROTO_TCP: name = linux_to_bsd_tcp_sockopt(args->optname); break; default: name = -1; break; } if (name == -1) return (EINVAL); bsd_args.name = name; bsd_args.val = PTRIN(args->optval); bsd_args.avalsize = PTRIN(args->optlen); if (name == IPV6_NEXTHOP) { error = sys_getsockopt(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.val); } else error = sys_getsockopt(td, &bsd_args); return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) /* Argument list sizes for linux_socketcall */ #define LINUX_AL(x) ((x) * sizeof(l_ulong)) static const unsigned char lxs_args[] = { LINUX_AL(0) /* unused*/, LINUX_AL(3) /* socket */, LINUX_AL(3) /* bind */, LINUX_AL(3) /* connect */, LINUX_AL(2) /* listen */, LINUX_AL(3) /* accept */, LINUX_AL(3) /* getsockname */, LINUX_AL(3) /* getpeername */, LINUX_AL(4) /* socketpair */, LINUX_AL(4) /* send */, LINUX_AL(4) /* recv */, LINUX_AL(6) /* sendto */, LINUX_AL(6) /* recvfrom */, LINUX_AL(2) /* shutdown */, LINUX_AL(5) /* setsockopt */, LINUX_AL(5) /* getsockopt */, LINUX_AL(3) /* sendmsg */, LINUX_AL(3) /* recvmsg */, LINUX_AL(4) /* accept4 */, LINUX_AL(5) /* recvmmsg */, LINUX_AL(4) /* sendmmsg */ }; #define LINUX_AL_SIZE (nitems(lxs_args) - 1) int linux_socketcall(struct thread *td, struct linux_socketcall_args *args) { l_ulong a[6]; void *arg; int error; if (args->what < LINUX_SOCKET || args->what > LINUX_AL_SIZE) return (EINVAL); error = copyin(PTRIN(args->args), a, lxs_args[args->what]); if (error) return (error); arg = a; switch (args->what) { case LINUX_SOCKET: return (linux_socket(td, arg)); case LINUX_BIND: return (linux_bind(td, arg)); case LINUX_CONNECT: return (linux_connect(td, arg)); case LINUX_LISTEN: return (linux_listen(td, arg)); case LINUX_ACCEPT: return (linux_accept(td, arg)); case LINUX_GETSOCKNAME: return (linux_getsockname(td, arg)); case LINUX_GETPEERNAME: return (linux_getpeername(td, arg)); case LINUX_SOCKETPAIR: return (linux_socketpair(td, arg)); case LINUX_SEND: return (linux_send(td, arg)); case LINUX_RECV: return (linux_recv(td, arg)); case LINUX_SENDTO: return (linux_sendto(td, arg)); case LINUX_RECVFROM: return (linux_recvfrom(td, arg)); case LINUX_SHUTDOWN: return (linux_shutdown(td, arg)); case LINUX_SETSOCKOPT: return (linux_setsockopt(td, arg)); case LINUX_GETSOCKOPT: return (linux_getsockopt(td, arg)); case LINUX_SENDMSG: return (linux_sendmsg(td, arg)); case LINUX_RECVMSG: return (linux_recvmsg(td, arg)); case LINUX_ACCEPT4: return (linux_accept4(td, arg)); case LINUX_RECVMMSG: return (linux_recvmmsg(td, arg)); case LINUX_SENDMMSG: return (linux_sendmmsg(td, arg)); } uprintf("LINUX: 'socket' typ=%d not implemented\n", args->what); return (ENOSYS); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ Index: head/sys/kern/kern_sendfile.c =================================================================== --- head/sys/kern/kern_sendfile.c (revision 306173) +++ head/sys/kern/kern_sendfile.c (revision 306174) @@ -1,1016 +1,1016 @@ /*- * Copyright (c) 2013-2015 Gleb Smirnoff * Copyright (c) 1998, David Greenman. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Structure describing a single sendfile(2) I/O, which may consist of * several underlying pager I/Os. * * The syscall context allocates the structure and initializes 'nios' * to 1. As sendfile_swapin() runs through pages and starts asynchronous * paging operations, it increments 'nios'. * * Every I/O completion calls sendfile_iodone(), which decrements the 'nios', * and the syscall also calls sendfile_iodone() after allocating all mbufs, * linking them and sending to socket. Whoever reaches zero 'nios' is * responsible to * call pru_ready on the socket, to notify it of readyness * of the data. */ struct sf_io { volatile u_int nios; u_int error; int npages; struct file *sock_fp; struct mbuf *m; vm_page_t pa[]; }; /* * Structure used to track requests with SF_SYNC flag. */ struct sendfile_sync { struct mtx mtx; struct cv cv; unsigned count; }; counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; static void sfstat_init(const void *unused) { COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t), M_WAITOK); } SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); static int sfstat_sysctl(SYSCTL_HANDLER_ARGS) { struct sfstat s; COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t)); if (req->newptr) COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t)); return (SYSCTL_OUT(req, &s, sizeof(s))); } SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW, NULL, 0, sfstat_sysctl, "I", "sendfile statistics"); /* * Detach mapped page and release resources back to the system. Called * by mbuf(9) code when last reference to a page is freed. */ void sf_ext_free(void *arg1, void *arg2) { struct sf_buf *sf = arg1; struct sendfile_sync *sfs = arg2; vm_page_t pg = sf_buf_page(sf); sf_buf_free(sf); vm_page_lock(pg); /* * Check for the object going away on us. This can * happen since we don't hold a reference to it. * If so, we're responsible for freeing the page. */ if (vm_page_unwire(pg, PQ_INACTIVE) && pg->object == NULL) vm_page_free(pg); vm_page_unlock(pg); if (sfs != NULL) { mtx_lock(&sfs->mtx); KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); if (--sfs->count == 0) cv_signal(&sfs->cv); mtx_unlock(&sfs->mtx); } } /* * Same as above, but forces the page to be detached from the object * and go into free pool. */ void sf_ext_free_nocache(void *arg1, void *arg2) { struct sf_buf *sf = arg1; struct sendfile_sync *sfs = arg2; vm_page_t pg = sf_buf_page(sf); sf_buf_free(sf); vm_page_lock(pg); if (vm_page_unwire(pg, PQ_NONE)) { vm_object_t obj; /* Try to free the page, but only if it is cheap to. */ if ((obj = pg->object) == NULL) vm_page_free(pg); else if (!vm_page_xbusied(pg) && VM_OBJECT_TRYWLOCK(obj)) { vm_page_free(pg); VM_OBJECT_WUNLOCK(obj); } else vm_page_deactivate(pg); } vm_page_unlock(pg); if (sfs != NULL) { mtx_lock(&sfs->mtx); KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); if (--sfs->count == 0) cv_signal(&sfs->cv); mtx_unlock(&sfs->mtx); } } /* * Helper function to calculate how much data to put into page i of n. * Only first and last pages are special. */ static inline off_t xfsize(int i, int n, off_t off, off_t len) { if (i == 0) return (omin(PAGE_SIZE - (off & PAGE_MASK), len)); if (i == n - 1 && ((off + len) & PAGE_MASK) > 0) return ((off + len) & PAGE_MASK); return (PAGE_SIZE); } /* * Helper function to get offset within object for i page. */ static inline vm_offset_t vmoff(int i, off_t off) { if (i == 0) return ((vm_offset_t)off); return (trunc_page(off + i * PAGE_SIZE)); } /* * Helper function used when allocation of a page or sf_buf failed. * Pretend as if we don't have enough space, subtract xfsize() of * all pages that failed. */ static inline void fixspace(int old, int new, off_t off, int *space) { KASSERT(old > new, ("%s: old %d new %d", __func__, old, new)); /* Subtract last one. */ *space -= xfsize(old - 1, old, off, *space); old--; if (new == old) /* There was only one page. */ return; /* Subtract first one. */ if (new == 0) { *space -= xfsize(0, old, off, *space); new++; } /* Rest of pages are full sized. */ *space -= (old - new) * PAGE_SIZE; KASSERT(*space >= 0, ("%s: space went backwards", __func__)); } /* * I/O completion callback. */ static void sendfile_iodone(void *arg, vm_page_t *pg, int count, int error) { struct sf_io *sfio = arg; struct socket *so; for (int i = 0; i < count; i++) vm_page_xunbusy(pg[i]); if (error) sfio->error = error; if (!refcount_release(&sfio->nios)) return; so = sfio->sock_fp->f_data; if (sfio->error) { struct mbuf *m; /* * I/O operation failed. The state of data in the socket * is now inconsistent, and all what we can do is to tear * it down. Protocol abort method would tear down protocol * state, free all ready mbufs and detach not ready ones. * We will free the mbufs corresponding to this I/O manually. * * The socket would be marked with EIO and made available * for read, so that application receives EIO on next * syscall and eventually closes the socket. */ so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = EIO; m = sfio->m; for (int i = 0; i < sfio->npages; i++) m = m_free(m); } else { CURVNET_SET(so->so_vnet); (void )(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m, sfio->npages); CURVNET_RESTORE(); } /* XXXGL: curthread */ fdrop(sfio->sock_fp, curthread); free(sfio, M_TEMP); } /* * Iterate through pages vector and request paging for non-valid pages. */ static int sendfile_swapin(vm_object_t obj, struct sf_io *sfio, off_t off, off_t len, int npages, int rhpages, int flags) { vm_page_t *pa = sfio->pa; int nios; nios = 0; flags = (flags & SF_NODISKIO) ? VM_ALLOC_NOWAIT : 0; /* * First grab all the pages and wire them. Note that we grab * only required pages. Readahead pages are dealt with later. */ VM_OBJECT_WLOCK(obj); for (int i = 0; i < npages; i++) { pa[i] = vm_page_grab(obj, OFF_TO_IDX(vmoff(i, off)), VM_ALLOC_WIRED | VM_ALLOC_NORMAL | flags); if (pa[i] == NULL) { npages = i; rhpages = 0; break; } } for (int i = 0; i < npages;) { int j, a, count, rv; /* Skip valid pages. */ if (vm_page_is_valid(pa[i], vmoff(i, off) & PAGE_MASK, xfsize(i, npages, off, len))) { vm_page_xunbusy(pa[i]); SFSTAT_INC(sf_pages_valid); i++; continue; } /* * Now 'i' points to first invalid page, iterate further * to make 'j' point at first valid after a bunch of * invalid ones. */ for (j = i + 1; j < npages; j++) if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK, xfsize(j, npages, off, len))) { SFSTAT_INC(sf_pages_valid); break; } /* * Now we got region of invalid pages between 'i' and 'j'. * Check that they belong to pager. They may not be there, * which is a regular situation for shmem pager. For vnode * pager this happens only in case of sparse file. * * Important feature of vm_pager_has_page() is the hint * stored in 'a', about how many pages we can pagein after * this page in a single I/O. */ while (!vm_pager_has_page(obj, OFF_TO_IDX(vmoff(i, off)), NULL, &a) && i < j) { pmap_zero_page(pa[i]); pa[i]->valid = VM_PAGE_BITS_ALL; pa[i]->dirty = 0; vm_page_xunbusy(pa[i]); i++; } if (i == j) continue; /* * We want to pagein as many pages as possible, limited only * by the 'a' hint and actual request. * * We should not pagein into already valid page, thus if * 'j' didn't reach last page, trim by that page. * * When the pagein fulfils the request, also specify readahead. */ if (j < npages) a = min(a, j - i - 1); count = min(a + 1, npages - i); refcount_acquire(&sfio->nios); rv = vm_pager_get_pages_async(obj, pa + i, count, NULL, i + count == npages ? &rhpages : NULL, &sendfile_iodone, sfio); KASSERT(rv == VM_PAGER_OK, ("%s: pager fail obj %p page %p", __func__, obj, pa[i])); SFSTAT_INC(sf_iocnt); SFSTAT_ADD(sf_pages_read, count); if (i + count == npages) SFSTAT_ADD(sf_rhpages_read, rhpages); #ifdef INVARIANTS for (j = i; j < i + count && j < npages; j++) KASSERT(pa[j] == vm_page_lookup(obj, OFF_TO_IDX(vmoff(j, off))), ("pa[j] %p lookup %p\n", pa[j], vm_page_lookup(obj, OFF_TO_IDX(vmoff(j, off))))); #endif i += count; nios++; } VM_OBJECT_WUNLOCK(obj); if (nios == 0 && npages != 0) SFSTAT_INC(sf_noiocnt); return (nios); } static int sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, int *bsize) { struct vattr va; vm_object_t obj; struct vnode *vp; struct shmfd *shmfd; int error; vp = *vp_res = NULL; obj = NULL; shmfd = *shmfd_res = NULL; *bsize = 0; /* * The file descriptor must be a regular file and have a * backing VM object. */ if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; vn_lock(vp, LK_SHARED | LK_RETRY); if (vp->v_type != VREG) { error = EINVAL; goto out; } *bsize = vp->v_mount->mnt_stat.f_iosize; error = VOP_GETATTR(vp, &va, td->td_ucred); if (error != 0) goto out; *obj_size = va.va_size; obj = vp->v_object; if (obj == NULL) { error = EINVAL; goto out; } } else if (fp->f_type == DTYPE_SHM) { error = 0; shmfd = fp->f_data; obj = shmfd->shm_object; *obj_size = shmfd->shm_size; } else { error = EINVAL; goto out; } VM_OBJECT_WLOCK(obj); if ((obj->flags & OBJ_DEAD) != 0) { VM_OBJECT_WUNLOCK(obj); error = EBADF; goto out; } /* * Temporarily increase the backing VM object's reference * count so that a forced reclamation of its vnode does not * immediately destroy it. */ vm_object_reference_locked(obj); VM_OBJECT_WUNLOCK(obj); *obj_res = obj; *vp_res = vp; *shmfd_res = shmfd; out: if (vp != NULL) VOP_UNLOCK(vp, 0); return (error); } static int sendfile_getsock(struct thread *td, int s, struct file **sock_fp, struct socket **so) { cap_rights_t rights; int error; *sock_fp = NULL; *so = NULL; /* * The socket must be a stream socket and connected. */ error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SEND), - sock_fp, NULL); + sock_fp, NULL, NULL); if (error != 0) return (error); *so = (*sock_fp)->f_data; if ((*so)->so_type != SOCK_STREAM) return (EINVAL); if (((*so)->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); return (0); } int vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { struct file *sock_fp; struct vnode *vp; struct vm_object *obj; struct socket *so; struct mbuf *m, *mh, *mhtail; struct sf_buf *sf; struct shmfd *shmfd; struct sendfile_sync *sfs; struct vattr va; off_t off, sbytes, rem, obj_size; int error, softerr, bsize, hdrlen; obj = NULL; so = NULL; m = mh = NULL; sfs = NULL; hdrlen = sbytes = 0; softerr = 0; error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); if (error != 0) return (error); error = sendfile_getsock(td, sockfd, &sock_fp, &so); if (error != 0) goto out; #ifdef MAC error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto out; #endif SFSTAT_INC(sf_syscalls); SFSTAT_ADD(sf_rhpages_requested, SF_READAHEAD(flags)); if (flags & SF_SYNC) { sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO); mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); cv_init(&sfs->cv, "sendfile"); } rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset; /* * Protect against multiple writers to the socket. * * XXXRW: Historically this has assumed non-interruptibility, so now * we implement that, but possibly shouldn't. */ (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); /* * Loop through the pages of the file, starting with the requested * offset. Get a file page (do I/O if necessary), map the file page * into an sf_buf, attach an mbuf header to the sf_buf, and queue * it on the socket. * This is done in two loops. The inner loop turns as many pages * as it can, up to available socket buffer space, without blocking * into mbufs to have it bulk delivered into the socket send buffer. * The outer loop checks the state and available space of the socket * and takes care of the overall progress. */ for (off = offset; rem > 0; ) { struct sf_io *sfio; vm_page_t *pa; struct mbuf *mtail; int nios, space, npages, rhpages; mtail = NULL; /* * Check the socket state for ongoing connection, * no errors and space in socket buffer. * If space is low allow for the remainder of the * file to be processed if it fits the socket buffer. * Otherwise block in waiting for sufficient space * to proceed, or if the socket is nonblocking, return * to userland with EAGAIN while reporting how far * we've come. * We wait until the socket buffer has significant free * space to do bulk sends. This makes good use of file * system read ahead and allows packet segmentation * offloading hardware to take over lots of work. If * we were not careful here we would send off only one * sfbuf at a time. */ SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; retry_space: if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; SOCKBUF_UNLOCK(&so->so_snd); goto done; } else if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto done; } space = sbspace(&so->so_snd); if (space < rem && (space <= 0 || space < so->so_snd.sb_lowat)) { if (so->so_state & SS_NBIO) { SOCKBUF_UNLOCK(&so->so_snd); error = EAGAIN; goto done; } /* * sbwait drops the lock while sleeping. * When we loop back to retry_space the * state may have changed and we retest * for it. */ error = sbwait(&so->so_snd); /* * An error from sbwait usually indicates that we've * been interrupted by a signal. If we've sent anything * then return bytes sent, otherwise return the error. */ if (error != 0) { SOCKBUF_UNLOCK(&so->so_snd); goto done; } goto retry_space; } SOCKBUF_UNLOCK(&so->so_snd); /* * At the beginning of the first loop check if any headers * are specified and copy them into mbufs. Reduce space in * the socket buffer by the size of the header mbuf chain. * Clear hdr_uio here and hdrlen at the end of the first loop. */ if (hdr_uio != NULL && hdr_uio->uio_resid > 0) { hdr_uio->uio_td = td; hdr_uio->uio_rw = UIO_WRITE; hdr_uio->uio_resid = min(hdr_uio->uio_resid, space); mh = m_uiotombuf(hdr_uio, M_WAITOK, 0, 0, 0); hdrlen = m_length(mh, &mhtail); space -= hdrlen; hdr_uio = NULL; } if (vp != NULL) { error = vn_lock(vp, LK_SHARED); if (error != 0) goto done; error = VOP_GETATTR(vp, &va, td->td_ucred); if (error != 0 || off >= va.va_size) { VOP_UNLOCK(vp, 0); goto done; } if (va.va_size != obj_size) { if (nbytes == 0) rem += va.va_size - obj_size; else if (offset + nbytes > va.va_size) rem -= (offset + nbytes - va.va_size); obj_size = va.va_size; } } if (space > rem) space = rem; npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE); /* * Calculate maximum allowed number of pages for readahead * at this iteration. First, we allow readahead up to "rem". * If application wants more, let it be, but there is no * reason to go above MAXPHYS. Also check against "obj_size", * since vm_pager_has_page() can hint beyond EOF. */ rhpages = howmany(rem + (off & PAGE_MASK), PAGE_SIZE) - npages; rhpages += SF_READAHEAD(flags); rhpages = min(howmany(MAXPHYS, PAGE_SIZE), rhpages); rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) - npages, rhpages); sfio = malloc(sizeof(struct sf_io) + npages * sizeof(vm_page_t), M_TEMP, M_WAITOK); refcount_init(&sfio->nios, 1); sfio->error = 0; nios = sendfile_swapin(obj, sfio, off, space, npages, rhpages, flags); /* * Loop and construct maximum sized mbuf chain to be bulk * dumped into socket buffer. */ pa = sfio->pa; for (int i = 0; i < npages; i++) { struct mbuf *m0; /* * If a page wasn't grabbed successfully, then * trim the array. Can happen only with SF_NODISKIO. */ if (pa[i] == NULL) { SFSTAT_INC(sf_busy); fixspace(npages, i, off, &space); npages = i; softerr = EBUSY; break; } /* * Get a sendfile buf. When allocating the * first buffer for mbuf chain, we usually * wait as long as necessary, but this wait * can be interrupted. For consequent * buffers, do not sleep, since several * threads might exhaust the buffers and then * deadlock. */ sf = sf_buf_alloc(pa[i], m != NULL ? SFB_NOWAIT : SFB_CATCH); if (sf == NULL) { SFSTAT_INC(sf_allocfail); for (int j = i; j < npages; j++) { vm_page_lock(pa[j]); vm_page_unwire(pa[j], PQ_INACTIVE); vm_page_unlock(pa[j]); } if (m == NULL) softerr = ENOBUFS; fixspace(npages, i, off, &space); npages = i; break; } m0 = m_get(M_WAITOK, MT_DATA); m0->m_ext.ext_buf = (char *)sf_buf_kva(sf); m0->m_ext.ext_size = PAGE_SIZE; m0->m_ext.ext_arg1 = sf; m0->m_ext.ext_arg2 = sfs; /* * SF_NOCACHE sets the page as being freed upon send. * However, we ignore it for the last page in 'space', * if the page is truncated, and we got more data to * send (rem > space), or if we have readahead * configured (rhpages > 0). */ if ((flags & SF_NOCACHE) == 0 || (i == npages - 1 && ((off + space) & PAGE_MASK) && (rem > space || rhpages > 0))) m0->m_ext.ext_type = EXT_SFBUF; else m0->m_ext.ext_type = EXT_SFBUF_NOCACHE; m0->m_ext.ext_flags = EXT_FLAG_EMBREF; m0->m_ext.ext_count = 1; m0->m_flags |= (M_EXT | M_RDONLY); if (nios) m0->m_flags |= M_NOTREADY; m0->m_data = (char *)sf_buf_kva(sf) + (vmoff(i, off) & PAGE_MASK); m0->m_len = xfsize(i, npages, off, space); if (i == 0) sfio->m = m0; /* Append to mbuf chain. */ if (mtail != NULL) mtail->m_next = m0; else m = m0; mtail = m0; if (sfs != NULL) { mtx_lock(&sfs->mtx); sfs->count++; mtx_unlock(&sfs->mtx); } } if (vp != NULL) VOP_UNLOCK(vp, 0); /* Keep track of bytes processed. */ off += space; rem -= space; /* Prepend header, if any. */ if (hdrlen) { mhtail->m_next = m; m = mh; mh = NULL; } if (m == NULL) { KASSERT(softerr, ("%s: m NULL, no error", __func__)); error = softerr; free(sfio, M_TEMP); goto done; } /* Add the buffer chain to the socket buffer. */ KASSERT(m_length(m, NULL) == space + hdrlen, ("%s: mlen %u space %d hdrlen %d", __func__, m_length(m, NULL), space, hdrlen)); CURVNET_SET(so->so_vnet); if (nios == 0) { /* * If sendfile_swapin() didn't initiate any I/Os, * which happens if all data is cached in VM, then * we can send data right now without the * PRUS_NOTREADY flag. */ free(sfio, M_TEMP); error = (*so->so_proto->pr_usrreqs->pru_send) (so, 0, m, NULL, NULL, td); } else { sfio->sock_fp = sock_fp; sfio->npages = npages; fhold(sock_fp); error = (*so->so_proto->pr_usrreqs->pru_send) (so, PRUS_NOTREADY, m, NULL, NULL, td); sendfile_iodone(sfio, NULL, 0, 0); } CURVNET_RESTORE(); m = NULL; /* pru_send always consumes */ if (error) goto done; sbytes += space + hdrlen; if (hdrlen) hdrlen = 0; if (softerr) { error = softerr; goto done; } } /* * Send trailers. Wimp out and use writev(2). */ if (trl_uio != NULL) { sbunlock(&so->so_snd); error = kern_writev(td, sockfd, trl_uio); if (error == 0) sbytes += td->td_retval[0]; goto out; } done: sbunlock(&so->so_snd); out: /* * If there was no error we have to clear td->td_retval[0] * because it may have been set by writev. */ if (error == 0) { td->td_retval[0] = 0; } if (sent != NULL) { (*sent) = sbytes; } if (obj != NULL) vm_object_deallocate(obj); if (so) fdrop(sock_fp, td); if (m) m_freem(m); if (mh) m_freem(mh); if (sfs != NULL) { mtx_lock(&sfs->mtx); if (sfs->count != 0) cv_wait(&sfs->cv, &sfs->mtx); KASSERT(sfs->count == 0, ("sendfile sync still busy")); cv_destroy(&sfs->cv); mtx_destroy(&sfs->mtx); free(sfs, M_TEMP); } if (error == ERESTART) error = EINTR; return (error); } static int sendfile(struct thread *td, struct sendfile_args *uap, int compat) { struct sf_hdtr hdtr; struct uio *hdr_uio, *trl_uio; struct file *fp; cap_rights_t rights; off_t sbytes; int error; /* * File offset must be positive. If it goes beyond EOF * we send only the header/trailer and no payload data. */ if (uap->offset < 0) return (EINVAL); hdr_uio = trl_uio = NULL; if (uap->hdtr != NULL) { error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); if (error != 0) goto out; if (hdtr.headers != NULL) { error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio); if (error != 0) goto out; #ifdef COMPAT_FREEBSD4 /* * In FreeBSD < 5.0 the nbytes to send also included * the header. If compat is specified subtract the * header size from nbytes. */ if (compat) { if (uap->nbytes > hdr_uio->uio_resid) uap->nbytes -= hdr_uio->uio_resid; else uap->nbytes = 0; } #endif } if (hdtr.trailers != NULL) { error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio); if (error != 0) goto out; } } AUDIT_ARG_FD(uap->fd); /* * sendfile(2) can start at any offset within a file so we require * CAP_READ+CAP_SEEK = CAP_PREAD. */ if ((error = fget_read(td, uap->fd, cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) { goto out; } error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset, uap->nbytes, &sbytes, uap->flags, td); fdrop(fp, td); if (uap->sbytes != NULL) copyout(&sbytes, uap->sbytes, sizeof(off_t)); out: free(hdr_uio, M_IOV); free(trl_uio, M_IOV); return (error); } /* * sendfile(2) * * int sendfile(int fd, int s, off_t offset, size_t nbytes, * struct sf_hdtr *hdtr, off_t *sbytes, int flags) * * Send a file specified by 'fd' and starting at 'offset' to a socket * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == * 0. Optionally add a header and/or trailer to the socket output. If * specified, write the total number of bytes sent into *sbytes. */ int sys_sendfile(struct thread *td, struct sendfile_args *uap) { return (sendfile(td, uap, 0)); } #ifdef COMPAT_FREEBSD4 int freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) { struct sendfile_args args; args.fd = uap->fd; args.s = uap->s; args.offset = uap->offset; args.nbytes = uap->nbytes; args.hdtr = uap->hdtr; args.sbytes = uap->sbytes; args.flags = uap->flags; return (sendfile(td, &args, 1)); } #endif /* COMPAT_FREEBSD4 */ Index: head/sys/kern/uipc_syscalls.c =================================================================== --- head/sys/kern/uipc_syscalls.c (revision 306173) +++ head/sys/kern/uipc_syscalls.c (revision 306174) @@ -1,1575 +1,1582 @@ /*- * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef COMPAT_FREEBSD32 #include #endif #include #include #include /* * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC * and SOCK_NONBLOCK. */ #define ACCEPT4_INHERIT 0x1 #define ACCEPT4_COMPAT 0x2 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); static int accept1(struct thread *td, int s, struct sockaddr *uname, socklen_t *anamelen, int flags); static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat); static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat); static int sockargs(struct mbuf **, char *, socklen_t, int); /* * Convert a user file descriptor to a kernel file entry and check if required * capability rights are present. + * If required copy of current set of capability rights is returned. * A reference on the file entry is held upon returning. */ int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, - struct file **fpp, u_int *fflagp) + struct file **fpp, u_int *fflagp, struct filecaps *havecapsp) { struct file *fp; int error; - error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL); + error = fget_cap(td, fd, rightsp, &fp, havecapsp); if (error != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, td); + if (havecapsp != NULL) + filecaps_free(havecapsp); return (ENOTSOCK); } if (fflagp != NULL) *fflagp = fp->f_flag; *fpp = fp; return (0); } /* * System call interface to the socket abstraction. */ #if defined(COMPAT_43) #define COMPAT_OLDSOCK #endif int sys_socket(struct thread *td, struct socket_args *uap) { struct socket *so; struct file *fp; int fd, error, type, oflag, fflag; AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol); type = uap->type; oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC error = mac_socket_check_create(td->td_ucred, uap->domain, type, uap->protocol); if (error != 0) return (error); #endif error = falloc(td, &fp, &fd, oflag); if (error != 0) return (error); /* An extra reference on `fp' has been held for us by falloc(). */ error = socreate(uap->domain, &so, type, uap->protocol, td->td_ucred, td); if (error != 0) { fdclose(td, fp, fd); } else { finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); if ((fflag & FNONBLOCK) != 0) (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); td->td_retval[0] = fd; } fdrop(fp, td); return (error); } int sys_bind(struct thread *td, struct bind_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_BIND), - &fp, NULL); + &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_bind(td->td_ucred, so, sa); if (error == 0) { #endif if (dirfd == AT_FDCWD) error = sobind(so, sa, td); else error = sobindat(dirfd, so, sa, td); #ifdef MAC } #endif fdrop(fp, td); return (error); } int sys_bindat(struct thread *td, struct bindat_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } int sys_listen(struct thread *td, struct listen_args *uap) { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->s); error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_LISTEN), - &fp, NULL); + &fp, NULL, NULL); if (error == 0) { so = fp->f_data; #ifdef MAC error = mac_socket_check_listen(td->td_ucred, so); if (error == 0) #endif error = solisten(so, uap->backlog, td); fdrop(fp, td); } return(error); } /* * accept1() */ static int accept1(td, s, uname, anamelen, flags) struct thread *td; int s; struct sockaddr *uname; socklen_t *anamelen; int flags; { struct sockaddr *name; socklen_t namelen; struct file *fp; int error; if (uname == NULL) return (kern_accept4(td, s, NULL, NULL, flags, NULL)); error = copyin(anamelen, &namelen, sizeof (namelen)); if (error != 0) return (error); error = kern_accept4(td, s, &name, &namelen, flags, &fp); if (error != 0) return (error); if (error == 0 && uname != NULL) { #ifdef COMPAT_OLDSOCK if (flags & ACCEPT4_COMPAT) ((struct osockaddr *)name)->sa_family = name->sa_family; #endif error = copyout(name, uname, namelen); } if (error == 0) error = copyout(&namelen, anamelen, sizeof(namelen)); if (error != 0) fdclose(td, fp, td->td_retval[0]); fdrop(fp, td); free(name, M_SONAME); return (error); } int kern_accept(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, struct file **fp) { return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); } int kern_accept4(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, int flags, struct file **fp) { struct file *headfp, *nfp = NULL; struct sockaddr *sa = NULL; struct socket *head, *so; + struct filecaps fcaps; cap_rights_t rights; u_int fflag; pid_t pgid; int error, fd, tmp; if (name != NULL) *name = NULL; AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_ACCEPT), - &headfp, &fflag); + &headfp, &fflag, &fcaps); if (error != 0) return (error); head = headfp->f_data; if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; } #ifdef MAC error = mac_socket_check_accept(td->td_ucred, head); if (error != 0) goto done; #endif - error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0); + error = falloc_caps(td, &nfp, &fd, + (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps); if (error != 0) goto done; ACCEPT_LOCK(); if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { ACCEPT_UNLOCK(); error = EWOULDBLOCK; goto noconnection; } while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { head->so_error = ECONNABORTED; break; } error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, "accept", 0); if (error != 0) { ACCEPT_UNLOCK(); goto noconnection; } } if (head->so_error) { error = head->so_error; head->so_error = 0; ACCEPT_UNLOCK(); goto noconnection; } so = TAILQ_FIRST(&head->so_comp); KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); /* * Before changing the flags on the socket, we have to bump the * reference count. Otherwise, if the protocol calls sofree(), * the socket will be released due to a zero refcount. */ SOCK_LOCK(so); /* soref() and so_state update */ soref(so); /* file descriptor reference */ TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; if (flags & ACCEPT4_INHERIT) so->so_state |= (head->so_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; so->so_qstate &= ~SQ_COMP; so->so_head = NULL; SOCK_UNLOCK(so); ACCEPT_UNLOCK(); /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; /* connection has been removed from the listen queue */ KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); if (flags & ACCEPT4_INHERIT) { pgid = fgetown(&head->so_sigio); if (pgid != 0) fsetown(pgid, &so->so_sigio); } else { fflag &= ~(FNONBLOCK | FASYNC); if (flags & SOCK_NONBLOCK) fflag |= FNONBLOCK; } finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); /* Sync socket nonblocking/async state with file flags */ tmp = fflag & FNONBLOCK; (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); sa = NULL; error = soaccept(so, &sa); if (error != 0) goto noconnection; if (sa == NULL) { if (name) *namelen = 0; goto done; } AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); if (name) { /* check sa_len before it is destroyed */ if (*namelen > sa->sa_len) *namelen = sa->sa_len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif *name = sa; sa = NULL; } noconnection: free(sa, M_SONAME); /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error != 0) fdclose(td, nfp, fd); /* * Release explicitly held references before returning. We return * a reference on nfp to the caller on success if they request it. */ done: + if (nfp == NULL) + filecaps_free(&fcaps); if (fp != NULL) { if (error == 0) { *fp = nfp; nfp = NULL; } else *fp = NULL; } if (nfp != NULL) fdrop(nfp, td); fdrop(headfp, td); return (error); } int sys_accept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); } int sys_accept4(td, uap) struct thread *td; struct accept4_args *uap; { if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return (EINVAL); return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); } #ifdef COMPAT_OLDSOCK int oaccept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT | ACCEPT4_COMPAT)); } #endif /* COMPAT_OLDSOCK */ int sys_connect(struct thread *td, struct connect_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; cap_rights_t rights; int error, interrupted = 0; AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_CONNECT), - &fp, NULL); + &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; if (so->so_state & SS_ISCONNECTING) { error = EALREADY; goto done1; } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_connect(td->td_ucred, so, sa); if (error != 0) goto bad; #endif if (dirfd == AT_FDCWD) error = soconnect(so, sa, td); else error = soconnectat(dirfd, so, sa, td); if (error != 0) goto bad; if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { error = EINPROGRESS; goto done1; } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, "connec", 0); if (error != 0) { if (error == EINTR || error == ERESTART) interrupted = 1; break; } } if (error == 0) { error = so->so_error; so->so_error = 0; } SOCK_UNLOCK(so); bad: if (!interrupted) so->so_state &= ~SS_ISCONNECTING; if (error == ERESTART) error = EINTR; done1: fdrop(fp, td); return (error); } int sys_connectat(struct thread *td, struct connectat_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_socketpair(struct thread *td, int domain, int type, int protocol, int *rsv) { struct file *fp1, *fp2; struct socket *so1, *so2; int fd, error, oflag, fflag; AUDIT_ARG_SOCKET(domain, type, protocol); oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC /* We might want to have a separate check for socket pairs. */ error = mac_socket_check_create(td->td_ucred, domain, type, protocol); if (error != 0) return (error); #endif error = socreate(domain, &so1, type, protocol, td->td_ucred, td); if (error != 0) return (error); error = socreate(domain, &so2, type, protocol, td->td_ucred, td); if (error != 0) goto free1; /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ error = falloc(td, &fp1, &fd, oflag); if (error != 0) goto free2; rsv[0] = fd; fp1->f_data = so1; /* so1 already has ref count */ error = falloc(td, &fp2, &fd, oflag); if (error != 0) goto free3; fp2->f_data = so2; /* so2 already has ref count */ rsv[1] = fd; error = soconnect2(so1, so2); if (error != 0) goto free4; if (type == SOCK_DGRAM) { /* * Datagram socket connection is asymmetric. */ error = soconnect2(so2, so1); if (error != 0) goto free4; } finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, &socketops); finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, &socketops); if ((fflag & FNONBLOCK) != 0) { (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); } fdrop(fp1, td); fdrop(fp2, td); return (0); free4: fdclose(td, fp2, rsv[1]); fdrop(fp2, td); free3: fdclose(td, fp1, rsv[0]); fdrop(fp1, td); free2: if (so2 != NULL) (void)soclose(so2); free1: if (so1 != NULL) (void)soclose(so1); return (error); } int sys_socketpair(struct thread *td, struct socketpair_args *uap) { int error, sv[2]; error = kern_socketpair(td, uap->domain, uap->type, uap->protocol, sv); if (error != 0) return (error); error = copyout(sv, uap->rsv, 2 * sizeof(int)); if (error != 0) { (void)kern_close(td, sv[0]); (void)kern_close(td, sv[1]); } return (error); } static int sendit(struct thread *td, int s, struct msghdr *mp, int flags) { struct mbuf *control; struct sockaddr *to; int error; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) return (ECAPMODE); #endif if (mp->msg_name != NULL) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error != 0) { to = NULL; goto bad; } mp->msg_name = to; } else { to = NULL; } if (mp->msg_control) { if (mp->msg_controllen < sizeof(struct cmsghdr) #ifdef COMPAT_OLDSOCK && mp->msg_flags != MSG_COMPAT #endif ) { error = EINVAL; goto bad; } error = sockargs(&control, mp->msg_control, mp->msg_controllen, MT_CONTROL); if (error != 0) goto bad; #ifdef COMPAT_OLDSOCK if (mp->msg_flags == MSG_COMPAT) { struct cmsghdr *cm; M_PREPEND(control, sizeof(*cm), M_WAITOK); cm = mtod(control, struct cmsghdr *); cm->cmsg_len = control->m_len; cm->cmsg_level = SOL_SOCKET; cm->cmsg_type = SCM_RIGHTS; } #endif } else { control = NULL; } error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); bad: free(to, M_SONAME); return (error); } int kern_sendit(struct thread *td, int s, struct msghdr *mp, int flags, struct mbuf *control, enum uio_seg segflg) { struct file *fp; struct uio auio; struct iovec *iov; struct socket *so; cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int i, error; AUDIT_ARG_FD(s); cap_rights_init(&rights, CAP_SEND); if (mp->msg_name != NULL) { AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); cap_rights_set(&rights, CAP_CONNECT); } - error = getsock_cap(td, s, &rights, &fp, NULL); + error = getsock_cap(td, s, &rights, &fp, NULL, NULL); if (error != 0) return (error); so = (struct socket *)fp->f_data; #ifdef KTRACE if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(mp->msg_name); #endif #ifdef MAC if (mp->msg_name != NULL) { error = mac_socket_check_connect(td->td_ucred, so, mp->msg_name); if (error != 0) goto bad; } error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto bad; #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = segflg; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { error = EINVAL; goto bad; } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(s, UIO_WRITE, ktruio, error); } #endif bad: fdrop(fp, td); return (error); } int sys_sendto(struct thread *td, struct sendto_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = uap->to; msg.msg_namelen = uap->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif aiov.iov_base = uap->buf; aiov.iov_len = uap->len; return (sendit(td, uap->s, &msg, uap->flags)); } #ifdef COMPAT_OLDSOCK int osend(struct thread *td, struct osend_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = 0; return (sendit(td, uap->s, &msg, uap->flags)); } int osendmsg(struct thread *td, struct osendmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; msg.msg_flags = MSG_COMPAT; error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } #endif int sys_sendmsg(struct thread *td, struct sendmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } int kern_recvit(struct thread *td, int s, struct msghdr *mp, enum uio_seg fromseg, struct mbuf **controlp) { struct uio auio; struct iovec *iov; struct mbuf *m, *control = NULL; caddr_t ctlbuf; struct file *fp; struct socket *so; struct sockaddr *fromsa = NULL; cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int error, i; if (controlp != NULL) *controlp = NULL; AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_RECV), - &fp, NULL); + &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef MAC error = mac_socket_check_receive(td->td_ucred, so); if (error != 0) { fdrop(fp, td); return (error); } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { fdrop(fp, td); return (EINVAL); } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = soreceive(so, &fromsa, &auio, NULL, (mp->msg_control || controlp) ? &control : NULL, &mp->msg_flags); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } if (fromsa != NULL) AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = len - auio.uio_resid; ktrgenio(s, UIO_READ, ktruio, error); } #endif if (error != 0) goto out; td->td_retval[0] = len - auio.uio_resid; if (mp->msg_name) { len = mp->msg_namelen; if (len <= 0 || fromsa == NULL) len = 0; else { /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) ((struct osockaddr *)fromsa)->sa_family = fromsa->sa_family; #endif if (fromseg == UIO_USERSPACE) { error = copyout(fromsa, mp->msg_name, (unsigned)len); if (error != 0) goto out; } else bcopy(fromsa, mp->msg_name, len); } mp->msg_namelen = len; } if (mp->msg_control && controlp == NULL) { #ifdef COMPAT_OLDSOCK /* * We assume that old recvmsg calls won't receive access * rights and other control info, esp. as control info * is always optional and those options didn't exist in 4.3. * If we receive rights, trim the cmsghdr; anything else * is tossed. */ if (control && mp->msg_flags & MSG_COMPAT) { if (mtod(control, struct cmsghdr *)->cmsg_level != SOL_SOCKET || mtod(control, struct cmsghdr *)->cmsg_type != SCM_RIGHTS) { mp->msg_controllen = 0; goto out; } control->m_len -= sizeof (struct cmsghdr); control->m_data += sizeof (struct cmsghdr); } #endif len = mp->msg_controllen; m = control; mp->msg_controllen = 0; ctlbuf = mp->msg_control; while (m && len > 0) { unsigned int tocopy; if (len >= m->m_len) tocopy = m->m_len; else { mp->msg_flags |= MSG_CTRUNC; tocopy = len; } if ((error = copyout(mtod(m, caddr_t), ctlbuf, tocopy)) != 0) goto out; ctlbuf += tocopy; len -= tocopy; m = m->m_next; } mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; } out: fdrop(fp, td); #ifdef KTRACE if (fromsa && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(fromsa); #endif free(fromsa, M_SONAME); if (error == 0 && controlp != NULL) *controlp = control; else if (control) m_freem(control); return (error); } static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp) { int error; error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); if (error != 0) return (error); if (namelenp != NULL) { error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) error = 0; /* old recvfrom didn't check */ #endif } return (error); } int sys_recvfrom(struct thread *td, struct recvfrom_args *uap) { struct msghdr msg; struct iovec aiov; int error; if (uap->fromlenaddr) { error = copyin(uap->fromlenaddr, &msg.msg_namelen, sizeof (msg.msg_namelen)); if (error != 0) goto done2; } else { msg.msg_namelen = 0; } msg.msg_name = uap->from; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, uap->fromlenaddr); done2: return (error); } #ifdef COMPAT_OLDSOCK int orecvfrom(struct thread *td, struct recvfrom_args *uap) { uap->flags |= MSG_COMPAT; return (sys_recvfrom(td, uap)); } #endif #ifdef COMPAT_OLDSOCK int orecv(struct thread *td, struct orecv_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; return (recvit(td, uap->s, &msg, NULL)); } /* * Old recvmsg. This code takes advantage of the fact that the old msghdr * overlays the new one, missing only the flags, and with the (old) access * rights where the control fields are now. */ int orecvmsg(struct thread *td, struct orecvmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags | MSG_COMPAT; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); if (msg.msg_controllen && error == 0) error = copyout(&msg.msg_controllen, &uap->msg->msg_accrightslen, sizeof (int)); free(iov, M_IOV); return (error); } #endif int sys_recvmsg(struct thread *td, struct recvmsg_args *uap) { struct msghdr msg; struct iovec *uiov, *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags; #ifdef COMPAT_OLDSOCK msg.msg_flags &= ~MSG_COMPAT; #endif uiov = msg.msg_iov; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, NULL); if (error == 0) { msg.msg_iov = uiov; error = copyout(&msg, uap->msg, sizeof(msg)); } free(iov, M_IOV); return (error); } int sys_shutdown(struct thread *td, struct shutdown_args *uap) { struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->s); error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_SHUTDOWN), - &fp, NULL); + &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = soshutdown(so, uap->how); /* * Previous versions did not return ENOTCONN, but 0 in * case the socket was not connected. Some important * programs like syslogd up to r279016, 2015-02-19, * still depend on this behavior. */ if (error == ENOTCONN && td->td_proc->p_osrel < P_OSREL_SHUTDOWN_ENOTCONN) error = 0; fdrop(fp, td); } return (error); } int sys_setsockopt(struct thread *td, struct setsockopt_args *uap) { return (kern_setsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, uap->valsize)); } int kern_setsockopt(struct thread *td, int s, int level, int name, void *val, enum uio_seg valseg, socklen_t valsize) { struct socket *so; struct file *fp; struct sockopt sopt; cap_rights_t rights; int error; if (val == NULL && valsize != 0) return (EFAULT); if ((int)valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = valsize; switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_setsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SETSOCKOPT), - &fp, NULL); + &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = sosetopt(so, &sopt); fdrop(fp, td); } return(error); } int sys_getsockopt(struct thread *td, struct getsockopt_args *uap) { socklen_t valsize; int error; if (uap->val) { error = copyin(uap->avalsize, &valsize, sizeof (valsize)); if (error != 0) return (error); } error = kern_getsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, &valsize); if (error == 0) error = copyout(&valsize, uap->avalsize, sizeof (valsize)); return (error); } /* * Kernel version of getsockopt. * optval can be a userland or userspace. optlen is always a kernel pointer. */ int kern_getsockopt(struct thread *td, int s, int level, int name, void *val, enum uio_seg valseg, socklen_t *valsize) { struct socket *so; struct file *fp; struct sockopt sopt; cap_rights_t rights; int error; if (val == NULL) *valsize = 0; if ((int)*valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_GET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_getsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_GETSOCKOPT), - &fp, NULL); + &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = sogetopt(so, &sopt); *valsize = sopt.sopt_valsize; fdrop(fp, td); } return (error); } /* * getsockname1() - Get socket name. */ static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat) { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof(len)); if (error != 0) return (error); error = kern_getsockname(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; cap_rights_t rights; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETSOCKNAME), - &fp, NULL); + &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: fdrop(fp, td); if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } return (error); } int sys_getsockname(struct thread *td, struct getsockname_args *uap) { return (getsockname1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetsockname(struct thread *td, struct getsockname_args *uap) { return (getsockname1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* * getpeername1() - Get name of peer for connected socket. */ static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat) { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof (len)); if (error != 0) return (error); error = kern_getpeername(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; cap_rights_t rights; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETPEERNAME), - &fp, NULL); + &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { error = ENOTCONN; goto done; } *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } done: fdrop(fp, td); return (error); } int sys_getpeername(struct thread *td, struct getpeername_args *uap) { return (getpeername1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetpeername(struct thread *td, struct ogetpeername_args *uap) { /* XXX uap should have type `getpeername_args *' to begin with. */ return (getpeername1(td, (struct getpeername_args *)uap, 1)); } #endif /* COMPAT_OLDSOCK */ static int sockargs(struct mbuf **mp, char *buf, socklen_t buflen, int type) { struct sockaddr *sa; struct mbuf *m; int error; if (buflen > MLEN) { #ifdef COMPAT_OLDSOCK if (type == MT_SONAME && buflen <= 112) buflen = MLEN; /* unix domain compat. hack */ else #endif if (buflen > MCLBYTES) return (EINVAL); } m = m_get2(buflen, M_WAITOK, type, 0); m->m_len = buflen; error = copyin(buf, mtod(m, void *), buflen); if (error != 0) (void) m_free(m); else { *mp = m; if (type == MT_SONAME) { sa = mtod(m, struct sockaddr *); #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = buflen; } } return (error); } int getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len) { struct sockaddr *sa; int error; if (len > SOCK_MAXADDRLEN) return (ENAMETOOLONG); if (len < offsetof(struct sockaddr, sa_data[0])) return (EINVAL); sa = malloc(len, M_SONAME, M_WAITOK); error = copyin(uaddr, sa, len); if (error != 0) { free(sa, M_SONAME); } else { #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = len; *namp = sa; } return (error); } Index: head/sys/netinet/sctp_syscalls.c =================================================================== --- head/sys/netinet/sctp_syscalls.c (revision 306173) +++ head/sys/netinet/sctp_syscalls.c (revision 306174) @@ -1,596 +1,596 @@ /*- * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_sctp.h" #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef COMPAT_FREEBSD32 #include #endif #include #include #include #include #include static struct syscall_helper_data sctp_syscalls[] = { SYSCALL_INIT_HELPER(sctp_peeloff), SYSCALL_INIT_HELPER(sctp_generic_sendmsg), SYSCALL_INIT_HELPER(sctp_generic_sendmsg_iov), SYSCALL_INIT_HELPER(sctp_generic_recvmsg), SYSCALL_INIT_LAST }; static void sctp_syscalls_init(void *unused __unused) { int error; error = syscall_helper_register(sctp_syscalls, SY_THR_STATIC); KASSERT((error == 0), ("%s: syscall_helper_register failed for sctp syscalls", __func__)); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(sctp_syscalls, SY_THR_STATIC); KASSERT((error == 0), ("%s: syscall32_helper_register failed for sctp syscalls", __func__)); #endif } SYSINIT(sctp_syscalls, SI_SUB_SYSCALLS, SI_ORDER_ANY, sctp_syscalls_init, NULL); /* * SCTP syscalls. * Functionality only compiled in if SCTP is defined in the kernel Makefile, * otherwise all return EOPNOTSUPP. * XXX: We should make this loadable one day. */ int sys_sctp_peeloff(td, uap) struct thread *td; struct sctp_peeloff_args /* { int sd; caddr_t name; } */ *uap; { #if (defined(INET) || defined(INET6)) && defined(SCTP) struct file *nfp = NULL; struct socket *head, *so; cap_rights_t rights; u_int fflag; int error, fd; AUDIT_ARG_FD(uap->sd); error = fgetsock(td, uap->sd, cap_rights_init(&rights, CAP_PEELOFF), &head, &fflag); if (error != 0) goto done2; if (head->so_proto->pr_protocol != IPPROTO_SCTP) { error = EOPNOTSUPP; goto done; } error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name); if (error != 0) goto done; /* * At this point we know we do have a assoc to pull * we proceed to get the fd setup. This may block * but that is ok. */ error = falloc(td, &nfp, &fd, 0); if (error != 0) goto done; td->td_retval[0] = fd; CURVNET_SET(head->so_vnet); so = sonewconn(head, SS_ISCONNECTED); if (so == NULL) { error = ENOMEM; goto noconnection; } /* * Before changing the flags on the socket, we have to bump the * reference count. Otherwise, if the protocol calls sofree(), * the socket will be released due to a zero refcount. */ SOCK_LOCK(so); soref(so); /* file descriptor reference */ SOCK_UNLOCK(so); ACCEPT_LOCK(); TAILQ_REMOVE(&head->so_comp, so, so_list); head->so_qlen--; so->so_state |= (head->so_state & SS_NBIO); so->so_state &= ~SS_NOFDREF; so->so_qstate &= ~SQ_COMP; so->so_head = NULL; ACCEPT_UNLOCK(); finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name); if (error != 0) goto noconnection; if (head->so_sigio != NULL) fsetown(fgetown(&head->so_sigio), &so->so_sigio); noconnection: /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error != 0) fdclose(td, nfp, fd); /* * Release explicitly held references before returning. */ CURVNET_RESTORE(); done: if (nfp != NULL) fdrop(nfp, td); fputsock(head); done2: return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sys_sctp_generic_sendmsg (td, uap) struct thread *td; struct sctp_generic_sendmsg_args /* { int sd, caddr_t msg, int mlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags } */ *uap; { #if (defined(INET) || defined(INET6)) && defined(SCTP) struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; struct socket *so; struct file *fp = NULL; struct sockaddr *to = NULL; #ifdef KTRACE struct uio *ktruio = NULL; #endif struct uio auio; struct iovec iov[1]; cap_rights_t rights; int error = 0, len; if (uap->sinfo != NULL) { error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); if (error != 0) return (error); u_sinfo = &sinfo; } cap_rights_init(&rights, CAP_SEND); if (uap->tolen != 0) { error = getsockaddr(&to, uap->to, uap->tolen); if (error != 0) { to = NULL; goto sctp_bad2; } cap_rights_set(&rights, CAP_CONNECT); } AUDIT_ARG_FD(uap->sd); - error = getsock_cap(td, uap->sd, &rights, &fp, NULL); + error = getsock_cap(td, uap->sd, &rights, &fp, NULL, NULL); if (error != 0) goto sctp_bad; #ifdef KTRACE if (to && (KTRPOINT(td, KTR_STRUCT))) ktrsockaddr(to); #endif iov[0].iov_base = uap->msg; iov[0].iov_len = uap->mlen; so = (struct socket *)fp->f_data; if (so->so_proto->pr_protocol != IPPROTO_SCTP) { error = EOPNOTSUPP; goto sctp_bad; } #ifdef MAC error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto sctp_bad; #endif /* MAC */ auio.uio_iov = iov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif /* KTRACE */ len = auio.uio_resid = uap->mlen; CURVNET_SET(so->so_vnet); error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, (struct mbuf *)NULL, uap->flags, u_sinfo, td); CURVNET_RESTORE(); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket. */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(uap->flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(uap->sd, UIO_WRITE, ktruio, error); } #endif /* KTRACE */ sctp_bad: if (fp != NULL) fdrop(fp, td); sctp_bad2: free(to, M_SONAME); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sys_sctp_generic_sendmsg_iov(td, uap) struct thread *td; struct sctp_generic_sendmsg_iov_args /* { int sd, struct iovec *iov, int iovlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags } */ *uap; { #if (defined(INET) || defined(INET6)) && defined(SCTP) struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL; struct socket *so; struct file *fp = NULL; struct sockaddr *to = NULL; #ifdef KTRACE struct uio *ktruio = NULL; #endif struct uio auio; struct iovec *iov, *tiov; cap_rights_t rights; ssize_t len; int error, i; if (uap->sinfo != NULL) { error = copyin(uap->sinfo, &sinfo, sizeof (sinfo)); if (error != 0) return (error); u_sinfo = &sinfo; } cap_rights_init(&rights, CAP_SEND); if (uap->tolen != 0) { error = getsockaddr(&to, uap->to, uap->tolen); if (error != 0) { to = NULL; goto sctp_bad2; } cap_rights_set(&rights, CAP_CONNECT); } AUDIT_ARG_FD(uap->sd); - error = getsock_cap(td, uap->sd, &rights, &fp, NULL); + error = getsock_cap(td, uap->sd, &rights, &fp, NULL, NULL); if (error != 0) goto sctp_bad1; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) error = freebsd32_copyiniov((struct iovec32 *)uap->iov, uap->iovlen, &iov, EMSGSIZE); else #endif error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); if (error != 0) goto sctp_bad1; #ifdef KTRACE if (to && (KTRPOINT(td, KTR_STRUCT))) ktrsockaddr(to); #endif so = (struct socket *)fp->f_data; if (so->so_proto->pr_protocol != IPPROTO_SCTP) { error = EOPNOTSUPP; goto sctp_bad; } #ifdef MAC error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto sctp_bad; #endif /* MAC */ auio.uio_iov = iov; auio.uio_iovcnt = uap->iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; tiov = iov; for (i = 0; i iovlen; i++, tiov++) { if ((auio.uio_resid += tiov->iov_len) < 0) { error = EINVAL; goto sctp_bad; } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif /* KTRACE */ len = auio.uio_resid; CURVNET_SET(so->so_vnet); error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL, (struct mbuf *)NULL, uap->flags, u_sinfo, td); CURVNET_RESTORE(); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(uap->flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(uap->sd, UIO_WRITE, ktruio, error); } #endif /* KTRACE */ sctp_bad: free(iov, M_IOV); sctp_bad1: if (fp != NULL) fdrop(fp, td); sctp_bad2: free(to, M_SONAME); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } int sys_sctp_generic_recvmsg(td, uap) struct thread *td; struct sctp_generic_recvmsg_args /* { int sd, struct iovec *iov, int iovlen, struct sockaddr *from, __socklen_t *fromlenaddr, struct sctp_sndrcvinfo *sinfo, int *msg_flags } */ *uap; { #if (defined(INET) || defined(INET6)) && defined(SCTP) uint8_t sockbufstore[256]; struct uio auio; struct iovec *iov, *tiov; struct sctp_sndrcvinfo sinfo; struct socket *so; struct file *fp = NULL; struct sockaddr *fromsa; cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int error, fromlen, i, msg_flags; AUDIT_ARG_FD(uap->sd); error = getsock_cap(td, uap->sd, cap_rights_init(&rights, CAP_RECV), - &fp, NULL); + &fp, NULL, NULL); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) error = freebsd32_copyiniov((struct iovec32 *)uap->iov, uap->iovlen, &iov, EMSGSIZE); else #endif error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE); if (error != 0) goto out1; so = fp->f_data; if (so->so_proto->pr_protocol != IPPROTO_SCTP) { error = EOPNOTSUPP; goto out; } #ifdef MAC error = mac_socket_check_receive(td->td_ucred, so); if (error != 0) goto out; #endif /* MAC */ if (uap->fromlenaddr != NULL) { error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen)); if (error != 0) goto out; } else { fromlen = 0; } if (uap->msg_flags) { error = copyin(uap->msg_flags, &msg_flags, sizeof (int)); if (error != 0) goto out; } else { msg_flags = 0; } auio.uio_iov = iov; auio.uio_iovcnt = uap->iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; tiov = iov; for (i = 0; i iovlen; i++, tiov++) { if ((auio.uio_resid += tiov->iov_len) < 0) { error = EINVAL; goto out; } } len = auio.uio_resid; fromsa = (struct sockaddr *)sockbufstore; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif /* KTRACE */ memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo)); CURVNET_SET(so->so_vnet); error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL, fromsa, fromlen, &msg_flags, (struct sctp_sndrcvinfo *)&sinfo, 1); CURVNET_RESTORE(); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } else { if (uap->sinfo) error = copyout(&sinfo, uap->sinfo, sizeof (sinfo)); } #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = len - auio.uio_resid; ktrgenio(uap->sd, UIO_READ, ktruio, error); } #endif /* KTRACE */ if (error != 0) goto out; td->td_retval[0] = len - auio.uio_resid; if (fromlen && uap->from) { len = fromlen; if (len <= 0 || fromsa == NULL) len = 0; else { len = MIN(len, fromsa->sa_len); error = copyout(fromsa, uap->from, (size_t)len); if (error != 0) goto out; } error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t)); if (error != 0) goto out; } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(fromsa); #endif if (uap->msg_flags) { error = copyout(&msg_flags, uap->msg_flags, sizeof (int)); if (error != 0) goto out; } out: free(iov, M_IOV); out1: if (fp != NULL) fdrop(fp, td); return (error); #else /* SCTP */ return (EOPNOTSUPP); #endif /* SCTP */ } Index: head/sys/sys/socketvar.h =================================================================== --- head/sys/sys/socketvar.h (revision 306173) +++ head/sys/sys/socketvar.h (revision 306174) @@ -1,422 +1,423 @@ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)socketvar.h 8.3 (Berkeley) 2/19/95 * * $FreeBSD$ */ #ifndef _SYS_SOCKETVAR_H_ #define _SYS_SOCKETVAR_H_ #include /* for TAILQ macros */ #include /* for struct selinfo */ #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif struct vnet; /* * Kernel structure per socket. * Contains send and receive buffer queues, * handle on protocol and pointer to protocol * private data and error information. */ typedef u_quad_t so_gen_t; struct socket; /*- * Locking key to struct socket: * (a) constant after allocation, no locking required. * (b) locked by SOCK_LOCK(so). * (c) locked by SOCKBUF_LOCK(&so->so_rcv). * (e) locked by ACCEPT_LOCK(). * (f) not locked since integer reads/writes are atomic. * (g) used only as a sleep/wakeup address, no value. * (h) locked by global mutex so_global_mtx. */ struct socket { int so_count; /* (b) reference count */ short so_type; /* (a) generic type, see socket.h */ short so_options; /* from socket call, see socket.h */ short so_linger; /* time to linger while closing */ short so_state; /* (b) internal state flags SS_* */ int so_qstate; /* (e) internal state flags SQ_* */ void *so_pcb; /* protocol control block */ struct vnet *so_vnet; /* (a) network stack instance */ struct protosw *so_proto; /* (a) protocol handle */ /* * Variables for connection queuing. * Socket where accepts occur is so_head in all subsidiary sockets. * If so_head is 0, socket is not related to an accept. * For head socket so_incomp queues partially completed connections, * while so_comp is a queue of connections ready to be accepted. * If a connection is aborted and it has so_head set, then * it has to be pulled out of either so_incomp or so_comp. * We allow connections to queue up based on current queue lengths * and limit on number of queued connections for this socket. */ struct socket *so_head; /* (e) back pointer to listen socket */ TAILQ_HEAD(, socket) so_incomp; /* (e) queue of partial unaccepted connections */ TAILQ_HEAD(, socket) so_comp; /* (e) queue of complete unaccepted connections */ TAILQ_ENTRY(socket) so_list; /* (e) list of unaccepted connections */ u_int so_qlen; /* (e) number of unaccepted connections */ u_int so_incqlen; /* (e) number of unaccepted incomplete connections */ u_int so_qlimit; /* (e) max number queued connections */ short so_timeo; /* (g) connection timeout */ u_short so_error; /* (f) error affecting connection */ struct sigio *so_sigio; /* [sg] information for async I/O or out of band data (SIGURG) */ u_long so_oobmark; /* (c) chars to oob mark */ struct sockbuf so_rcv, so_snd; struct ucred *so_cred; /* (a) user credentials */ struct label *so_label; /* (b) MAC label for socket */ struct label *so_peerlabel; /* (b) cached MAC label for peer */ /* NB: generation count must not be first. */ so_gen_t so_gencnt; /* (h) generation count */ void *so_emuldata; /* (b) private data for emulators */ struct so_accf { struct accept_filter *so_accept_filter; void *so_accept_filter_arg; /* saved filter args */ char *so_accept_filter_str; /* saved user args */ } *so_accf; struct osd osd; /* Object Specific extensions */ /* * so_fibnum, so_user_cookie and friends can be used to attach * some user-specified metadata to a socket, which then can be * used by the kernel for various actions. * so_user_cookie is used by ipfw/dummynet. */ int so_fibnum; /* routing domain for this socket */ uint32_t so_user_cookie; void *so_pspare[2]; /* packet pacing / general use */ int so_ispare[2]; /* packet pacing / general use */ }; /* * Global accept mutex to serialize access to accept queues and * fields associated with multiple sockets. This allows us to * avoid defining a lock order between listen and accept sockets * until such time as it proves to be a good idea. */ extern struct mtx accept_mtx; #define ACCEPT_LOCK_ASSERT() mtx_assert(&accept_mtx, MA_OWNED) #define ACCEPT_UNLOCK_ASSERT() mtx_assert(&accept_mtx, MA_NOTOWNED) #define ACCEPT_LOCK() mtx_lock(&accept_mtx) #define ACCEPT_UNLOCK() mtx_unlock(&accept_mtx) /* * Per-socket mutex: we reuse the receive socket buffer mutex for space * efficiency. This decision should probably be revisited as we optimize * locking for the socket code. */ #define SOCK_MTX(_so) SOCKBUF_MTX(&(_so)->so_rcv) #define SOCK_LOCK(_so) SOCKBUF_LOCK(&(_so)->so_rcv) #define SOCK_OWNED(_so) SOCKBUF_OWNED(&(_so)->so_rcv) #define SOCK_UNLOCK(_so) SOCKBUF_UNLOCK(&(_so)->so_rcv) #define SOCK_LOCK_ASSERT(_so) SOCKBUF_LOCK_ASSERT(&(_so)->so_rcv) /* * Socket state bits stored in so_qstate. */ #define SQ_INCOMP 0x0800 /* unaccepted, incomplete connection */ #define SQ_COMP 0x1000 /* unaccepted, complete connection */ /* * Externalized form of struct socket used by the sysctl(3) interface. */ struct xsocket { size_t xso_len; /* length of this structure */ struct socket *xso_so; /* makes a convenient handle sometimes */ short so_type; short so_options; short so_linger; short so_state; caddr_t so_pcb; /* another convenient handle */ int xso_protocol; int xso_family; u_int so_qlen; u_int so_incqlen; u_int so_qlimit; short so_timeo; u_short so_error; pid_t so_pgid; u_long so_oobmark; struct xsockbuf so_rcv, so_snd; uid_t so_uid; /* XXX */ }; #ifdef _KERNEL /* * Macros for sockets and socket buffering. */ /* * Flags to sblock(). */ #define SBL_WAIT 0x00000001 /* Wait if not immediately available. */ #define SBL_NOINTR 0x00000002 /* Force non-interruptible sleep. */ #define SBL_VALID (SBL_WAIT | SBL_NOINTR) /* * Do we need to notify the other side when I/O is possible? */ #define sb_notify(sb) (((sb)->sb_flags & (SB_WAIT | SB_SEL | SB_ASYNC | \ SB_UPCALL | SB_AIO | SB_KNOTE)) != 0) /* do we have to send all at once on a socket? */ #define sosendallatonce(so) \ ((so)->so_proto->pr_flags & PR_ATOMIC) /* can we read something from so? */ #define soreadabledata(so) \ (sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || \ !TAILQ_EMPTY(&(so)->so_comp) || (so)->so_error) #define soreadable(so) \ (soreadabledata(so) || ((so)->so_rcv.sb_state & SBS_CANTRCVMORE)) /* can we write something to so? */ #define sowriteable(so) \ ((sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat && \ (((so)->so_state&SS_ISCONNECTED) || \ ((so)->so_proto->pr_flags&PR_CONNREQUIRED)==0)) || \ ((so)->so_snd.sb_state & SBS_CANTSENDMORE) || \ (so)->so_error) /* * soref()/sorele() ref-count the socket structure. Note that you must * still explicitly close the socket, but the last ref count will free * the structure. */ #define soref(so) do { \ SOCK_LOCK_ASSERT(so); \ ++(so)->so_count; \ } while (0) #define sorele(so) do { \ ACCEPT_LOCK_ASSERT(); \ SOCK_LOCK_ASSERT(so); \ if ((so)->so_count <= 0) \ panic("sorele"); \ if (--(so)->so_count == 0) \ sofree(so); \ else { \ SOCK_UNLOCK(so); \ ACCEPT_UNLOCK(); \ } \ } while (0) /* * In sorwakeup() and sowwakeup(), acquire the socket buffer lock to * avoid a non-atomic test-and-wakeup. However, sowakeup is * responsible for releasing the lock if it is called. We unlock only * if we don't call into sowakeup. If any code is introduced that * directly invokes the underlying sowakeup() primitives, it must * maintain the same semantics. */ #define sorwakeup_locked(so) do { \ SOCKBUF_LOCK_ASSERT(&(so)->so_rcv); \ if (sb_notify(&(so)->so_rcv)) \ sowakeup((so), &(so)->so_rcv); \ else \ SOCKBUF_UNLOCK(&(so)->so_rcv); \ } while (0) #define sorwakeup(so) do { \ SOCKBUF_LOCK(&(so)->so_rcv); \ sorwakeup_locked(so); \ } while (0) #define sowwakeup_locked(so) do { \ SOCKBUF_LOCK_ASSERT(&(so)->so_snd); \ if (sb_notify(&(so)->so_snd)) \ sowakeup((so), &(so)->so_snd); \ else \ SOCKBUF_UNLOCK(&(so)->so_snd); \ } while (0) #define sowwakeup(so) do { \ SOCKBUF_LOCK(&(so)->so_snd); \ sowwakeup_locked(so); \ } while (0) struct accept_filter { char accf_name[16]; int (*accf_callback) (struct socket *so, void *arg, int waitflag); void * (*accf_create) (struct socket *so, char *arg); void (*accf_destroy) (struct socket *so); SLIST_ENTRY(accept_filter) accf_next; }; #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_ACCF); MALLOC_DECLARE(M_PCB); MALLOC_DECLARE(M_SONAME); #endif /* * Socket specific helper hook point identifiers * Do not leave holes in the sequence, hook registration is a loop. */ #define HHOOK_SOCKET_OPT 0 #define HHOOK_SOCKET_CREATE 1 #define HHOOK_SOCKET_RCV 2 #define HHOOK_SOCKET_SND 3 #define HHOOK_FILT_SOREAD 4 #define HHOOK_FILT_SOWRITE 5 #define HHOOK_SOCKET_CLOSE 6 #define HHOOK_SOCKET_LAST HHOOK_SOCKET_CLOSE struct socket_hhook_data { struct socket *so; struct mbuf *m; void *hctx; /* hook point specific data*/ int status; }; extern int maxsockets; extern u_long sb_max; extern so_gen_t so_gencnt; struct file; +struct filecaps; struct filedesc; struct mbuf; struct sockaddr; struct ucred; struct uio; /* 'which' values for socket upcalls. */ #define SO_RCV 1 #define SO_SND 2 /* Return values for socket upcalls. */ #define SU_OK 0 #define SU_ISCONNECTED 1 /* * From uipc_socket and friends */ int getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len); int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, - struct file **fpp, u_int *fflagp); + struct file **fpp, u_int *fflagp, struct filecaps *havecaps); void soabort(struct socket *so); int soaccept(struct socket *so, struct sockaddr **nam); void soaio_enqueue(struct task *task); void soaio_rcv(void *context, int pending); void soaio_snd(void *context, int pending); int socheckuid(struct socket *so, uid_t uid); int sobind(struct socket *so, struct sockaddr *nam, struct thread *td); int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td); int soclose(struct socket *so); int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td); int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td); int soconnect2(struct socket *so1, struct socket *so2); int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td); int sodisconnect(struct socket *so); struct sockaddr *sodupsockaddr(const struct sockaddr *sa, int mflags); void sofree(struct socket *so); void sohasoutofband(struct socket *so); int solisten(struct socket *so, int backlog, struct thread *td); void solisten_proto(struct socket *so, int backlog); int solisten_proto_check(struct socket *so); struct socket * sonewconn(struct socket *head, int connstatus); int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td); int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td); int soreceive(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreceive_stream(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreceive_dgram(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreceive_generic(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreserve(struct socket *so, u_long sndcc, u_long rcvcc); void sorflush(struct socket *so); int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int soshutdown(struct socket *so, int how); void sotoxsocket(struct socket *so, struct xsocket *xso); void soupcall_clear(struct socket *so, int which); void soupcall_set(struct socket *so, int which, int (*func)(struct socket *, void *, int), void *arg); void sowakeup(struct socket *so, struct sockbuf *sb); void sowakeup_aio(struct socket *so, struct sockbuf *sb); int selsocket(struct socket *so, int events, struct timeval *tv, struct thread *td); /* * Accept filter functions (duh). */ int accept_filt_add(struct accept_filter *filt); int accept_filt_del(char *name); struct accept_filter *accept_filt_get(char *name); #ifdef ACCEPT_FILTER_MOD #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_accf); #endif int accept_filt_generic_mod_event(module_t mod, int event, void *data); #endif #endif /* _KERNEL */ #endif /* !_SYS_SOCKETVAR_H_ */