diff --git a/lib/libsys/getsockopt.2 b/lib/libsys/getsockopt.2 index a74bf3a4685e..ca826528ff2f 100644 --- a/lib/libsys/getsockopt.2 +++ b/lib/libsys/getsockopt.2 @@ -1,622 +1,682 @@ .\" Copyright (c) 1983, 1991, 1993 .\" The Regents of the University of California. All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd February 8, 2021 +.Dd July 8, 2024 .Dt GETSOCKOPT 2 .Os .Sh NAME .Nm getsockopt , .Nm setsockopt .Nd get and set options on sockets .Sh LIBRARY .Lb libc .Sh SYNOPSIS .In sys/types.h .In sys/socket.h .Ft int .Fn getsockopt "int s" "int level" "int optname" "void * restrict optval" "socklen_t * restrict optlen" .Ft int .Fn setsockopt "int s" "int level" "int optname" "const void *optval" "socklen_t optlen" .Sh DESCRIPTION The .Fn getsockopt and .Fn setsockopt system calls manipulate the .Em options associated with a socket. Options may exist at multiple protocol levels; they are always present at the uppermost .Dq socket level. .Pp When manipulating socket options the level at which the option resides and the name of the option must be specified. To manipulate options at the socket level, .Fa level is specified as .Dv SOL_SOCKET . To manipulate options at any other level the protocol number of the appropriate protocol controlling the option is supplied. For example, to indicate that an option is to be interpreted by the .Tn TCP protocol, .Fa level should be set to the protocol number of .Tn TCP ; see .Xr getprotoent 3 . .Pp The .Fa optval and .Fa optlen arguments are used to access option values for .Fn setsockopt . For .Fn getsockopt they identify a buffer in which the value for the requested option(s) are to be returned. For .Fn getsockopt , .Fa optlen is a value-result argument, initially containing the size of the buffer pointed to by .Fa optval , and modified on return to indicate the actual size of the value returned. If no option value is to be supplied or returned, .Fa optval may be NULL. .Pp The .Fa optname argument and any specified options are passed uninterpreted to the appropriate protocol module for interpretation. The include file .In sys/socket.h contains definitions for socket level options, described below. Options at other protocol levels vary in format and name; consult the appropriate entries in section 4 of the manual. .Pp Most socket-level options utilize an .Vt int argument for .Fa optval . For .Fn setsockopt , the argument should be non-zero to enable a boolean option, or zero if the option is to be disabled. .Dv SO_LINGER uses a .Vt "struct linger" argument, defined in .In sys/socket.h , which specifies the desired state of the option and the linger interval (see below). .Dv SO_SNDTIMEO and .Dv SO_RCVTIMEO use a .Vt "struct timeval" argument, defined in .In sys/time.h . .Pp The following options are recognized at the socket level. For protocol-specific options, see protocol manual pages, e.g. .Xr ip 4 or .Xr tcp 4 . Except as noted, each may be examined with .Fn getsockopt and set with .Fn setsockopt . .Bl -column SO_ACCEPTFILTER -offset indent .It Dv SO_DEBUG Ta "enables recording of debugging information" .It Dv SO_REUSEADDR Ta "enables local address reuse" .It Dv SO_REUSEPORT Ta "enables duplicate address and port bindings" .It Dv SO_REUSEPORT_LB Ta "enables duplicate address and port bindings with load balancing" .It Dv SO_KEEPALIVE Ta "enables keep connections alive" .It Dv SO_DONTROUTE Ta "enables routing bypass for outgoing messages" .It Dv SO_LINGER Ta "linger on close if data present" .It Dv SO_BROADCAST Ta "enables permission to transmit broadcast messages" .It Dv SO_OOBINLINE Ta "enables reception of out-of-band data in band" .It Dv SO_SNDBUF Ta "set buffer size for output" .It Dv SO_RCVBUF Ta "set buffer size for input" .It Dv SO_SNDLOWAT Ta "set minimum count for output" .It Dv SO_RCVLOWAT Ta "set minimum count for input" .It Dv SO_SNDTIMEO Ta "set timeout value for output" .It Dv SO_RCVTIMEO Ta "set timeout value for input" .It Dv SO_ACCEPTFILTER Ta "set accept filter on listening socket" .It Dv SO_NOSIGPIPE Ta controls generation of .Dv SIGPIPE for the socket .It Dv SO_TIMESTAMP Ta "enables reception of a timestamp with datagrams" .It Dv SO_BINTIME Ta "enables reception of a timestamp with datagrams" .It Dv SO_ACCEPTCONN Ta "get listening status of the socket (get only)" .It Dv SO_DOMAIN Ta "get the domain of the socket (get only)" .It Dv SO_TYPE Ta "get the type of the socket (get only)" .It Dv SO_PROTOCOL Ta "get the protocol number for the socket (get only)" .It Dv SO_PROTOTYPE Ta "SunOS alias for the Linux SO_PROTOCOL (get only)" .It Dv SO_ERROR Ta "get and clear error on the socket (get only)" .It Dv SO_RERROR Ta "enables receive error reporting" .It Dv SO_SETFIB Ta "set the associated FIB (routing table) for the socket (set only)" .El .Pp The following options are recognized in .Fx : .Bl -column SO_LISTENINCQLEN -offset indent .It Dv SO_LABEL Ta "get MAC label of the socket (get only)" .It Dv SO_PEERLABEL Ta "get socket's peer's MAC label (get only)" .It Dv SO_LISTENQLIMIT Ta "get backlog limit of the socket (get only)" .It Dv SO_LISTENQLEN Ta "get complete queue length of the socket (get only)" .It Dv SO_LISTENINCQLEN Ta "get incomplete queue length of the socket (get only)" .It Dv SO_USER_COOKIE Ta "set the 'so_user_cookie' value for the socket (uint32_t, set only)" .It Dv SO_TS_CLOCK Ta "set specific format of timestamp returned by SO_TIMESTAMP" .It Dv SO_MAX_PACING_RATE Ta "set the maximum transmit rate in bytes per second for the socket" .It Dv SO_NO_OFFLOAD Ta "disables protocol offloads" .It Dv SO_NO_DDP Ta "disables direct data placement offload" +.It Dv SO_SPLICE Ta "splice two sockets together" .El .Pp .Dv SO_DEBUG enables debugging in the underlying protocol modules. .Pp .Dv SO_REUSEADDR indicates that the rules used in validating addresses supplied in a .Xr bind 2 system call should allow reuse of local addresses. .Pp .Dv SO_REUSEPORT allows completely duplicate bindings by multiple processes if they all set .Dv SO_REUSEPORT before binding the port. This option permits multiple instances of a program to each receive UDP/IP multicast or broadcast datagrams destined for the bound port. .Pp .Dv SO_REUSEPORT_LB allows completely duplicate bindings by multiple sockets if they all set .Dv SO_REUSEPORT_LB before binding the port. Incoming TCP and UDP connections are distributed among the participating listening sockets based on a hash function of local port number, and foreign IP address and port number. A maximum of 256 sockets can be bound to the same load-balancing group. .Pp .Dv SO_KEEPALIVE enables the periodic transmission of messages on a connected socket. Should the connected party fail to respond to these messages, the connection is considered broken and processes using the socket are notified via a .Dv SIGPIPE signal when attempting to send data. .Pp .Dv SO_DONTROUTE indicates that outgoing messages should bypass the standard routing facilities. Instead, messages are directed to the appropriate network interface according to the network portion of the destination address. .Pp .Dv SO_LINGER controls the action taken when unsent messages are queued on socket and a .Xr close 2 is performed. If the socket promises reliable delivery of data and .Dv SO_LINGER is set, the system will block the process on the .Xr close 2 attempt until it is able to transmit the data or until it decides it is unable to deliver the information (a timeout period, termed the linger interval, is specified in seconds in the .Fn setsockopt system call when .Dv SO_LINGER is requested). If .Dv SO_LINGER is disabled and a .Xr close 2 is issued, the system will process the close in a manner that allows the process to continue as quickly as possible. .Pp The option .Dv SO_BROADCAST requests permission to send broadcast datagrams on the socket. Broadcast was a privileged operation in earlier versions of the system. .Pp With protocols that support out-of-band data, the .Dv SO_OOBINLINE option requests that out-of-band data be placed in the normal data input queue as received; it will then be accessible with .Xr recv 2 or .Xr read 2 calls without the .Dv MSG_OOB flag. Some protocols always behave as if this option is set. .Pp .Dv SO_SNDBUF and .Dv SO_RCVBUF are options to adjust the normal buffer sizes allocated for output and input buffers, respectively. The buffer size may be increased for high-volume connections, or may be decreased to limit the possible backlog of incoming data. The system places an absolute maximum on these values, which is accessible through the .Xr sysctl 3 MIB variable .Dq Li kern.ipc.maxsockbuf . .Pp .Dv SO_SNDLOWAT is an option to set the minimum count for output operations. Most output operations process all of the data supplied by the call, delivering data to the protocol for transmission and blocking as necessary for flow control. Nonblocking output operations will process as much data as permitted subject to flow control without blocking, but will process no data if flow control does not allow the smaller of the low water mark value or the entire request to be processed. A .Xr select 2 operation testing the ability to write to a socket will return true only if the low water mark amount could be processed. The default value for .Dv SO_SNDLOWAT is set to a convenient size for network efficiency, often 1024. .Pp .Dv SO_RCVLOWAT is an option to set the minimum count for input operations. In general, receive calls will block until any (non-zero) amount of data is received, then return with the smaller of the amount available or the amount requested. The default value for .Dv SO_RCVLOWAT is 1. If .Dv SO_RCVLOWAT is set to a larger value, blocking receive calls normally wait until they have received the smaller of the low water mark value or the requested amount. Receive calls may still return less than the low water mark if an error occurs, a signal is caught, or the type of data next in the receive queue is different from that which was returned. .Pp .Dv SO_SNDTIMEO is an option to set a timeout value for output operations. It accepts a .Vt "struct timeval" argument with the number of seconds and microseconds used to limit waits for output operations to complete. If a send operation has blocked for this much time, it returns with a partial count or with the error .Er EWOULDBLOCK if no data were sent. In the current implementation, this timer is restarted each time additional data are delivered to the protocol, implying that the limit applies to output portions ranging in size from the low water mark to the high water mark for output. .Pp .Dv SO_RCVTIMEO is an option to set a timeout value for input operations. It accepts a .Vt "struct timeval" argument with the number of seconds and microseconds used to limit waits for input operations to complete. In the current implementation, this timer is restarted each time additional data are received by the protocol, and thus the limit is in effect an inactivity timer. If a receive operation has been blocked for this much time without receiving additional data, it returns with a short count or with the error .Er EWOULDBLOCK if no data were received. .Pp .Dv SO_SETFIB can be used to over-ride the default FIB (routing table) for the given socket. The value must be from 0 to one less than the number returned from the sysctl .Em net.fibs . .Pp .Dv SO_USER_COOKIE can be used to set the uint32_t so_user_cookie field in the socket. The value is an uint32_t, and can be used in the kernel code that manipulates traffic related to the socket. The default value for the field is 0. As an example, the value can be used as the skipto target or pipe number in .Nm ipfw/dummynet . .Pp .Dv SO_ACCEPTFILTER places an .Xr accept_filter 9 on the socket, which will filter incoming connections on a listening stream socket before being presented for .Xr accept 2 . Once more, .Xr listen 2 must be called on the socket before trying to install the filter on it, or else the .Fn setsockopt system call will fail. .Bd -literal struct accept_filter_arg { char af_name[16]; char af_arg[256-16]; }; .Ed .Pp The .Fa optval argument should point to a .Fa struct accept_filter_arg that will select and configure the .Xr accept_filter 9 . The .Fa af_name argument should be filled with the name of the accept filter that the application wishes to place on the listening socket. The optional argument .Fa af_arg can be passed to the accept filter specified by .Fa af_name to provide additional configuration options at attach time. Passing in an .Fa optval of NULL will remove the filter. .Pp The .Dv SO_NOSIGPIPE option controls generation of the .Dv SIGPIPE signal normally sent when writing to a connected socket where the other end has been closed returns with the error .Er EPIPE . .Pp If the .Dv SO_TIMESTAMP or .Dv SO_BINTIME option is enabled on a .Dv SOCK_DGRAM socket, the .Xr recvmsg 2 call may return a timestamp corresponding to when the datagram was received. However, it may not, for example due to a resource shortage. The .Va msg_control field in the .Vt msghdr structure points to a buffer that contains a .Vt cmsghdr structure followed by a .Vt "struct timeval" for .Dv SO_TIMESTAMP and .Vt "struct bintime" for .Dv SO_BINTIME . The .Vt cmsghdr fields have the following values for TIMESTAMP by default: .Bd -literal cmsg_len = CMSG_LEN(sizeof(struct timeval)); cmsg_level = SOL_SOCKET; cmsg_type = SCM_TIMESTAMP; .Ed .Pp and for .Dv SO_BINTIME : .Bd -literal cmsg_len = CMSG_LEN(sizeof(struct bintime)); cmsg_level = SOL_SOCKET; cmsg_type = SCM_BINTIME; .Ed .Pp Additional timestamp types are available by following .Dv SO_TIMESTAMP with .Dv SO_TS_CLOCK , which requests a specific timestamp format to be returned instead of .Dv SCM_TIMESTAMP when .Dv SO_TIMESTAMP is enabled. These .Dv SO_TS_CLOCK values are recognized in .Fx : .Bl -column SO_TS_CLOCK -offset indent .It Dv SO_TS_REALTIME_MICRO Ta "realtime (SCM_TIMESTAMP, struct timeval), default" .It Dv SO_TS_BINTIME Ta "realtime (SCM_BINTIME, struct bintime)" .It Dv SO_TS_REALTIME Ta "realtime (SCM_REALTIME, struct timespec)" .It Dv SO_TS_MONOTONIC Ta "monotonic time (SCM_MONOTONIC, struct timespec)" .El .Pp .Dv SO_ACCEPTCONN , .Dv SO_TYPE , .Dv SO_PROTOCOL (and its alias .Dv SO_PROTOTYPE ) and .Dv SO_ERROR are options used only with .Fn getsockopt . .Dv SO_ACCEPTCONN returns whether the socket is currently accepting connections, that is, whether or not the .Xr listen 2 system call was invoked on the socket. .Dv SO_TYPE returns the type of the socket, such as .Dv SOCK_STREAM ; it is useful for servers that inherit sockets on startup. .Dv SO_PROTOCOL returns the protocol number for the socket, for .Dv AF_INET and .Dv AF_INET6 address families. .Dv SO_ERROR returns any pending error on the socket and clears the error status. It may be used to check for asynchronous errors on connected datagram sockets or for other asynchronous errors. .Dv SO_RERROR indicates that receive buffer overflows should be handled as errors. Historically receive buffer overflows have been ignored and programs could not tell if they missed messages or messages had been truncated because of overflows. Since programs historically do not expect to get receive overflow errors, this behavior is not the default. .Pp .Dv SO_LABEL returns the MAC label of the socket. .Dv SO_PEERLABEL returns the MAC label of the socket's peer. Note that your kernel must be compiled with MAC support. See .Xr mac 3 for more information. .Pp .Dv SO_LISTENQLIMIT returns the maximal number of queued connections, as set by .Xr listen 2 . .Dv SO_LISTENQLEN returns the number of unaccepted complete connections. .Dv SO_LISTENINCQLEN returns the number of unaccepted incomplete connections. .Pp .Dv SO_MAX_PACING_RATE instruct the socket and underlying network adapter layers to limit the transfer rate to the given unsigned 32-bit value in bytes per second. .Pp .Dv SO_NO_OFFLOAD disables support for protocol offloads. At present, this prevents TCP sockets from using TCP offload engines. .Dv SO_NO_DDP disables support for a specific TCP offload known as direct data placement (DDP). DDP is an offload supported by Chelsio network adapters that permits reassembled TCP data streams to be received via zero-copy in user-supplied buffers using .Xr aio_read 2 . +.Pp +.Dv SO_SPLICE , +when passed to +.Fn setsockopt , +splices two sockets together using the following +.Fa optval : +.Bd -literal +struct so_splice { + int sp_fd; + off_t sp_max; + struct timeval sp_idle; +}; +.Ed +.Pp +Data received on +.Fa s +will automatically be transmitted from the socket specified in +.Fa sp_fd +without any intervention by userspace. +Splicing is a one-way operation; a given pair of sockets may be +spliced in one or both directions. +Currently only connected +.Xr tcp 4 +sockets may be spliced together. +If +.Fa sp_max +is greater than zero, the socket pair will automatically be unspliced +once that number of bytes have been transmitted. +If +.Fa sp_idle +is non-zero, the socket pair will automatically be unspliced once the +specified amount of time has elapsed since the initial call to +.Fn setsockopt . +If +.Fa sp_fd +is -1, the socket will be unspliced immediately. +.Pp +When passed to +.Fn getsockopt , +the +.Dv SO_SPLICE +option returns a 64-bit integer containing the number of bytes transmitted by +the most recent splice. +That is, while the socket is spliced, the value returned will be the number +of bytes spliced so far. +When unsplicing, this value is saved and is returned until the socket is closed +or spliced again. +For example, if a splice transmits 100 bytes and is then unspliced, a subsequent +.Nm getsockopt +call will return 100 until the socket is spliced again. .Sh RETURN VALUES .Rv -std .Sh ERRORS The .Fn getsockopt and .Fn setsockopt system calls succeed unless: .Bl -tag -width Er .It Bq Er EBADF The argument .Fa s is not a valid descriptor. .It Bq Er ENOTSOCK The argument .Fa s is a file, not a socket. .It Bq Er ENOPROTOOPT The option is unknown at the level indicated. .It Bq Er EFAULT The address pointed to by .Fa optval is not in a valid part of the process address space. For .Fn getsockopt , this error may also be returned if .Fa optlen is not in a valid part of the process address space. .It Bq Er EINVAL Installing an .Xr accept_filter 9 on a non-listening socket was attempted. .It Bq Er ENOMEM A memory allocation failed that was required to service the request. .El .Pp The .Fn setsockopt system call may also return the following error: .Bl -tag -width Er .It Bq Er ENOBUFS Insufficient resources were available in the system to perform the operation. .El .Sh SEE ALSO .Xr ioctl 2 , .Xr listen 2 , .Xr recvmsg 2 , .Xr socket 2 , .Xr getprotoent 3 , .Xr mac 3 , .Xr sysctl 3 , .Xr ip 4 , .Xr ip6 4 , .Xr sctp 4 , .Xr tcp 4 , .Xr protocols 5 , .Xr sysctl 8 , .Xr accept_filter 9 , .Xr bintime 9 .Sh HISTORY The .Fn getsockopt and .Fn setsockopt system calls appeared in .Bx 4.2 . +The +.Dv SO_SPLICE +option originated in +.Ox 4.9 +and first appeared in +.Fx 15.0 . +The +.Fx +implementation aims to be source-compatible. .Sh BUGS Several of the socket options should be handled at lower levels of the system. diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c index 6d3050596f23..95c7241d5f13 100644 --- a/sys/kern/uipc_sockbuf.c +++ b/sys/kern/uipc_sockbuf.c @@ -1,1876 +1,1906 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_kern_tls.h" #include "opt_param.h" #include #include /* for aio_swake proto */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Function pointer set by the AIO routines so that the socket buffer code * can call back into the AIO module if it is loaded. */ void (*aio_swake)(struct socket *, struct sockbuf *); /* * Primitive routines for operating on socket buffers */ #define BUF_MAX_ADJ(_sz) (((u_quad_t)(_sz)) * MCLBYTES / (MSIZE + MCLBYTES)) u_long sb_max = SB_MAX; u_long sb_max_adj = BUF_MAX_ADJ(SB_MAX); static u_long sb_efficiency = 8; /* parameter for sbreserve() */ #ifdef KERN_TLS static void sbcompress_ktls_rx(struct sockbuf *sb, struct mbuf *m, struct mbuf *n); #endif static struct mbuf *sbcut_internal(struct sockbuf *sb, int len); static void sbflush_internal(struct sockbuf *sb); /* * Our own version of m_clrprotoflags(), that can preserve M_NOTREADY. */ static void sbm_clrprotoflags(struct mbuf *m, int flags) { int mask; mask = ~M_PROTOFLAGS; if (flags & PRUS_NOTREADY) mask |= M_NOTREADY; while (m) { m->m_flags &= mask; m = m->m_next; } } /* * Compress M_NOTREADY mbufs after they have been readied by sbready(). * * sbcompress() skips M_NOTREADY mbufs since the data is not available to * be copied at the time of sbcompress(). This function combines small * mbufs similar to sbcompress() once mbufs are ready. 'm0' is the first * mbuf sbready() marked ready, and 'end' is the first mbuf still not * ready. */ static void sbready_compress(struct sockbuf *sb, struct mbuf *m0, struct mbuf *end) { struct mbuf *m, *n; int ext_size; SOCKBUF_LOCK_ASSERT(sb); if ((sb->sb_flags & SB_NOCOALESCE) != 0) return; for (m = m0; m != end; m = m->m_next) { MPASS((m->m_flags & M_NOTREADY) == 0); /* * NB: In sbcompress(), 'n' is the last mbuf in the * socket buffer and 'm' is the new mbuf being copied * into the trailing space of 'n'. Here, the roles * are reversed and 'n' is the next mbuf after 'm' * that is being copied into the trailing space of * 'm'. */ n = m->m_next; #ifdef KERN_TLS /* Try to coalesce adjacent ktls mbuf hdr/trailers. */ if ((n != NULL) && (n != end) && (m->m_flags & M_EOR) == 0 && (m->m_flags & M_EXTPG) && (n->m_flags & M_EXTPG) && !mbuf_has_tls_session(m) && !mbuf_has_tls_session(n)) { int hdr_len, trail_len; hdr_len = n->m_epg_hdrlen; trail_len = m->m_epg_trllen; if (trail_len != 0 && hdr_len != 0 && trail_len + hdr_len <= MBUF_PEXT_TRAIL_LEN) { /* copy n's header to m's trailer */ memcpy(&m->m_epg_trail[trail_len], n->m_epg_hdr, hdr_len); m->m_epg_trllen += hdr_len; m->m_len += hdr_len; n->m_epg_hdrlen = 0; n->m_len -= hdr_len; } } #endif /* Compress small unmapped mbufs into plain mbufs. */ if ((m->m_flags & M_EXTPG) && m->m_len <= MLEN && !mbuf_has_tls_session(m)) { ext_size = m->m_ext.ext_size; if (mb_unmapped_compress(m) == 0) sb->sb_mbcnt -= ext_size; } while ((n != NULL) && (n != end) && (m->m_flags & M_EOR) == 0 && M_WRITABLE(m) && (m->m_flags & M_EXTPG) == 0 && !mbuf_has_tls_session(n) && !mbuf_has_tls_session(m) && n->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ n->m_len <= M_TRAILINGSPACE(m) && m->m_type == n->m_type) { KASSERT(sb->sb_lastrecord != n, ("%s: merging start of record (%p) into previous mbuf (%p)", __func__, n, m)); m_copydata(n, 0, n->m_len, mtodo(m, m->m_len)); m->m_len += n->m_len; m->m_next = n->m_next; m->m_flags |= n->m_flags & M_EOR; if (sb->sb_mbtail == n) sb->sb_mbtail = m; sb->sb_mbcnt -= MSIZE; if (n->m_flags & M_EXT) sb->sb_mbcnt -= n->m_ext.ext_size; m_free(n); n = m->m_next; } } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); } /* * Mark ready "count" units of I/O starting with "m". Most mbufs * count as a single unit of I/O except for M_EXTPG mbufs which * are backed by multiple pages. */ int sbready(struct sockbuf *sb, struct mbuf *m0, int count) { struct mbuf *m; u_int blocker; SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb)); KASSERT(count > 0, ("%s: invalid count %d", __func__, count)); m = m0; blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0; while (count > 0) { KASSERT(m->m_flags & M_NOTREADY, ("%s: m %p !M_NOTREADY", __func__, m)); if ((m->m_flags & M_EXTPG) != 0 && m->m_epg_npgs != 0) { if (count < m->m_epg_nrdy) { m->m_epg_nrdy -= count; count = 0; break; } count -= m->m_epg_nrdy; m->m_epg_nrdy = 0; } else count--; m->m_flags &= ~(M_NOTREADY | blocker); if (blocker) sb->sb_acc += m->m_len; m = m->m_next; } /* * If the first mbuf is still not fully ready because only * some of its backing pages were readied, no further progress * can be made. */ if (m0 == m) { MPASS(m->m_flags & M_NOTREADY); return (EINPROGRESS); } if (!blocker) { sbready_compress(sb, m0, m); return (EINPROGRESS); } /* This one was blocking all the queue. */ for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) { KASSERT(m->m_flags & M_BLOCKED, ("%s: m %p !M_BLOCKED", __func__, m)); m->m_flags &= ~M_BLOCKED; sb->sb_acc += m->m_len; } sb->sb_fnrdy = m; sbready_compress(sb, m0, m); return (0); } /* * Adjust sockbuf state reflecting allocation of m. */ void sballoc(struct sockbuf *sb, struct mbuf *m) { SOCKBUF_LOCK_ASSERT(sb); sb->sb_ccc += m->m_len; if (sb->sb_fnrdy == NULL) { if (m->m_flags & M_NOTREADY) sb->sb_fnrdy = m; else sb->sb_acc += m->m_len; } else m->m_flags |= M_BLOCKED; if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) sb->sb_ctl += m->m_len; sb->sb_mbcnt += MSIZE; if (m->m_flags & M_EXT) sb->sb_mbcnt += m->m_ext.ext_size; } /* * Adjust sockbuf state reflecting freeing of m. */ void sbfree(struct sockbuf *sb, struct mbuf *m) { #if 0 /* XXX: not yet: soclose() call path comes here w/o lock. */ SOCKBUF_LOCK_ASSERT(sb); #endif sb->sb_ccc -= m->m_len; if (!(m->m_flags & M_NOTAVAIL)) sb->sb_acc -= m->m_len; if (m == sb->sb_fnrdy) { struct mbuf *n; KASSERT(m->m_flags & M_NOTREADY, ("%s: m %p !M_NOTREADY", __func__, m)); n = m->m_next; while (n != NULL && !(n->m_flags & M_NOTREADY)) { n->m_flags &= ~M_BLOCKED; sb->sb_acc += n->m_len; n = n->m_next; } sb->sb_fnrdy = n; } if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) sb->sb_ctl -= m->m_len; sb->sb_mbcnt -= MSIZE; if (m->m_flags & M_EXT) sb->sb_mbcnt -= m->m_ext.ext_size; if (sb->sb_sndptr == m) { sb->sb_sndptr = NULL; sb->sb_sndptroff = 0; } if (sb->sb_sndptroff != 0) sb->sb_sndptroff -= m->m_len; } #ifdef KERN_TLS /* * Similar to sballoc/sbfree but does not adjust state associated with * the sb_mb chain such as sb_fnrdy or sb_sndptr*. Also assumes mbufs * are not ready. */ void sballoc_ktls_rx(struct sockbuf *sb, struct mbuf *m) { SOCKBUF_LOCK_ASSERT(sb); sb->sb_ccc += m->m_len; sb->sb_tlscc += m->m_len; sb->sb_mbcnt += MSIZE; if (m->m_flags & M_EXT) sb->sb_mbcnt += m->m_ext.ext_size; } void sbfree_ktls_rx(struct sockbuf *sb, struct mbuf *m) { #if 0 /* XXX: not yet: soclose() call path comes here w/o lock. */ SOCKBUF_LOCK_ASSERT(sb); #endif sb->sb_ccc -= m->m_len; sb->sb_tlscc -= m->m_len; sb->sb_mbcnt -= MSIZE; if (m->m_flags & M_EXT) sb->sb_mbcnt -= m->m_ext.ext_size; } #endif /* * Socantsendmore indicates that no more data will be sent on the socket; it * would normally be applied to a socket when the user informs the system * that no more data is to be sent, by the protocol code (in case * PRU_SHUTDOWN). Socantrcvmore indicates that no more data will be * received, and will normally be applied to the socket by a protocol when it * detects that the peer will send no more data. Data queued for reading in * the socket may yet be read. */ void socantsendmore_locked(struct socket *so) { SOCK_SENDBUF_LOCK_ASSERT(so); so->so_snd.sb_state |= SBS_CANTSENDMORE; sowwakeup_locked(so); SOCK_SENDBUF_UNLOCK_ASSERT(so); } void socantsendmore(struct socket *so) { SOCK_SENDBUF_LOCK(so); socantsendmore_locked(so); SOCK_SENDBUF_UNLOCK_ASSERT(so); } void socantrcvmore_locked(struct socket *so) { SOCK_RECVBUF_LOCK_ASSERT(so); so->so_rcv.sb_state |= SBS_CANTRCVMORE; #ifdef KERN_TLS if (so->so_rcv.sb_flags & SB_TLS_RX) ktls_check_rx(&so->so_rcv); #endif sorwakeup_locked(so); SOCK_RECVBUF_UNLOCK_ASSERT(so); } void socantrcvmore(struct socket *so) { SOCK_RECVBUF_LOCK(so); socantrcvmore_locked(so); SOCK_RECVBUF_UNLOCK_ASSERT(so); } void soroverflow_locked(struct socket *so) { SOCK_RECVBUF_LOCK_ASSERT(so); if (so->so_options & SO_RERROR) { so->so_rerror = ENOBUFS; sorwakeup_locked(so); } else SOCK_RECVBUF_UNLOCK(so); SOCK_RECVBUF_UNLOCK_ASSERT(so); } void soroverflow(struct socket *so) { SOCK_RECVBUF_LOCK(so); soroverflow_locked(so); SOCK_RECVBUF_UNLOCK_ASSERT(so); } /* * Wait for data to arrive at/drain from a socket buffer. */ int sbwait(struct socket *so, sb_which which) { struct sockbuf *sb; SOCK_BUF_LOCK_ASSERT(so, which); sb = sobuf(so, which); sb->sb_flags |= SB_WAIT; return (msleep_sbt(&sb->sb_acc, soeventmtx(so, which), (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", sb->sb_timeo, 0, 0)); } /* * Wakeup processes waiting on a socket buffer. Do asynchronous notification * via SIGIO if the socket has the SS_ASYNC flag set. * * Called with the socket buffer lock held; will release the lock by the end * of the function. This allows the caller to acquire the socket buffer lock * while testing for the need for various sorts of wakeup and hold it through * to the point where it's no longer required. We currently hold the lock * through calls out to other subsystems (with the exception of kqueue), and * then release it to avoid lock order issues. It's not clear that's * correct. */ static __always_inline void sowakeup(struct socket *so, const sb_which which) { struct sockbuf *sb; int ret; SOCK_BUF_LOCK_ASSERT(so, which); sb = sobuf(so, which); selwakeuppri(sb->sb_sel, PSOCK); if (!SEL_WAITING(sb->sb_sel)) sb->sb_flags &= ~SB_SEL; if (sb->sb_flags & SB_WAIT) { sb->sb_flags &= ~SB_WAIT; wakeup(&sb->sb_acc); } KNOTE_LOCKED(&sb->sb_sel->si_note, 0); if (sb->sb_upcall != NULL) { ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT); if (ret == SU_ISCONNECTED) { KASSERT(sb == &so->so_rcv, ("SO_SND upcall returned SU_ISCONNECTED")); soupcall_clear(so, SO_RCV); } } else ret = SU_OK; if (sb->sb_flags & SB_AIO) sowakeup_aio(so, which); SOCK_BUF_UNLOCK(so, which); if (ret == SU_ISCONNECTED) soisconnected(so); if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGIO, 0); SOCK_BUF_UNLOCK_ASSERT(so, which); } +static void +splice_push(struct socket *so) +{ + struct so_splice *sp; + + SOCK_RECVBUF_LOCK_ASSERT(so); + + sp = so->so_splice; + mtx_lock(&sp->mtx); + SOCK_RECVBUF_UNLOCK(so); + so_splice_dispatch(sp); +} + +static void +splice_pull(struct socket *so) +{ + struct so_splice *sp; + + SOCK_SENDBUF_LOCK_ASSERT(so); + + sp = so->so_splice_back; + mtx_lock(&sp->mtx); + SOCK_SENDBUF_UNLOCK(so); + so_splice_dispatch(sp); +} + /* * Do we need to notify the other side when I/O is possible? */ static __always_inline bool sb_notify(const struct sockbuf *sb) { return ((sb->sb_flags & (SB_WAIT | SB_SEL | SB_ASYNC | SB_UPCALL | SB_AIO | SB_KNOTE)) != 0); } void sorwakeup_locked(struct socket *so) { SOCK_RECVBUF_LOCK_ASSERT(so); - if (sb_notify(&so->so_rcv)) + if (so->so_rcv.sb_flags & SB_SPLICED) + splice_push(so); + else if (sb_notify(&so->so_rcv)) sowakeup(so, SO_RCV); else SOCK_RECVBUF_UNLOCK(so); } void sowwakeup_locked(struct socket *so) { SOCK_SENDBUF_LOCK_ASSERT(so); - if (sb_notify(&so->so_snd)) + if (so->so_snd.sb_flags & SB_SPLICED) + splice_pull(so); + else if (sb_notify(&so->so_snd)) sowakeup(so, SO_SND); else SOCK_SENDBUF_UNLOCK(so); } /* * Socket buffer (struct sockbuf) utility routines. * * Each socket contains two socket buffers: one for sending data and one for * receiving data. Each buffer contains a queue of mbufs, information about * the number of mbufs and amount of data in the queue, and other fields * allowing select() statements and notification on data availability to be * implemented. * * Data stored in a socket buffer is maintained as a list of records. Each * record is a list of mbufs chained together with the m_next field. Records * are chained together with the m_nextpkt field. The upper level routine * soreceive() expects the following conventions to be observed when placing * information in the receive buffer: * * 1. If the protocol requires each message be preceded by the sender's name, * then a record containing that name must be present before any * associated data (mbuf's must be of type MT_SONAME). * 2. If the protocol supports the exchange of ``access rights'' (really just * additional data associated with the message), and there are ``rights'' * to be received, then a record containing this data should be present * (mbuf's must be of type MT_RIGHTS). * 3. If a name or rights record exists, then it must be followed by a data * record, perhaps of zero length. * * Before using a new socket structure it is first necessary to reserve * buffer space to the socket, by calling sbreserve(). This should commit * some of the available buffer space in the system buffer pool for the * socket (currently, it does nothing but enforce limits). The space should * be released by calling sbrelease() when the socket is destroyed. */ int soreserve(struct socket *so, u_long sndcc, u_long rcvcc) { struct thread *td = curthread; SOCK_SENDBUF_LOCK(so); SOCK_RECVBUF_LOCK(so); if (sbreserve_locked(so, SO_SND, sndcc, td) == 0) goto bad; if (sbreserve_locked(so, SO_RCV, rcvcc, td) == 0) goto bad2; if (so->so_rcv.sb_lowat == 0) so->so_rcv.sb_lowat = 1; if (so->so_snd.sb_lowat == 0) so->so_snd.sb_lowat = MCLBYTES; if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) so->so_snd.sb_lowat = so->so_snd.sb_hiwat; SOCK_RECVBUF_UNLOCK(so); SOCK_SENDBUF_UNLOCK(so); return (0); bad2: sbrelease_locked(so, SO_SND); bad: SOCK_RECVBUF_UNLOCK(so); SOCK_SENDBUF_UNLOCK(so); return (ENOBUFS); } static int sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS) { int error = 0; u_long tmp_sb_max = sb_max; error = sysctl_handle_long(oidp, &tmp_sb_max, arg2, req); if (error || !req->newptr) return (error); if (tmp_sb_max < MSIZE + MCLBYTES) return (EINVAL); sb_max = tmp_sb_max; sb_max_adj = BUF_MAX_ADJ(sb_max); return (0); } /* * Allot mbufs to a sockbuf. Attempt to scale mbmax so that mbcnt doesn't * become limiting if buffering efficiency is near the normal case. */ bool sbreserve_locked_limit(struct socket *so, sb_which which, u_long cc, u_long buf_max, struct thread *td) { struct sockbuf *sb = sobuf(so, which); rlim_t sbsize_limit; SOCK_BUF_LOCK_ASSERT(so, which); /* * When a thread is passed, we take into account the thread's socket * buffer size limit. The caller will generally pass curthread, but * in the TCP input path, NULL will be passed to indicate that no * appropriate thread resource limits are available. In that case, * we don't apply a process limit. */ if (cc > BUF_MAX_ADJ(buf_max)) return (false); if (td != NULL) { sbsize_limit = lim_cur(td, RLIMIT_SBSIZE); } else sbsize_limit = RLIM_INFINITY; if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc, sbsize_limit)) return (false); sb->sb_mbmax = min(cc * sb_efficiency, buf_max); if (sb->sb_lowat > sb->sb_hiwat) sb->sb_lowat = sb->sb_hiwat; return (true); } bool sbreserve_locked(struct socket *so, sb_which which, u_long cc, struct thread *td) { return (sbreserve_locked_limit(so, which, cc, sb_max, td)); } int sbsetopt(struct socket *so, struct sockopt *sopt) { struct sockbuf *sb; sb_which wh; short *flags; u_int cc, *hiwat, *lowat; int error, optval; error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error != 0) return (error); /* * Values < 1 make no sense for any of these options, * so disallow them. */ if (optval < 1) return (EINVAL); cc = optval; sb = NULL; SOCK_LOCK(so); if (SOLISTENING(so)) { switch (sopt->sopt_name) { case SO_SNDLOWAT: case SO_SNDBUF: lowat = &so->sol_sbsnd_lowat; hiwat = &so->sol_sbsnd_hiwat; flags = &so->sol_sbsnd_flags; break; case SO_RCVLOWAT: case SO_RCVBUF: lowat = &so->sol_sbrcv_lowat; hiwat = &so->sol_sbrcv_hiwat; flags = &so->sol_sbrcv_flags; break; } } else { switch (sopt->sopt_name) { case SO_SNDLOWAT: case SO_SNDBUF: sb = &so->so_snd; wh = SO_SND; break; case SO_RCVLOWAT: case SO_RCVBUF: sb = &so->so_rcv; wh = SO_RCV; break; } flags = &sb->sb_flags; hiwat = &sb->sb_hiwat; lowat = &sb->sb_lowat; SOCK_BUF_LOCK(so, wh); } error = 0; switch (sopt->sopt_name) { case SO_SNDBUF: case SO_RCVBUF: if (SOLISTENING(so)) { if (cc > sb_max_adj) { error = ENOBUFS; break; } *hiwat = cc; if (*lowat > *hiwat) *lowat = *hiwat; } else { if (!sbreserve_locked(so, wh, cc, curthread)) error = ENOBUFS; } if (error == 0) *flags &= ~SB_AUTOSIZE; break; case SO_SNDLOWAT: case SO_RCVLOWAT: /* * Make sure the low-water is never greater than the * high-water. */ *lowat = (cc > *hiwat) ? *hiwat : cc; break; } if (!SOLISTENING(so)) SOCK_BUF_UNLOCK(so, wh); SOCK_UNLOCK(so); return (error); } /* * Free mbufs held by a socket, and reserved mbuf space. */ static void sbrelease_internal(struct socket *so, sb_which which) { struct sockbuf *sb = sobuf(so, which); sbflush_internal(sb); (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY); sb->sb_mbmax = 0; } void sbrelease_locked(struct socket *so, sb_which which) { SOCK_BUF_LOCK_ASSERT(so, which); sbrelease_internal(so, which); } void sbrelease(struct socket *so, sb_which which) { SOCK_BUF_LOCK(so, which); sbrelease_locked(so, which); SOCK_BUF_UNLOCK(so, which); } void sbdestroy(struct socket *so, sb_which which) { #ifdef KERN_TLS struct sockbuf *sb = sobuf(so, which); if (sb->sb_tls_info != NULL) ktls_free(sb->sb_tls_info); sb->sb_tls_info = NULL; #endif sbrelease_internal(so, which); } /* * Routines to add and remove data from an mbuf queue. * * The routines sbappend() or sbappendrecord() are normally called to append * new mbufs to a socket buffer, after checking that adequate space is * available, comparing the function sbspace() with the amount of data to be * added. sbappendrecord() differs from sbappend() in that data supplied is * treated as the beginning of a new record. To place a sender's address, * optional access rights, and data in a socket receive buffer, * sbappendaddr() should be used. To place access rights and data in a * socket receive buffer, sbappendrights() should be used. In either case, * the new data begins a new record. Note that unlike sbappend() and * sbappendrecord(), these routines check for the caller that there will be * enough space to store the data. Each fails if there is not enough space, * or if it cannot find mbufs to store additional information in. * * Reliable protocols may use the socket send buffer to hold data awaiting * acknowledgement. Data is normally copied from a socket send buffer in a * protocol with m_copy for output to a peer, and then removing the data from * the socket buffer with sbdrop() or sbdroprecord() when the data is * acknowledged by the peer. */ #ifdef SOCKBUF_DEBUG void sblastrecordchk(struct sockbuf *sb, const char *file, int line) { struct mbuf *m = sb->sb_mb; SOCKBUF_LOCK_ASSERT(sb); while (m && m->m_nextpkt) m = m->m_nextpkt; if (m != sb->sb_lastrecord) { printf("%s: sb_mb %p sb_lastrecord %p last %p\n", __func__, sb->sb_mb, sb->sb_lastrecord, m); printf("packet chain:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) printf("\t%p\n", m); panic("%s from %s:%u", __func__, file, line); } } void sblastmbufchk(struct sockbuf *sb, const char *file, int line) { struct mbuf *m = sb->sb_mb; struct mbuf *n; SOCKBUF_LOCK_ASSERT(sb); while (m && m->m_nextpkt) m = m->m_nextpkt; while (m && m->m_next) m = m->m_next; if (m != sb->sb_mbtail) { printf("%s: sb_mb %p sb_mbtail %p last %p\n", __func__, sb->sb_mb, sb->sb_mbtail, m); printf("packet tree:\n"); for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) { printf("\t"); for (n = m; n != NULL; n = n->m_next) printf("%p ", n); printf("\n"); } panic("%s from %s:%u", __func__, file, line); } #ifdef KERN_TLS m = sb->sb_mtls; while (m && m->m_next) m = m->m_next; if (m != sb->sb_mtlstail) { printf("%s: sb_mtls %p sb_mtlstail %p last %p\n", __func__, sb->sb_mtls, sb->sb_mtlstail, m); printf("TLS packet tree:\n"); printf("\t"); for (m = sb->sb_mtls; m != NULL; m = m->m_next) { printf("%p ", m); } printf("\n"); panic("%s from %s:%u", __func__, file, line); } #endif } #endif /* SOCKBUF_DEBUG */ #define SBLINKRECORD(sb, m0) do { \ SOCKBUF_LOCK_ASSERT(sb); \ if ((sb)->sb_lastrecord != NULL) \ (sb)->sb_lastrecord->m_nextpkt = (m0); \ else \ (sb)->sb_mb = (m0); \ (sb)->sb_lastrecord = (m0); \ } while (/*CONSTCOND*/0) /* * Append mbuf chain m to the last record in the socket buffer sb. The * additional space associated the mbuf chain is recorded in sb. Empty mbufs * are discarded and mbufs are compacted where possible. */ void sbappend_locked(struct sockbuf *sb, struct mbuf *m, int flags) { struct mbuf *n; SOCKBUF_LOCK_ASSERT(sb); if (m == NULL) return; kmsan_check_mbuf(m, "sbappend"); sbm_clrprotoflags(m, flags); SBLASTRECORDCHK(sb); n = sb->sb_mb; if (n) { while (n->m_nextpkt) n = n->m_nextpkt; do { if (n->m_flags & M_EOR) { sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); } else { /* * XXX Would like to simply use sb_mbtail here, but * XXX I need to verify that I won't miss an EOR that * XXX way. */ if ((n = sb->sb_lastrecord) != NULL) { do { if (n->m_flags & M_EOR) { sbappendrecord_locked(sb, m); /* XXXXXX!!!! */ return; } } while (n->m_next && (n = n->m_next)); } else { /* * If this is the first record in the socket buffer, * it's also the last record. */ sb->sb_lastrecord = m; } } sbcompress(sb, m, n); SBLASTRECORDCHK(sb); } /* * Append mbuf chain m to the last record in the socket buffer sb. The * additional space associated the mbuf chain is recorded in sb. Empty mbufs * are discarded and mbufs are compacted where possible. */ void sbappend(struct sockbuf *sb, struct mbuf *m, int flags) { SOCKBUF_LOCK(sb); sbappend_locked(sb, m, flags); SOCKBUF_UNLOCK(sb); } #ifdef KERN_TLS /* * Append an mbuf containing encrypted TLS data. The data * is marked M_NOTREADY until it has been decrypted and * stored as a TLS record. */ static void sbappend_ktls_rx(struct sockbuf *sb, struct mbuf *m) { struct ifnet *ifp; struct mbuf *n; int flags; ifp = NULL; flags = M_NOTREADY; SBLASTMBUFCHK(sb); /* Mbuf chain must start with a packet header. */ MPASS((m->m_flags & M_PKTHDR) != 0); /* Remove all packet headers and mbuf tags to get a pure data chain. */ for (n = m; n != NULL; n = n->m_next) { if (n->m_flags & M_PKTHDR) { ifp = m->m_pkthdr.leaf_rcvif; if ((n->m_pkthdr.csum_flags & CSUM_TLS_MASK) == CSUM_TLS_DECRYPTED) { /* Mark all mbufs in this packet decrypted. */ flags = M_NOTREADY | M_DECRYPTED; } else { flags = M_NOTREADY; } m_demote_pkthdr(n); } n->m_flags &= M_DEMOTEFLAGS; n->m_flags |= flags; MPASS((n->m_flags & M_NOTREADY) != 0); } sbcompress_ktls_rx(sb, m, sb->sb_mtlstail); ktls_check_rx(sb); /* Check for incoming packet route changes: */ if (ifp != NULL && sb->sb_tls_info->rx_ifp != NULL && sb->sb_tls_info->rx_ifp != ifp) ktls_input_ifp_mismatch(sb, ifp); } #endif /* * This version of sbappend() should only be used when the caller absolutely * knows that there will never be more than one record in the socket buffer, * that is, a stream protocol (such as TCP). */ void sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags) { SOCKBUF_LOCK_ASSERT(sb); KASSERT(m->m_nextpkt == NULL,("sbappendstream 0")); kmsan_check_mbuf(m, "sbappend"); #ifdef KERN_TLS /* * Decrypted TLS records are appended as records via * sbappendrecord(). TCP passes encrypted TLS records to this * function which must be scheduled for decryption. */ if (sb->sb_flags & SB_TLS_RX) { sbappend_ktls_rx(sb, m); return; } #endif KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1")); SBLASTMBUFCHK(sb); #ifdef KERN_TLS if (sb->sb_tls_info != NULL) ktls_seq(sb, m); #endif /* Remove all packet headers and mbuf tags to get a pure data chain. */ m_demote(m, 1, flags & PRUS_NOTREADY ? M_NOTREADY : 0); sbcompress(sb, m, sb->sb_mbtail); sb->sb_lastrecord = sb->sb_mb; SBLASTRECORDCHK(sb); } /* * This version of sbappend() should only be used when the caller absolutely * knows that there will never be more than one record in the socket buffer, * that is, a stream protocol (such as TCP). */ void sbappendstream(struct sockbuf *sb, struct mbuf *m, int flags) { SOCKBUF_LOCK(sb); sbappendstream_locked(sb, m, flags); SOCKBUF_UNLOCK(sb); } #ifdef SOCKBUF_DEBUG void sbcheck(struct sockbuf *sb, const char *file, int line) { struct mbuf *m, *n, *fnrdy; u_long acc, ccc, mbcnt; #ifdef KERN_TLS u_long tlscc; #endif SOCKBUF_LOCK_ASSERT(sb); acc = ccc = mbcnt = 0; fnrdy = NULL; for (m = sb->sb_mb; m; m = n) { n = m->m_nextpkt; for (; m; m = m->m_next) { if (m->m_len == 0) { printf("sb %p empty mbuf %p\n", sb, m); goto fail; } if ((m->m_flags & M_NOTREADY) && fnrdy == NULL) { if (m != sb->sb_fnrdy) { printf("sb %p: fnrdy %p != m %p\n", sb, sb->sb_fnrdy, m); goto fail; } fnrdy = m; } if (fnrdy) { if (!(m->m_flags & M_NOTAVAIL)) { printf("sb %p: fnrdy %p, m %p is avail\n", sb, sb->sb_fnrdy, m); goto fail; } } else acc += m->m_len; ccc += m->m_len; mbcnt += MSIZE; if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ mbcnt += m->m_ext.ext_size; } } #ifdef KERN_TLS /* * Account for mbufs "detached" by ktls_detach_record() while * they are decrypted by ktls_decrypt(). tlsdcc gives a count * of the detached bytes that are included in ccc. The mbufs * and clusters are not included in the socket buffer * accounting. */ ccc += sb->sb_tlsdcc; tlscc = 0; for (m = sb->sb_mtls; m; m = m->m_next) { if (m->m_nextpkt != NULL) { printf("sb %p TLS mbuf %p with nextpkt\n", sb, m); goto fail; } if ((m->m_flags & M_NOTREADY) == 0) { printf("sb %p TLS mbuf %p ready\n", sb, m); goto fail; } tlscc += m->m_len; ccc += m->m_len; mbcnt += MSIZE; if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ mbcnt += m->m_ext.ext_size; } if (sb->sb_tlscc != tlscc) { printf("tlscc %ld/%u dcc %u\n", tlscc, sb->sb_tlscc, sb->sb_tlsdcc); goto fail; } #endif if (acc != sb->sb_acc || ccc != sb->sb_ccc || mbcnt != sb->sb_mbcnt) { printf("acc %ld/%u ccc %ld/%u mbcnt %ld/%u\n", acc, sb->sb_acc, ccc, sb->sb_ccc, mbcnt, sb->sb_mbcnt); #ifdef KERN_TLS printf("tlscc %ld/%u dcc %u\n", tlscc, sb->sb_tlscc, sb->sb_tlsdcc); #endif goto fail; } return; fail: panic("%s from %s:%u", __func__, file, line); } #endif /* * As above, except the mbuf chain begins a new record. */ void sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0) { struct mbuf *m; SOCKBUF_LOCK_ASSERT(sb); if (m0 == NULL) return; kmsan_check_mbuf(m0, "sbappend"); m_clrprotoflags(m0); /* * Put the first mbuf on the queue. Note this permits zero length * records. */ sballoc(sb, m0); SBLASTRECORDCHK(sb); SBLINKRECORD(sb, m0); sb->sb_mbtail = m0; m = m0->m_next; m0->m_next = 0; if (m && (m0->m_flags & M_EOR)) { m0->m_flags &= ~M_EOR; m->m_flags |= M_EOR; } /* always call sbcompress() so it can do SBLASTMBUFCHK() */ sbcompress(sb, m, m0); } /* * As above, except the mbuf chain begins a new record. */ void sbappendrecord(struct sockbuf *sb, struct mbuf *m0) { SOCKBUF_LOCK(sb); sbappendrecord_locked(sb, m0); SOCKBUF_UNLOCK(sb); } /* Helper routine that appends data, control, and address to a sockbuf. */ static int sbappendaddr_locked_internal(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control, struct mbuf *ctrl_last) { struct mbuf *m, *n, *nlast; if (m0 != NULL) kmsan_check_mbuf(m0, "sbappend"); if (control != NULL) kmsan_check_mbuf(control, "sbappend"); #if MSIZE <= 256 if (asa->sa_len > MLEN) return (0); #endif m = m_get(M_NOWAIT, MT_SONAME); if (m == NULL) return (0); m->m_len = asa->sa_len; bcopy(asa, mtod(m, caddr_t), asa->sa_len); if (m0) { M_ASSERT_NO_SND_TAG(m0); m_clrprotoflags(m0); m_tag_delete_chain(m0, NULL); /* * Clear some persistent info from pkthdr. * We don't use m_demote(), because some netgraph consumers * expect M_PKTHDR presence. */ m0->m_pkthdr.rcvif = NULL; m0->m_pkthdr.flowid = 0; m0->m_pkthdr.csum_flags = 0; m0->m_pkthdr.fibnum = 0; m0->m_pkthdr.rsstype = 0; } if (ctrl_last) ctrl_last->m_next = m0; /* concatenate data to control */ else control = m0; m->m_next = control; for (n = m; n->m_next != NULL; n = n->m_next) sballoc(sb, n); sballoc(sb, n); nlast = n; SBLINKRECORD(sb, m); sb->sb_mbtail = nlast; SBLASTMBUFCHK(sb); SBLASTRECORDCHK(sb); return (1); } /* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header * with total length. Returns 0 if no space in sockbuf or insufficient * mbufs. */ int sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { struct mbuf *ctrl_last; int space = asa->sa_len; SOCKBUF_LOCK_ASSERT(sb); if (m0 && (m0->m_flags & M_PKTHDR) == 0) panic("sbappendaddr_locked"); if (m0) space += m0->m_pkthdr.len; space += m_length(control, &ctrl_last); if (space > sbspace(sb)) return (0); return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last)); } /* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header * with total length. Returns 0 if insufficient mbufs. Does not validate space * on the receiving sockbuf. */ int sbappendaddr_nospacecheck_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { struct mbuf *ctrl_last; SOCKBUF_LOCK_ASSERT(sb); ctrl_last = (control == NULL) ? NULL : m_last(control); return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last)); } /* * Append address and data, and optionally, control (ancillary) data to the * receive queue of a socket. If present, m0 must include a packet header * with total length. Returns 0 if no space in sockbuf or insufficient * mbufs. */ int sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control) { int retval; SOCKBUF_LOCK(sb); retval = sbappendaddr_locked(sb, asa, m0, control); SOCKBUF_UNLOCK(sb); return (retval); } void sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control, int flags) { struct mbuf *m, *mlast; if (m0 != NULL) kmsan_check_mbuf(m0, "sbappend"); kmsan_check_mbuf(control, "sbappend"); sbm_clrprotoflags(m0, flags); m_last(control)->m_next = m0; SBLASTRECORDCHK(sb); for (m = control; m->m_next; m = m->m_next) sballoc(sb, m); sballoc(sb, m); mlast = m; SBLINKRECORD(sb, control); sb->sb_mbtail = mlast; SBLASTMBUFCHK(sb); SBLASTRECORDCHK(sb); } void sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control, int flags) { SOCKBUF_LOCK(sb); sbappendcontrol_locked(sb, m0, control, flags); SOCKBUF_UNLOCK(sb); } /* * Append the data in mbuf chain (m) into the socket buffer sb following mbuf * (n). If (n) is NULL, the buffer is presumed empty. * * When the data is compressed, mbufs in the chain may be handled in one of * three ways: * * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no * record boundary, and no change in data type). * * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into * an mbuf already in the socket buffer. This can occur if an * appropriate mbuf exists, there is room, both mbufs are not marked as * not ready, and no merging of data types will occur. * * (3) The mbuf may be appended to the end of the existing mbuf chain. * * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as * end-of-record. */ void sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) { int eor = 0; struct mbuf *o; SOCKBUF_LOCK_ASSERT(sb); while (m) { eor |= m->m_flags & M_EOR; if (m->m_len == 0 && (eor == 0 || (((o = m->m_next) || (o = n)) && o->m_type == m->m_type))) { if (sb->sb_lastrecord == m) sb->sb_lastrecord = m->m_next; m = m_free(m); continue; } if (n && (n->m_flags & M_EOR) == 0 && M_WRITABLE(n) && ((sb->sb_flags & SB_NOCOALESCE) == 0) && !(m->m_flags & M_NOTREADY) && !(n->m_flags & (M_NOTREADY | M_EXTPG)) && !mbuf_has_tls_session(m) && !mbuf_has_tls_session(n) && m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ m->m_len <= M_TRAILINGSPACE(n) && n->m_type == m->m_type) { m_copydata(m, 0, m->m_len, mtodo(n, n->m_len)); n->m_len += m->m_len; sb->sb_ccc += m->m_len; if (sb->sb_fnrdy == NULL) sb->sb_acc += m->m_len; if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) /* XXX: Probably don't need.*/ sb->sb_ctl += m->m_len; m = m_free(m); continue; } if (m->m_len <= MLEN && (m->m_flags & M_EXTPG) && (m->m_flags & M_NOTREADY) == 0 && !mbuf_has_tls_session(m)) (void)mb_unmapped_compress(m); if (n) n->m_next = m; else sb->sb_mb = m; sb->sb_mbtail = m; sballoc(sb, m); n = m; m->m_flags &= ~M_EOR; m = m->m_next; n->m_next = 0; } if (eor) { KASSERT(n != NULL, ("sbcompress: eor && n == NULL")); n->m_flags |= eor; } SBLASTMBUFCHK(sb); } #ifdef KERN_TLS /* * A version of sbcompress() for encrypted TLS RX mbufs. These mbufs * are appended to the 'sb_mtls' chain instead of 'sb_mb' and are also * a bit simpler (no EOR markers, always MT_DATA, etc.). */ static void sbcompress_ktls_rx(struct sockbuf *sb, struct mbuf *m, struct mbuf *n) { SOCKBUF_LOCK_ASSERT(sb); while (m) { KASSERT((m->m_flags & M_EOR) == 0, ("TLS RX mbuf %p with EOR", m)); KASSERT(m->m_type == MT_DATA, ("TLS RX mbuf %p is not MT_DATA", m)); KASSERT((m->m_flags & M_NOTREADY) != 0, ("TLS RX mbuf %p ready", m)); KASSERT((m->m_flags & M_EXTPG) == 0, ("TLS RX mbuf %p unmapped", m)); if (m->m_len == 0) { m = m_free(m); continue; } /* * Even though both 'n' and 'm' are NOTREADY, it's ok * to coalesce the data. */ if (n && M_WRITABLE(n) && ((sb->sb_flags & SB_NOCOALESCE) == 0) && !((m->m_flags ^ n->m_flags) & M_DECRYPTED) && !(n->m_flags & M_EXTPG) && m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ m->m_len <= M_TRAILINGSPACE(n)) { m_copydata(m, 0, m->m_len, mtodo(n, n->m_len)); n->m_len += m->m_len; sb->sb_ccc += m->m_len; sb->sb_tlscc += m->m_len; m = m_free(m); continue; } if (n) n->m_next = m; else sb->sb_mtls = m; sb->sb_mtlstail = m; sballoc_ktls_rx(sb, m); n = m; m = m->m_next; n->m_next = NULL; } SBLASTMBUFCHK(sb); } #endif /* * Free all mbufs in a sockbuf. Check that all resources are reclaimed. */ static void sbflush_internal(struct sockbuf *sb) { while (sb->sb_mbcnt || sb->sb_tlsdcc) { /* * Don't call sbcut(sb, 0) if the leading mbuf is non-empty: * we would loop forever. Panic instead. */ if (sb->sb_ccc == 0 && (sb->sb_mb == NULL || sb->sb_mb->m_len)) break; m_freem(sbcut_internal(sb, (int)sb->sb_ccc)); } KASSERT(sb->sb_ccc == 0 && sb->sb_mb == 0 && sb->sb_mbcnt == 0, ("%s: ccc %u mb %p mbcnt %u", __func__, sb->sb_ccc, (void *)sb->sb_mb, sb->sb_mbcnt)); } void sbflush_locked(struct sockbuf *sb) { SOCKBUF_LOCK_ASSERT(sb); sbflush_internal(sb); } void sbflush(struct sockbuf *sb) { SOCKBUF_LOCK(sb); sbflush_locked(sb); SOCKBUF_UNLOCK(sb); } /* * Cut data from (the front of) a sockbuf. */ static struct mbuf * sbcut_internal(struct sockbuf *sb, int len) { struct mbuf *m, *next, *mfree; bool is_tls; KASSERT(len >= 0, ("%s: len is %d but it is supposed to be >= 0", __func__, len)); KASSERT(len <= sb->sb_ccc, ("%s: len: %d is > ccc: %u", __func__, len, sb->sb_ccc)); next = (m = sb->sb_mb) ? m->m_nextpkt : 0; is_tls = false; mfree = NULL; while (len > 0) { if (m == NULL) { #ifdef KERN_TLS if (next == NULL && !is_tls) { if (sb->sb_tlsdcc != 0) { MPASS(len >= sb->sb_tlsdcc); len -= sb->sb_tlsdcc; sb->sb_ccc -= sb->sb_tlsdcc; sb->sb_tlsdcc = 0; if (len == 0) break; } next = sb->sb_mtls; is_tls = true; } #endif KASSERT(next, ("%s: no next, len %d", __func__, len)); m = next; next = m->m_nextpkt; } if (m->m_len > len) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p M_NOTAVAIL", __func__, m)); m->m_len -= len; m->m_data += len; sb->sb_ccc -= len; sb->sb_acc -= len; if (sb->sb_sndptroff != 0) sb->sb_sndptroff -= len; if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA) sb->sb_ctl -= len; break; } len -= m->m_len; #ifdef KERN_TLS if (is_tls) sbfree_ktls_rx(sb, m); else #endif sbfree(sb, m); /* * Do not put M_NOTREADY buffers to the free list, they * are referenced from outside. */ if (m->m_flags & M_NOTREADY && !is_tls) m = m->m_next; else { struct mbuf *n; n = m->m_next; m->m_next = mfree; mfree = m; m = n; } } /* * Free any zero-length mbufs from the buffer. * For SOCK_DGRAM sockets such mbufs represent empty records. * XXX: For SOCK_STREAM sockets such mbufs can appear in the buffer, * when sosend_generic() needs to send only control data. */ while (m && m->m_len == 0) { struct mbuf *n; sbfree(sb, m); n = m->m_next; m->m_next = mfree; mfree = m; m = n; } #ifdef KERN_TLS if (is_tls) { sb->sb_mb = NULL; sb->sb_mtls = m; if (m == NULL) sb->sb_mtlstail = NULL; } else #endif if (m) { sb->sb_mb = m; m->m_nextpkt = next; } else sb->sb_mb = next; /* * First part is an inline SB_EMPTY_FIXUP(). Second part makes sure * sb_lastrecord is up-to-date if we dropped part of the last record. */ m = sb->sb_mb; if (m == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (m->m_nextpkt == NULL) { sb->sb_lastrecord = m; } return (mfree); } /* * Drop data from (the front of) a sockbuf. */ void sbdrop_locked(struct sockbuf *sb, int len) { SOCKBUF_LOCK_ASSERT(sb); m_freem(sbcut_internal(sb, len)); } /* * Drop data from (the front of) a sockbuf, * and return it to caller. */ struct mbuf * sbcut_locked(struct sockbuf *sb, int len) { SOCKBUF_LOCK_ASSERT(sb); return (sbcut_internal(sb, len)); } void sbdrop(struct sockbuf *sb, int len) { struct mbuf *mfree; SOCKBUF_LOCK(sb); mfree = sbcut_internal(sb, len); SOCKBUF_UNLOCK(sb); m_freem(mfree); } struct mbuf * sbsndptr_noadv(struct sockbuf *sb, uint32_t off, uint32_t *moff) { struct mbuf *m; KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__)); if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) { *moff = off; if (sb->sb_sndptr == NULL) { sb->sb_sndptr = sb->sb_mb; sb->sb_sndptroff = 0; } return (sb->sb_mb); } else { m = sb->sb_sndptr; off -= sb->sb_sndptroff; } *moff = off; return (m); } void sbsndptr_adv(struct sockbuf *sb, struct mbuf *mb, uint32_t len) { /* * A small copy was done, advance forward the sb_sbsndptr to cover * it. */ struct mbuf *m; if (mb != sb->sb_sndptr) { /* Did not copyout at the same mbuf */ return; } m = mb; while (m && (len > 0)) { if (len >= m->m_len) { len -= m->m_len; if (m->m_next) { sb->sb_sndptroff += m->m_len; sb->sb_sndptr = m->m_next; } m = m->m_next; } else { len = 0; } } } /* * Return the first mbuf and the mbuf data offset for the provided * send offset without changing the "sb_sndptroff" field. */ struct mbuf * sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff) { struct mbuf *m; KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__)); /* * If the "off" is below the stored offset, which happens on * retransmits, just use "sb_mb": */ if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) { m = sb->sb_mb; } else { m = sb->sb_sndptr; off -= sb->sb_sndptroff; } while (off > 0 && m != NULL) { if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } *moff = off; return (m); } /* * Drop a record off the front of a sockbuf and move the next record to the * front. */ void sbdroprecord_locked(struct sockbuf *sb) { struct mbuf *m; SOCKBUF_LOCK_ASSERT(sb); m = sb->sb_mb; if (m) { sb->sb_mb = m->m_nextpkt; do { sbfree(sb, m); m = m_free(m); } while (m); } SB_EMPTY_FIXUP(sb); } /* * Drop a record off the front of a sockbuf and move the next record to the * front. */ void sbdroprecord(struct sockbuf *sb) { SOCKBUF_LOCK(sb); sbdroprecord_locked(sb); SOCKBUF_UNLOCK(sb); } /* * Create a "control" mbuf containing the specified data with the specified * type for presentation on a socket buffer. */ struct mbuf * sbcreatecontrol(const void *p, u_int size, int type, int level, int wait) { struct cmsghdr *cp; struct mbuf *m; MBUF_CHECKSLEEP(wait); if (wait == M_NOWAIT) { if (CMSG_SPACE(size) > MCLBYTES) return (NULL); } else KASSERT(CMSG_SPACE(size) <= MCLBYTES, ("%s: passed CMSG_SPACE(%u) > MCLBYTES", __func__, size)); if (CMSG_SPACE(size) > MLEN) m = m_getcl(wait, MT_CONTROL, 0); else m = m_get(wait, MT_CONTROL); if (m == NULL) return (NULL); KASSERT(CMSG_SPACE(size) <= M_TRAILINGSPACE(m), ("sbcreatecontrol: short mbuf")); /* * Don't leave the padding between the msg header and the * cmsg data and the padding after the cmsg data un-initialized. */ cp = mtod(m, struct cmsghdr *); bzero(cp, CMSG_SPACE(size)); if (p != NULL) (void)memcpy(CMSG_DATA(cp), p, size); m->m_len = CMSG_SPACE(size); cp->cmsg_len = CMSG_LEN(size); cp->cmsg_level = level; cp->cmsg_type = type; return (m); } /* * This does the same for socket buffers that sotoxsocket does for sockets: * generate an user-format data structure describing the socket buffer. Note * that the xsockbuf structure, since it is always embedded in a socket, does * not include a self pointer nor a length. We make this entry point public * in case some other mechanism needs it. */ void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) { xsb->sb_cc = sb->sb_ccc; xsb->sb_hiwat = sb->sb_hiwat; xsb->sb_mbcnt = sb->sb_mbcnt; xsb->sb_mbmax = sb->sb_mbmax; xsb->sb_lowat = sb->sb_lowat; xsb->sb_flags = sb->sb_flags; xsb->sb_timeo = sb->sb_timeo; } /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */ static int dummy; SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW | CTLFLAG_SKIP, &dummy, 0, ""); SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size"); SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, &sb_efficiency, 0, "Socket buffer size waste factor"); diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 42c43539b484..071530925892 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -1,4363 +1,5057 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2004 The FreeBSD Foundation * Copyright (c) 2004-2008 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Comments on the socket life cycle: * * soalloc() sets of socket layer state for a socket, called only by * socreate() and sonewconn(). Socket layer private. * * sodealloc() tears down socket layer state for a socket, called only by * sofree() and sonewconn(). Socket layer private. * * pru_attach() associates protocol layer state with an allocated socket; * called only once, may fail, aborting socket allocation. This is called * from socreate() and sonewconn(). Socket layer private. * * pru_detach() disassociates protocol layer state from an attached socket, * and will be called exactly once for sockets in which pru_attach() has * been successfully called. If pru_attach() returned an error, * pru_detach() will not be called. Socket layer private. * * pru_abort() and pru_close() notify the protocol layer that the last * consumer of a socket is starting to tear down the socket, and that the * protocol should terminate the connection. Historically, pru_abort() also * detached protocol state from the socket state, but this is no longer the * case. * * socreate() creates a socket and attaches protocol state. This is a public * interface that may be used by socket layer consumers to create new * sockets. * * sonewconn() creates a socket and attaches protocol state. This is a * public interface that may be used by protocols to create new sockets when * a new connection is received and will be available for accept() on a * listen socket. * * soclose() destroys a socket after possibly waiting for it to disconnect. * This is a public interface that socket consumers should use to close and * release a socket when done with it. * * soabort() destroys a socket without waiting for it to disconnect (used * only for incoming connections that are already partially or fully * connected). This is used internally by the socket layer when clearing * listen socket queues (due to overflow or close on the listen socket), but * is also a public interface protocols may use to abort connections in * their incomplete listen queues should they no longer be required. Sockets * placed in completed connection listen queues should not be aborted for * reasons described in the comment above the soclose() implementation. This * is not a general purpose close routine, and except in the specific * circumstances described here, should not be used. * * sofree() will free a socket and its protocol state if all references on * the socket have been released, and is the public interface to attempt to * free a socket when a reference is removed. This is a socket layer private * interface. * * NOTE: In addition to socreate() and soclose(), which provide a single * socket reference to the consumer to be managed as required, there are two * calls to explicitly manage socket references, soref(), and sorele(). * Currently, these are generally required only when transitioning a socket * from a listen queue to a file descriptor, in order to prevent garbage * collection of the socket at an untimely moment. For a number of reasons, * these interfaces are not preferred, and should be avoided. * * NOTE: With regard to VNETs the general rule is that callers do not set * curvnet. Exceptions to this rule include soabort(), sodisconnect(), * sofree(), sorele(), sonewconn() and sorflush(), which are usually called * from a pre-set VNET context. sopoll() currently does not need a VNET * context to be set. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ktrace.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include /* for struct knote */ #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include +#include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #include #endif +static int soreceive_generic_locked(struct socket *so, + struct sockaddr **psa, struct uio *uio, struct mbuf **mp, + struct mbuf **controlp, int *flagsp); static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); +static int soreceive_stream_locked(struct socket *so, struct sockbuf *sb, + struct sockaddr **psa, struct uio *uio, struct mbuf **mp, + struct mbuf **controlp, int flags); +static int sosend_generic_locked(struct socket *so, struct sockaddr *addr, + struct uio *uio, struct mbuf *top, struct mbuf *control, + int flags, struct thread *td); static void so_rdknl_lock(void *); static void so_rdknl_unlock(void *); static void so_rdknl_assert_lock(void *, int); static void so_wrknl_lock(void *); static void so_wrknl_unlock(void *); static void so_wrknl_assert_lock(void *, int); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); static int filt_soempty(struct knote *kn, long hint); fo_kqfilter_t soo_kqfilter; static struct filterops soread_filtops = { .f_isfd = 1, .f_detach = filt_sordetach, .f_event = filt_soread, }; static struct filterops sowrite_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_sowrite, }; static struct filterops soempty_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_soempty, }; so_gen_t so_gencnt; /* generation count for sockets */ MALLOC_DEFINE(M_SONAME, "soname", "socket name"); MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); #define VNET_SO_ASSERT(so) \ VNET_ASSERT(curvnet != NULL, \ ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); #ifdef SOCKET_HHOOK VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); #define V_socket_hhh VNET(socket_hhh) static inline int hhook_run_socket(struct socket *, void *, int32_t); #endif +#ifdef COMPAT_FREEBSD32 +#ifdef __amd64__ +/* off_t has 4-byte alignment on i386 but not on other 32-bit platforms. */ +#define __splice32_packed __packed +#else +#define __splice32_packed +#endif +struct splice32 { + int32_t sp_fd; + int64_t sp_max; + struct timeval32 sp_idle; +} __splice32_packed; +#undef __splice32_packed +#endif + /* * Limit on the number of connections in the listen queue waiting * for accept(2). * NB: The original sysctl somaxconn is still available but hidden * to prevent confusion about the actual purpose of this number. */ static u_int somaxconn = SOMAXCONN; static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS) { int error; int val; val = somaxconn; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); /* * The purpose of the UINT_MAX / 3 limit, is so that the formula * 3 * so_qlimit / 2 * below, will not overflow. */ if (val < 1 || val > UINT_MAX / 3) return (EINVAL); somaxconn = val; return (0); } SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size"); SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size (compat)"); static int numopensockets; SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, &numopensockets, 0, "Number of open sockets"); /* * so_global_mtx protects so_gencnt, numopensockets, and the per-socket * so_gencnt field. */ static struct mtx so_global_mtx; MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); /* * General IPC sysctl name space, used by sockets and a variety of other IPC * types. */ SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPC"); /* * Initialize the socket subsystem and set up the socket * memory allocator. */ static uma_zone_t socket_zone; int maxsockets; static void socket_zone_change(void *tag) { maxsockets = uma_zone_set_max(socket_zone, maxsockets); } +static int splice_init_state; +static struct sx splice_init_lock; +SX_SYSINIT(splice_init_lock, &splice_init_lock, "splice_init"); + +static SYSCTL_NODE(_kern_ipc, OID_AUTO, splice, CTLFLAG_RW, 0, + "Settings relating to the SO_SPLICE socket option"); + +static bool splice_receive_stream = true; +SYSCTL_BOOL(_kern_ipc_splice, OID_AUTO, receive_stream, CTLFLAG_RWTUN, + &splice_receive_stream, 0, + "Use soreceive_stream() for stream splices"); + +static uma_zone_t splice_zone; +static struct proc *splice_proc; +struct splice_wq { + struct mtx mtx; + STAILQ_HEAD(, so_splice) head; + bool running; +} __aligned(CACHE_LINE_SIZE); +static struct splice_wq *splice_wq; +static uint32_t splice_index = 0; + +static void so_splice_timeout(void *arg, int pending); +static void so_splice_xfer(struct so_splice *s); +static int so_unsplice(struct socket *so, bool timeout); + +static void +splice_work_thread(void *ctx) +{ + struct splice_wq *wq = ctx; + struct so_splice *s, *s_temp; + STAILQ_HEAD(, so_splice) local_head; + int cpu; + + cpu = wq - splice_wq; + if (bootverbose) + printf("starting so_splice worker thread for CPU %d\n", cpu); + + for (;;) { + mtx_lock(&wq->mtx); + while (STAILQ_EMPTY(&wq->head)) { + wq->running = false; + mtx_sleep(wq, &wq->mtx, 0, "-", 0); + wq->running = true; + } + STAILQ_INIT(&local_head); + STAILQ_CONCAT(&local_head, &wq->head); + STAILQ_INIT(&wq->head); + mtx_unlock(&wq->mtx); + STAILQ_FOREACH_SAFE(s, &local_head, next, s_temp) { + mtx_lock(&s->mtx); + CURVNET_SET(s->src->so_vnet); + so_splice_xfer(s); + CURVNET_RESTORE(); + } + } +} + +static void +so_splice_dispatch_async(struct so_splice *sp) +{ + struct splice_wq *wq; + bool running; + + wq = &splice_wq[sp->wq_index]; + mtx_lock(&wq->mtx); + STAILQ_INSERT_TAIL(&wq->head, sp, next); + running = wq->running; + mtx_unlock(&wq->mtx); + if (!running) + wakeup(wq); +} + +void +so_splice_dispatch(struct so_splice *sp) +{ + mtx_assert(&sp->mtx, MA_OWNED); + + if (sp->state != SPLICE_IDLE) { + mtx_unlock(&sp->mtx); + } else { + sp->state = SPLICE_QUEUED; + mtx_unlock(&sp->mtx); + so_splice_dispatch_async(sp); + } +} + +static int +splice_zinit(void *mem, int size __unused, int flags __unused) +{ + struct so_splice *s; + + s = (struct so_splice *)mem; + mtx_init(&s->mtx, "so_splice", NULL, MTX_DEF); + return (0); +} + +static void +splice_zfini(void *mem, int size) +{ + struct so_splice *s; + + s = (struct so_splice *)mem; + mtx_destroy(&s->mtx); +} + +static int +splice_init(void) +{ + struct thread *td; + int error, i, state; + + state = atomic_load_acq_int(&splice_init_state); + if (__predict_true(state > 0)) + return (0); + if (state < 0) + return (ENXIO); + sx_xlock(&splice_init_lock); + if (splice_init_state != 0) { + sx_xunlock(&splice_init_lock); + return (0); + } + + splice_zone = uma_zcreate("splice", sizeof(struct so_splice), NULL, + NULL, splice_zinit, splice_zfini, UMA_ALIGN_CACHE, 0); + + splice_wq = mallocarray(mp_maxid + 1, sizeof(*splice_wq), M_TEMP, + M_WAITOK | M_ZERO); + + /* + * Initialize the workqueues to run the splice work. We create a + * work queue for each CPU. + */ + CPU_FOREACH(i) { + STAILQ_INIT(&splice_wq[i].head); + mtx_init(&splice_wq[i].mtx, "splice work queue", NULL, MTX_DEF); + } + + /* Start kthreads for each workqueue. */ + error = 0; + CPU_FOREACH(i) { + error = kproc_kthread_add(splice_work_thread, &splice_wq[i], + &splice_proc, &td, 0, 0, "so_splice", "thr_%d", i); + if (error) { + printf("Can't add so_splice thread %d error %d\n", + i, error); + break; + } + + /* + * It's possible to create loops with SO_SPLICE; ensure that + * worker threads aren't able to starve the system too easily. + */ + thread_lock(td); + sched_prio(td, PUSER); + thread_unlock(td); + } + + splice_init_state = error != 0 ? -1 : 1; + sx_xunlock(&splice_init_lock); + + return (error); +} + +/* + * Lock a pair of socket's I/O locks for splicing. Avoid blocking while holding + * one lock in order to avoid potential deadlocks in case there is some other + * code path which acquires more than one I/O lock at a time. + */ +static void +splice_lock_pair(struct socket *so_src, struct socket *so_dst) +{ + int error; + + for (;;) { + error = SOCK_IO_SEND_LOCK(so_dst, SBL_WAIT | SBL_NOINTR); + KASSERT(error == 0, + ("%s: failed to lock send I/O lock: %d", __func__, error)); + error = SOCK_IO_RECV_LOCK(so_src, 0); + KASSERT(error == 0 || error == EWOULDBLOCK, + ("%s: failed to lock recv I/O lock: %d", __func__, error)); + if (error == 0) + break; + SOCK_IO_SEND_UNLOCK(so_dst); + + error = SOCK_IO_RECV_LOCK(so_src, SBL_WAIT | SBL_NOINTR); + KASSERT(error == 0, + ("%s: failed to lock recv I/O lock: %d", __func__, error)); + error = SOCK_IO_SEND_LOCK(so_dst, 0); + KASSERT(error == 0 || error == EWOULDBLOCK, + ("%s: failed to lock send I/O lock: %d", __func__, error)); + if (error == 0) + break; + SOCK_IO_RECV_UNLOCK(so_src); + } +} + +static void +splice_unlock_pair(struct socket *so_src, struct socket *so_dst) +{ + SOCK_IO_RECV_UNLOCK(so_src); + SOCK_IO_SEND_UNLOCK(so_dst); +} + +/* + * Move data from the source to the sink. Assumes that both of the relevant + * socket I/O locks are held. + */ +static int +so_splice_xfer_data(struct socket *so_src, struct socket *so_dst, off_t max, + ssize_t *lenp) +{ + struct uio uio; + struct mbuf *m; + struct sockbuf *sb_src, *sb_dst; + ssize_t len; + long space; + int error, flags; + + SOCK_IO_RECV_ASSERT_LOCKED(so_src); + SOCK_IO_SEND_ASSERT_LOCKED(so_dst); + + error = 0; + m = NULL; + memset(&uio, 0, sizeof(uio)); + + sb_src = &so_src->so_rcv; + sb_dst = &so_dst->so_snd; + + space = sbspace(sb_dst); + if (space < 0) + space = 0; + len = MIN(max, MIN(space, sbavail(sb_src))); + if (len == 0) { + SOCK_RECVBUF_LOCK(so_src); + if ((sb_src->sb_state & SBS_CANTRCVMORE) != 0) + error = EPIPE; + SOCK_RECVBUF_UNLOCK(so_src); + } else { + flags = MSG_DONTWAIT; + uio.uio_resid = len; + if (splice_receive_stream && sb_src->sb_tls_info == NULL) { + error = soreceive_stream_locked(so_src, sb_src, NULL, + &uio, &m, NULL, flags); + } else { + error = soreceive_generic_locked(so_src, NULL, + &uio, &m, NULL, &flags); + } + if (error != 0 && m != NULL) { + m_freem(m); + m = NULL; + } + } + if (m != NULL) { + len -= uio.uio_resid; + error = sosend_generic_locked(so_dst, NULL, NULL, m, NULL, + MSG_DONTWAIT, curthread); + } else if (error == 0) { + len = 0; + SOCK_SENDBUF_LOCK(so_dst); + if ((sb_dst->sb_state & SBS_CANTSENDMORE) != 0) + error = EPIPE; + SOCK_SENDBUF_UNLOCK(so_dst); + } + if (error == 0) + *lenp = len; + return (error); +} + +/* + * Transfer data from the source to the sink. + * + * If "direct" is true, the transfer is done in the context of whichever thread + * is operating on one of the socket buffers. We do not know which locks are + * held, so we can only trylock the socket buffers; if this fails, we fall back + * to the worker thread, which invokes this routine with "direct" set to false. + */ +static void +so_splice_xfer(struct so_splice *sp) +{ + struct socket *so_src, *so_dst; + off_t max; + ssize_t len; + int error; + + mtx_assert(&sp->mtx, MA_OWNED); + KASSERT(sp->state == SPLICE_QUEUED || sp->state == SPLICE_CLOSING, + ("so_splice_xfer: invalid state %d", sp->state)); + KASSERT(sp->max != 0, ("so_splice_xfer: max == 0")); + + if (sp->state == SPLICE_CLOSING) { + /* Userspace asked us to close the splice. */ + goto closing; + } + + sp->state = SPLICE_RUNNING; + so_src = sp->src; + so_dst = sp->dst; + max = sp->max > 0 ? sp->max - so_src->so_splice_sent : OFF_MAX; + if (max < 0) + max = 0; + + /* + * Lock the sockets in order to block userspace from doing anything + * sneaky. If an error occurs or one of the sockets can no longer + * transfer data, we will automatically unsplice. + */ + mtx_unlock(&sp->mtx); + splice_lock_pair(so_src, so_dst); + + error = so_splice_xfer_data(so_src, so_dst, max, &len); + + mtx_lock(&sp->mtx); + + /* + * Update our stats while still holding the socket locks. This + * synchronizes with getsockopt(SO_SPLICE), see the comment there. + */ + if (error == 0) { + KASSERT(len >= 0, ("%s: len %zd < 0", __func__, len)); + so_src->so_splice_sent += len; + } + splice_unlock_pair(so_src, so_dst); + + switch (sp->state) { + case SPLICE_CLOSING: +closing: + sp->state = SPLICE_CLOSED; + wakeup(sp); + mtx_unlock(&sp->mtx); + break; + case SPLICE_RUNNING: + if (error != 0 || + (sp->max > 0 && so_src->so_splice_sent >= sp->max)) { + sp->state = SPLICE_EXCEPTION; + soref(so_src); + mtx_unlock(&sp->mtx); + (void)so_unsplice(so_src, false); + sorele(so_src); + } else { + /* + * Locklessly check for additional bytes in the source's + * receive buffer and queue more work if possible. We + * may end up queuing needless work, but that's ok, and + * if we race with a thread inserting more data into the + * buffer and observe sbavail() == 0, the splice mutex + * ensures that splice_push() will queue more work for + * us. + */ + if (sbavail(&so_src->so_rcv) > 0 && + sbspace(&so_dst->so_snd) > 0) { + sp->state = SPLICE_QUEUED; + mtx_unlock(&sp->mtx); + so_splice_dispatch_async(sp); + } else { + sp->state = SPLICE_IDLE; + mtx_unlock(&sp->mtx); + } + } + break; + default: + __assert_unreachable(); + } +} + static void socket_init(void *tag) { socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); maxsockets = uma_zone_set_max(socket_zone, maxsockets); uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); #ifdef SOCKET_HHOOK static void socket_hhook_register(int subtype) { if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, &V_socket_hhh[subtype], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register hook\n", __func__); } static void socket_hhook_deregister(int subtype) { if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) printf("%s: WARNING: unable to deregister hook\n", __func__); } static void socket_vnet_init(const void *unused __unused) { int i; /* We expect a contiguous range */ for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_register(i); } VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_init, NULL); static void socket_vnet_uninit(const void *unused __unused) { int i; for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_deregister(i); } VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_uninit, NULL); #endif /* SOCKET_HHOOK */ /* * Initialise maxsockets. This SYSINIT must be run after * tunable_mbinit(). */ static void init_maxsockets(void *ignored) { TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); maxsockets = imax(maxsockets, maxfiles); } SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); /* * Sysctl to get and set the maximum global sockets limit. Notify protocols * of the change so that they can update their dependent limits as required. */ static int sysctl_maxsockets(SYSCTL_HANDLER_ARGS) { int error, newmaxsockets; newmaxsockets = maxsockets; error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); if (error == 0 && req->newptr && newmaxsockets != maxsockets) { if (newmaxsockets > maxsockets && newmaxsockets <= maxfiles) { maxsockets = newmaxsockets; EVENTHANDLER_INVOKE(maxsockets_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, &maxsockets, 0, sysctl_maxsockets, "IU", "Maximum number of sockets available"); /* * Socket operation routines. These routines are called by the routines in * sys_socket.c or from a system process, and implement the semantics of * socket operations by switching out to the protocol specific routines. */ /* * Get a socket structure from our zone, and initialize it. Note that it * would probably be better to allocate socket and PCB at the same time, but * I'm not convinced that all the protocols can be easily modified to do * this. * * soalloc() returns a socket with a ref count of 0. */ static struct socket * soalloc(struct vnet *vnet) { struct socket *so; so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); if (so == NULL) return (NULL); #ifdef MAC if (mac_socket_init(so, M_NOWAIT) != 0) { uma_zfree(socket_zone, so); return (NULL); } #endif if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { uma_zfree(socket_zone, so); return (NULL); } /* * The socket locking protocol allows to lock 2 sockets at a time, * however, the first one must be a listening socket. WITNESS lacks * a feature to change class of an existing lock, so we use DUPOK. */ mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF); mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF); so->so_rcv.sb_sel = &so->so_rdsel; so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd_sx, "so_snd_sx"); sx_init(&so->so_rcv_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); TAILQ_INIT(&so->so_rcv.sb_aiojobq); TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); #ifdef VIMAGE VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet = vnet; #endif #ifdef SOCKET_HHOOK /* We shouldn't need the so_global_mtx */ if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { /* Do we need more comprehensive error returns? */ uma_zfree(socket_zone, so); return (NULL); } #endif mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; #ifdef VIMAGE vnet->vnet_sockcnt++; #endif mtx_unlock(&so_global_mtx); return (so); } /* * Free the storage associated with a socket at the socket layer, tear down * locks, labels, etc. All protocol state is assumed already to have been * torn down (and possibly never set up) by the caller. */ void sodealloc(struct socket *so) { KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ #ifdef VIMAGE VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet->vnet_sockcnt--; #endif mtx_unlock(&so_global_mtx); #ifdef MAC mac_socket_destroy(so); #endif #ifdef SOCKET_HHOOK hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); #endif khelp_destroy_osd(&so->osd); if (SOLISTENING(so)) { if (so->sol_accept_filter != NULL) accept_filt_setopt(so, NULL); } else { if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); sx_destroy(&so->so_snd_sx); sx_destroy(&so->so_rcv_sx); mtx_destroy(&so->so_snd_mtx); mtx_destroy(&so->so_rcv_mtx); } crfree(so->so_cred); mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } /* * socreate returns a socket with a ref count of 1 and a file descriptor * reference. The socket should be closed with soclose(). */ int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td) { struct protosw *prp; struct socket *so; int error; /* * XXX: divert(4) historically abused PF_INET. Keep this compatibility * shim until all applications have been updated. */ if (__predict_false(dom == PF_INET && type == SOCK_RAW && proto == IPPROTO_DIVERT)) { dom = PF_DIVERT; printf("%s uses obsolete way to create divert(4) socket\n", td->td_proc->p_comm); } prp = pffindproto(dom, type, proto); if (prp == NULL) { /* No support for domain. */ if (pffinddomain(dom) == NULL) return (EAFNOSUPPORT); /* No support for socket type. */ if (proto == 0 && type != 0) return (EPROTOTYPE); return (EPROTONOSUPPORT); } MPASS(prp->pr_attach); if ((prp->pr_flags & PR_CAPATTACH) == 0) { if (CAP_TRACING(td)) ktrcapfail(CAPFAIL_PROTO, &proto); if (IN_CAPABILITY_MODE(td)) return (ECAPMODE); } if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) return (EPROTONOSUPPORT); so = soalloc(CRED_TO_VNET(cred)); if (so == NULL) return (ENOBUFS); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || (prp->pr_domain->dom_family == PF_INET6) || (prp->pr_domain->dom_family == PF_ROUTE)) so->so_fibnum = td->td_proc->p_fibnum; else so->so_fibnum = 0; so->so_proto = prp; #ifdef MAC mac_socket_create(cred, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); if ((prp->pr_flags & PR_SOCKBUF) == 0) { so->so_snd.sb_mtx = &so->so_snd_mtx; so->so_rcv.sb_mtx = &so->so_rcv_mtx; } /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ CURVNET_SET(so->so_vnet); error = prp->pr_attach(so, proto, td); CURVNET_RESTORE(); if (error) { sodealloc(so); return (error); } soref(so); *aso = so; return (0); } #ifdef REGRESSION static int regression_sonewconn_earlytest = 1; SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); #endif static int sooverprio = LOG_DEBUG; SYSCTL_INT(_kern_ipc, OID_AUTO, sooverprio, CTLFLAG_RW, &sooverprio, 0, "Log priority for listen socket overflows: 0..7 or -1 to disable"); static struct timeval overinterval = { 60, 0 }; SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, &overinterval, "Delay in seconds between warnings for listen socket overflows"); /* * When an attempt at a new connection is noted on a socket which supports * accept(2), the protocol has two options: * 1) Call legacy sonewconn() function, which would call protocol attach * method, same as used for socket(2). * 2) Call solisten_clone(), do attach that is specific to a cloned connection, * and then call solisten_enqueue(). * * Note: the ref count on the socket is 0 on return. */ struct socket * solisten_clone(struct socket *head) { struct sbuf descrsb; struct socket *so; int len, overcount; u_int qlen; const char localprefix[] = "local:"; char descrbuf[SUNPATHLEN + sizeof(localprefix)]; #if defined(INET6) char addrbuf[INET6_ADDRSTRLEN]; #elif defined(INET) char addrbuf[INET_ADDRSTRLEN]; #endif bool dolog, over; SOLISTEN_LOCK(head); over = (head->sol_qlen > 3 * head->sol_qlimit / 2); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else if (over) { #endif head->sol_overcount++; dolog = (sooverprio >= 0) && !!ratecheck(&head->sol_lastover, &overinterval); /* * If we're going to log, copy the overflow count and queue * length from the listen socket before dropping the lock. * Also, reset the overflow count. */ if (dolog) { overcount = head->sol_overcount; head->sol_overcount = 0; qlen = head->sol_qlen; } SOLISTEN_UNLOCK(head); if (dolog) { /* * Try to print something descriptive about the * socket for the error message. */ sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), SBUF_FIXEDLEN); switch (head->so_proto->pr_domain->dom_family) { #if defined(INET) || defined(INET6) #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: if (head->so_proto->pr_domain->dom_family == AF_INET6 || (sotoinpcb(head)->inp_inc.inc_flags & INC_ISIPV6)) { ip6_sprintf(addrbuf, &sotoinpcb(head)->inp_inc.inc6_laddr); sbuf_printf(&descrsb, "[%s]", addrbuf); } else #endif { #ifdef INET inet_ntoa_r( sotoinpcb(head)->inp_inc.inc_laddr, addrbuf); sbuf_cat(&descrsb, addrbuf); #endif } sbuf_printf(&descrsb, ":%hu (proto %u)", ntohs(sotoinpcb(head)->inp_inc.inc_lport), head->so_proto->pr_protocol); break; #endif /* INET || INET6 */ case AF_UNIX: sbuf_cat(&descrsb, localprefix); if (sotounpcb(head)->unp_addr != NULL) len = sotounpcb(head)->unp_addr->sun_len - offsetof(struct sockaddr_un, sun_path); else len = 0; if (len > 0) sbuf_bcat(&descrsb, sotounpcb(head)->unp_addr->sun_path, len); else sbuf_cat(&descrsb, "(unknown)"); break; } /* * If we can't print something more specific, at least * print the domain name. */ if (sbuf_finish(&descrsb) != 0 || sbuf_len(&descrsb) <= 0) { sbuf_clear(&descrsb); sbuf_cat(&descrsb, head->so_proto->pr_domain->dom_name ?: "unknown"); sbuf_finish(&descrsb); } KASSERT(sbuf_len(&descrsb) > 0, ("%s: sbuf creation failed", __func__)); /* * Preserve the historic listen queue overflow log * message, that starts with "sonewconn:". It has * been known to sysadmins for years and also test * sys/kern/sonewconn_overflow checks for it. */ if (head->so_cred == 0) { log(LOG_PRI(sooverprio), "sonewconn: pcb %p (%s): " "Listen queue overflow: %i already in " "queue awaiting acceptance (%d " "occurrences)\n", head->so_pcb, sbuf_data(&descrsb), qlen, overcount); } else { log(LOG_PRI(sooverprio), "sonewconn: pcb %p (%s): " "Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences), euid %d, rgid %d, jail %s\n", head->so_pcb, sbuf_data(&descrsb), qlen, overcount, head->so_cred->cr_uid, head->so_cred->cr_rgid, head->so_cred->cr_prison ? head->so_cred->cr_prison->pr_name : "not_jailed"); } sbuf_delete(&descrsb); overcount = 0; } return (NULL); } SOLISTEN_UNLOCK(head); VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", __func__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_listen = head; so->so_type = head->so_type; /* * POSIX is ambiguous on what options an accept(2)ed socket should * inherit from the listener. Words "create a new socket" may be * interpreted as not inheriting anything. Best programming practice * for application developers is to not rely on such inheritance. * FreeBSD had historically inherited all so_options excluding * SO_ACCEPTCONN, which virtually means all SOL_SOCKET level options, * including those completely irrelevant to a new born socket. For * compatibility with older versions we will inherit a list of * meaningful options. * The crucial bit to inherit is SO_ACCEPTFILTER. We need it present * in the child socket for soisconnected() promoting socket from the * incomplete queue to complete. It will be cleared before the child * gets available to accept(2). */ so->so_options = head->so_options & (SO_ACCEPTFILTER | SO_KEEPALIVE | SO_DONTROUTE | SO_LINGER | SO_OOBINLINE | SO_NOSIGPIPE); so->so_linger = head->so_linger; so->so_state = head->so_state; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef SOCKET_HHOOK if (V_socket_hhh[HHOOK_SOCKET_NEWCONN]->hhh_nhooks > 0) { if (hhook_run_socket(so, head, HHOOK_SOCKET_NEWCONN)) { sodealloc(so); log(LOG_DEBUG, "%s: hhook run failed\n", __func__); return (NULL); } } #endif #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; so->so_snd.sb_lowat = head->sol_sbsnd_lowat; so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; so->so_snd.sb_timeo = head->sol_sbsnd_timeo; so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE; so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE; if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { so->so_snd.sb_mtx = &so->so_snd_mtx; so->so_rcv.sb_mtx = &so->so_rcv_mtx; } return (so); } /* Connstatus may be 0 or SS_ISCONNECTED. */ struct socket * sonewconn(struct socket *head, int connstatus) { struct socket *so; if ((so = solisten_clone(head)) == NULL) return (NULL); if (so->so_proto->pr_attach(so, 0, NULL) != 0) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n", __func__, head->so_pcb); return (NULL); } (void)solisten_enqueue(so, connstatus); return (so); } /* * Enqueue socket cloned by solisten_clone() to the listen queue of the * listener it has been cloned from. * * Return 'true' if socket landed on complete queue, otherwise 'false'. */ bool solisten_enqueue(struct socket *so, int connstatus) { struct socket *head = so->so_listen; MPASS(refcount_load(&so->so_count) == 0); refcount_init(&so->so_count, 1); SOLISTEN_LOCK(head); if (head->sol_accept_filter != NULL) connstatus = 0; so->so_state |= connstatus; soref(head); /* A socket on (in)complete queue refs head. */ if (connstatus) { TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); so->so_qstate = SQ_COMP; head->sol_qlen++; solisten_wakeup(head); /* unlocks */ return (true); } else { /* * Keep removing sockets from the head until there's room for * us to insert on the tail. In pre-locking revisions, this * was a simple if(), but as we could be racing with other * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ while (head->sol_incqlen > head->sol_qlimit) { struct socket *sp; sp = TAILQ_FIRST(&head->sol_incomp); TAILQ_REMOVE(&head->sol_incomp, sp, so_list); head->sol_incqlen--; SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); sorele_locked(head); /* does SOLISTEN_UNLOCK, head stays */ soabort(sp); SOLISTEN_LOCK(head); } TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); so->so_qstate = SQ_INCOMP; head->sol_incqlen++; SOLISTEN_UNLOCK(head); return (false); } } #if defined(SCTP) || defined(SCTP_SUPPORT) /* * Socket part of sctp_peeloff(). Detach a new socket from an * association. The new socket is returned with a reference. * * XXXGL: reduce copy-paste with solisten_clone(). */ struct socket * sopeeloff(struct socket *head) { struct socket *so; VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", __func__, __LINE__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_type = head->so_type; so->so_options = head->so_options; so->so_linger = head->so_linger; so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; so->so_snd.sb_lowat = head->so_snd.sb_lowat; so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; so->so_snd.sb_timeo = head->so_snd.sb_timeo; so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { so->so_snd.sb_mtx = &so->so_snd_mtx; so->so_rcv.sb_mtx = &so->so_rcv_mtx; } soref(so); return (so); } #endif /* SCTP */ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_bind(so, nam, td); CURVNET_RESTORE(); return (error); } int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_bindat(fd, so, nam, td); CURVNET_RESTORE(); return (error); } /* * solisten() transitions a socket from a non-listening state to a listening * state, but can also be used to update the listen queue depth on an * existing listen socket. The protocol will call back into the sockets * layer using solisten_proto_check() and solisten_proto() to check and set * socket-layer listen state. Call backs are used so that the protocol can * acquire both protocol and socket layer locks in whatever order is required * by the protocol. * * Protocol implementors are advised to hold the socket lock across the * socket-layer test and set to avoid races at the socket layer. */ int solisten(struct socket *so, int backlog, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_listen(so, backlog, td); CURVNET_RESTORE(); return (error); } /* * Prepare for a call to solisten_proto(). Acquire all socket buffer locks in * order to interlock with socket I/O. */ int solisten_proto_check(struct socket *so) { SOCK_LOCK_ASSERT(so); if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) != 0) return (EINVAL); /* * Sleeping is not permitted here, so simply fail if userspace is * attempting to transmit or receive on the socket. This kind of * transient failure is not ideal, but it should occur only if userspace * is misusing the socket interfaces. */ if (!sx_try_xlock(&so->so_snd_sx)) return (EAGAIN); if (!sx_try_xlock(&so->so_rcv_sx)) { sx_xunlock(&so->so_snd_sx); return (EAGAIN); } mtx_lock(&so->so_snd_mtx); mtx_lock(&so->so_rcv_mtx); /* Interlock with soo_aio_queue() and KTLS. */ if (!SOLISTENING(so)) { bool ktls; #ifdef KERN_TLS ktls = so->so_snd.sb_tls_info != NULL || so->so_rcv.sb_tls_info != NULL; #else ktls = false; #endif if (ktls || (so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 || (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0) { solisten_proto_abort(so); return (EINVAL); } } return (0); } /* * Undo the setup done by solisten_proto_check(). */ void solisten_proto_abort(struct socket *so) { mtx_unlock(&so->so_snd_mtx); mtx_unlock(&so->so_rcv_mtx); sx_xunlock(&so->so_snd_sx); sx_xunlock(&so->so_rcv_sx); } void solisten_proto(struct socket *so, int backlog) { int sbrcv_lowat, sbsnd_lowat; u_int sbrcv_hiwat, sbsnd_hiwat; short sbrcv_flags, sbsnd_flags; sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0, ("%s: bad socket state %p", __func__, so)); if (SOLISTENING(so)) goto listening; /* * Change this socket to listening state. */ sbrcv_lowat = so->so_rcv.sb_lowat; sbsnd_lowat = so->so_snd.sb_lowat; sbrcv_hiwat = so->so_rcv.sb_hiwat; sbsnd_hiwat = so->so_snd.sb_hiwat; sbrcv_flags = so->so_rcv.sb_flags; sbsnd_flags = so->so_snd.sb_flags; sbrcv_timeo = so->so_rcv.sb_timeo; sbsnd_timeo = so->so_snd.sb_timeo; if (!(so->so_proto->pr_flags & PR_SOCKBUF)) { sbdestroy(so, SO_SND); sbdestroy(so, SO_RCV); } #ifdef INVARIANTS bzero(&so->so_rcv, sizeof(struct socket) - offsetof(struct socket, so_rcv)); #endif so->sol_sbrcv_lowat = sbrcv_lowat; so->sol_sbsnd_lowat = sbsnd_lowat; so->sol_sbrcv_hiwat = sbrcv_hiwat; so->sol_sbsnd_hiwat = sbsnd_hiwat; so->sol_sbrcv_flags = sbrcv_flags; so->sol_sbsnd_flags = sbsnd_flags; so->sol_sbrcv_timeo = sbrcv_timeo; so->sol_sbsnd_timeo = sbsnd_timeo; so->sol_qlen = so->sol_incqlen = 0; TAILQ_INIT(&so->sol_incomp); TAILQ_INIT(&so->sol_comp); so->sol_accept_filter = NULL; so->sol_accept_filter_arg = NULL; so->sol_accept_filter_str = NULL; so->sol_upcall = NULL; so->sol_upcallarg = NULL; so->so_options |= SO_ACCEPTCONN; listening: if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->sol_qlimit = backlog; mtx_unlock(&so->so_snd_mtx); mtx_unlock(&so->so_rcv_mtx); sx_xunlock(&so->so_snd_sx); sx_xunlock(&so->so_rcv_sx); } /* * Wakeup listeners/subsystems once we have a complete connection. * Enters with lock, returns unlocked. */ void solisten_wakeup(struct socket *sol) { if (sol->sol_upcall != NULL) (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); else { selwakeuppri(&sol->so_rdsel, PSOCK); KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); } SOLISTEN_UNLOCK(sol); wakeup_one(&sol->sol_comp); if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) pgsigio(&sol->so_sigio, SIGIO, 0); } /* * Return single connection off a listening socket queue. Main consumer of * the function is kern_accept4(). Some modules, that do their own accept * management also use the function. The socket reference held by the * listen queue is handed to the caller. * * Listening socket must be locked on entry and is returned unlocked on * return. * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. */ int solisten_dequeue(struct socket *head, struct socket **ret, int flags) { struct socket *so; int error; SOLISTEN_LOCK_ASSERT(head); while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && head->so_error == 0) { error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH, "accept", 0); if (error != 0) { SOLISTEN_UNLOCK(head); return (error); } } if (head->so_error) { error = head->so_error; head->so_error = 0; } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) error = EWOULDBLOCK; else error = 0; if (error) { SOLISTEN_UNLOCK(head); return (error); } so = TAILQ_FIRST(&head->sol_comp); SOCK_LOCK(so); KASSERT(so->so_qstate == SQ_COMP, ("%s: so %p not SQ_COMP", __func__, so)); head->sol_qlen--; so->so_qstate = SQ_NONE; so->so_listen = NULL; TAILQ_REMOVE(&head->sol_comp, so, so_list); if (flags & ACCEPT4_INHERIT) so->so_state |= (head->so_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; SOCK_UNLOCK(so); sorele_locked(head); *ret = so; return (0); } +static struct so_splice * +so_splice_alloc(off_t max) +{ + struct so_splice *sp; + + sp = uma_zalloc(splice_zone, M_WAITOK); + sp->src = NULL; + sp->dst = NULL; + sp->max = max > 0 ? max : -1; + do { + sp->wq_index = atomic_fetchadd_32(&splice_index, 1) % + (mp_maxid + 1); + } while (CPU_ABSENT(sp->wq_index)); + sp->state = SPLICE_IDLE; + TIMEOUT_TASK_INIT(taskqueue_thread, &sp->timeout, 0, so_splice_timeout, + sp); + return (sp); +} + +static void +so_splice_free(struct so_splice *sp) +{ + KASSERT(sp->state == SPLICE_CLOSED, + ("so_splice_free: sp %p not closed", sp)); + uma_zfree(splice_zone, sp); +} + +static void +so_splice_timeout(void *arg, int pending __unused) +{ + struct so_splice *sp; + + sp = arg; + (void)so_unsplice(sp->src, true); +} + +/* + * Splice the output from so to the input of so2. + */ +static int +so_splice(struct socket *so, struct socket *so2, struct splice *splice) +{ + struct so_splice *sp; + int error; + + if (splice->sp_max < 0) + return (EINVAL); + /* Handle only TCP for now; TODO: other streaming protos */ + if (so->so_proto->pr_protocol != IPPROTO_TCP || + so2->so_proto->pr_protocol != IPPROTO_TCP) + return (EPROTONOSUPPORT); + if (so->so_vnet != so2->so_vnet) + return (EINVAL); + + /* so_splice_xfer() assumes that we're using these implementations. */ + KASSERT(so->so_proto->pr_sosend == sosend_generic, + ("so_splice: sosend not sosend_generic")); + KASSERT(so2->so_proto->pr_soreceive == soreceive_generic || + so2->so_proto->pr_soreceive == soreceive_stream, + ("so_splice: soreceive not soreceive_generic/stream")); + + sp = so_splice_alloc(splice->sp_max); + so->so_splice_sent = 0; + sp->src = so; + sp->dst = so2; + + error = 0; + SOCK_LOCK(so); + if (SOLISTENING(so)) + error = EINVAL; + else if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) + error = ENOTCONN; + else if (so->so_splice != NULL) + error = EBUSY; + if (error != 0) { + SOCK_UNLOCK(so); + uma_zfree(splice_zone, sp); + return (error); + } + soref(so); + so->so_splice = sp; + SOCK_RECVBUF_LOCK(so); + so->so_rcv.sb_flags |= SB_SPLICED; + SOCK_RECVBUF_UNLOCK(so); + SOCK_UNLOCK(so); + + error = 0; + SOCK_LOCK(so2); + if (SOLISTENING(so2)) + error = EINVAL; + else if ((so2->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) + error = ENOTCONN; + else if (so2->so_splice_back != NULL) + error = EBUSY; + if (error != 0) { + SOCK_UNLOCK(so2); + SOCK_LOCK(so); + so->so_splice = NULL; + SOCK_RECVBUF_LOCK(so); + so->so_rcv.sb_flags &= ~SB_SPLICED; + SOCK_RECVBUF_UNLOCK(so); + SOCK_UNLOCK(so); + sorele(so); + uma_zfree(splice_zone, sp); + return (error); + } + soref(so2); + so2->so_splice_back = sp; + SOCK_SENDBUF_LOCK(so2); + so2->so_snd.sb_flags |= SB_SPLICED; + mtx_lock(&sp->mtx); + SOCK_SENDBUF_UNLOCK(so2); + SOCK_UNLOCK(so2); + + if (splice->sp_idle.tv_sec != 0 || splice->sp_idle.tv_usec != 0) { + taskqueue_enqueue_timeout_sbt(taskqueue_thread, &sp->timeout, + tvtosbt(splice->sp_idle), 0, C_PREL(4)); + } + + /* + * Transfer any data already present in the socket buffer. + */ + sp->state = SPLICE_QUEUED; + so_splice_xfer(sp); + return (0); +} + +static int +so_unsplice(struct socket *so, bool timeout) +{ + struct socket *so2; + struct so_splice *sp; + bool drain; + + /* + * First unset SB_SPLICED and hide the splice structure so that + * wakeup routines will stop enqueuing work. This also ensures that + * a only a single thread will proceed with the unsplice. + */ + SOCK_LOCK(so); + if (SOLISTENING(so)) { + SOCK_UNLOCK(so); + return (EINVAL); + } + SOCK_RECVBUF_LOCK(so); + if ((so->so_rcv.sb_flags & SB_SPLICED) == 0) { + SOCK_RECVBUF_UNLOCK(so); + SOCK_UNLOCK(so); + return (ENOTCONN); + } + so->so_rcv.sb_flags &= ~SB_SPLICED; + sp = so->so_splice; + so->so_splice = NULL; + SOCK_RECVBUF_UNLOCK(so); + SOCK_UNLOCK(so); + + so2 = sp->dst; + SOCK_LOCK(so2); + KASSERT(!SOLISTENING(so2), ("%s: so2 is listening", __func__)); + SOCK_SENDBUF_LOCK(so2); + KASSERT((so2->so_snd.sb_flags & SB_SPLICED) != 0, + ("%s: so2 is not spliced", __func__)); + KASSERT(so2->so_splice_back == sp, + ("%s: so_splice_back != sp", __func__)); + so2->so_snd.sb_flags &= ~SB_SPLICED; + so2->so_splice_back = NULL; + SOCK_SENDBUF_UNLOCK(so2); + SOCK_UNLOCK(so2); + + /* + * No new work is being enqueued. The worker thread might be + * splicing data right now, in which case we want to wait for it to + * finish before proceeding. + */ + mtx_lock(&sp->mtx); + switch (sp->state) { + case SPLICE_QUEUED: + case SPLICE_RUNNING: + sp->state = SPLICE_CLOSING; + while (sp->state == SPLICE_CLOSING) + msleep(sp, &sp->mtx, PSOCK, "unsplice", 0); + break; + case SPLICE_IDLE: + case SPLICE_EXCEPTION: + sp->state = SPLICE_CLOSED; + break; + default: + __assert_unreachable(); + } + if (!timeout) { + drain = taskqueue_cancel_timeout(taskqueue_thread, &sp->timeout, + NULL) != 0; + } else { + drain = false; + } + mtx_unlock(&sp->mtx); + if (drain) + taskqueue_drain_timeout(taskqueue_thread, &sp->timeout); + + /* + * Now we hold the sole reference to the splice structure. + * Clean up: signal userspace and release socket references. + */ + sorwakeup(so); + CURVNET_SET(so->so_vnet); + sorele(so); + sowwakeup(so2); + sorele(so2); + CURVNET_RESTORE(); + so_splice_free(sp); + return (0); +} + /* * Free socket upon release of the very last reference. */ static void sofree(struct socket *so) { struct protosw *pr = so->so_proto; SOCK_LOCK_ASSERT(so); KASSERT(refcount_load(&so->so_count) == 0, ("%s: so %p has references", __func__, so)); KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE, ("%s: so %p is on listen queue", __func__, so)); + KASSERT(SOLISTENING(so) || (so->so_rcv.sb_flags & SB_SPLICED) == 0, + ("%s: so %p rcvbuf is spliced", __func__, so)); + KASSERT(SOLISTENING(so) || (so->so_snd.sb_flags & SB_SPLICED) == 0, + ("%s: so %p sndbuf is spliced", __func__, so)); + KASSERT(so->so_splice == NULL && so->so_splice_back == NULL, + ("%s: so %p has spliced data", __func__, so)); SOCK_UNLOCK(so); if (so->so_dtor != NULL) so->so_dtor(so); VNET_SO_ASSERT(so); if (pr->pr_detach != NULL) pr->pr_detach(so); /* * From this point on, we assume that no other references to this * socket exist anywhere else in the stack. Therefore, no locks need * to be acquired or held. */ if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) { sbdestroy(so, SO_SND); sbdestroy(so, SO_RCV); } seldrain(&so->so_rdsel); seldrain(&so->so_wrsel); knlist_destroy(&so->so_rdsel.si_note); knlist_destroy(&so->so_wrsel.si_note); sodealloc(so); } /* * Release a reference on a socket while holding the socket lock. * Unlocks the socket lock before returning. */ void sorele_locked(struct socket *so) { SOCK_LOCK_ASSERT(so); if (refcount_release(&so->so_count)) sofree(so); else SOCK_UNLOCK(so); } /* * Close a socket on last file table reference removal. Initiate disconnect * if connected. Free socket when disconnect complete. * * This function will sorele() the socket. Note that soclose() may be called * prior to the ref count reaching zero. The actual socket structure will * not be freed until the ref count reaches zero. */ int soclose(struct socket *so) { struct accept_queue lqueue; int error = 0; bool listening, last __diagused; CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error) { if (error == ENOTCONN) error = 0; goto drop; } } if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { error = tsleep(&so->so_timeo, PSOCK | PCATCH, "soclos", so->so_linger * hz); if (error) break; } } } drop: if (so->so_proto->pr_close != NULL) so->so_proto->pr_close(so); SOCK_LOCK(so); if ((listening = SOLISTENING(so))) { struct socket *sp; TAILQ_INIT(&lqueue); TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); so->sol_qlen = so->sol_incqlen = 0; TAILQ_FOREACH(sp, &lqueue, so_list) { SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); last = refcount_release(&so->so_count); KASSERT(!last, ("%s: released last reference for %p", __func__, so)); } } sorele_locked(so); if (listening) { struct socket *sp, *tsp; TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) soabort(sp); } CURVNET_RESTORE(); return (error); } /* * soabort() is used to abruptly tear down a connection, such as when a * resource limit is reached (listen queue depth exceeded), or if a listen * socket is closed while there are sockets waiting to be accepted. * * This interface is tricky, because it is called on an unreferenced socket, * and must be called only by a thread that has actually removed the socket * from the listen queue it was on. Likely this thread holds the last * reference on the socket and soabort() will proceed with sofree(). But * it might be not the last, as the sockets on the listen queues are seen * from the protocol side. * * This interface will call into the protocol code, so must not be called * with any socket locks held. Protocols do call it while holding their own * recursible protocol mutexes, but this is something that should be subject * to review in the future. * * Usually socket should have a single reference left, but this is not a * requirement. In the past, when we have had named references for file * descriptor and protocol, we asserted that none of them are being held. */ void soabort(struct socket *so) { VNET_SO_ASSERT(so); if (so->so_proto->pr_abort != NULL) so->so_proto->pr_abort(so); SOCK_LOCK(so); sorele_locked(so); } int soaccept(struct socket *so, struct sockaddr *sa) { #ifdef INVARIANTS u_char len = sa->sa_len; #endif int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_accept(so, sa); KASSERT(sa->sa_len <= len, ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); CURVNET_RESTORE(); return (error); } int sopeeraddr(struct socket *so, struct sockaddr *sa) { #ifdef INVARIANTS u_char len = sa->sa_len; #endif int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_peeraddr(so, sa); KASSERT(sa->sa_len <= len, ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); CURVNET_RESTORE(); return (error); } int sosockaddr(struct socket *so, struct sockaddr *sa) { #ifdef INVARIANTS u_char len = sa->sa_len; #endif int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_sockaddr(so, sa); KASSERT(sa->sa_len <= len, ("%s: protocol %p sockaddr overflow", __func__, so->so_proto)); CURVNET_RESTORE(); return (error); } int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (soconnectat(AT_FDCWD, so, nam, td)); } int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. This allows * user to disconnect by connecting to, e.g., a null address. * * Note, this check is racy and may need to be re-evaluated at the * protocol layer. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) { error = EISCONN; } else { /* * Prevent accumulated error from previous connection from * biting us. */ so->so_error = 0; if (fd == AT_FDCWD) { error = so->so_proto->pr_connect(so, nam, td); } else { error = so->so_proto->pr_connectat(fd, so, nam, td); } } CURVNET_RESTORE(); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { int error; CURVNET_SET(so1->so_vnet); error = so1->so_proto->pr_connect2(so1, so2); CURVNET_RESTORE(); return (error); } int sodisconnect(struct socket *so) { int error; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) return (EALREADY); VNET_SO_ASSERT(so); error = so->so_proto->pr_disconnect(so); return (error); } int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); KASSERT(so->so_proto->pr_flags & PR_ATOMIC, ("sosend_dgram: !PR_ATOMIC")); if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. */ if (resid < 0) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection-based * socket if it supports implied connect. Return ENOTCONN if * not connected and no address is supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if (!(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto out; } } else if (addr == NULL) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; SOCKBUF_UNLOCK(&so->so_snd); goto out; } } /* * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a * problem and need fixing. */ space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; space -= clen; SOCKBUF_UNLOCK(&so->so_snd); if (resid > space) { error = EMSGSIZE; goto out; } if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf chain. * If no data is to be copied in, a single empty mbuf * is returned. */ top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); if (top == NULL) { error = EFAULT; /* only possible error */ goto out; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } KASSERT(resid == 0, ("sosend_dgram: resid != 0")); /* * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock * than with. */ if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously done could be out * of date. We could have received a reset packet in an interrupt or * maybe we slept while doing page faults in uiomove() etc. We could * probably recheck again inside the locking protection here, but * there are probably other places that this also happens. We must * rethink this. */ VNET_SO_ASSERT(so); error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands this flag and * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } /* * Send on a socket. If send must go all at once and message is larger than * send buffering, then hard error. Lock against other senders. If must go * all at once and not enough room now, then inform user that this would * block and do nothing. Otherwise, if nonblocking, send as much as * possible. The data to be sent is described by "uio" if nonzero, otherwise * by the mbuf chain "top" (which must be null if uio is not). Data provided * in mbuf chain must be small enough to send all at once. * * Returns nonzero on error, timeout or signal; callers must check for short * counts if EINTR/ERESTART are returned. Data and control buffers are freed * on return. */ static int sosend_generic_locked(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; int pr_send_flag; #ifdef KERN_TLS struct ktls_session *tls; int tls_enq_cnt, tls_send_flag; uint8_t tls_rtype; tls = NULL; tls_rtype = TLS_RLTYPE_APP; #endif SOCK_IO_SEND_ASSERT_LOCKED(so); if (uio != NULL) resid = uio->uio_resid; else if ((top->m_flags & M_PKTHDR) != 0) resid = top->m_pkthdr.len; else resid = m_length(top, NULL); /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. * * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; #ifdef KERN_TLS tls_send_flag = 0; tls = ktls_hold(so->so_snd.sb_tls_info); if (tls != NULL) { if (tls->mode == TCP_TLS_MODE_SW) tls_send_flag = PRUS_NOTREADY; if (control != NULL) { struct cmsghdr *cm = mtod(control, struct cmsghdr *); if (clen >= sizeof(*cm) && cm->cmsg_type == TLS_SET_RECORD_TYPE) { tls_rtype = *((uint8_t *)CMSG_DATA(cm)); clen = 0; m_freem(control); control = NULL; atomic = 1; } } if (resid == 0 && !ktls_permit_empty_frames(tls)) { error = EINVAL; goto out; } } #endif restart: do { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection- * based socket if it supports implied connect. * Return ENOTCONN if not connected and no address is * supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if (!(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto out; } } else if (addr == NULL) { SOCKBUF_UNLOCK(&so->so_snd); if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; goto out; } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) { SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; goto out; } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto out; } error = sbwait(so, SO_SND); SOCKBUF_UNLOCK(&so->so_snd); if (error) goto out; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); space -= clen; do { if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; #ifdef KERN_TLS if (tls != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); tls_rtype = TLS_RLTYPE_APP; } #endif } else { /* * Copy the data from userland into a mbuf * chain. If resid is 0, which can happen * only if we have control to send, then * a single empty mbuf is returned. This * is a workaround to prevent protocol send * methods to panic. */ #ifdef KERN_TLS if (tls != NULL) { top = m_uiotombuf(uio, M_WAITOK, space, tls->params.max_frame_len, M_EXTPG | ((flags & MSG_EOR) ? M_EOR : 0)); if (top != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); } tls_rtype = TLS_RLTYPE_APP; } else #endif top = m_uiotombuf(uio, M_WAITOK, space, (atomic ? max_hdr : 0), (atomic ? M_PKTHDR : 0) | ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { error = EFAULT; /* only possible error */ goto out; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date. We could have received * a reset packet in an interrupt or maybe we slept * while doing page faults in uiomove() etc. We * could probably recheck again inside the locking * protection here, but there are probably other * places that this also happens. We must rethink * this. */ VNET_SO_ASSERT(so); pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands * this flag and nothing left to send then use * PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; #ifdef KERN_TLS pr_send_flag |= tls_send_flag; #endif error = so->so_proto->pr_send(so, pr_send_flag, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } #ifdef KERN_TLS if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { if (error != 0) { m_freem(top); top = NULL; } else { soref(so); ktls_enqueue(top, so, tls_enq_cnt); } } #endif clen = 0; control = NULL; top = NULL; if (error) goto out; } while (resid && space > 0); } while (resid); out: #ifdef KERN_TLS if (tls != NULL) ktls_free(tls); #endif if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { int error; error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); error = sosend_generic_locked(so, addr, uio, top, control, flags, td); SOCK_IO_SEND_UNLOCK(so); return (error); } /* * Send to a socket from a kernel thread. * * XXXGL: in almost all cases uio is NULL and the mbuf is supplied. * Exception is nfs/bootp_subr.c. It is arguable that the VNET context needs * to be set at all. This function should just boil down to a static inline * calling the protocol method. */ int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_sosend(so, addr, uio, top, control, flags, td); CURVNET_RESTORE(); return (error); } /* * send(2), write(2) or aio_write(2) on a socket. */ int sousrsend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *control, int flags, struct proc *userproc) { struct thread *td; ssize_t len; int error; td = uio->uio_td; len = uio->uio_resid; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_sosend(so, addr, uio, NULL, control, flags, td); CURVNET_RESTORE(); if (error != 0) { /* * Clear transient errors for stream protocols if they made * some progress. Make exclusion for aio(4) that would * schedule a new write in case of EWOULDBLOCK and clear * error itself. See soaio_process_job(). */ if (uio->uio_resid != len && (so->so_proto->pr_flags & PR_ATOMIC) == 0 && userproc == NULL && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket. */ if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0 && (flags & MSG_NOSIGNAL) == 0) { if (userproc != NULL) { /* aio(4) job */ PROC_LOCK(userproc); kern_psignal(userproc, SIGPIPE); PROC_UNLOCK(userproc); } else { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } } return (error); } /* * The part of soreceive() that implements reading non-inline out-of-band * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is * unable to return an mbuf chain to the caller. */ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { struct protosw *pr = so->so_proto; struct mbuf *m; int error; KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); VNET_SO_ASSERT(so); m = m_get(M_WAITOK, MT_DATA); error = pr->pr_rcvoob(so, m, flags & MSG_PEEK); if (error) goto bad; do { error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: if (m != NULL) m_freem(m); return (error); } /* * Following replacement or removal of the first mbuf on the first mbuf chain * of a socket buffer, push necessary state changes back into the socket * buffer so that other consumers see the values consistently. 'nextrecord' * is the callers locally stored value of the original value of * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. * NOTE: 'nextrecord' may be NULL. */ static __inline void sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) { SOCKBUF_LOCK_ASSERT(sb); /* * First, update for the new value of nextrecord. If necessary, make * it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect the new * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the * addition of a second clause that takes care of the case where * sb_mb has been updated, but remains the last record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } /* * Implement receive operations on a socket. We depend on the way that * records are added to the sockbuf by sbappend. In particular, each record * (mbufs linked through m_next) must begin with an address if the protocol * so specifies, followed by an optional mbuf or mbufs containing ancillary * data, and then zero or more mbufs of data. In order to allow parallelism * between network receive and copying to user space, as well as avoid * sleeping with a mutex held, we release the socket buffer mutex during the * user space copy. Although the sockbuf is locked, new data may still be * appended, and thus we must maintain consistency of the sockbuf during that * time. * * The caller may receive the data as a single mbuf chain by supplying an * mbuf **mp0 for use in returning the chain. The uio is then used only for * the count in uio_resid. */ static int soreceive_generic_locked(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp, struct mbuf **controlp, int *flagsp) { struct mbuf *m; int flags, error, offset; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; ssize_t orig_resid = uio->uio_resid; bool report_real_len = false; SOCK_IO_RECV_ASSERT_LOCKED(so); error = 0; if (flagsp != NULL) { report_real_len = *flagsp & MSG_TRUNC; *flagsp &= ~MSG_TRUNC; flags = *flagsp &~ MSG_EOR; } else flags = 0; restart: SOCKBUF_LOCK(&so->so_rcv); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more (subject * to any timeout) if: * 1. the current count is less than the low water mark, or * 2. MSG_DONTWAIT is not set */ if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && sbavail(&so->so_rcv) < uio->uio_resid) && sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { KASSERT(m != NULL || !sbavail(&so->so_rcv), ("receive: m == %p sbavail == %u", m, sbavail(&so->so_rcv))); if (so->so_error || so->so_rerror) { if (m != NULL) goto dontblock; if (so->so_error) error = so->so_error; else error = so->so_rerror; if ((flags & MSG_PEEK) == 0) { if (so->so_error) so->so_error = 0; else so->so_rerror = 0; } SOCKBUF_UNLOCK(&so->so_rcv); goto release; } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { if (m != NULL) goto dontblock; #ifdef KERN_TLS else if (so->so_rcv.sb_tlsdcc == 0 && so->so_rcv.sb_tlscc == 0) { #else else { #endif SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENOTCONN; goto release; } if (uio->uio_resid == 0 && !report_real_len) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(so, SO_RCV); SOCKBUF_UNLOCK(&so->so_rcv); if (error) goto release; goto restart; } dontblock: /* * From this point onward, we maintain 'nextrecord' as a cache of the * pointer to the next record in the socket buffer. We must keep the * various socket buffer pointers and local stack versions of the * pointers in sync, pushing out modifications before dropping the * socket buffer mutex, and re-reading them when picking it up. * * Otherwise, we will race with the network stack appending new data * or records onto the socket buffer by using inconsistent/stale * versions of the field, possibly resulting in socket buffer * corruption. * * By holding the high-level sblock(), we prevent simultaneous * readers from pulling off the front of the socket buffer. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); orig_resid = 0; if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; sockbuf_pushsync(&so->so_rcv, nextrecord); } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization (or freeing if controlp == NULL). */ if (m != NULL && m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; #ifdef KERN_TLS struct cmsghdr *cmsg; struct tls_get_record tgr; /* * For MSG_TLSAPPDATA, check for an alert record. * If found, return ENXIO without removing * it from the receive queue. This allows a subsequent * call without MSG_TLSAPPDATA to receive it. * Note that, for TLS, there should only be a single * control mbuf with the TLS_GET_RECORD message in it. */ if (flags & MSG_TLSAPPDATA) { cmsg = mtod(m, struct cmsghdr *); if (cmsg->cmsg_type == TLS_GET_RECORD && cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); if (__predict_false(tgr.tls_type == TLS_RLTYPE_ALERT)) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENXIO; goto release; } } } #endif do { if (flags & MSG_PEEK) { if (controlp != NULL) { *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); controlp = &(*controlp)->m_next; } m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sockbuf_pushsync(&so->so_rcv, nextrecord); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); SOCKBUF_LOCK(&so->so_rcv); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } if (m != NULL) nextrecord = so->so_rcv.sb_mb->m_nextpkt; else nextrecord = so->so_rcv.sb_mb; orig_resid = 0; } if (m != NULL) { if ((flags & MSG_PEEK) == 0) { KASSERT(m->m_nextpkt == nextrecord, ("soreceive: post-control, nextrecord !sync")); if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_mb == m, ("soreceive: post-control, sb_mb!=m")); KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive: post-control, lastrecord!=m")); } } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; } else { if ((flags & MSG_PEEK) == 0) { KASSERT(so->so_rcv.sb_mb == nextrecord, ("soreceive: sb_mb != nextrecord")); if (so->so_rcv.sb_mb == NULL) { KASSERT(so->so_rcv.sb_lastrecord == NULL, ("soreceive: sb_lastercord != NULL")); } } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * Now continue to read any data mbufs off of the head of the socket * buffer until the read request is satisfied. Note that 'type' is * used to store the type of any mbuf reads that have happened so far * such that soreceive() can stop reading if the type changes, which * causes soreceive() to return only one of regular data and inline * out-of-band data in a single socket receive operation. */ moff = 0; offset = 0; while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 && error == 0) { /* * If the type of mbuf has changed since the last mbuf * examined ('type'), end the receive operation. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { if (type != m->m_type) break; } else if (type == MT_OOBDATA) break; else KASSERT(m->m_type == MT_DATA, ("m->m_type == %d", m->m_type)); so->so_rcv.sb_state &= ~SBS_RCVATMARK; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. Otherwise copy * them out via the uio, then free. Sockbuf must be * consistent here (points to current mbuf, it points to next * record) when we drop priority; we must note any additions * to the sockbuf when we block interrupts again. */ if (mp == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if ((m->m_flags & M_EXTPG) != 0) error = m_unmapped_uiomove(m, moff, uio, (int)len); else error = uiomove(mtod(m, char *) + moff, (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) { /* * The MT_SONAME mbuf has already been removed * from the record, so it is necessary to * remove the data mbufs, if any, to preserve * the invariant in the case of PR_ADDR that * requires MT_SONAME mbufs at the head of * each record. */ if (pr->pr_flags & PR_ATOMIC && ((flags & MSG_PEEK) == 0)) (void)sbdroprecord_locked(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } else uio->uio_resid -= len; SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); if (mp != NULL) { m->m_nextpkt = NULL; *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } sockbuf_pushsync(&so->so_rcv, nextrecord); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); } } else { if (flags & MSG_PEEK) moff += len; else { if (mp != NULL) { if (flags & MSG_DONTWAIT) { *mp = m_copym(m, 0, len, M_NOWAIT); if (*mp == NULL) { /* * m_copym() couldn't * allocate an mbuf. * Adjust uio_resid back * (it was adjusted * down by len bytes, * which we didn't end * up "copying" over). */ uio->uio_resid += len; break; } } else { SOCKBUF_UNLOCK(&so->so_rcv); *mp = m_copym(m, 0, len, M_WAITOK); SOCKBUF_LOCK(&so->so_rcv); } } sbcut_locked(&so->so_rcv, len); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_rcv.sb_state |= SBS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), we * must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return with a * short count but without error. Keep sockbuf locked * against other readers. */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_error || so->so_rerror || so->so_rcv.sb_state & SBS_CANTRCVMORE) break; /* * Notify the protocol that some data has been * drained before blocking. */ if (pr->pr_flags & PR_WANTRCVD) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); pr->pr_rcvd(so, flags); SOCKBUF_LOCK(&so->so_rcv); if (__predict_false(so->so_rcv.sb_mb == NULL && (so->so_error || so->so_rerror || so->so_rcv.sb_state & SBS_CANTRCVMORE))) break; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * We could receive some data while was notifying * the protocol. Skip blocking in this case. */ if (so->so_rcv.sb_mb == NULL) { error = sbwait(so, SO_RCV); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } m = so->so_rcv.sb_mb; if (m != NULL) nextrecord = m->m_nextpkt; } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m != NULL && pr->pr_flags & PR_ATOMIC) { if (report_real_len) uio->uio_resid -= m_length(m, NULL) - moff; flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord_locked(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * If soreceive() is being done from the socket callback, * then don't need to generate ACK to peer to update window, * since ACK will be generated on return to TCP. */ if (!(flags & MSG_SOCALLBCK) && (pr->pr_flags & PR_WANTRCVD)) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); pr->pr_rcvd(so, flags); SOCKBUF_LOCK(&so->so_rcv); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto restart; } SOCKBUF_UNLOCK(&so->so_rcv); if (flagsp != NULL) *flagsp |= flags; release: return (error); } int soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp, struct mbuf **controlp, int *flagsp) { int error, flags; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) { flags = *flagsp; if ((flags & MSG_OOB) != 0) return (soreceive_rcvoob(so, uio, flags)); } else { flags = 0; } if (mp != NULL) *mp = NULL; error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); error = soreceive_generic_locked(so, psa, uio, mp, controlp, flagsp); SOCK_IO_RECV_UNLOCK(so); return (error); } /* * Optimized version of soreceive() for stream (TCP) sockets. */ static int soreceive_stream_locked(struct socket *so, struct sockbuf *sb, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int flags) { int len = 0, error = 0, oresid; struct mbuf *m, *n = NULL; SOCK_IO_RECV_ASSERT_LOCKED(so); /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) return (EINVAL); oresid = uio->uio_resid; SOCKBUF_LOCK(sb); /* We will never ever get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { error = ENOTCONN; goto out; } restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); /* Abort if socket has reported problems. */ if (so->so_error) { if (sbavail(sb) > 0) goto deliver; if (oresid > uio->uio_resid) goto out; error = so->so_error; if (!(flags & MSG_PEEK)) so->so_error = 0; goto out; } /* Door is closed. Deliver what is left, if any. */ if (sb->sb_state & SBS_CANTRCVMORE) { if (sbavail(sb) > 0) goto deliver; else goto out; } /* Socket buffer is empty and we shall not block. */ if (sbavail(sb) == 0 && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { error = EAGAIN; goto out; } /* Socket buffer got some data that we shall deliver now. */ if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)) || sbavail(sb) >= sb->sb_lowat || sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat) ) { goto deliver; } /* On MSG_WAITALL we must wait until all data or error arrives. */ if ((flags & MSG_WAITALL) && (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) goto deliver; /* * Wait and block until (more) data comes in. * NB: Drops the sockbuf lock during wait. */ error = sbwait(so, SO_RCV); if (error) goto out; goto restart; deliver: SOCKBUF_LOCK_ASSERT(&so->so_rcv); KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); /* Statistics. */ if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; /* Fill uio until full or current end of socket buffer is reached. */ len = min(uio->uio_resid, sbavail(sb)); if (mp0 != NULL) { /* Dequeue as many mbufs as possible. */ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { if (*mp0 == NULL) *mp0 = sb->sb_mb; else m_cat(*mp0, sb->sb_mb); for (m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p not available", __func__, m)); len -= m->m_len; uio->uio_resid -= m->m_len; sbfree(sb, m); n = m; } n->m_next = NULL; sb->sb_mb = m; sb->sb_lastrecord = sb->sb_mb; if (sb->sb_mb == NULL) SB_EMPTY_FIXUP(sb); } /* Copy the remainder. */ if (len > 0) { KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else uio->uio_resid -= len; if (*mp0 != NULL) m_cat(*mp0, m); else *mp0 = m; if (*mp0 == NULL) { error = ENOBUFS; goto out; } } } else { /* NB: Must unlock socket buffer as uiomove may sleep. */ SOCKBUF_UNLOCK(sb); error = m_mbuftouio(uio, sb->sb_mb, len); SOCKBUF_LOCK(sb); if (error) goto out; } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); /* * Remove the delivered data from the socket buffer unless we * were only peeking. */ if (!(flags & MSG_PEEK)) { if (len > 0) sbdrop_locked(sb, len); /* Notify protocol that we drained some data. */ if ((so->so_proto->pr_flags & PR_WANTRCVD) && (((flags & MSG_WAITALL) && uio->uio_resid > 0) || !(flags & MSG_SOCALLBCK))) { SOCKBUF_UNLOCK(sb); VNET_SO_ASSERT(so); so->so_proto->pr_rcvd(so, flags); SOCKBUF_LOCK(sb); } } /* * For MSG_WAITALL we may have to loop again and wait for * more data to come in. */ if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); return (error); } int soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct sockbuf *sb; int error, flags; sb = &so->so_rcv; /* We only do stream sockets. */ if (so->so_type != SOCK_STREAM) return (EINVAL); if (psa != NULL) *psa = NULL; if (flagsp != NULL) flags = *flagsp & ~MSG_EOR; else flags = 0; if (controlp != NULL) *controlp = NULL; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp0 != NULL) *mp0 = NULL; #ifdef KERN_TLS /* * KTLS store TLS records as records with a control message to * describe the framing. * * We check once here before acquiring locks to optimize the * common case. */ if (sb->sb_tls_info != NULL) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); #endif /* * Prevent other threads from reading from the socket. This lock may be * dropped in order to sleep waiting for data to arrive. */ error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); #ifdef KERN_TLS if (__predict_false(sb->sb_tls_info != NULL)) { SOCK_IO_RECV_UNLOCK(so); return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); } #endif error = soreceive_stream_locked(so, sb, psa, uio, mp0, controlp, flags); SOCK_IO_RECV_UNLOCK(so); return (error); } /* * Optimized version of soreceive() for simple datagram cases from userspace. * Unlike in the stream case, we're able to drop a datagram if copyout() * fails, and because we handle datagrams atomically, we don't need to use a * sleep lock to prevent I/O interlacing. */ int soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, *m2; int flags, error; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; /* * For any complicated cases, fall back to the full * soreceive_generic(). */ if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC))) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); /* * Enforce restrictions on use. */ KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, ("soreceive_dgram: wantrcvd")); KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, ("soreceive_dgram: SBS_RCVATMARK")); KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, ("soreceive_dgram: P_CONNREQUIRED")); /* * Loop blocking while waiting for a datagram. */ SOCKBUF_LOCK(&so->so_rcv); while ((m = so->so_rcv.sb_mb) == NULL) { KASSERT(sbavail(&so->so_rcv) == 0, ("soreceive_dgram: sb_mb NULL but sbavail %u", sbavail(&so->so_rcv))); if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); return (error); } if (so->so_rcv.sb_state & SBS_CANTRCVMORE || uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); return (0); } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); return (EWOULDBLOCK); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(so, SO_RCV); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); return (error); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive_dgram: lastrecord != m")); } KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, ("soreceive_dgram: m_nextpkt != nextrecord")); /* * Pull 'm' and its chain off the front of the packet queue. */ so->so_rcv.sb_mb = NULL; sockbuf_pushsync(&so->so_rcv, nextrecord); /* * Walk 'm's chain and free that many bytes from the socket buffer. */ for (m2 = m; m2 != NULL; m2 = m2->m_next) sbfree(&so->so_rcv, m2); /* * Do a few last checks before we let go of the lock. */ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_WAITOK); m = m_free(m); } KASSERT(m, ("%s: no data or control after soname", __func__)); /* * Packet to copyout() is now in 'm' and it is disconnected from the * queue. * * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. We call into the * protocol to perform externalization (or freeing if controlp == * NULL). In some cases there can be only MT_CONTROL mbufs without * MT_DATA mbufs. */ if (m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { m2 = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = m2; } while (m != NULL && m->m_type == MT_CONTROL); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } } KASSERT(m == NULL || m->m_type == MT_DATA, ("soreceive_dgram: !data")); while (m != NULL && uio->uio_resid > 0) { len = uio->uio_resid; if (len > m->m_len) len = m->m_len; error = uiomove(mtod(m, char *), (int)len, uio); if (error) { m_freem(m); return (error); } if (len == m->m_len) m = m_free(m); else { m->m_data += len; m->m_len -= len; } } if (m != NULL) { flags |= MSG_TRUNC; m_freem(m); } if (flagsp != NULL) *flagsp |= flags; return (0); } int soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp); CURVNET_RESTORE(); return (error); } int soshutdown(struct socket *so, enum shutdown_how how) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_shutdown(so, how); CURVNET_RESTORE(); return (error); } /* * Used by several pr_shutdown implementations that use generic socket buffers. */ void sorflush(struct socket *so) { int error; VNET_SO_ASSERT(so); /* * Dislodge threads currently blocked in receive and wait to acquire * a lock against other simultaneous readers before clearing the * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. * * The SOCK_IO_RECV_LOCK() is important here as there some pr_soreceive * methods that read the top of the socket buffer without acquisition * of the socket buffer mutex, assuming that top of the buffer * exclusively belongs to the read(2) syscall. This is handy when * performing MSG_PEEK. */ socantrcvmore(so); error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR); if (error != 0) { KASSERT(SOLISTENING(so), ("%s: soiolock(%p) failed", __func__, so)); return; } sbrelease(so, SO_RCV); SOCK_IO_RECV_UNLOCK(so); } #ifdef SOCKET_HHOOK /* * Wrapper for Socket established helper hook. * Parameters: socket, context of the hook point, hook id. */ static inline int hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) { struct socket_hhook_data hhook_data = { .so = so, .hctx = hctx, .m = NULL, .status = 0 }; CURVNET_SET(so->so_vnet); HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); CURVNET_RESTORE(); /* Ugly but needed, since hhooks return void for now */ return (hhook_data.status); } #endif /* * Perhaps this routine, and sooptcopyout(), below, ought to come in an * additional variant to handle the case where the option value needs to be * some kind of integer, but not a specific size. In addition to their use * here, these functions are also called by the protocol-level pr_ctloutput() * routines. */ int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) { size_t valsize; /* * If the user gives us more than we wanted, we ignore it, but if we * don't get the minimum length the caller wants, we return EINVAL. * On success, sopt->sopt_valsize is set to however much we actually * retrieved. */ if ((valsize = sopt->sopt_valsize) < minlen) return EINVAL; if (valsize > len) sopt->sopt_valsize = valsize = len; if (sopt->sopt_td != NULL) return (copyin(sopt->sopt_val, buf, valsize)); bcopy(sopt->sopt_val, buf, valsize); return (0); } /* * Kernel version of setsockopt(2). * * XXX: optlen is size_t, not socklen_t */ int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen) { struct sockopt sopt; sopt.sopt_level = level; sopt.sopt_name = optname; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = optval; sopt.sopt_valsize = optlen; sopt.sopt_td = NULL; return (sosetopt(so, &sopt)); } int sosetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; sbintime_t val, *valp; uint32_t val32; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_setopt(so, sopt); if (error) goto bad; break; case SO_LINGER: error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; if (l.l_linger < 0 || l.l_linger > USHRT_MAX || l.l_linger > (INT_MAX / hz)) { error = EDOM; goto bad; } SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) so->so_options |= SO_LINGER; else so->so_options &= ~SO_LINGER; SOCK_UNLOCK(so); break; case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: case SO_RERROR: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; SOCK_LOCK(so); if (optval) so->so_options |= sopt->sopt_name; else so->so_options &= ~sopt->sopt_name; SOCK_UNLOCK(so); break; case SO_SETFIB: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval >= rt_numfibs) { error = EINVAL; goto bad; } if (((so->so_proto->pr_domain->dom_family == PF_INET) || (so->so_proto->pr_domain->dom_family == PF_INET6) || (so->so_proto->pr_domain->dom_family == PF_ROUTE))) so->so_fibnum = optval; else so->so_fibnum = 0; break; case SO_USER_COOKIE: error = sooptcopyin(sopt, &val32, sizeof val32, sizeof val32); if (error) goto bad; so->so_user_cookie = val32; break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: error = so->so_proto->pr_setsbopt(so, sopt); if (error) goto bad; break; case SO_SNDTIMEO: case SO_RCVTIMEO: #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; error = sooptcopyin(sopt, &tv32, sizeof tv32, sizeof tv32); CP(tv32, tv, tv_sec); CP(tv32, tv, tv_usec); } else #endif error = sooptcopyin(sopt, &tv, sizeof tv, sizeof tv); if (error) goto bad; if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; goto bad; } if (tv.tv_sec > INT32_MAX) val = SBT_MAX; else val = tvtosbt(tv); SOCK_LOCK(so); valp = sopt->sopt_name == SO_SNDTIMEO ? (SOLISTENING(so) ? &so->sol_sbsnd_timeo : &so->so_snd.sb_timeo) : (SOLISTENING(so) ? &so->sol_sbrcv_timeo : &so->so_rcv.sb_timeo); *valp = val; SOCK_UNLOCK(so); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof extmac, sizeof extmac); if (error) goto bad; error = mac_setsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); #else error = EOPNOTSUPP; #endif break; case SO_TS_CLOCK: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval > SO_TS_CLOCK_MAX) { error = EINVAL; goto bad; } so->so_ts_clock = optval; break; case SO_MAX_PACING_RATE: error = sooptcopyin(sopt, &val32, sizeof(val32), sizeof(val32)); if (error) goto bad; so->so_max_pacing_rate = val32; break; + case SO_SPLICE: { + struct splice splice; + +#ifdef COMPAT_FREEBSD32 + if (SV_CURPROC_FLAG(SV_ILP32)) { + struct splice32 splice32; + + error = sooptcopyin(sopt, &splice32, + sizeof(splice32), sizeof(splice32)); + if (error == 0) { + splice.sp_fd = splice32.sp_fd; + splice.sp_max = splice32.sp_max; + CP(splice32.sp_idle, splice.sp_idle, + tv_sec); + CP(splice32.sp_idle, splice.sp_idle, + tv_usec); + } + } else +#endif + { + error = sooptcopyin(sopt, &splice, + sizeof(splice), sizeof(splice)); + } + if (error) + goto bad; + ktrsplice(&splice); + + error = splice_init(); + if (error != 0) + goto bad; + + if (splice.sp_fd >= 0) { + struct file *fp; + struct socket *so2; + + if (!cap_rights_contains(sopt->sopt_rights, + &cap_recv_rights)) { + error = ENOTCAPABLE; + goto bad; + } + error = getsock(sopt->sopt_td, splice.sp_fd, + &cap_send_rights, &fp); + if (error != 0) + goto bad; + so2 = fp->f_data; + + error = so_splice(so, so2, &splice); + fdrop(fp, sopt->sopt_td); + } else { + error = so_unsplice(so, false); + } + break; + } default: #ifdef SOCKET_HHOOK if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else #endif error = ENOPROTOOPT; break; } if (error == 0 && so->so_proto->pr_ctloutput != NULL) (void)(*so->so_proto->pr_ctloutput)(so, sopt); } bad: CURVNET_RESTORE(); return (error); } /* * Helper routine for getsockopt. */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) { int error; size_t valsize; error = 0; /* * Documented get behavior is that we always return a value, possibly * truncated to fit in the user's buffer. Traditional behavior is * that we always tell the user precisely how much we copied, rather * than something useful like the total amount we had available for * her. Note that this interface is not idempotent; the entire * answer must be generated ahead of time. */ valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != NULL) { if (sopt->sopt_td != NULL) error = copyout(buf, sopt->sopt_val, valsize); else bcopy(buf, sopt->sopt_val, valsize); } return (error); } int sogetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; CURVNET_RESTORE(); return (error); } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_getopt(so, sopt); break; case SO_LINGER: SOCK_LOCK(so); l.l_onoff = so->so_options & SO_LINGER; l.l_linger = so->so_linger; SOCK_UNLOCK(so); error = sooptcopyout(sopt, &l, sizeof l); break; case SO_USELOOPBACK: case SO_DONTROUTE: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_BROADCAST: case SO_OOBINLINE: case SO_ACCEPTCONN: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: case SO_RERROR: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); break; case SO_DOMAIN: optval = so->so_proto->pr_domain->dom_family; goto integer; case SO_TYPE: optval = so->so_type; goto integer; case SO_PROTOCOL: optval = so->so_proto->pr_protocol; goto integer; case SO_ERROR: SOCK_LOCK(so); if (so->so_error) { optval = so->so_error; so->so_error = 0; } else { optval = so->so_rerror; so->so_rerror = 0; } SOCK_UNLOCK(so); goto integer; case SO_SNDBUF: optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : so->so_snd.sb_hiwat; goto integer; case SO_RCVBUF: optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : so->so_rcv.sb_hiwat; goto integer; case SO_SNDLOWAT: optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : so->so_snd.sb_lowat; goto integer; case SO_RCVLOWAT: optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : so->so_rcv.sb_lowat; goto integer; case SO_SNDTIMEO: case SO_RCVTIMEO: SOCK_LOCK(so); tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? (SOLISTENING(so) ? so->sol_sbsnd_timeo : so->so_snd.sb_timeo) : (SOLISTENING(so) ? so->sol_sbrcv_timeo : so->so_rcv.sb_timeo)); SOCK_UNLOCK(so); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; CP(tv, tv32, tv_sec); CP(tv, tv32, tv_usec); error = sooptcopyout(sopt, &tv32, sizeof tv32); } else #endif error = sooptcopyout(sopt, &tv, sizeof tv); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; /* Don't copy out extmac, it is unchanged. */ #else error = EOPNOTSUPP; #endif break; case SO_PEERLABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_peerlabel( sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; /* Don't copy out extmac, it is unchanged. */ #else error = EOPNOTSUPP; #endif break; case SO_LISTENQLIMIT: optval = SOLISTENING(so) ? so->sol_qlimit : 0; goto integer; case SO_LISTENQLEN: optval = SOLISTENING(so) ? so->sol_qlen : 0; goto integer; case SO_LISTENINCQLEN: optval = SOLISTENING(so) ? so->sol_incqlen : 0; goto integer; case SO_TS_CLOCK: optval = so->so_ts_clock; goto integer; case SO_MAX_PACING_RATE: optval = so->so_max_pacing_rate; goto integer; + case SO_SPLICE: { + off_t n; + + /* + * Acquire the I/O lock to serialize with + * so_splice_xfer(). This is not required for + * correctness, but makes testing simpler: once a byte + * has been transmitted to the sink and observed (e.g., + * by reading from the socket to which the sink is + * connected), a subsequent getsockopt(SO_SPLICE) will + * return an up-to-date value. + */ + error = SOCK_IO_RECV_LOCK(so, SBL_WAIT); + if (error != 0) + goto bad; + SOCK_LOCK(so); + if (SOLISTENING(so)) { + n = 0; + } else { + n = so->so_splice_sent; + } + SOCK_UNLOCK(so); + SOCK_IO_RECV_UNLOCK(so); + error = sooptcopyout(sopt, &n, sizeof(n)); + break; + } + default: #ifdef SOCKET_HHOOK if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else #endif error = ENOPROTOOPT; break; } } -#ifdef MAC bad: -#endif CURVNET_RESTORE(); return (error); } int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { struct mbuf *m, *m_prev; int sopt_size = sopt->sopt_valsize; MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) return ENOBUFS; if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; *mp = m; m_prev = m; while (sopt_size) { MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { m_freem(*mp); return ENOBUFS; } if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m_freem(*mp); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; m_prev->m_next = m; m_prev = m; } return (0); } int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; m = m->m_next; } if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ panic("ip6_sooptmcopyin"); return (0); } int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; size_t valsize = 0; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; valsize += m->m_len; m = m->m_next; } if (m != NULL) { /* enough soopt buffer should be given from user-land */ m_freem(m0); return(EINVAL); } sopt->sopt_valsize = valsize; return (0); } /* * sohasoutofband(): protocol notifies socket layer of the arrival of new * out-of-band data, which will then notify socket consumers. */ void sohasoutofband(struct socket *so) { if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); selwakeuppri(&so->so_rdsel, PSOCK); } int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { /* * We do not need to set or assert curvnet as long as everyone uses * sopoll_generic(). */ return (so->so_proto->pr_sopoll(so, events, active_cred, td)); } int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { int revents; SOCK_LOCK(so); if (SOLISTENING(so)) { if (!(events & (POLLIN | POLLRDNORM))) revents = 0; else if (!TAILQ_EMPTY(&so->sol_comp)) revents = events & (POLLIN | POLLRDNORM); else if ((events & POLLINIGNEOF) == 0 && so->so_error) revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; else { selrecord(td, &so->so_rdsel); revents = 0; } } else { revents = 0; SOCK_SENDBUF_LOCK(so); SOCK_RECVBUF_LOCK(so); if (events & (POLLIN | POLLRDNORM)) - if (soreadabledata(so)) + if (soreadabledata(so) && !isspliced(so)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) - if (sowriteable(so)) + if (sowriteable(so) && !issplicedback(so)) revents |= events & (POLLOUT | POLLWRNORM); if (events & (POLLPRI | POLLRDBAND)) if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) revents |= events & (POLLPRI | POLLRDBAND); if ((events & POLLINIGNEOF) == 0) { if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { revents |= events & (POLLIN | POLLRDNORM); if (so->so_snd.sb_state & SBS_CANTSENDMORE) revents |= POLLHUP; } } if (so->so_rcv.sb_state & SBS_CANTRCVMORE) revents |= events & POLLRDHUP; if (revents == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) { selrecord(td, &so->so_rdsel); so->so_rcv.sb_flags |= SB_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &so->so_wrsel); so->so_snd.sb_flags |= SB_SEL; } } SOCK_RECVBUF_UNLOCK(so); SOCK_SENDBUF_UNLOCK(so); } SOCK_UNLOCK(so); return (revents); } int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; sb_which which; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; knl = &so->so_rdsel.si_note; sb = &so->so_rcv; which = SO_RCV; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; which = SO_SND; break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; which = SO_SND; break; default: return (EINVAL); } SOCK_LOCK(so); if (SOLISTENING(so)) { knlist_add(knl, kn, 1); } else { SOCK_BUF_LOCK(so, which); knlist_add(knl, kn, 1); sb->sb_flags |= SB_KNOTE; SOCK_BUF_UNLOCK(so, which); } SOCK_UNLOCK(so); return (0); } static void filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_rdknl_lock(so); knlist_remove(&so->so_rdsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; so_rdknl_unlock(so); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) { SOCK_LOCK_ASSERT(so); kn->kn_data = so->sol_qlen; if (so->so_error) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } return (!TAILQ_EMPTY(&so->sol_comp)); } + if ((so->so_rcv.sb_flags & SB_SPLICED) != 0) + return (0); + SOCK_RECVBUF_LOCK_ASSERT(so); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error || so->so_rerror) return (1); if (kn->kn_sfflags & NOTE_LOWAT) { if (kn->kn_data >= kn->kn_sdata) return (1); } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) return (1); #ifdef SOCKET_HHOOK /* This hook returning non-zero indicates an event, not error */ return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); #else return (0); #endif } static void filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_wrknl_lock(so); knlist_remove(&so->so_wrsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; so_wrknl_unlock(so); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (0); SOCK_SENDBUF_LOCK_ASSERT(so); kn->kn_data = sbspace(&so->so_snd); #ifdef SOCKET_HHOOK hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); #endif if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) return (0); else if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); else return (kn->kn_data >= so->so_snd.sb_lowat); } static int filt_soempty(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (1); SOCK_SENDBUF_LOCK_ASSERT(so); kn->kn_data = sbused(&so->so_snd); if (kn->kn_data == 0) return (1); else return (0); } int socheckuid(struct socket *so, uid_t uid) { if (so == NULL) return (EPERM); if (so->so_cred->cr_uid != uid) return (EPERM); return (0); } /* * These functions are used by protocols to notify the socket layer (and its * consumers) of state changes in the sockets driven by protocol-side events. */ /* * Procedures to manipulate state flags of socket and do appropriate wakeups. * * Normal sequence from the active (originating) side is that * soisconnecting() is called during processing of connect() call, resulting * in an eventual call to soisconnected() if/when the connection is * established. When the connection is torn down soisdisconnecting() is * called during processing of disconnect() call, and soisdisconnected() is * called when the connection to the peer is totally severed. The semantics * of these routines are such that connectionless protocols can call * soisconnected() and soisdisconnected() only, bypassing the in-progress * calls when setting up a ``connection'' takes no time. * * From the passive side, a socket is created with two queues of sockets: * so_incomp for connections in progress and so_comp for connections already * made and awaiting user acceptance. As a protocol is preparing incoming * connections, it creates a socket structure queued on so_incomp by calling * sonewconn(). When the connection is established, soisconnected() is * called, and transfers the socket structure to so_comp, making it available * to accept(). * * If a socket is closed with sockets on either so_incomp or so_comp, these * sockets are dropped. * * If higher-level protocols are implemented in the kernel, the wakeups done * here will sometimes cause software-interrupt process scheduling. */ void soisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } void soisconnected(struct socket *so) { bool last __diagused; SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTED; if (so->so_qstate == SQ_INCOMP) { struct socket *head = so->so_listen; int ret; KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); /* * Promoting a socket from incomplete queue to complete, we * need to go through reverse order of locking. We first do * trylock, and if that doesn't succeed, we go the hard way * leaving a reference and rechecking consistency after proper * locking. */ if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { soref(head); SOCK_UNLOCK(so); SOLISTEN_LOCK(head); SOCK_LOCK(so); if (__predict_false(head != so->so_listen)) { /* * The socket went off the listen queue, * should be lost race to close(2) of sol. * The socket is about to soabort(). */ SOCK_UNLOCK(so); sorele_locked(head); return; } last = refcount_release(&head->so_count); KASSERT(!last, ("%s: released last reference for %p", __func__, head)); } again: if ((so->so_options & SO_ACCEPTFILTER) == 0) { TAILQ_REMOVE(&head->sol_incomp, so, so_list); head->sol_incqlen--; TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); head->sol_qlen++; so->so_qstate = SQ_COMP; SOCK_UNLOCK(so); solisten_wakeup(head); /* unlocks */ } else { SOCK_RECVBUF_LOCK(so); soupcall_set(so, SO_RCV, head->sol_accept_filter->accf_callback, head->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; ret = head->sol_accept_filter->accf_callback(so, head->sol_accept_filter_arg, M_NOWAIT); if (ret == SU_ISCONNECTED) { soupcall_clear(so, SO_RCV); SOCK_RECVBUF_UNLOCK(so); goto again; } SOCK_RECVBUF_UNLOCK(so); SOCK_UNLOCK(so); SOLISTEN_UNLOCK(head); } return; } SOCK_UNLOCK(so); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); } void soisdisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; if (!SOLISTENING(so)) { SOCK_RECVBUF_LOCK(so); socantrcvmore_locked(so); SOCK_SENDBUF_LOCK(so); socantsendmore_locked(so); } SOCK_UNLOCK(so); wakeup(&so->so_timeo); } void soisdisconnected(struct socket *so) { SOCK_LOCK(so); /* * There is at least one reader of so_state that does not * acquire socket lock, namely soreceive_generic(). Ensure * that it never sees all flags that track connection status * cleared, by ordering the update with a barrier semantic of * our release thread fence. */ so->so_state |= SS_ISDISCONNECTED; atomic_thread_fence_rel(); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); if (!SOLISTENING(so)) { SOCK_UNLOCK(so); SOCK_RECVBUF_LOCK(so); socantrcvmore_locked(so); SOCK_SENDBUF_LOCK(so); sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); socantsendmore_locked(so); } else SOCK_UNLOCK(so); wakeup(&so->so_timeo); } int soiolock(struct socket *so, struct sx *sx, int flags) { int error; KASSERT((flags & SBL_VALID) == flags, ("soiolock: invalid flags %#x", flags)); if ((flags & SBL_WAIT) != 0) { if ((flags & SBL_NOINTR) != 0) { sx_xlock(sx); } else { error = sx_xlock_sig(sx); if (error != 0) return (error); } } else if (!sx_try_xlock(sx)) { return (EWOULDBLOCK); } if (__predict_false(SOLISTENING(so))) { sx_xunlock(sx); return (ENOTCONN); } return (0); } void soiounlock(struct sx *sx) { sx_xunlock(sx); } /* * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. */ struct sockaddr * sodupsockaddr(const struct sockaddr *sa, int mflags) { struct sockaddr *sa2; sa2 = malloc(sa->sa_len, M_SONAME, mflags); if (sa2) bcopy(sa, sa2, sa->sa_len); return sa2; } /* * Register per-socket destructor. */ void sodtor_set(struct socket *so, so_dtor_t *func) { SOCK_LOCK_ASSERT(so); so->so_dtor = func; } /* * Register per-socket buffer upcalls. */ void soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; } SOCK_BUF_LOCK_ASSERT(so, which); sb->sb_upcall = func; sb->sb_upcallarg = arg; sb->sb_flags |= SB_UPCALL; } void soupcall_clear(struct socket *so, sb_which which) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; } SOCK_BUF_LOCK_ASSERT(so, which); KASSERT(sb->sb_upcall != NULL, ("%s: so %p no upcall to clear", __func__, so)); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } void solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) { SOLISTEN_LOCK_ASSERT(so); so->sol_upcall = func; so->sol_upcallarg = arg; } static void so_rdknl_lock(void *arg) { struct socket *so = arg; retry: if (SOLISTENING(so)) { SOLISTEN_LOCK(so); } else { SOCK_RECVBUF_LOCK(so); if (__predict_false(SOLISTENING(so))) { SOCK_RECVBUF_UNLOCK(so); goto retry; } } } static void so_rdknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOLISTEN_UNLOCK(so); else SOCK_RECVBUF_UNLOCK(so); } static void so_rdknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOLISTEN_LOCK_ASSERT(so); else SOCK_RECVBUF_LOCK_ASSERT(so); } else { if (SOLISTENING(so)) SOLISTEN_UNLOCK_ASSERT(so); else SOCK_RECVBUF_UNLOCK_ASSERT(so); } } static void so_wrknl_lock(void *arg) { struct socket *so = arg; retry: if (SOLISTENING(so)) { SOLISTEN_LOCK(so); } else { SOCK_SENDBUF_LOCK(so); if (__predict_false(SOLISTENING(so))) { SOCK_SENDBUF_UNLOCK(so); goto retry; } } } static void so_wrknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOLISTEN_UNLOCK(so); else SOCK_SENDBUF_UNLOCK(so); } static void so_wrknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOLISTEN_LOCK_ASSERT(so); else SOCK_SENDBUF_LOCK_ASSERT(so); } else { if (SOLISTENING(so)) SOLISTEN_UNLOCK_ASSERT(so); else SOCK_SENDBUF_UNLOCK_ASSERT(so); } } /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void sotoxsocket(struct socket *so, struct xsocket *xso) { bzero(xso, sizeof(*xso)); xso->xso_len = sizeof *xso; xso->xso_so = (uintptr_t)so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = (uintptr_t)so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; xso->so_uid = so->so_cred->cr_uid; xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; SOCK_LOCK(so); if (SOLISTENING(so)) { xso->so_qlen = so->sol_qlen; xso->so_incqlen = so->sol_incqlen; xso->so_qlimit = so->sol_qlimit; xso->so_oobmark = 0; } else { xso->so_state |= so->so_qstate; xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; xso->so_oobmark = so->so_oobmark; sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + if ((so->so_rcv.sb_flags & SB_SPLICED) != 0) + xso->so_splice_so = (uintptr_t)so->so_splice->dst; } SOCK_UNLOCK(so); } int so_options_get(const struct socket *so) { return (so->so_options); } void so_options_set(struct socket *so, int val) { so->so_options = val; } int so_error_get(const struct socket *so) { return (so->so_error); } void so_error_set(struct socket *so, int val) { so->so_error = val; } diff --git a/sys/sys/ktrace.h b/sys/sys/ktrace.h index 263f25945bff..966af1744058 100644 --- a/sys/sys/ktrace.h +++ b/sys/sys/ktrace.h @@ -1,369 +1,371 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _SYS_KTRACE_H_ #define _SYS_KTRACE_H_ #include #include #include #include #include /* * operations to ktrace system call (KTROP(op)) */ #define KTROP_SET 0 /* set trace points */ #define KTROP_CLEAR 1 /* clear trace points */ #define KTROP_CLEARFILE 2 /* stop all tracing to file */ #define KTROP(o) ((o)&3) /* macro to extract operation */ /* * flags (ORed in with operation) */ #define KTRFLAG_DESCEND 4 /* perform op on all children too */ /* * ktrace record header */ struct ktr_header_v0 { int ktr_len; /* length of buf */ short ktr_type; /* trace record type */ pid_t ktr_pid; /* process id */ char ktr_comm[MAXCOMLEN + 1];/* command name */ struct timeval ktr_time; /* timestamp */ long ktr_tid; /* thread id */ }; struct ktr_header { int ktr_len; /* length of buf */ short ktr_type; /* trace record type */ short ktr_version; /* ktr_header version */ pid_t ktr_pid; /* process id */ char ktr_comm[MAXCOMLEN + 1];/* command name */ struct timespec ktr_time; /* timestamp */ /* XXX: make ktr_tid an lwpid_t on next ABI break */ long ktr_tid; /* thread id */ int ktr_cpu; /* cpu id */ }; #define KTR_VERSION0 0 #define KTR_VERSION1 1 #define KTR_OFFSET_V0 sizeof(struct ktr_header_v0) - \ sizeof(struct ktr_header) /* * Test for kernel trace point (MP SAFE). * * KTRCHECK() just checks that the type is enabled and is only for * internal use in the ktrace subsystem. KTRPOINT() checks against * ktrace recursion as well as checking that the type is enabled and * is the public interface. */ #define KTRCHECK(td, type) ((td)->td_proc->p_traceflag & (1 << type)) #define KTRPOINT(td, type) (__predict_false(KTRCHECK((td), (type)))) #define KTRCHECKDRAIN(td) (!(STAILQ_EMPTY(&(td)->td_proc->p_ktr))) #define KTRUSERRET(td) do { \ if (__predict_false(KTRCHECKDRAIN(td))) \ ktruserret(td); \ } while (0) /* * ktrace record types */ /* * KTR_SYSCALL - system call record */ #define KTR_SYSCALL 1 struct ktr_syscall { short ktr_code; /* syscall number */ short ktr_narg; /* number of arguments */ /* * followed by ktr_narg register_t */ register_t ktr_args[1]; }; /* * KTR_SYSRET - return from system call record */ #define KTR_SYSRET 2 struct ktr_sysret { short ktr_code; short ktr_eosys; int ktr_error; register_t ktr_retval; }; /* * KTR_NAMEI - namei record */ #define KTR_NAMEI 3 /* record contains pathname */ /* * KTR_GENIO - trace generic process i/o */ #define KTR_GENIO 4 struct ktr_genio { int ktr_fd; enum uio_rw ktr_rw; /* * followed by data successfully read/written */ }; /* * KTR_PSIG - trace processed signal */ #define KTR_PSIG 5 struct ktr_psig { int signo; sig_t action; int code; sigset_t mask; }; /* * KTR_CSW - trace context switches */ #define KTR_CSW 6 struct ktr_csw_old { int out; /* 1 if switch out, 0 if switch in */ int user; /* 1 if usermode (ivcsw), 0 if kernel (vcsw) */ }; struct ktr_csw { int out; /* 1 if switch out, 0 if switch in */ int user; /* 1 if usermode (ivcsw), 0 if kernel (vcsw) */ char wmesg[8]; }; /* * KTR_USER - data coming from userland */ #define KTR_USER_MAXLEN 2048 /* maximum length of passed data */ #define KTR_USER 7 /* * KTR_STRUCT - misc. structs */ #define KTR_STRUCT 8 /* * record contains null-terminated struct name followed by * struct contents */ struct sockaddr; struct stat; struct sysentvec; /* * KTR_SYSCTL - name of a sysctl MIB */ #define KTR_SYSCTL 9 /* record contains null-terminated MIB name */ /* * KTR_PROCCTOR - trace process creation (multiple ABI support) */ #define KTR_PROCCTOR 10 struct ktr_proc_ctor { u_int sv_flags; /* struct sysentvec sv_flags copy */ }; /* * KTR_PROCDTOR - trace process destruction (multiple ABI support) */ #define KTR_PROCDTOR 11 /* * KTR_CAPFAIL - trace capability check failures */ #define KTR_CAPFAIL 12 enum ktr_cap_violation { CAPFAIL_NOTCAPABLE, /* insufficient capabilities in cap_check() */ CAPFAIL_INCREASE, /* attempt to increase rights on a capability */ CAPFAIL_SYSCALL, /* disallowed system call */ CAPFAIL_SIGNAL, /* sent signal to process other than self */ CAPFAIL_PROTO, /* disallowed protocol */ CAPFAIL_SOCKADDR, /* restricted address lookup */ CAPFAIL_NAMEI, /* restricted namei lookup */ CAPFAIL_CPUSET, /* restricted CPU set modification */ }; union ktr_cap_data { cap_rights_t cap_rights[2]; #define cap_needed cap_rights[0] #define cap_held cap_rights[1] int cap_int; struct sockaddr cap_sockaddr; char cap_path[MAXPATHLEN]; }; struct ktr_cap_fail { enum ktr_cap_violation cap_type; short cap_code; u_int cap_svflags; union ktr_cap_data cap_data; }; /* * KTR_FAULT - page fault record */ #define KTR_FAULT 13 struct ktr_fault { vm_offset_t vaddr; int type; }; /* * KTR_FAULTEND - end of page fault record */ #define KTR_FAULTEND 14 struct ktr_faultend { int result; }; /* * KTR_STRUCT_ARRAY - array of misc. structs */ #define KTR_STRUCT_ARRAY 15 struct ktr_struct_array { size_t struct_size; /* * Followed by null-terminated structure name and then payload * contents. */ }; /* * KTR_DROP - If this bit is set in ktr_type, then at least one event * between the previous record and this record was dropped. */ #define KTR_DROP 0x8000 /* * KTR_VERSIONED - If this bit is set in ktr_type, then the kernel * exposes the new struct ktr_header (versioned), otherwise the old * struct ktr_header_v0 is exposed. */ #define KTR_VERSIONED 0x4000 #define KTR_TYPE (KTR_DROP | KTR_VERSIONED) /* * kernel trace points (in p_traceflag) */ #define KTRFAC_MASK 0x00ffffff #define KTRFAC_SYSCALL (1<sa_len) #define ktrstat(s) \ ktrstruct("stat", (s), sizeof(struct stat)) #define ktrstat_error(s, error) \ ktrstruct_error("stat", (s), sizeof(struct stat), error) #define ktrcpuset(s, l) \ ktrstruct("cpuset_t", (s), l) +#define ktrsplice(s) \ + ktrstruct("splice", (s), sizeof(struct splice)) extern u_int ktr_geniosize; #ifdef KTRACE extern int ktr_filesize_limit_signal; #define __ktrace_used #else #define ktr_filesize_limit_signal 0 #define __ktrace_used __unused #endif #else #include __BEGIN_DECLS int ktrace(const char *, int, int, pid_t); int utrace(const void *, size_t); __END_DECLS #endif #endif diff --git a/sys/sys/sockbuf.h b/sys/sys/sockbuf.h index a2e327bbb5df..fd503d021282 100644 --- a/sys/sys/sockbuf.h +++ b/sys/sys/sockbuf.h @@ -1,324 +1,325 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _SYS_SOCKBUF_H_ #define _SYS_SOCKBUF_H_ /* * Constants for sb_flags field of struct sockbuf/xsockbuf. */ #define SB_TLS_RX 0x01 /* using KTLS on RX */ #define SB_TLS_RX_RUNNING 0x02 /* KTLS RX operation running */ #define SB_WAIT 0x04 /* someone is waiting for data/space */ #define SB_SEL 0x08 /* someone is selecting */ #define SB_ASYNC 0x10 /* ASYNC I/O, need signals */ #define SB_UPCALL 0x20 /* someone wants an upcall */ #define SB_NOINTR 0x40 /* operations not interruptible */ #define SB_AIO 0x80 /* AIO operations queued */ #define SB_KNOTE 0x100 /* kernel note attached */ #define SB_NOCOALESCE 0x200 /* don't coalesce new data into existing mbufs */ #define SB_IN_TOE 0x400 /* socket buffer is in the middle of an operation */ #define SB_AUTOSIZE 0x800 /* automatically size socket buffer */ #define SB_STOP 0x1000 /* backpressure indicator */ #define SB_AIO_RUNNING 0x2000 /* AIO operation running */ -#define SB_UNUSED 0x4000 /* previously used for SB_TLS_IFNET */ +#define SB_SPLICED 0x4000 /* socket buffer is spliced; + previously used for SB_TLS_IFNET */ #define SB_TLS_RX_RESYNC 0x8000 /* KTLS RX lost HW sync */ #define SBS_CANTSENDMORE 0x0010 /* can't send more data to peer */ #define SBS_CANTRCVMORE 0x0020 /* can't receive more data from peer */ #define SBS_RCVATMARK 0x0040 /* at mark on input */ #if defined(_KERNEL) || defined(_WANT_SOCKET) #include #include #include #include #define SB_MAX (2*1024*1024) /* default for max chars in sockbuf */ struct ktls_session; struct mbuf; struct sockaddr; struct socket; struct sockopt; struct thread; struct selinfo; /* * Socket buffer * * A buffer starts with the fields that are accessed by I/O multiplexing * APIs like select(2), kevent(2) or AIO and thus are shared between different * buffer implementations. They are protected by the SOCK_RECVBUF_LOCK() * or SOCK_SENDBUF_LOCK() of the owning socket. * * XXX: sb_acc, sb_ccc and sb_mbcnt shall become implementation specific * methods. * * Protocol specific implementations follow in a union. */ struct sockbuf { struct selinfo *sb_sel; /* process selecting read/write */ short sb_state; /* socket state on sockbuf */ short sb_flags; /* flags, see above */ u_int sb_acc; /* available chars in buffer */ u_int sb_ccc; /* claimed chars in buffer */ u_int sb_mbcnt; /* chars of mbufs used */ u_int sb_ctl; /* non-data chars in buffer */ u_int sb_hiwat; /* max actual char count */ u_int sb_lowat; /* low water mark */ u_int sb_mbmax; /* max chars of mbufs to use */ sbintime_t sb_timeo; /* timeout for read/write */ int (*sb_upcall)(struct socket *, void *, int); void *sb_upcallarg; TAILQ_HEAD(, kaiocb) sb_aiojobq; /* pending AIO ops */ struct task sb_aiotask; /* AIO task */ union { /* * Classic BSD one-size-fits-all socket buffer, capable of * doing streams and datagrams. The stream part is able * to perform special features: * - not ready data (sendfile) * - TLS */ struct { /* compat: sockbuf lock pointer */ struct mtx *sb_mtx; /* first and last mbufs in the chain */ struct mbuf *sb_mb; struct mbuf *sb_mbtail; /* first mbuf of last record in socket buffer */ struct mbuf *sb_lastrecord; /* pointer to data to send next (TCP */ struct mbuf *sb_sndptr; /* pointer to first not ready buffer */ struct mbuf *sb_fnrdy; /* byte offset of ptr into chain, used with sb_sndptr */ u_int sb_sndptroff; /* TLS */ u_int sb_tlscc; /* TLS chain characters */ u_int sb_tlsdcc; /* characters being decrypted */ struct mbuf *sb_mtls; /* TLS mbuf chain */ struct mbuf *sb_mtlstail; /* last mbuf in TLS chain */ uint64_t sb_tls_seqno; /* TLS seqno */ /* TLS state, locked by sockbuf and sock I/O mutexes. */ struct ktls_session *sb_tls_info; }; /* * PF_UNIX/SOCK_DGRAM * * Local protocol, thus we should buffer on the receive side * only. However, in one to many configuration we don't want * a single receive buffer to be shared. So we would link * send buffers onto receive buffer. All the fields are locked * by the receive buffer lock. */ struct { /* * For receive buffer: own queue of this buffer for * unconnected sends. For send buffer: queue lended * to the peer receive buffer, to isolate ourselves * from other senders. */ STAILQ_HEAD(, mbuf) uxdg_mb; /* For receive buffer: datagram seen via MSG_PEEK. */ struct mbuf *uxdg_peeked; /* * For receive buffer: queue of send buffers of * connected peers. For send buffer: linkage on * connected peer receive buffer queue. */ union { TAILQ_HEAD(, sockbuf) uxdg_conns; TAILQ_ENTRY(sockbuf) uxdg_clist; }; /* Counters for this buffer uxdg_mb chain + peeked. */ u_int uxdg_cc; u_int uxdg_ctl; u_int uxdg_mbcnt; }; /* * Netlink socket. */ struct { TAILQ_HEAD(, nl_buf) nl_queue; }; }; }; #endif /* defined(_KERNEL) || defined(_WANT_SOCKET) */ #ifdef _KERNEL /* 'which' values for KPIs that operate on one buffer of a socket. */ typedef enum { SO_RCV, SO_SND } sb_which; /* * Per-socket buffer mutex used to protect most fields in the socket buffer. * These make use of the mutex pointer embedded in struct sockbuf, which * currently just references mutexes in the containing socket. The * SOCK_SENDBUF_LOCK() etc. macros can be used instead of or in combination with * these locking macros. */ #define SOCKBUF_MTX(_sb) ((_sb)->sb_mtx) #define SOCKBUF_LOCK(_sb) mtx_lock(SOCKBUF_MTX(_sb)) #define SOCKBUF_OWNED(_sb) mtx_owned(SOCKBUF_MTX(_sb)) #define SOCKBUF_UNLOCK(_sb) mtx_unlock(SOCKBUF_MTX(_sb)) #define SOCKBUF_LOCK_ASSERT(_sb) mtx_assert(SOCKBUF_MTX(_sb), MA_OWNED) #define SOCKBUF_UNLOCK_ASSERT(_sb) mtx_assert(SOCKBUF_MTX(_sb), MA_NOTOWNED) /* * Socket buffer private mbuf(9) flags. */ #define M_NOTREADY M_PROTO1 /* m_data not populated yet */ #define M_BLOCKED M_PROTO2 /* M_NOTREADY in front of m */ #define M_NOTAVAIL (M_NOTREADY | M_BLOCKED) void sbappend(struct sockbuf *sb, struct mbuf *m, int flags); void sbappend_locked(struct sockbuf *sb, struct mbuf *m, int flags); void sbappendstream(struct sockbuf *sb, struct mbuf *m, int flags); void sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags); int sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control); int sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control); int sbappendaddr_nospacecheck_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control); void sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control, int flags); void sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control, int flags); void sbappendrecord(struct sockbuf *sb, struct mbuf *m0); void sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0); void sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n); struct mbuf * sbcreatecontrol(const void *p, u_int size, int type, int level, int wait); void sbdestroy(struct socket *, sb_which); void sbdrop(struct sockbuf *sb, int len); void sbdrop_locked(struct sockbuf *sb, int len); struct mbuf * sbcut_locked(struct sockbuf *sb, int len); void sbdroprecord(struct sockbuf *sb); void sbdroprecord_locked(struct sockbuf *sb); void sbflush(struct sockbuf *sb); void sbflush_locked(struct sockbuf *sb); void sbrelease(struct socket *, sb_which); void sbrelease_locked(struct socket *, sb_which); int sbsetopt(struct socket *so, struct sockopt *); bool sbreserve_locked(struct socket *so, sb_which which, u_long cc, struct thread *td); bool sbreserve_locked_limit(struct socket *so, sb_which which, u_long cc, u_long buf_max, struct thread *td); void sbsndptr_adv(struct sockbuf *sb, struct mbuf *mb, u_int len); struct mbuf * sbsndptr_noadv(struct sockbuf *sb, u_int off, u_int *moff); struct mbuf * sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff); int sbwait(struct socket *, sb_which); void sballoc(struct sockbuf *, struct mbuf *); void sbfree(struct sockbuf *, struct mbuf *); void sballoc_ktls_rx(struct sockbuf *sb, struct mbuf *m); void sbfree_ktls_rx(struct sockbuf *sb, struct mbuf *m); int sbready(struct sockbuf *, struct mbuf *, int); /* * Return how much data is available to be taken out of socket * buffer right now. */ static inline u_int sbavail(struct sockbuf *sb) { #if 0 SOCKBUF_LOCK_ASSERT(sb); #endif return (sb->sb_acc); } /* * Return how much data sits there in the socket buffer * It might be that some data is not yet ready to be read. */ static inline u_int sbused(struct sockbuf *sb) { #if 0 SOCKBUF_LOCK_ASSERT(sb); #endif return (sb->sb_ccc); } /* * How much space is there in a socket buffer (so->so_snd or so->so_rcv)? * This is problematical if the fields are unsigned, as the space might * still be negative (ccc > hiwat or mbcnt > mbmax). */ static inline long sbspace(struct sockbuf *sb) { int bleft, mleft; /* size should match sockbuf fields */ #if 0 SOCKBUF_LOCK_ASSERT(sb); #endif if (sb->sb_flags & SB_STOP) return(0); bleft = sb->sb_hiwat - sb->sb_ccc; mleft = sb->sb_mbmax - sb->sb_mbcnt; return ((bleft < mleft) ? bleft : mleft); } #define SB_EMPTY_FIXUP(sb) do { \ if ((sb)->sb_mb == NULL) { \ (sb)->sb_mbtail = NULL; \ (sb)->sb_lastrecord = NULL; \ } \ } while (/*CONSTCOND*/0) #ifdef SOCKBUF_DEBUG void sblastrecordchk(struct sockbuf *, const char *, int); void sblastmbufchk(struct sockbuf *, const char *, int); void sbcheck(struct sockbuf *, const char *, int); #define SBLASTRECORDCHK(sb) sblastrecordchk((sb), __FILE__, __LINE__) #define SBLASTMBUFCHK(sb) sblastmbufchk((sb), __FILE__, __LINE__) #define SBCHECK(sb) sbcheck((sb), __FILE__, __LINE__) #else #define SBLASTRECORDCHK(sb) do {} while (0) #define SBLASTMBUFCHK(sb) do {} while (0) #define SBCHECK(sb) do {} while (0) #endif /* SOCKBUF_DEBUG */ #endif /* _KERNEL */ #endif /* _SYS_SOCKBUF_H_ */ diff --git a/sys/sys/socket.h b/sys/sys/socket.h index e45e46da77b3..a1b220ce4bae 100644 --- a/sys/sys/socket.h +++ b/sys/sys/socket.h @@ -1,729 +1,741 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1985, 1986, 1988, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _SYS_SOCKET_H_ #define _SYS_SOCKET_H_ #include #include #include +#include #include /* * Definitions related to sockets: types, address families, options. */ /* * Data types. */ #if __BSD_VISIBLE #ifndef _GID_T_DECLARED typedef __gid_t gid_t; #define _GID_T_DECLARED #endif #ifndef _OFF_T_DECLARED typedef __off_t off_t; #define _OFF_T_DECLARED #endif #ifndef _PID_T_DECLARED typedef __pid_t pid_t; #define _PID_T_DECLARED #endif #endif #ifndef _SA_FAMILY_T_DECLARED typedef __sa_family_t sa_family_t; #define _SA_FAMILY_T_DECLARED #endif #ifndef _SOCKLEN_T_DECLARED typedef __socklen_t socklen_t; #define _SOCKLEN_T_DECLARED #endif #ifndef _SSIZE_T_DECLARED typedef __ssize_t ssize_t; #define _SSIZE_T_DECLARED #endif #if __BSD_VISIBLE #ifndef _UID_T_DECLARED typedef __uid_t uid_t; #define _UID_T_DECLARED #endif #endif #ifndef _UINT32_T_DECLARED typedef __uint32_t uint32_t; #define _UINT32_T_DECLARED #endif #ifndef _UINTPTR_T_DECLARED typedef __uintptr_t uintptr_t; #define _UINTPTR_T_DECLARED #endif /* * Types */ #define SOCK_STREAM 1 /* stream socket */ #define SOCK_DGRAM 2 /* datagram socket */ #define SOCK_RAW 3 /* raw-protocol interface */ #if __BSD_VISIBLE #define SOCK_RDM 4 /* reliably-delivered message */ #endif #define SOCK_SEQPACKET 5 /* sequenced packet stream */ #if __BSD_VISIBLE /* * Creation flags, OR'ed into socket() and socketpair() type argument. */ #define SOCK_CLOEXEC 0x10000000 #define SOCK_NONBLOCK 0x20000000 #ifdef _KERNEL /* * Flags for accept1(), kern_accept4() and solisten_dequeue, in addition * to SOCK_CLOEXEC and SOCK_NONBLOCK. */ #define ACCEPT4_INHERIT 0x1 #define ACCEPT4_COMPAT 0x2 #endif /* _KERNEL */ #endif /* __BSD_VISIBLE */ /* * Option flags per-socket. */ #define SO_DEBUG 0x00000001 /* turn on debugging info recording */ #define SO_ACCEPTCONN 0x00000002 /* socket has had listen() */ #define SO_REUSEADDR 0x00000004 /* allow local address reuse */ #define SO_KEEPALIVE 0x00000008 /* keep connections alive */ #define SO_DONTROUTE 0x00000010 /* just use interface addresses */ #define SO_BROADCAST 0x00000020 /* permit sending of broadcast msgs */ #if __BSD_VISIBLE #define SO_USELOOPBACK 0x00000040 /* bypass hardware when possible */ #endif #define SO_LINGER 0x00000080 /* linger on close if data present */ #define SO_OOBINLINE 0x00000100 /* leave received OOB data in line */ #if __BSD_VISIBLE #define SO_REUSEPORT 0x00000200 /* allow local address & port reuse */ #define SO_TIMESTAMP 0x00000400 /* timestamp received dgram traffic */ #define SO_NOSIGPIPE 0x00000800 /* no SIGPIPE from EPIPE */ #define SO_ACCEPTFILTER 0x00001000 /* there is an accept filter */ #define SO_BINTIME 0x00002000 /* timestamp received dgram traffic */ #endif #define SO_NO_OFFLOAD 0x00004000 /* socket cannot be offloaded */ #define SO_NO_DDP 0x00008000 /* disable direct data placement */ #define SO_REUSEPORT_LB 0x00010000 /* reuse with load balancing */ #define SO_RERROR 0x00020000 /* keep track of receive errors */ /* * Additional options, not kept in so_options. */ #define SO_SNDBUF 0x1001 /* send buffer size */ #define SO_RCVBUF 0x1002 /* receive buffer size */ #define SO_SNDLOWAT 0x1003 /* send low-water mark */ #define SO_RCVLOWAT 0x1004 /* receive low-water mark */ #define SO_SNDTIMEO 0x1005 /* send timeout */ #define SO_RCVTIMEO 0x1006 /* receive timeout */ #define SO_ERROR 0x1007 /* get error status and clear */ #define SO_TYPE 0x1008 /* get socket type */ #if __BSD_VISIBLE #define SO_LABEL 0x1009 /* socket's MAC label */ #define SO_PEERLABEL 0x1010 /* socket's peer's MAC label */ #define SO_LISTENQLIMIT 0x1011 /* socket's backlog limit */ #define SO_LISTENQLEN 0x1012 /* socket's complete queue length */ #define SO_LISTENINCQLEN 0x1013 /* socket's incomplete queue length */ #define SO_SETFIB 0x1014 /* use this FIB to route */ #define SO_USER_COOKIE 0x1015 /* user cookie (dummynet etc.) */ #define SO_PROTOCOL 0x1016 /* get socket protocol (Linux name) */ #define SO_PROTOTYPE SO_PROTOCOL /* alias for SO_PROTOCOL (SunOS name) */ #define SO_TS_CLOCK 0x1017 /* clock type used for SO_TIMESTAMP */ #define SO_MAX_PACING_RATE 0x1018 /* socket's max TX pacing rate (Linux name) */ #define SO_DOMAIN 0x1019 /* get socket domain */ #endif #if __BSD_VISIBLE +#define SO_SPLICE 0x1023 /* splice data to other socket */ #define SO_TS_REALTIME_MICRO 0 /* microsecond resolution, realtime */ #define SO_TS_BINTIME 1 /* sub-nanosecond resolution, realtime */ #define SO_TS_REALTIME 2 /* nanosecond resolution, realtime */ #define SO_TS_MONOTONIC 3 /* nanosecond resolution, monotonic */ #define SO_TS_DEFAULT SO_TS_REALTIME_MICRO #define SO_TS_CLOCK_MAX SO_TS_MONOTONIC #endif /* * Space reserved for new socket options added by third-party vendors. * This range applies to all socket option levels. New socket options * in FreeBSD should always use an option value less than SO_VENDOR. */ #if __BSD_VISIBLE #define SO_VENDOR 0x80000000 #endif /* * Structure used for manipulating linger option. */ struct linger { int l_onoff; /* option on/off */ int l_linger; /* linger time */ }; #if __BSD_VISIBLE struct accept_filter_arg { char af_name[16]; char af_arg[256-16]; }; #endif /* * Level number for (get/set)sockopt() to apply to socket itself. */ #define SOL_SOCKET 0xffff /* options for socket level */ /* * Address families. */ #define AF_UNSPEC 0 /* unspecified */ #if __BSD_VISIBLE #define AF_LOCAL AF_UNIX /* local to host (pipes, portals) */ #endif #define AF_UNIX 1 /* standardized name for AF_LOCAL */ #define AF_INET 2 /* internetwork: UDP, TCP, etc. */ #if __BSD_VISIBLE #define AF_IMPLINK 3 /* arpanet imp addresses */ #define AF_PUP 4 /* pup protocols: e.g. BSP */ #define AF_CHAOS 5 /* mit CHAOS protocols */ #define AF_NETBIOS 6 /* SMB protocols */ #define AF_ISO 7 /* ISO protocols */ #define AF_OSI AF_ISO #define AF_ECMA 8 /* European computer manufacturers */ #define AF_DATAKIT 9 /* datakit protocols */ #define AF_CCITT 10 /* CCITT protocols, X.25 etc */ #define AF_SNA 11 /* IBM SNA */ #define AF_DECnet 12 /* DECnet */ #define AF_DLI 13 /* DEC Direct data link interface */ #define AF_LAT 14 /* LAT */ #define AF_HYLINK 15 /* NSC Hyperchannel */ #define AF_APPLETALK 16 /* Apple Talk */ #define AF_ROUTE 17 /* Internal Routing Protocol */ #define AF_LINK 18 /* Link layer interface */ #define pseudo_AF_XTP 19 /* eXpress Transfer Protocol (no AF) */ #define AF_COIP 20 /* connection-oriented IP, aka ST II */ #define AF_CNT 21 /* Computer Network Technology */ #define pseudo_AF_RTIP 22 /* Help Identify RTIP packets */ #define AF_IPX 23 /* Novell Internet Protocol */ #define AF_SIP 24 /* Simple Internet Protocol */ #define pseudo_AF_PIP 25 /* Help Identify PIP packets */ #define AF_ISDN 26 /* Integrated Services Digital Network*/ #define AF_E164 AF_ISDN /* CCITT E.164 recommendation */ #define pseudo_AF_KEY 27 /* Internal key-management function */ #endif #define AF_INET6 28 /* IPv6 */ #if __BSD_VISIBLE #define AF_NATM 29 /* native ATM access */ #define AF_ATM 30 /* ATM */ #define pseudo_AF_HDRCMPLT 31 /* Used by BPF to not rewrite headers * in interface output routine */ #define AF_NETGRAPH 32 /* Netgraph sockets */ #define AF_SLOW 33 /* 802.3ad slow protocol */ #define AF_SCLUSTER 34 /* Sitara cluster protocol */ #define AF_ARP 35 #define AF_BLUETOOTH 36 /* Bluetooth sockets */ #define AF_IEEE80211 37 /* IEEE 802.11 protocol */ #define AF_NETLINK 38 /* Netlink protocol */ #define AF_INET_SDP 40 /* OFED Socket Direct Protocol ipv4 */ #define AF_INET6_SDP 42 /* OFED Socket Direct Protocol ipv6 */ #define AF_HYPERV 43 /* HyperV sockets */ #define AF_DIVERT 44 /* divert(4) */ #define AF_MAX 44 /* * When allocating a new AF_ constant, please only allocate * even numbered constants for FreeBSD until 134 as odd numbered AF_ * constants 39-133 are now reserved for vendors. */ #define AF_VENDOR00 39 #define AF_VENDOR01 41 #define AF_VENDOR03 45 #define AF_VENDOR04 47 #define AF_VENDOR05 49 #define AF_VENDOR06 51 #define AF_VENDOR07 53 #define AF_VENDOR08 55 #define AF_VENDOR09 57 #define AF_VENDOR10 59 #define AF_VENDOR11 61 #define AF_VENDOR12 63 #define AF_VENDOR13 65 #define AF_VENDOR14 67 #define AF_VENDOR15 69 #define AF_VENDOR16 71 #define AF_VENDOR17 73 #define AF_VENDOR18 75 #define AF_VENDOR19 77 #define AF_VENDOR20 79 #define AF_VENDOR21 81 #define AF_VENDOR22 83 #define AF_VENDOR23 85 #define AF_VENDOR24 87 #define AF_VENDOR25 89 #define AF_VENDOR26 91 #define AF_VENDOR27 93 #define AF_VENDOR28 95 #define AF_VENDOR29 97 #define AF_VENDOR30 99 #define AF_VENDOR31 101 #define AF_VENDOR32 103 #define AF_VENDOR33 105 #define AF_VENDOR34 107 #define AF_VENDOR35 109 #define AF_VENDOR36 111 #define AF_VENDOR37 113 #define AF_VENDOR38 115 #define AF_VENDOR39 117 #define AF_VENDOR40 119 #define AF_VENDOR41 121 #define AF_VENDOR42 123 #define AF_VENDOR43 125 #define AF_VENDOR44 127 #define AF_VENDOR45 129 #define AF_VENDOR46 131 #define AF_VENDOR47 133 #endif /* * Structure used by kernel to store most * addresses. */ struct sockaddr { unsigned char sa_len; /* total length */ sa_family_t sa_family; /* address family */ char sa_data[14]; /* actually longer; address value */ }; #if __BSD_VISIBLE #define SOCK_MAXADDRLEN 255 /* longest possible addresses */ /* * Structure used by kernel to pass protocol * information in raw sockets. */ struct sockproto { unsigned short sp_family; /* address family */ unsigned short sp_protocol; /* protocol */ }; #endif #include #if __BSD_VISIBLE /* * Protocol families, same as address families for now. */ #define PF_UNSPEC AF_UNSPEC #define PF_LOCAL AF_LOCAL #define PF_UNIX PF_LOCAL /* backward compatibility */ #define PF_INET AF_INET #define PF_IMPLINK AF_IMPLINK #define PF_PUP AF_PUP #define PF_CHAOS AF_CHAOS #define PF_NETBIOS AF_NETBIOS #define PF_ISO AF_ISO #define PF_OSI AF_ISO #define PF_ECMA AF_ECMA #define PF_DATAKIT AF_DATAKIT #define PF_CCITT AF_CCITT #define PF_SNA AF_SNA #define PF_DECnet AF_DECnet #define PF_DLI AF_DLI #define PF_LAT AF_LAT #define PF_HYLINK AF_HYLINK #define PF_APPLETALK AF_APPLETALK #define PF_ROUTE AF_ROUTE #define PF_LINK AF_LINK #define PF_XTP pseudo_AF_XTP /* really just proto family, no AF */ #define PF_COIP AF_COIP #define PF_CNT AF_CNT #define PF_SIP AF_SIP #define PF_IPX AF_IPX #define PF_RTIP pseudo_AF_RTIP /* same format as AF_INET */ #define PF_PIP pseudo_AF_PIP #define PF_ISDN AF_ISDN #define PF_KEY pseudo_AF_KEY #define PF_INET6 AF_INET6 #define PF_NATM AF_NATM #define PF_ATM AF_ATM #define PF_NETGRAPH AF_NETGRAPH #define PF_SLOW AF_SLOW #define PF_SCLUSTER AF_SCLUSTER #define PF_ARP AF_ARP #define PF_BLUETOOTH AF_BLUETOOTH #define PF_IEEE80211 AF_IEEE80211 #define PF_NETLINK AF_NETLINK #define PF_INET_SDP AF_INET_SDP #define PF_INET6_SDP AF_INET6_SDP #define PF_DIVERT AF_DIVERT #define PF_MAX AF_MAX /* * Definitions for network related sysctl, CTL_NET. * * Second level is protocol family. * Third level is protocol number. * * Further levels are defined by the individual families. */ /* * PF_ROUTE - Routing table * * Three additional levels are defined: * Fourth: address family, 0 is wildcard * Fifth: type of info, defined below * Sixth: flag(s) to mask with for NET_RT_FLAGS */ #define NET_RT_DUMP 1 /* dump; may limit to a.f. */ #define NET_RT_FLAGS 2 /* by flags, e.g. RESOLVING */ #define NET_RT_IFLIST 3 /* survey interface list */ #define NET_RT_IFMALIST 4 /* return multicast address list */ #define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en * versions of msghdr structs. */ #define NET_RT_NHOP 6 /* dump routing nexthops */ #define NET_RT_NHGRP 7 /* dump routing nexthop groups */ #endif /* __BSD_VISIBLE */ /* * Maximum queue length specifiable by listen. */ #define SOMAXCONN 128 /* * Message header for recvmsg and sendmsg calls. * Used value-result for recvmsg, value only for sendmsg. */ struct msghdr { void *msg_name; /* optional address */ socklen_t msg_namelen; /* size of address */ struct iovec *msg_iov; /* scatter/gather array */ int msg_iovlen; /* # elements in msg_iov */ void *msg_control; /* ancillary data, see below */ socklen_t msg_controllen; /* ancillary data buffer len */ int msg_flags; /* flags on received message */ }; #define MSG_OOB 0x00000001 /* process out-of-band data */ #define MSG_PEEK 0x00000002 /* peek at incoming message */ #define MSG_DONTROUTE 0x00000004 /* send without using routing tables */ #define MSG_EOR 0x00000008 /* data completes record */ #define MSG_TRUNC 0x00000010 /* data discarded before delivery */ #define MSG_CTRUNC 0x00000020 /* control data lost before delivery */ #define MSG_WAITALL 0x00000040 /* wait for full request or error */ #if __BSD_VISIBLE #define MSG_DONTWAIT 0x00000080 /* this message should be nonblocking */ #define MSG_EOF 0x00000100 /* data completes connection */ /* 0x00000200 unused */ /* 0x00000400 unused */ /* 0x00000800 unused */ /* 0x00001000 unused */ #define MSG_NOTIFICATION 0x00002000 /* SCTP notification */ #define MSG_NBIO 0x00004000 /* FIONBIO mode, used by fifofs */ #define MSG_COMPAT 0x00008000 /* used in sendit() */ #endif #ifdef _KERNEL #define MSG_SOCALLBCK 0x00010000 /* for use by socket callbacks - soreceive (TCP) */ #endif #if __POSIX_VISIBLE >= 200809 #define MSG_NOSIGNAL 0x00020000 /* do not generate SIGPIPE on EOF */ #endif #if __BSD_VISIBLE #define MSG_CMSG_CLOEXEC 0x00040000 /* make received fds close-on-exec */ #define MSG_WAITFORONE 0x00080000 /* for recvmmsg() */ #endif #ifdef _KERNEL #define MSG_MORETOCOME 0x00100000 /* additional data pending */ #define MSG_TLSAPPDATA 0x00200000 /* do not soreceive() alert rec. (TLS) */ #endif /* * Header for ancillary data objects in msg_control buffer. * Used for additional information with/about a datagram * not expressible by flags. The format is a sequence * of message elements headed by cmsghdr structures. */ struct cmsghdr { socklen_t cmsg_len; /* data byte count, including hdr */ int cmsg_level; /* originating protocol */ int cmsg_type; /* protocol-specific type */ /* followed by u_char cmsg_data[]; */ }; #if __BSD_VISIBLE /* * While we may have more groups than this, the cmsgcred struct must * be able to fit in an mbuf and we have historically supported a * maximum of 16 groups. */ #define CMGROUP_MAX 16 /* * Credentials structure, used to verify the identity of a peer * process that has sent us a message. This is allocated by the * peer process but filled in by the kernel. This prevents the * peer from lying about its identity. (Note that cmcred_groups[0] * is the effective GID.) */ struct cmsgcred { pid_t cmcred_pid; /* PID of sending process */ uid_t cmcred_uid; /* real UID of sending process */ uid_t cmcred_euid; /* effective UID of sending process */ gid_t cmcred_gid; /* real GID of sending process */ short cmcred_ngroups; /* number or groups */ gid_t cmcred_groups[CMGROUP_MAX]; /* groups */ }; /* * Socket credentials (LOCAL_CREDS). */ struct sockcred { uid_t sc_uid; /* real user id */ uid_t sc_euid; /* effective user id */ gid_t sc_gid; /* real group id */ gid_t sc_egid; /* effective group id */ int sc_ngroups; /* number of supplemental groups */ gid_t sc_groups[1]; /* variable length */ }; /* * Compute size of a sockcred structure with groups. */ #define SOCKCREDSIZE(ngrps) \ (sizeof(struct sockcred) + (sizeof(gid_t) * ((ngrps) - 1))) /* * Socket credentials (LOCAL_CREDS_PERSISTENT). */ struct sockcred2 { int sc_version; /* version of this structure */ pid_t sc_pid; /* PID of sending process */ uid_t sc_uid; /* real user id */ uid_t sc_euid; /* effective user id */ gid_t sc_gid; /* real group id */ gid_t sc_egid; /* effective group id */ int sc_ngroups; /* number of supplemental groups */ gid_t sc_groups[1]; /* variable length */ }; #define SOCKCRED2SIZE(ngrps) \ (sizeof(struct sockcred2) + (sizeof(gid_t) * ((ngrps) - 1))) #endif /* __BSD_VISIBLE */ /* given pointer to struct cmsghdr, return pointer to data */ #define CMSG_DATA(cmsg) ((unsigned char *)(cmsg) + \ _ALIGN(sizeof(struct cmsghdr))) /* given pointer to struct cmsghdr, return pointer to next cmsghdr */ #define CMSG_NXTHDR(mhdr, cmsg) \ ((char *)(cmsg) == (char *)0 ? CMSG_FIRSTHDR(mhdr) : \ ((char *)(cmsg) + _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len) + \ _ALIGN(sizeof(struct cmsghdr)) > \ (char *)(mhdr)->msg_control + (mhdr)->msg_controllen) ? \ (struct cmsghdr *)0 : \ (struct cmsghdr *)(void *)((char *)(cmsg) + \ _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len))) /* * RFC 2292 requires to check msg_controllen, in case that the kernel returns * an empty list for some reasons. */ #define CMSG_FIRSTHDR(mhdr) \ ((mhdr)->msg_controllen >= sizeof(struct cmsghdr) ? \ (struct cmsghdr *)(mhdr)->msg_control : \ (struct cmsghdr *)0) #if __BSD_VISIBLE /* RFC 2292 additions */ #define CMSG_SPACE(l) (_ALIGN(sizeof(struct cmsghdr)) + _ALIGN(l)) #define CMSG_LEN(l) (_ALIGN(sizeof(struct cmsghdr)) + (l)) #endif #ifdef _KERNEL #define CMSG_ALIGN(n) _ALIGN(n) #endif /* "Socket"-level control message types: */ #define SCM_RIGHTS 0x01 /* access rights (array of int) */ #if __BSD_VISIBLE #define SCM_TIMESTAMP 0x02 /* timestamp (struct timeval) */ #define SCM_CREDS 0x03 /* process creds (struct cmsgcred) */ #define SCM_BINTIME 0x04 /* timestamp (struct bintime) */ #define SCM_REALTIME 0x05 /* timestamp (struct timespec) */ #define SCM_MONOTONIC 0x06 /* timestamp (struct timespec) */ #define SCM_TIME_INFO 0x07 /* timestamp info */ #define SCM_CREDS2 0x08 /* process creds (struct sockcred2) */ struct sock_timestamp_info { __uint32_t st_info_flags; __uint32_t st_info_pad0; __uint64_t st_info_rsv[7]; }; #define ST_INFO_HW 0x0001 /* SCM_TIMESTAMP was hw */ #define ST_INFO_HW_HPREC 0x0002 /* SCM_TIMESTAMP was hw-assisted on entrance */ #endif #if __BSD_VISIBLE /* * 4.3 compat sockaddr, move to compat file later */ struct osockaddr { unsigned short sa_family; /* address family */ char sa_data[14]; /* up to 14 bytes of direct address */ }; /* * 4.3-compat message header (move to compat file later). */ struct omsghdr { char *msg_name; /* optional address */ int msg_namelen; /* size of address */ struct iovec *msg_iov; /* scatter/gather array */ int msg_iovlen; /* # elements in msg_iov */ char *msg_accrights; /* access rights sent/received */ int msg_accrightslen; }; #endif /* * howto arguments for shutdown(2), specified by Posix.1g. */ enum shutdown_how { SHUT_RD = 0, /* shut down the reading side */ #define SHUT_RD SHUT_RD SHUT_WR, /* shut down the writing side */ #define SHUT_WR SHUT_WR SHUT_RDWR /* shut down both sides */ #define SHUT_RDWR SHUT_RDWR }; #if __BSD_VISIBLE /* * sendfile(2) header/trailer struct */ struct sf_hdtr { struct iovec *headers; /* pointer to an array of header struct iovec's */ int hdr_cnt; /* number of header iovec's */ struct iovec *trailers; /* pointer to an array of trailer struct iovec's */ int trl_cnt; /* number of trailer iovec's */ }; /* * Sendfile-specific flag(s) */ #define SF_NODISKIO 0x00000001 #define SF_MNOWAIT 0x00000002 /* obsolete */ #define SF_SYNC 0x00000004 #define SF_USER_READAHEAD 0x00000008 #define SF_NOCACHE 0x00000010 #define SF_FLAGS(rh, flags) (((rh) << 16) | (flags)) #ifdef _KERNEL #define SF_READAHEAD(flags) ((flags) >> 16) #endif /* _KERNEL */ /* * Sendmmsg/recvmmsg specific structure(s) */ struct mmsghdr { struct msghdr msg_hdr; /* message header */ ssize_t msg_len; /* message length */ }; + +/* + * Structure used for manipulating splice option. + */ +struct splice { + int sp_fd; /* drain socket file descriptor */ + off_t sp_max; /* if set, maximum bytes to splice */ + struct timeval sp_idle; /* idle timeout */ +}; + #endif /* __BSD_VISIBLE */ #if defined(_FORTIFY_SOURCE) && _FORTIFY_SOURCE > 0 #include #endif #ifndef _KERNEL #include __BEGIN_DECLS int accept(int, struct sockaddr * __restrict, socklen_t * __restrict); int bind(int, const struct sockaddr *, socklen_t); int connect(int, const struct sockaddr *, socklen_t); #if __BSD_VISIBLE int accept4(int, struct sockaddr * __restrict, socklen_t * __restrict, int); int bindat(int, int, const struct sockaddr *, socklen_t); int connectat(int, int, const struct sockaddr *, socklen_t); #endif int getpeername(int, struct sockaddr * __restrict, socklen_t * __restrict); int getsockname(int, struct sockaddr * __restrict, socklen_t * __restrict); int getsockopt(int, int, int, void * __restrict, socklen_t * __restrict); int listen(int, int); ssize_t recv(int, void *, size_t, int); ssize_t recvfrom(int, void *, size_t, int, struct sockaddr * __restrict, socklen_t * __restrict); ssize_t recvmsg(int, struct msghdr *, int); #if __BSD_VISIBLE struct timespec; ssize_t recvmmsg(int, struct mmsghdr * __restrict, size_t, int, const struct timespec * __restrict); #endif ssize_t send(int, const void *, size_t, int); ssize_t sendto(int, const void *, size_t, int, const struct sockaddr *, socklen_t); ssize_t sendmsg(int, const struct msghdr *, int); #if __BSD_VISIBLE int sendfile(int, int, off_t, size_t, struct sf_hdtr *, off_t *, int); ssize_t sendmmsg(int, struct mmsghdr * __restrict, size_t, int); int setfib(int); #endif int setsockopt(int, int, int, const void *, socklen_t); int shutdown(int, int); int sockatmark(int); int socket(int, int, int); int socketpair(int, int, int, int *); __END_DECLS #endif /* !_KERNEL */ #ifdef _KERNEL struct socket; int so_options_get(const struct socket *); void so_options_set(struct socket *, int); int so_error_get(const struct socket *); void so_error_set(struct socket *, int); #endif /* _KERNEL */ #endif /* !_SYS_SOCKET_H_ */ diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h index 2d050707606d..40464f555816 100644 --- a/sys/sys/socketvar.h +++ b/sys/sys/socketvar.h @@ -1,605 +1,650 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _SYS_SOCKETVAR_H_ #define _SYS_SOCKETVAR_H_ /* * Socket generation count type. Also used in xinpcb, xtcpcb, xunpcb. */ typedef uint64_t so_gen_t; #if defined(_KERNEL) || defined(_WANT_SOCKET) #include /* for TAILQ macros */ #include /* for struct selinfo */ #include #include #include #include #include +#include #ifdef _KERNEL #include #include +#else +#include #endif struct vnet; /* * Kernel structure per socket. * Contains send and receive buffer queues, * handle on protocol and pointer to protocol * private data and error information. */ typedef int so_upcall_t(struct socket *, void *, int); typedef void so_dtor_t(struct socket *); struct socket; enum socket_qstate { SQ_NONE = 0, SQ_INCOMP = 0x0800, /* on sol_incomp */ SQ_COMP = 0x1000, /* on sol_comp */ }; + +struct so_splice { + struct socket *src; + struct socket *dst; + off_t max; /* maximum bytes to splice, or -1 */ + struct mtx mtx; + unsigned int wq_index; + enum so_splice_state { + SPLICE_IDLE, /* waiting for work to arrive */ + SPLICE_QUEUED, /* a wakeup has queued some work */ + SPLICE_RUNNING, /* currently transferring data */ + SPLICE_CLOSING, /* waiting for work to drain */ + SPLICE_CLOSED, /* unsplicing, terminal state */ + SPLICE_EXCEPTION, /* I/O error or limit, implicit unsplice */ + } state; + struct timeout_task timeout; + STAILQ_ENTRY(so_splice) next; +}; + /*- * Locking key to struct socket: * (a) constant after allocation, no locking required. * (b) locked by SOCK_LOCK(so). * (cr) locked by SOCK_RECVBUF_LOCK(so) * (cs) locked by SOCK_SENDBUF_LOCK(so) * (e) locked by SOLISTEN_LOCK() of corresponding listening socket. * (f) not locked since integer reads/writes are atomic. * (g) used only as a sleep/wakeup address, no value. * (h) locked by global mutex so_global_mtx. + * (ir,is) locked by recv or send I/O locks. * (k) locked by KTLS workqueue mutex */ TAILQ_HEAD(accept_queue, socket); struct socket { struct mtx so_lock; volatile u_int so_count; /* (b / refcount) */ struct selinfo so_rdsel; /* (b/cr) for so_rcv/so_comp */ struct selinfo so_wrsel; /* (b/cs) for so_snd */ int so_options; /* (b) from socket call, see socket.h */ short so_type; /* (a) generic type, see socket.h */ short so_state; /* (b) internal state flags SS_* */ void *so_pcb; /* protocol control block */ struct vnet *so_vnet; /* (a) network stack instance */ struct protosw *so_proto; /* (a) protocol handle */ short so_linger; /* time to linger close(2) */ short so_timeo; /* (g) connection timeout */ u_short so_error; /* (f) error affecting connection */ u_short so_rerror; /* (f) error affecting connection */ struct sigio *so_sigio; /* [sg] information for async I/O or out of band data (SIGURG) */ struct ucred *so_cred; /* (a) user credentials */ struct label *so_label; /* (b) MAC label for socket */ /* NB: generation count must not be first. */ so_gen_t so_gencnt; /* (h) generation count */ void *so_emuldata; /* (b) private data for emulators */ so_dtor_t *so_dtor; /* (b) optional destructor */ struct osd osd; /* Object Specific extensions */ /* * so_fibnum, so_user_cookie and friends can be used to attach * some user-specified metadata to a socket, which then can be * used by the kernel for various actions. * so_user_cookie is used by ipfw/dummynet. */ int so_fibnum; /* routing domain for this socket */ uint32_t so_user_cookie; int so_ts_clock; /* type of the clock used for timestamps */ uint32_t so_max_pacing_rate; /* (f) TX rate limit in bytes/s */ + struct so_splice *so_splice; /* (b) splice state for sink */ + struct so_splice *so_splice_back; /* (b) splice state for source */ + off_t so_splice_sent; /* (ir) splice bytes sent so far */ /* * Mutexes to prevent interleaving of socket I/O. These have to be * outside of the socket buffers in order to interlock with listen(2). */ struct sx so_snd_sx __aligned(CACHE_LINE_SIZE); struct mtx so_snd_mtx; struct sx so_rcv_sx __aligned(CACHE_LINE_SIZE); struct mtx so_rcv_mtx; union { /* Regular (data flow) socket. */ struct { /* (cr, cs) Receive and send buffers. */ struct sockbuf so_rcv, so_snd; /* (e) Our place on accept queue. */ TAILQ_ENTRY(socket) so_list; struct socket *so_listen; /* (b) */ enum socket_qstate so_qstate; /* (b) */ /* (b) cached MAC label for peer */ struct label *so_peerlabel; u_long so_oobmark; /* chars to oob mark */ /* (k) Our place on KTLS RX work queue. */ STAILQ_ENTRY(socket) so_ktls_rx_list; }; /* * Listening socket, where accepts occur, is so_listen in all * subsidiary sockets. If so_listen is NULL, socket is not * related to an accept. For a listening socket itself * sol_incomp queues partially completed connections, while * sol_comp is a queue of connections ready to be accepted. * If a connection is aborted and it has so_listen set, then * it has to be pulled out of either sol_incomp or sol_comp. * We allow connections to queue up based on current queue * lengths and limit on number of queued connections for this * socket. */ struct { /* (e) queue of partial unaccepted connections */ struct accept_queue sol_incomp; /* (e) queue of complete unaccepted connections */ struct accept_queue sol_comp; u_int sol_qlen; /* (e) sol_comp length */ u_int sol_incqlen; /* (e) sol_incomp length */ u_int sol_qlimit; /* (e) queue limit */ /* accept_filter(9) optional data */ struct accept_filter *sol_accept_filter; void *sol_accept_filter_arg; /* saved filter args */ char *sol_accept_filter_str; /* saved user args */ /* Optional upcall, for kernel socket. */ so_upcall_t *sol_upcall; /* (e) */ void *sol_upcallarg; /* (e) */ /* Socket buffer parameters, to be copied to * dataflow sockets, accepted from this one. */ int sol_sbrcv_lowat; int sol_sbsnd_lowat; u_int sol_sbrcv_hiwat; u_int sol_sbsnd_hiwat; short sol_sbrcv_flags; short sol_sbsnd_flags; sbintime_t sol_sbrcv_timeo; sbintime_t sol_sbsnd_timeo; /* Information tracking listen queue overflows. */ struct timeval sol_lastover; /* (e) */ int sol_overcount; /* (e) */ }; }; }; #endif /* defined(_KERNEL) || defined(_WANT_SOCKET) */ /* * Socket state bits. * * Historically, these bits were all kept in the so_state field. * They are now split into separate, lock-specific fields. * so_state maintains basic socket state protected by the socket lock. * so_qstate holds information about the socket accept queues. * Each socket buffer also has a state field holding information * relevant to that socket buffer (can't send, rcv). * Many fields will be read without locks to improve performance and avoid * lock order issues. However, this approach must be used with caution. */ #define SS_ISCONNECTED 0x0002 /* socket connected to a peer */ #define SS_ISCONNECTING 0x0004 /* in process of connecting to peer */ #define SS_ISDISCONNECTING 0x0008 /* in process of disconnecting */ #define SS_NBIO 0x0100 /* non-blocking ops */ #define SS_ASYNC 0x0200 /* async i/o notify */ /* was SS_ISCONFIRMING 0x0400 */ #define SS_ISDISCONNECTED 0x2000 /* socket disconnected from peer */ #ifdef _KERNEL #define SOCK_MTX(so) (&(so)->so_lock) #define SOCK_LOCK(so) mtx_lock(&(so)->so_lock) #define SOCK_OWNED(so) mtx_owned(&(so)->so_lock) #define SOCK_UNLOCK(so) mtx_unlock(&(so)->so_lock) #define SOCK_LOCK_ASSERT(so) mtx_assert(&(so)->so_lock, MA_OWNED) #define SOCK_UNLOCK_ASSERT(so) mtx_assert(&(so)->so_lock, MA_NOTOWNED) #define SOLISTENING(sol) (((sol)->so_options & SO_ACCEPTCONN) != 0) #define SOLISTEN_LOCK(sol) do { \ mtx_lock(&(sol)->so_lock); \ KASSERT(SOLISTENING(sol), \ ("%s: %p not listening", __func__, (sol))); \ } while (0) #define SOLISTEN_TRYLOCK(sol) mtx_trylock(&(sol)->so_lock) #define SOLISTEN_UNLOCK(sol) do { \ KASSERT(SOLISTENING(sol), \ ("%s: %p not listening", __func__, (sol))); \ mtx_unlock(&(sol)->so_lock); \ } while (0) #define SOLISTEN_LOCK_ASSERT(sol) do { \ mtx_assert(&(sol)->so_lock, MA_OWNED); \ KASSERT(SOLISTENING(sol), \ ("%s: %p not listening", __func__, (sol))); \ } while (0) #define SOLISTEN_UNLOCK_ASSERT(sol) do { \ mtx_assert(&(sol)->so_lock, MA_NOTOWNED); \ KASSERT(SOLISTENING(sol), \ ("%s: %p not listening", __func__, (sol))); \ } while (0) /* * Socket buffer locks. These are strongly preferred over SOCKBUF_LOCK(sb) * macros, as we are moving towards protocol specific socket buffers. */ #define SOCK_RECVBUF_MTX(so) \ (&(so)->so_rcv_mtx) #define SOCK_RECVBUF_LOCK(so) \ mtx_lock(SOCK_RECVBUF_MTX(so)) #define SOCK_RECVBUF_UNLOCK(so) \ mtx_unlock(SOCK_RECVBUF_MTX(so)) #define SOCK_RECVBUF_LOCK_ASSERT(so) \ mtx_assert(SOCK_RECVBUF_MTX(so), MA_OWNED) #define SOCK_RECVBUF_UNLOCK_ASSERT(so) \ mtx_assert(SOCK_RECVBUF_MTX(so), MA_NOTOWNED) #define SOCK_SENDBUF_MTX(so) \ (&(so)->so_snd_mtx) #define SOCK_SENDBUF_LOCK(so) \ mtx_lock(SOCK_SENDBUF_MTX(so)) #define SOCK_SENDBUF_UNLOCK(so) \ mtx_unlock(SOCK_SENDBUF_MTX(so)) #define SOCK_SENDBUF_LOCK_ASSERT(so) \ mtx_assert(SOCK_SENDBUF_MTX(so), MA_OWNED) #define SOCK_SENDBUF_UNLOCK_ASSERT(so) \ mtx_assert(SOCK_SENDBUF_MTX(so), MA_NOTOWNED) #define SOCK_BUF_LOCK(so, which) \ mtx_lock(soeventmtx(so, which)) #define SOCK_BUF_UNLOCK(so, which) \ mtx_unlock(soeventmtx(so, which)) #define SOCK_BUF_LOCK_ASSERT(so, which) \ mtx_assert(soeventmtx(so, which), MA_OWNED) #define SOCK_BUF_UNLOCK_ASSERT(so, which) \ mtx_assert(soeventmtx(so, which), MA_NOTOWNED) static inline struct sockbuf * sobuf(struct socket *so, const sb_which which) { return (which == SO_RCV ? &so->so_rcv : &so->so_snd); } static inline struct mtx * soeventmtx(struct socket *so, const sb_which which) { return (which == SO_RCV ? SOCK_RECVBUF_MTX(so) : SOCK_SENDBUF_MTX(so)); } /* * Macros for sockets and socket buffering. */ + +#define isspliced(so) ((so->so_splice != NULL && \ + so->so_splice->src != NULL)) +#define issplicedback(so) ((so->so_splice_back != NULL && \ + so->so_splice_back->dst != NULL)) /* * Flags to soiolock(). */ #define SBL_WAIT 0x00000001 /* Wait if not immediately available. */ #define SBL_NOINTR 0x00000002 /* Force non-interruptible sleep. */ #define SBL_VALID (SBL_WAIT | SBL_NOINTR) #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) #define SOCK_IO_SEND_LOCK(so, flags) \ soiolock((so), &(so)->so_snd_sx, (flags)) #define SOCK_IO_SEND_UNLOCK(so) \ soiounlock(&(so)->so_snd_sx) #define SOCK_IO_SEND_ASSERT_LOCKED(so) \ sx_assert(&(so)->so_snd_sx, SA_LOCKED) #define SOCK_IO_RECV_LOCK(so, flags) \ soiolock((so), &(so)->so_rcv_sx, (flags)) #define SOCK_IO_RECV_UNLOCK(so) \ soiounlock(&(so)->so_rcv_sx) #define SOCK_IO_RECV_ASSERT_LOCKED(so) \ sx_assert(&(so)->so_rcv_sx, SA_LOCKED) /* do we have to send all at once on a socket? */ #define sosendallatonce(so) \ ((so)->so_proto->pr_flags & PR_ATOMIC) /* can we read something from so? */ #define soreadabledata(so) \ (sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || \ (so)->so_error || (so)->so_rerror) -#define soreadable(so) \ +#define _soreadable(so) \ (soreadabledata(so) || ((so)->so_rcv.sb_state & SBS_CANTRCVMORE)) +static inline bool +soreadable(struct socket *so) +{ + if (isspliced(so)) + return (false); + return (_soreadable(so)); +} + /* can we write something to so? */ #define sowriteable(so) \ ((sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat && \ (((so)->so_state&SS_ISCONNECTED) || \ ((so)->so_proto->pr_flags&PR_CONNREQUIRED)==0)) || \ ((so)->so_snd.sb_state & SBS_CANTSENDMORE) || \ (so)->so_error) /* * soref()/sorele() ref-count the socket structure. * soref() may be called without owning socket lock, but in that case a * caller must own something that holds socket, and so_count must be not 0. * Note that you must still explicitly close the socket, but the last ref * count will free the structure. */ #define soref(so) refcount_acquire(&(so)->so_count) #define sorele(so) do { \ SOCK_UNLOCK_ASSERT(so); \ if (!refcount_release_if_not_last(&(so)->so_count)) { \ SOCK_LOCK(so); \ sorele_locked(so); \ } \ } while (0) /* * In sorwakeup() and sowwakeup(), acquire the socket buffer lock to * avoid a non-atomic test-and-wakeup. However, sowakeup is * responsible for releasing the lock if it is called. We unlock only * if we don't call into sowakeup. If any code is introduced that * directly invokes the underlying sowakeup() primitives, it must * maintain the same semantics. */ #define sorwakeup(so) do { \ SOCK_RECVBUF_LOCK(so); \ sorwakeup_locked(so); \ } while (0) #define sowwakeup(so) do { \ SOCK_SENDBUF_LOCK(so); \ sowwakeup_locked(so); \ } while (0) struct accept_filter { char accf_name[16]; int (*accf_callback) (struct socket *so, void *arg, int waitflag); void * (*accf_create) (struct socket *so, char *arg); void (*accf_destroy) (struct socket *so); SLIST_ENTRY(accept_filter) accf_next; }; #define ACCEPT_FILTER_DEFINE(modname, filtname, cb, create, destroy, ver) \ static struct accept_filter modname##_filter = { \ .accf_name = filtname, \ .accf_callback = cb, \ .accf_create = create, \ .accf_destroy = destroy, \ }; \ static moduledata_t modname##_mod = { \ .name = __XSTRING(modname), \ .evhand = accept_filt_generic_mod_event, \ .priv = &modname##_filter, \ }; \ DECLARE_MODULE(modname, modname##_mod, SI_SUB_DRIVERS, \ SI_ORDER_MIDDLE); \ MODULE_VERSION(modname, ver) #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_ACCF); MALLOC_DECLARE(M_PCB); MALLOC_DECLARE(M_SONAME); #endif /* * Socket specific helper hook point identifiers * Do not leave holes in the sequence, hook registration is a loop. */ #define HHOOK_SOCKET_OPT 0 #define HHOOK_SOCKET_CREATE 1 #define HHOOK_SOCKET_RCV 2 #define HHOOK_SOCKET_SND 3 #define HHOOK_FILT_SOREAD 4 #define HHOOK_FILT_SOWRITE 5 #define HHOOK_SOCKET_CLOSE 6 #define HHOOK_SOCKET_NEWCONN 7 #define HHOOK_SOCKET_LAST HHOOK_SOCKET_NEWCONN struct socket_hhook_data { struct socket *so; struct mbuf *m; void *hctx; /* hook point specific data*/ int status; }; extern int maxsockets; extern u_long sb_max; extern so_gen_t so_gencnt; struct file; struct filecaps; struct filedesc; struct mbuf; struct sockaddr; struct ucred; struct uio; enum shutdown_how; /* Return values for socket upcalls. */ #define SU_OK 0 #define SU_ISCONNECTED 1 /* * From uipc_socket and friends */ int getsockaddr(struct sockaddr **namp, const struct sockaddr *uaddr, size_t len); int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp, struct filecaps *havecaps); int getsock(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); void soabort(struct socket *so); int soaccept(struct socket *so, struct sockaddr *sa); int sopeeraddr(struct socket *so, struct sockaddr *sa); int sosockaddr(struct socket *so, struct sockaddr *sa); void soaio_enqueue(struct task *task); void soaio_rcv(void *context, int pending); void soaio_snd(void *context, int pending); int socheckuid(struct socket *so, uid_t uid); int sobind(struct socket *so, struct sockaddr *nam, struct thread *td); int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td); int soclose(struct socket *so); int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td); int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td); int soconnect2(struct socket *so1, struct socket *so2); int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td); int sodisconnect(struct socket *so); void sodtor_set(struct socket *, so_dtor_t *); struct sockaddr *sodupsockaddr(const struct sockaddr *sa, int mflags); void sohasoutofband(struct socket *so); int solisten(struct socket *so, int backlog, struct thread *td); void solisten_proto(struct socket *so, int backlog); void solisten_proto_abort(struct socket *so); int solisten_proto_check(struct socket *so); bool solisten_enqueue(struct socket *, int); int solisten_dequeue(struct socket *, struct socket **, int); struct socket * solisten_clone(struct socket *); struct socket * sonewconn(struct socket *head, int connstatus); struct socket * sopeeloff(struct socket *); int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td); int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td); int soreceive(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreceive_stream(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreceive_dgram(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreceive_generic(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); void sorele_locked(struct socket *so); void sodealloc(struct socket *); int soreserve(struct socket *so, u_long sndcc, u_long rcvcc); void sorflush(struct socket *so); int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int sousrsend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *control, int flags, struct proc *); int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int soshutdown(struct socket *so, enum shutdown_how); void soupcall_clear(struct socket *, sb_which); void soupcall_set(struct socket *, sb_which, so_upcall_t, void *); void solisten_upcall_set(struct socket *, so_upcall_t, void *); void sorwakeup_locked(struct socket *); void sowwakeup_locked(struct socket *); void sowakeup_aio(struct socket *, sb_which); void solisten_wakeup(struct socket *); int selsocket(struct socket *so, int events, struct timeval *tv, struct thread *td); void soisconnected(struct socket *so); void soisconnecting(struct socket *so); void soisdisconnected(struct socket *so); void soisdisconnecting(struct socket *so); void socantrcvmore(struct socket *so); void socantrcvmore_locked(struct socket *so); void socantsendmore(struct socket *so); void socantsendmore_locked(struct socket *so); void soroverflow(struct socket *so); void soroverflow_locked(struct socket *so); int soiolock(struct socket *so, struct sx *sx, int flags); void soiounlock(struct sx *sx); +/* + * Socket splicing routines. + */ +void so_splice_dispatch(struct so_splice *sp); + /* * Accept filter functions (duh). */ int accept_filt_add(struct accept_filter *filt); int accept_filt_del(char *name); struct accept_filter *accept_filt_get(char *name); #ifdef ACCEPT_FILTER_MOD #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_accf); #endif int accept_filt_generic_mod_event(module_t mod, int event, void *data); #endif #endif /* _KERNEL */ /* * Structure to export socket from kernel to utilities, via sysctl(3). */ struct xsocket { ksize_t xso_len; /* length of this structure */ kvaddr_t xso_so; /* kernel address of struct socket */ kvaddr_t so_pcb; /* kernel address of struct inpcb */ uint64_t so_oobmark; - int64_t so_spare64[8]; + kvaddr_t so_splice_so; /* kernel address of spliced socket */ + int64_t so_spare64[7]; int32_t xso_protocol; int32_t xso_family; uint32_t so_qlen; uint32_t so_incqlen; uint32_t so_qlimit; pid_t so_pgid; uid_t so_uid; int32_t so_spare32[8]; int16_t so_type; int16_t so_options; int16_t so_linger; int16_t so_state; int16_t so_timeo; uint16_t so_error; struct xsockbuf { uint32_t sb_cc; uint32_t sb_hiwat; uint32_t sb_mbcnt; uint32_t sb_spare0; /* was sb_mcnt */ uint32_t sb_spare1; /* was sb_ccnt */ uint32_t sb_mbmax; int32_t sb_lowat; int32_t sb_timeo; int16_t sb_flags; } so_rcv, so_snd; }; #ifdef _KERNEL void sotoxsocket(struct socket *so, struct xsocket *xso); void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb); #endif /* * Socket buffer state bits. Exported via libprocstat(3). */ #define SBS_CANTSENDMORE 0x0010 /* can't send more data to peer */ #define SBS_CANTRCVMORE 0x0020 /* can't receive more data from peer */ #define SBS_RCVATMARK 0x0040 /* at mark on input */ #endif /* !_SYS_SOCKETVAR_H_ */