diff --git a/lib/libc/sys/socket.2 b/lib/libc/sys/socket.2 index 8ced1f0ba930..1eceabbf6fd4 100644 --- a/lib/libc/sys/socket.2 +++ b/lib/libc/sys/socket.2 @@ -1,349 +1,351 @@ .\" Copyright (c) 1983, 1991, 1993 .\" The Regents of the University of California. All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" From: @(#)socket.2 8.1 (Berkeley) 6/4/93 .\" $FreeBSD$ .\" -.Dd August 26, 2022 +.Dd August 30, 2022 .Dt SOCKET 2 .Os .Sh NAME .Nm socket .Nd create an endpoint for communication .Sh LIBRARY .Lb libc .Sh SYNOPSIS .In sys/socket.h .Ft int .Fn socket "int domain" "int type" "int protocol" .Sh DESCRIPTION The .Fn socket system call creates an endpoint for communication and returns a descriptor. .Pp The .Fa domain argument specifies a communications domain within which communication will take place; this selects the protocol family which should be used. These families are defined in the include file .In sys/socket.h . The currently understood formats are: .Pp .Bd -literal -offset indent -compact PF_LOCAL Host-internal protocols (alias for PF_UNIX), PF_UNIX Host-internal protocols, PF_INET Internet version 4 protocols, PF_INET6 Internet version 6 protocols, +PF_DIVERT Firewall packet diversion/re-injection, PF_ROUTE Internal routing protocol, PF_KEY Internal key-management function, PF_NETGRAPH Netgraph sockets, PF_BLUETOOTH Bluetooth protocols, PF_INET_SDP OFED socket direct protocol (IPv4), AF_HYPERV HyperV sockets .Ed .Pp Each protocol family is connected to an address family, which has the same name except that the prefix is .Dq Dv AF_ in place of .Dq Dv PF_ . Other protocol families may be also defined, beginning with .Dq Dv PF_ , with corresponding address families. .Pp The socket has the indicated .Fa type , which specifies the semantics of communication. Currently defined types are: .Pp .Bd -literal -offset indent -compact SOCK_STREAM Stream socket, SOCK_DGRAM Datagram socket, SOCK_RAW Raw-protocol interface, SOCK_SEQPACKET Sequenced packet stream .Ed .Pp A .Dv SOCK_STREAM type provides sequenced, reliable, two-way connection based byte streams. An out-of-band data transmission mechanism may be supported. A .Dv SOCK_DGRAM socket supports datagrams (connectionless, unreliable messages of a fixed (typically small) maximum length). A .Dv SOCK_SEQPACKET socket may provide a sequenced, reliable, two-way connection-based data transmission path for datagrams of fixed maximum length; a consumer may be required to read an entire packet with each read system call. This facility may have protocol-specific properties. .Dv SOCK_RAW sockets provide access to internal network protocols and interfaces. The .Dv SOCK_RAW type is available only to the super-user and is described in .Xr ip 4 and .Xr ip6 4 . .Pp Additionally, the following flags are allowed in the .Fa type argument: .Pp .Bd -literal -offset indent -compact SOCK_CLOEXEC Set close-on-exec on the new descriptor, SOCK_NONBLOCK Set non-blocking mode on the new socket .Ed .Pp The .Fa protocol argument specifies a particular protocol to be used with the socket. Normally only a single protocol exists to support a particular socket type within a given protocol family. However, it is possible that many protocols may exist, in which case a particular protocol must be specified in this manner. The protocol number to use is particular to the .Dq "communication domain" in which communication is to take place; see .Xr protocols 5 . .Pp The .Fa protocol argument may be set to zero (0) to request the default implementation of a socket type for the protocol, if any. .Pp Sockets of type .Dv SOCK_STREAM are full-duplex byte streams, similar to pipes. A stream socket must be in a .Em connected state before any data may be sent or received on it. A connection to another socket is created with a .Xr connect 2 system call. Once connected, data may be transferred using .Xr read 2 and .Xr write 2 calls or some variant of the .Xr send 2 and .Xr recv 2 functions. (Some protocol families, such as the Internet family, support the notion of an .Dq implied connect , which permits data to be sent piggybacked onto a connect operation by using the .Xr sendto 2 system call.) When a session has been completed a .Xr close 2 may be performed. Out-of-band data may also be transmitted as described in .Xr send 2 and received as described in .Xr recv 2 . .Pp The communications protocols used to implement a .Dv SOCK_STREAM ensure that data is not lost or duplicated. If a piece of data for which the peer protocol has buffer space cannot be successfully transmitted within a reasonable length of time, then the connection is considered broken and calls will indicate an error with -1 returns and with .Er ETIMEDOUT as the specific code in the global variable .Va errno . The protocols optionally keep sockets .Dq warm by forcing transmissions roughly every minute in the absence of other activity. An error is then indicated if no response can be elicited on an otherwise idle connection for an extended period (e.g.\& 5 minutes). By default, a .Dv SIGPIPE signal is raised if a process sends on a broken stream, but this behavior may be inhibited via .Xr setsockopt 2 . .Pp .Dv SOCK_SEQPACKET sockets employ the same system calls as .Dv SOCK_STREAM sockets. The only difference is that .Xr read 2 calls will return only the amount of data requested, and any remaining in the arriving packet will be discarded. .Pp .Dv SOCK_DGRAM and .Dv SOCK_RAW sockets allow sending of datagrams to correspondents named in .Xr send 2 calls. Datagrams are generally received with .Xr recvfrom 2 , which returns the next datagram with its return address. .Pp An .Xr fcntl 2 system call can be used to specify a process group to receive a .Dv SIGURG signal when the out-of-band data arrives. It may also enable non-blocking I/O and asynchronous notification of I/O events via .Dv SIGIO . .Pp The operation of sockets is controlled by socket level .Em options . These options are defined in the file .In sys/socket.h . The .Xr setsockopt 2 and .Xr getsockopt 2 system calls are used to set and get options, respectively. .Sh RETURN VALUES A -1 is returned if an error occurs, otherwise the return value is a descriptor referencing the socket. .Sh ERRORS The .Fn socket system call fails if: .Bl -tag -width Er .It Bq Er EACCES Permission to create a socket of the specified type and/or protocol is denied. .It Bq Er EAFNOSUPPORT The address family (domain) is not supported or the specified domain is not supported by this protocol family. .It Bq Er EMFILE The per-process descriptor table is full. .It Bq Er ENFILE The system file table is full. .It Bq Er ENOBUFS Insufficient buffer space is available. The socket cannot be created until sufficient resources are freed. .It Bq Er EPERM User has insufficient privileges to carry out the requested operation. .It Bq Er EPROTONOSUPPORT The protocol type or the specified protocol is not supported within this domain. .It Bq Er EPROTOTYPE The socket type is not supported by the protocol. .El .Sh SEE ALSO .Xr accept 2 , .Xr bind 2 , .Xr connect 2 , +.Xr divert 4 , .Xr getpeername 2 , .Xr getsockname 2 , .Xr getsockopt 2 , .Xr ioctl 2 , .Xr ip 4 , .Xr ip6 4 , .Xr listen 2 , .Xr read 2 , .Xr recv 2 , .Xr select 2 , .Xr send 2 , .Xr shutdown 2 , .Xr socketpair 2 , .Xr write 2 , .Xr CMSG_DATA 3 , .Xr getprotoent 3 , .Xr netgraph 4 , .Xr protocols 5 .Rs .%T "An Introductory 4.3 BSD Interprocess Communication Tutorial" .%B PS1 .%N 7 .Re .Rs .%T "BSD Interprocess Communication Tutorial" .%B PS1 .%N 8 .Re .Sh STANDARDS The .Fn socket function conforms to .St -p1003.1-2008 . The .Tn POSIX standard specifies only the .Dv AF_INET , .Dv AF_INET6 , and .Dv AF_UNIX constants for address families, and requires the use of .Dv AF_* constants for the .Fa domain argument of .Fn socket . The .Dv SOCK_CLOEXEC flag is expected to conform to the next revision of the .Tn POSIX standard. The .Dv SOCK_RDM .Fa type , the .Dv PF_* constants, and other address families are .Fx extensions. .Sh HISTORY The .Fn socket system call appeared in .Bx 4.2 . diff --git a/share/examples/netgraph/ngctl b/share/examples/netgraph/ngctl index e7b7cd86b04f..8dc6b23815b7 100644 --- a/share/examples/netgraph/ngctl +++ b/share/examples/netgraph/ngctl @@ -1,173 +1,173 @@ # $FreeBSD$ # # This is an example that shows how to send ASCII formatted control # messages to a node using ngctl(8). # # What we will do here create a divert(4) tap. This simply dumps # out all packets diverted by some ipfw(8) divert rule to the console. # # Lines that begin with ``$'' (shell prompt) or ``+'' (ngctl prompt) # indicate user input # # First, start up ngctl in interactive mode: $ ngctl Available commands: connect Connects hook of the node at to debug Get/set debugging verbosity level help Show command summary or get more help on a specific command list Show information about all nodes mkpeer Create and connect a new node to the node at "path" msg Send a netgraph control message to the node at "path" name Assign name to the node at read Read and execute commands from a file rmhook Disconnect hook "hook" of the node at "path" show Show information about the node at shutdown Shutdown the node at status Get human readable status information from the node at types Show information about all installed node types quit Exit program + -# Now let's create a ng_ksocket(4) node, in the family PF_INET, -# of type SOCK_RAW, and protocol IPPROTO_DIVERT: +# Now let's create a ng_ksocket(4) node, in the family PF_DIVERT, +# of type SOCK_RAW: - + mkpeer ksocket foo inet/raw/divert + + mkpeer ksocket foo divert/raw/0 # Note that ``foo'' is the hook name on the socket node, which can be # anything. The ``inet/raw/divert'' is the hook name on the ksocket # node, which tells it what kind of socket to create. # Lets give our ksocket node a global name. How about ``fred'': + name foo fred # Note that we used ngctl's ``name'' command to do this. However, # the following manually constructed netgraph message would have # accomplished the exact same thing: + msg foo name { name="fred" } # Here we are using the ASCII <-> binary control message conversion # routines. ngctl does this for us automatically when we use the # ``msg'' command. # Now lets bind the socket associated with the ksocket node to a port # supplied by the system. We do this by sending the ksocket node a # ``bind'' control message. Again, ngctl does the conversion of the # control message from ASCII to binary behind the scenes. + msg fred: bind inet/192.168.1.1 # The ksocket accepts arbitrary sockaddr structures, but also has # special support for the PF_LOCAL and PF_INET protocol families. # That is why we can specify the struct sockaddr argument to the # ``bind'' command as ``inet/192.168.1.1'' (since we didn't specify # a port number, it's assumed to be zero). We could have also # relied on the generic sockaddr syntax and instead said this: + msg fred: bind { family=2 len=16 data=[ 2=192 168 1 1 ] } # This is what you would have to do for protocol families other # that PF_INET and PF_LOCAL, at least until special handling for # new ones is added. # The reason for the ``2=192'' is to skip the two byte IP port number, # which causes it to be set to zero, the default value for integral # types when parsing. Now since we didn't ask for a specific port # number, we need to do a ``getname'' to see what port number we got: + msg fred: getname Rec'd response "getname" (5) from "fred:": Args: inet/192.168.1.1:1029 # As soon as we sent the message, we got back a response. Here # ngctl is telling us that it received a control message with the # NGF_RESP (response) flag set, the response was to a prior ``getname'' # control message, that the originator was the node addressable # as ``fred:''. The message arguments field is then displayed to # us in its ASCII form. In this case, what we get back is a struct # sockaddr, and there we see that our port number is 1029. # So now let's add the ipfw divert rule for whatever packets we # want to see. How about anything from 192.168.1.129. + ^Z Suspended $ ipfw add 100 divert 1029 ip from 192.168.1.129 to any 00100 divert 1029 ip from 192.168.1.129 to any $ fg # Now watch what happens when we try to ping from that machine: + Rec'd data packet on hook "foo": 0000: 45 00 00 3c 57 00 00 00 20 01 bf ee c0 a8 01 81 E.. __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include /* for struct knote */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #include #endif static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); static void so_rdknl_lock(void *); static void so_rdknl_unlock(void *); static void so_rdknl_assert_lock(void *, int); static void so_wrknl_lock(void *); static void so_wrknl_unlock(void *); static void so_wrknl_assert_lock(void *, int); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); static int filt_soempty(struct knote *kn, long hint); static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); fo_kqfilter_t soo_kqfilter; static struct filterops soread_filtops = { .f_isfd = 1, .f_detach = filt_sordetach, .f_event = filt_soread, }; static struct filterops sowrite_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_sowrite, }; static struct filterops soempty_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_soempty, }; so_gen_t so_gencnt; /* generation count for sockets */ MALLOC_DEFINE(M_SONAME, "soname", "socket name"); MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); #define VNET_SO_ASSERT(so) \ VNET_ASSERT(curvnet != NULL, \ ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); #define V_socket_hhh VNET(socket_hhh) /* * Limit on the number of connections in the listen queue waiting * for accept(2). * NB: The original sysctl somaxconn is still available but hidden * to prevent confusion about the actual purpose of this number. */ static u_int somaxconn = SOMAXCONN; static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS) { int error; int val; val = somaxconn; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); /* * The purpose of the UINT_MAX / 3 limit, is so that the formula * 3 * so_qlimit / 2 * below, will not overflow. */ if (val < 1 || val > UINT_MAX / 3) return (EINVAL); somaxconn = val; return (0); } SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size"); SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size (compat)"); static int numopensockets; SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, &numopensockets, 0, "Number of open sockets"); /* * accept_mtx locks down per-socket fields relating to accept queues. See * socketvar.h for an annotation of the protected fields of struct socket. */ struct mtx accept_mtx; MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); /* * so_global_mtx protects so_gencnt, numopensockets, and the per-socket * so_gencnt field. */ static struct mtx so_global_mtx; MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); /* * General IPC sysctl name space, used by sockets and a variety of other IPC * types. */ SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPC"); /* * Initialize the socket subsystem and set up the socket * memory allocator. */ static uma_zone_t socket_zone; int maxsockets; static void socket_zone_change(void *tag) { maxsockets = uma_zone_set_max(socket_zone, maxsockets); } static void socket_hhook_register(int subtype) { if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, &V_socket_hhh[subtype], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register hook\n", __func__); } static void socket_hhook_deregister(int subtype) { if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) printf("%s: WARNING: unable to deregister hook\n", __func__); } static void socket_init(void *tag) { socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); maxsockets = uma_zone_set_max(socket_zone, maxsockets); uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); static void socket_vnet_init(const void *unused __unused) { int i; /* We expect a contiguous range */ for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_register(i); } VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_init, NULL); static void socket_vnet_uninit(const void *unused __unused) { int i; for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_deregister(i); } VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_uninit, NULL); /* * Initialise maxsockets. This SYSINIT must be run after * tunable_mbinit(). */ static void init_maxsockets(void *ignored) { TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); maxsockets = imax(maxsockets, maxfiles); } SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); /* * Sysctl to get and set the maximum global sockets limit. Notify protocols * of the change so that they can update their dependent limits as required. */ static int sysctl_maxsockets(SYSCTL_HANDLER_ARGS) { int error, newmaxsockets; newmaxsockets = maxsockets; error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); if (error == 0 && req->newptr && newmaxsockets != maxsockets) { if (newmaxsockets > maxsockets && newmaxsockets <= maxfiles) { maxsockets = newmaxsockets; EVENTHANDLER_INVOKE(maxsockets_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &maxsockets, 0, sysctl_maxsockets, "IU", "Maximum number of sockets available"); /* * Socket operation routines. These routines are called by the routines in * sys_socket.c or from a system process, and implement the semantics of * socket operations by switching out to the protocol specific routines. */ /* * Get a socket structure from our zone, and initialize it. Note that it * would probably be better to allocate socket and PCB at the same time, but * I'm not convinced that all the protocols can be easily modified to do * this. * * soalloc() returns a socket with a ref count of 0. */ static struct socket * soalloc(struct vnet *vnet) { struct socket *so; so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); if (so == NULL) return (NULL); #ifdef MAC if (mac_socket_init(so, M_NOWAIT) != 0) { uma_zfree(socket_zone, so); return (NULL); } #endif if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { uma_zfree(socket_zone, so); return (NULL); } /* * The socket locking protocol allows to lock 2 sockets at a time, * however, the first one must be a listening socket. WITNESS lacks * a feature to change class of an existing lock, so we use DUPOK. */ mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF); mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF); so->so_rcv.sb_sel = &so->so_rdsel; so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd_sx, "so_snd_sx"); sx_init(&so->so_rcv_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); TAILQ_INIT(&so->so_rcv.sb_aiojobq); TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); #ifdef VIMAGE VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet = vnet; #endif /* We shouldn't need the so_global_mtx */ if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { /* Do we need more comprehensive error returns? */ uma_zfree(socket_zone, so); return (NULL); } mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; #ifdef VIMAGE vnet->vnet_sockcnt++; #endif mtx_unlock(&so_global_mtx); return (so); } /* * Free the storage associated with a socket at the socket layer, tear down * locks, labels, etc. All protocol state is assumed already to have been * torn down (and possibly never set up) by the caller. */ void sodealloc(struct socket *so) { KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ #ifdef VIMAGE VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet->vnet_sockcnt--; #endif mtx_unlock(&so_global_mtx); #ifdef MAC mac_socket_destroy(so); #endif hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); khelp_destroy_osd(&so->osd); if (SOLISTENING(so)) { if (so->sol_accept_filter != NULL) accept_filt_setopt(so, NULL); } else { if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); sx_destroy(&so->so_snd_sx); sx_destroy(&so->so_rcv_sx); mtx_destroy(&so->so_snd_mtx); mtx_destroy(&so->so_rcv_mtx); } crfree(so->so_cred); mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } /* * socreate returns a socket with a ref count of 1 and a file descriptor * reference. The socket should be closed with soclose(). */ int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td) { struct protosw *prp; struct socket *so; int error; + /* + * XXX: divert(4) historically abused PF_INET. Keep this compatibility + * shim until all applications have been updated. + */ + if (__predict_false(dom == PF_INET && type == SOCK_RAW && + proto == IPPROTO_DIVERT)) { + dom = PF_DIVERT; + printf("%s uses obsolete way to create divert(4) socket\n", + td->td_proc->p_comm); + } + if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); if (prp == NULL) { /* No support for domain. */ if (pffinddomain(dom) == NULL) return (EAFNOSUPPORT); /* No support for socket type. */ if (proto == 0 && type != 0) return (EPROTOTYPE); return (EPROTONOSUPPORT); } MPASS(prp->pr_attach); if (IN_CAPABILITY_MODE(td) && (prp->pr_flags & PR_CAPATTACH) == 0) return (ECAPMODE); if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) return (EPROTONOSUPPORT); if (prp->pr_type != type) return (EPROTOTYPE); so = soalloc(CRED_TO_VNET(cred)); if (so == NULL) return (ENOBUFS); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || (prp->pr_domain->dom_family == PF_INET6) || (prp->pr_domain->dom_family == PF_ROUTE)) so->so_fibnum = td->td_proc->p_fibnum; else so->so_fibnum = 0; so->so_proto = prp; #ifdef MAC mac_socket_create(cred, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); if ((prp->pr_flags & PR_SOCKBUF) == 0) { so->so_snd.sb_mtx = &so->so_snd_mtx; so->so_rcv.sb_mtx = &so->so_rcv_mtx; } /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ CURVNET_SET(so->so_vnet); error = prp->pr_attach(so, proto, td); CURVNET_RESTORE(); if (error) { sodealloc(so); return (error); } soref(so); *aso = so; return (0); } #ifdef REGRESSION static int regression_sonewconn_earlytest = 1; SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); #endif static struct timeval overinterval = { 60, 0 }; SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, &overinterval, "Delay in seconds between warnings for listen socket overflows"); /* * When an attempt at a new connection is noted on a socket which supports * accept(2), the protocol has two options: * 1) Call legacy sonewconn() function, which would call protocol attach * method, same as used for socket(2). * 2) Call solisten_clone(), do attach that is specific to a cloned connection, * and then call solisten_enqueue(). * * Note: the ref count on the socket is 0 on return. */ struct socket * solisten_clone(struct socket *head) { struct sbuf descrsb; struct socket *so; int len, overcount; u_int qlen; const char localprefix[] = "local:"; char descrbuf[SUNPATHLEN + sizeof(localprefix)]; #if defined(INET6) char addrbuf[INET6_ADDRSTRLEN]; #elif defined(INET) char addrbuf[INET_ADDRSTRLEN]; #endif bool dolog, over; SOLISTEN_LOCK(head); over = (head->sol_qlen > 3 * head->sol_qlimit / 2); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else if (over) { #endif head->sol_overcount++; dolog = !!ratecheck(&head->sol_lastover, &overinterval); /* * If we're going to log, copy the overflow count and queue * length from the listen socket before dropping the lock. * Also, reset the overflow count. */ if (dolog) { overcount = head->sol_overcount; head->sol_overcount = 0; qlen = head->sol_qlen; } SOLISTEN_UNLOCK(head); if (dolog) { /* * Try to print something descriptive about the * socket for the error message. */ sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), SBUF_FIXEDLEN); switch (head->so_proto->pr_domain->dom_family) { #if defined(INET) || defined(INET6) #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: if (head->so_proto->pr_domain->dom_family == AF_INET6 || (sotoinpcb(head)->inp_inc.inc_flags & INC_ISIPV6)) { ip6_sprintf(addrbuf, &sotoinpcb(head)->inp_inc.inc6_laddr); sbuf_printf(&descrsb, "[%s]", addrbuf); } else #endif { #ifdef INET inet_ntoa_r( sotoinpcb(head)->inp_inc.inc_laddr, addrbuf); sbuf_cat(&descrsb, addrbuf); #endif } sbuf_printf(&descrsb, ":%hu (proto %u)", ntohs(sotoinpcb(head)->inp_inc.inc_lport), head->so_proto->pr_protocol); break; #endif /* INET || INET6 */ case AF_UNIX: sbuf_cat(&descrsb, localprefix); if (sotounpcb(head)->unp_addr != NULL) len = sotounpcb(head)->unp_addr->sun_len - offsetof(struct sockaddr_un, sun_path); else len = 0; if (len > 0) sbuf_bcat(&descrsb, sotounpcb(head)->unp_addr->sun_path, len); else sbuf_cat(&descrsb, "(unknown)"); break; } /* * If we can't print something more specific, at least * print the domain name. */ if (sbuf_finish(&descrsb) != 0 || sbuf_len(&descrsb) <= 0) { sbuf_clear(&descrsb); sbuf_cat(&descrsb, head->so_proto->pr_domain->dom_name ?: "unknown"); sbuf_finish(&descrsb); } KASSERT(sbuf_len(&descrsb) > 0, ("%s: sbuf creation failed", __func__)); /* * Preserve the historic listen queue overflow log * message, that starts with "sonewconn:". It has * been known to sysadmins for years and also test * sys/kern/sonewconn_overflow checks for it. */ if (head->so_cred == 0) { log(LOG_DEBUG, "sonewconn: pcb %p (%s): " "Listen queue overflow: %i already in " "queue awaiting acceptance (%d " "occurrences)\n", head->so_pcb, sbuf_data(&descrsb), qlen, overcount); } else { log(LOG_DEBUG, "sonewconn: pcb %p (%s): " "Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences), euid %d, rgid %d, jail %s\n", head->so_pcb, sbuf_data(&descrsb), qlen, overcount, head->so_cred->cr_uid, head->so_cred->cr_rgid, head->so_cred->cr_prison ? head->so_cred->cr_prison->pr_name : "not_jailed"); } sbuf_delete(&descrsb); overcount = 0; } return (NULL); } SOLISTEN_UNLOCK(head); VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", __func__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_listen = head; so->so_type = head->so_type; so->so_options = head->so_options & ~SO_ACCEPTCONN; so->so_linger = head->so_linger; so->so_state = head->so_state; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; so->so_snd.sb_lowat = head->sol_sbsnd_lowat; so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; so->so_snd.sb_timeo = head->sol_sbsnd_timeo; so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE; so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE; if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { so->so_snd.sb_mtx = &so->so_snd_mtx; so->so_rcv.sb_mtx = &so->so_rcv_mtx; } return (so); } /* Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. */ struct socket * sonewconn(struct socket *head, int connstatus) { struct socket *so; if ((so = solisten_clone(head)) == NULL) return (NULL); if (so->so_proto->pr_attach(so, 0, NULL) != 0) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n", __func__, head->so_pcb); return (NULL); } solisten_enqueue(so, connstatus); return (so); } /* * Enqueue socket cloned by solisten_clone() to the listen queue of the * listener it has been cloned from. */ void solisten_enqueue(struct socket *so, int connstatus) { struct socket *head = so->so_listen; MPASS(refcount_load(&so->so_count) == 0); refcount_init(&so->so_count, 1); SOLISTEN_LOCK(head); if (head->sol_accept_filter != NULL) connstatus = 0; so->so_state |= connstatus; soref(head); /* A socket on (in)complete queue refs head. */ if (connstatus) { TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); so->so_qstate = SQ_COMP; head->sol_qlen++; solisten_wakeup(head); /* unlocks */ } else { /* * Keep removing sockets from the head until there's room for * us to insert on the tail. In pre-locking revisions, this * was a simple if(), but as we could be racing with other * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ while (head->sol_incqlen > head->sol_qlimit) { struct socket *sp; sp = TAILQ_FIRST(&head->sol_incomp); TAILQ_REMOVE(&head->sol_incomp, sp, so_list); head->sol_incqlen--; SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); sorele_locked(head); /* does SOLISTEN_UNLOCK, head stays */ soabort(sp); SOLISTEN_LOCK(head); } TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); so->so_qstate = SQ_INCOMP; head->sol_incqlen++; SOLISTEN_UNLOCK(head); } } #if defined(SCTP) || defined(SCTP_SUPPORT) /* * Socket part of sctp_peeloff(). Detach a new socket from an * association. The new socket is returned with a reference. * * XXXGL: reduce copy-paste with solisten_clone(). */ struct socket * sopeeloff(struct socket *head) { struct socket *so; VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", __func__, __LINE__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_type = head->so_type; so->so_options = head->so_options; so->so_linger = head->so_linger; so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; so->so_snd.sb_lowat = head->so_snd.sb_lowat; so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; so->so_snd.sb_timeo = head->so_snd.sb_timeo; so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; soref(so); return (so); } #endif /* SCTP */ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_bind(so, nam, td); CURVNET_RESTORE(); return (error); } int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_bindat(fd, so, nam, td); CURVNET_RESTORE(); return (error); } /* * solisten() transitions a socket from a non-listening state to a listening * state, but can also be used to update the listen queue depth on an * existing listen socket. The protocol will call back into the sockets * layer using solisten_proto_check() and solisten_proto() to check and set * socket-layer listen state. Call backs are used so that the protocol can * acquire both protocol and socket layer locks in whatever order is required * by the protocol. * * Protocol implementors are advised to hold the socket lock across the * socket-layer test and set to avoid races at the socket layer. */ int solisten(struct socket *so, int backlog, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_listen(so, backlog, td); CURVNET_RESTORE(); return (error); } /* * Prepare for a call to solisten_proto(). Acquire all socket buffer locks in * order to interlock with socket I/O. */ int solisten_proto_check(struct socket *so) { SOCK_LOCK_ASSERT(so); if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) != 0) return (EINVAL); /* * Sleeping is not permitted here, so simply fail if userspace is * attempting to transmit or receive on the socket. This kind of * transient failure is not ideal, but it should occur only if userspace * is misusing the socket interfaces. */ if (!sx_try_xlock(&so->so_snd_sx)) return (EAGAIN); if (!sx_try_xlock(&so->so_rcv_sx)) { sx_xunlock(&so->so_snd_sx); return (EAGAIN); } mtx_lock(&so->so_snd_mtx); mtx_lock(&so->so_rcv_mtx); /* Interlock with soo_aio_queue(). */ if (!SOLISTENING(so) && ((so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 || (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0)) { solisten_proto_abort(so); return (EINVAL); } return (0); } /* * Undo the setup done by solisten_proto_check(). */ void solisten_proto_abort(struct socket *so) { mtx_unlock(&so->so_snd_mtx); mtx_unlock(&so->so_rcv_mtx); sx_xunlock(&so->so_snd_sx); sx_xunlock(&so->so_rcv_sx); } void solisten_proto(struct socket *so, int backlog) { int sbrcv_lowat, sbsnd_lowat; u_int sbrcv_hiwat, sbsnd_hiwat; short sbrcv_flags, sbsnd_flags; sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0, ("%s: bad socket state %p", __func__, so)); if (SOLISTENING(so)) goto listening; /* * Change this socket to listening state. */ sbrcv_lowat = so->so_rcv.sb_lowat; sbsnd_lowat = so->so_snd.sb_lowat; sbrcv_hiwat = so->so_rcv.sb_hiwat; sbsnd_hiwat = so->so_snd.sb_hiwat; sbrcv_flags = so->so_rcv.sb_flags; sbsnd_flags = so->so_snd.sb_flags; sbrcv_timeo = so->so_rcv.sb_timeo; sbsnd_timeo = so->so_snd.sb_timeo; sbdestroy(so, SO_SND); sbdestroy(so, SO_RCV); #ifdef INVARIANTS bzero(&so->so_rcv, sizeof(struct socket) - offsetof(struct socket, so_rcv)); #endif so->sol_sbrcv_lowat = sbrcv_lowat; so->sol_sbsnd_lowat = sbsnd_lowat; so->sol_sbrcv_hiwat = sbrcv_hiwat; so->sol_sbsnd_hiwat = sbsnd_hiwat; so->sol_sbrcv_flags = sbrcv_flags; so->sol_sbsnd_flags = sbsnd_flags; so->sol_sbrcv_timeo = sbrcv_timeo; so->sol_sbsnd_timeo = sbsnd_timeo; so->sol_qlen = so->sol_incqlen = 0; TAILQ_INIT(&so->sol_incomp); TAILQ_INIT(&so->sol_comp); so->sol_accept_filter = NULL; so->sol_accept_filter_arg = NULL; so->sol_accept_filter_str = NULL; so->sol_upcall = NULL; so->sol_upcallarg = NULL; so->so_options |= SO_ACCEPTCONN; listening: if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->sol_qlimit = backlog; mtx_unlock(&so->so_snd_mtx); mtx_unlock(&so->so_rcv_mtx); sx_xunlock(&so->so_snd_sx); sx_xunlock(&so->so_rcv_sx); } /* * Wakeup listeners/subsystems once we have a complete connection. * Enters with lock, returns unlocked. */ void solisten_wakeup(struct socket *sol) { if (sol->sol_upcall != NULL) (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); else { selwakeuppri(&sol->so_rdsel, PSOCK); KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); } SOLISTEN_UNLOCK(sol); wakeup_one(&sol->sol_comp); if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) pgsigio(&sol->so_sigio, SIGIO, 0); } /* * Return single connection off a listening socket queue. Main consumer of * the function is kern_accept4(). Some modules, that do their own accept * management also use the function. The socket reference held by the * listen queue is handed to the caller. * * Listening socket must be locked on entry and is returned unlocked on * return. * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. */ int solisten_dequeue(struct socket *head, struct socket **ret, int flags) { struct socket *so; int error; SOLISTEN_LOCK_ASSERT(head); while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && head->so_error == 0) { error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH, "accept", 0); if (error != 0) { SOLISTEN_UNLOCK(head); return (error); } } if (head->so_error) { error = head->so_error; head->so_error = 0; } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) error = EWOULDBLOCK; else error = 0; if (error) { SOLISTEN_UNLOCK(head); return (error); } so = TAILQ_FIRST(&head->sol_comp); SOCK_LOCK(so); KASSERT(so->so_qstate == SQ_COMP, ("%s: so %p not SQ_COMP", __func__, so)); head->sol_qlen--; so->so_qstate = SQ_NONE; so->so_listen = NULL; TAILQ_REMOVE(&head->sol_comp, so, so_list); if (flags & ACCEPT4_INHERIT) so->so_state |= (head->so_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; SOCK_UNLOCK(so); sorele_locked(head); *ret = so; return (0); } /* * Free socket upon release of the very last reference. */ static void sofree(struct socket *so) { struct protosw *pr = so->so_proto; SOCK_LOCK_ASSERT(so); KASSERT(refcount_load(&so->so_count) == 0, ("%s: so %p has references", __func__, so)); KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE, ("%s: so %p is on listen queue", __func__, so)); SOCK_UNLOCK(so); if (so->so_dtor != NULL) so->so_dtor(so); VNET_SO_ASSERT(so); if ((pr->pr_flags & PR_RIGHTS) && !SOLISTENING(so)) { MPASS(pr->pr_domain->dom_dispose != NULL); (*pr->pr_domain->dom_dispose)(so); } if (pr->pr_detach != NULL) pr->pr_detach(so); /* * From this point on, we assume that no other references to this * socket exist anywhere else in the stack. Therefore, no locks need * to be acquired or held. */ if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) { sbdestroy(so, SO_SND); sbdestroy(so, SO_RCV); } seldrain(&so->so_rdsel); seldrain(&so->so_wrsel); knlist_destroy(&so->so_rdsel.si_note); knlist_destroy(&so->so_wrsel.si_note); sodealloc(so); } /* * Release a reference on a socket while holding the socket lock. * Unlocks the socket lock before returning. */ void sorele_locked(struct socket *so) { SOCK_LOCK_ASSERT(so); if (refcount_release(&so->so_count)) sofree(so); else SOCK_UNLOCK(so); } /* * Close a socket on last file table reference removal. Initiate disconnect * if connected. Free socket when disconnect complete. * * This function will sorele() the socket. Note that soclose() may be called * prior to the ref count reaching zero. The actual socket structure will * not be freed until the ref count reaches zero. */ int soclose(struct socket *so) { struct accept_queue lqueue; int error = 0; bool listening, last __diagused; CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error) { if (error == ENOTCONN) error = 0; goto drop; } } if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { error = tsleep(&so->so_timeo, PSOCK | PCATCH, "soclos", so->so_linger * hz); if (error) break; } } } drop: if (so->so_proto->pr_close != NULL) so->so_proto->pr_close(so); SOCK_LOCK(so); if ((listening = SOLISTENING(so))) { struct socket *sp; TAILQ_INIT(&lqueue); TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); so->sol_qlen = so->sol_incqlen = 0; TAILQ_FOREACH(sp, &lqueue, so_list) { SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); last = refcount_release(&so->so_count); KASSERT(!last, ("%s: released last reference for %p", __func__, so)); } } sorele_locked(so); if (listening) { struct socket *sp, *tsp; TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) soabort(sp); } CURVNET_RESTORE(); return (error); } /* * soabort() is used to abruptly tear down a connection, such as when a * resource limit is reached (listen queue depth exceeded), or if a listen * socket is closed while there are sockets waiting to be accepted. * * This interface is tricky, because it is called on an unreferenced socket, * and must be called only by a thread that has actually removed the socket * from the listen queue it was on. Likely this thread holds the last * reference on the socket and soabort() will proceed with sofree(). But * it might be not the last, as the sockets on the listen queues are seen * from the protocol side. * * This interface will call into the protocol code, so must not be called * with any socket locks held. Protocols do call it while holding their own * recursible protocol mutexes, but this is something that should be subject * to review in the future. * * Usually socket should have a single reference left, but this is not a * requirement. In the past, when we have had named references for file * descriptor and protocol, we asserted that none of them are being held. */ void soabort(struct socket *so) { VNET_SO_ASSERT(so); if (so->so_proto->pr_abort != NULL) so->so_proto->pr_abort(so); SOCK_LOCK(so); sorele_locked(so); } int soaccept(struct socket *so, struct sockaddr **nam) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_accept(so, nam); CURVNET_RESTORE(); return (error); } int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (soconnectat(AT_FDCWD, so, nam, td)); } int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. This allows * user to disconnect by connecting to, e.g., a null address. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) { error = EISCONN; } else { /* * Prevent accumulated error from previous connection from * biting us. */ so->so_error = 0; if (fd == AT_FDCWD) { error = so->so_proto->pr_connect(so, nam, td); } else { error = so->so_proto->pr_connectat(fd, so, nam, td); } } CURVNET_RESTORE(); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { int error; CURVNET_SET(so1->so_vnet); error = so1->so_proto->pr_connect2(so1, so2); CURVNET_RESTORE(); return (error); } int sodisconnect(struct socket *so) { int error; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) return (EALREADY); VNET_SO_ASSERT(so); error = so->so_proto->pr_disconnect(so); return (error); } int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); KASSERT(so->so_proto->pr_flags & PR_ATOMIC, ("sosend_dgram: !PR_ATOMIC")); if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. */ if (resid < 0) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection-based * socket if it supports implied connect. Return ENOTCONN if * not connected and no address is supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto out; } } else if (addr == NULL) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; SOCKBUF_UNLOCK(&so->so_snd); goto out; } } /* * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a * problem and need fixing. */ space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; space -= clen; SOCKBUF_UNLOCK(&so->so_snd); if (resid > space) { error = EMSGSIZE; goto out; } if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf chain. * If no data is to be copied in, a single empty mbuf * is returned. */ top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); if (top == NULL) { error = EFAULT; /* only possible error */ goto out; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } KASSERT(resid == 0, ("sosend_dgram: resid != 0")); /* * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock * than with. */ if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously done could be out * of date. We could have received a reset packet in an interrupt or * maybe we slept while doing page faults in uiomove() etc. We could * probably recheck again inside the locking protection here, but * there are probably other places that this also happens. We must * rethink this. */ VNET_SO_ASSERT(so); error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands this flag and * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } /* * Send on a socket. If send must go all at once and message is larger than * send buffering, then hard error. Lock against other senders. If must go * all at once and not enough room now, then inform user that this would * block and do nothing. Otherwise, if nonblocking, send as much as * possible. The data to be sent is described by "uio" if nonzero, otherwise * by the mbuf chain "top" (which must be null if uio is not). Data provided * in mbuf chain must be small enough to send all at once. * * Returns nonzero on error, timeout or signal; callers must check for short * counts if EINTR/ERESTART are returned. Data and control buffers are freed * on return. */ int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; int pr_send_flag; #ifdef KERN_TLS struct ktls_session *tls; int tls_enq_cnt, tls_send_flag; uint8_t tls_rtype; tls = NULL; tls_rtype = TLS_RLTYPE_APP; #endif if (uio != NULL) resid = uio->uio_resid; else if ((top->m_flags & M_PKTHDR) != 0) resid = top->m_pkthdr.len; else resid = m_length(top, NULL); /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. * * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); if (error) goto out; #ifdef KERN_TLS tls_send_flag = 0; tls = ktls_hold(so->so_snd.sb_tls_info); if (tls != NULL) { if (tls->mode == TCP_TLS_MODE_SW) tls_send_flag = PRUS_NOTREADY; if (control != NULL) { struct cmsghdr *cm = mtod(control, struct cmsghdr *); if (clen >= sizeof(*cm) && cm->cmsg_type == TLS_SET_RECORD_TYPE) { tls_rtype = *((uint8_t *)CMSG_DATA(cm)); clen = 0; m_freem(control); control = NULL; atomic = 1; } } if (resid == 0 && !ktls_permit_empty_frames(tls)) { error = EINVAL; goto release; } } #endif restart: do { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto release; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection- * based socket if it supports implied connect. * Return ENOTCONN if not connected and no address is * supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto release; } } else if (addr == NULL) { SOCKBUF_UNLOCK(&so->so_snd); if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; goto release; } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) { SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; goto release; } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto release; } error = sbwait(so, SO_SND); SOCKBUF_UNLOCK(&so->so_snd); if (error) goto release; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); space -= clen; do { if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; #ifdef KERN_TLS if (tls != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); tls_rtype = TLS_RLTYPE_APP; } #endif } else { /* * Copy the data from userland into a mbuf * chain. If resid is 0, which can happen * only if we have control to send, then * a single empty mbuf is returned. This * is a workaround to prevent protocol send * methods to panic. */ #ifdef KERN_TLS if (tls != NULL) { top = m_uiotombuf(uio, M_WAITOK, space, tls->params.max_frame_len, M_EXTPG | ((flags & MSG_EOR) ? M_EOR : 0)); if (top != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); } tls_rtype = TLS_RLTYPE_APP; } else #endif top = m_uiotombuf(uio, M_WAITOK, space, (atomic ? max_hdr : 0), (atomic ? M_PKTHDR : 0) | ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { error = EFAULT; /* only possible error */ goto release; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date. We could have received * a reset packet in an interrupt or maybe we slept * while doing page faults in uiomove() etc. We * could probably recheck again inside the locking * protection here, but there are probably other * places that this also happens. We must rethink * this. */ VNET_SO_ASSERT(so); pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands * this flag and nothing left to send then use * PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; #ifdef KERN_TLS pr_send_flag |= tls_send_flag; #endif error = so->so_proto->pr_send(so, pr_send_flag, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } #ifdef KERN_TLS if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { if (error != 0) { m_freem(top); top = NULL; } else { soref(so); ktls_enqueue(top, so, tls_enq_cnt); } } #endif clen = 0; control = NULL; top = NULL; if (error) goto release; } while (resid && space > 0); } while (resid); release: SOCK_IO_SEND_UNLOCK(so); out: #ifdef KERN_TLS if (tls != NULL) ktls_free(tls); #endif if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_sosend(so, addr, uio, top, control, flags, td); CURVNET_RESTORE(); return (error); } /* * The part of soreceive() that implements reading non-inline out-of-band * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is * unable to return an mbuf chain to the caller. */ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { struct protosw *pr = so->so_proto; struct mbuf *m; int error; KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); VNET_SO_ASSERT(so); m = m_get(M_WAITOK, MT_DATA); error = pr->pr_rcvoob(so, m, flags & MSG_PEEK); if (error) goto bad; do { error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: if (m != NULL) m_freem(m); return (error); } /* * Following replacement or removal of the first mbuf on the first mbuf chain * of a socket buffer, push necessary state changes back into the socket * buffer so that other consumers see the values consistently. 'nextrecord' * is the callers locally stored value of the original value of * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. * NOTE: 'nextrecord' may be NULL. */ static __inline void sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) { SOCKBUF_LOCK_ASSERT(sb); /* * First, update for the new value of nextrecord. If necessary, make * it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect the new * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the * addition of a second clause that takes care of the case where * sb_mb has been updated, but remains the last record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } /* * Implement receive operations on a socket. We depend on the way that * records are added to the sockbuf by sbappend. In particular, each record * (mbufs linked through m_next) must begin with an address if the protocol * so specifies, followed by an optional mbuf or mbufs containing ancillary * data, and then zero or more mbufs of data. In order to allow parallelism * between network receive and copying to user space, as well as avoid * sleeping with a mutex held, we release the socket buffer mutex during the * user space copy. Although the sockbuf is locked, new data may still be * appended, and thus we must maintain consistency of the sockbuf during that * time. * * The caller may receive the data as a single mbuf chain by supplying an * mbuf **mp0 for use in returning the chain. The uio is then used only for * the count in uio_resid. */ int soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, **mp; int flags, error, offset; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; ssize_t orig_resid = uio->uio_resid; bool report_real_len = false; mp = mp0; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) { report_real_len = *flagsp & MSG_TRUNC; *flagsp &= ~MSG_TRUNC; flags = *flagsp &~ MSG_EOR; } else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp != NULL) *mp = NULL; if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) && uio->uio_resid) { VNET_SO_ASSERT(so); pr->pr_rcvd(so, 0); } error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); restart: SOCKBUF_LOCK(&so->so_rcv); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more (subject * to any timeout) if: * 1. the current count is less than the low water mark, or * 2. MSG_DONTWAIT is not set */ if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && sbavail(&so->so_rcv) < uio->uio_resid) && sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { KASSERT(m != NULL || !sbavail(&so->so_rcv), ("receive: m == %p sbavail == %u", m, sbavail(&so->so_rcv))); if (so->so_error || so->so_rerror) { if (m != NULL) goto dontblock; if (so->so_error) error = so->so_error; else error = so->so_rerror; if ((flags & MSG_PEEK) == 0) { if (so->so_error) so->so_error = 0; else so->so_rerror = 0; } SOCKBUF_UNLOCK(&so->so_rcv); goto release; } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { if (m != NULL) goto dontblock; #ifdef KERN_TLS else if (so->so_rcv.sb_tlsdcc == 0 && so->so_rcv.sb_tlscc == 0) { #else else { #endif SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENOTCONN; goto release; } if (uio->uio_resid == 0 && !report_real_len) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(so, SO_RCV); SOCKBUF_UNLOCK(&so->so_rcv); if (error) goto release; goto restart; } dontblock: /* * From this point onward, we maintain 'nextrecord' as a cache of the * pointer to the next record in the socket buffer. We must keep the * various socket buffer pointers and local stack versions of the * pointers in sync, pushing out modifications before dropping the * socket buffer mutex, and re-reading them when picking it up. * * Otherwise, we will race with the network stack appending new data * or records onto the socket buffer by using inconsistent/stale * versions of the field, possibly resulting in socket buffer * corruption. * * By holding the high-level sblock(), we prevent simultaneous * readers from pulling off the front of the socket buffer. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); orig_resid = 0; if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; sockbuf_pushsync(&so->so_rcv, nextrecord); } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization (or freeing if controlp == NULL). */ if (m != NULL && m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; #ifdef KERN_TLS struct cmsghdr *cmsg; struct tls_get_record tgr; /* * For MSG_TLSAPPDATA, check for an alert record. * If found, return ENXIO without removing * it from the receive queue. This allows a subsequent * call without MSG_TLSAPPDATA to receive it. * Note that, for TLS, there should only be a single * control mbuf with the TLS_GET_RECORD message in it. */ if (flags & MSG_TLSAPPDATA) { cmsg = mtod(m, struct cmsghdr *); if (cmsg->cmsg_type == TLS_GET_RECORD && cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); if (__predict_false(tgr.tls_type == TLS_RLTYPE_ALERT)) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENXIO; goto release; } } } #endif do { if (flags & MSG_PEEK) { if (controlp != NULL) { *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); controlp = &(*controlp)->m_next; } m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sockbuf_pushsync(&so->so_rcv, nextrecord); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); SOCKBUF_LOCK(&so->so_rcv); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } if (m != NULL) nextrecord = so->so_rcv.sb_mb->m_nextpkt; else nextrecord = so->so_rcv.sb_mb; orig_resid = 0; } if (m != NULL) { if ((flags & MSG_PEEK) == 0) { KASSERT(m->m_nextpkt == nextrecord, ("soreceive: post-control, nextrecord !sync")); if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_mb == m, ("soreceive: post-control, sb_mb!=m")); KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive: post-control, lastrecord!=m")); } } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; } else { if ((flags & MSG_PEEK) == 0) { KASSERT(so->so_rcv.sb_mb == nextrecord, ("soreceive: sb_mb != nextrecord")); if (so->so_rcv.sb_mb == NULL) { KASSERT(so->so_rcv.sb_lastrecord == NULL, ("soreceive: sb_lastercord != NULL")); } } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * Now continue to read any data mbufs off of the head of the socket * buffer until the read request is satisfied. Note that 'type' is * used to store the type of any mbuf reads that have happened so far * such that soreceive() can stop reading if the type changes, which * causes soreceive() to return only one of regular data and inline * out-of-band data in a single socket receive operation. */ moff = 0; offset = 0; while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 && error == 0) { /* * If the type of mbuf has changed since the last mbuf * examined ('type'), end the receive operation. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { if (type != m->m_type) break; } else if (type == MT_OOBDATA) break; else KASSERT(m->m_type == MT_DATA, ("m->m_type == %d", m->m_type)); so->so_rcv.sb_state &= ~SBS_RCVATMARK; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. Otherwise copy * them out via the uio, then free. Sockbuf must be * consistent here (points to current mbuf, it points to next * record) when we drop priority; we must note any additions * to the sockbuf when we block interrupts again. */ if (mp == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if ((m->m_flags & M_EXTPG) != 0) error = m_unmapped_uiomove(m, moff, uio, (int)len); else error = uiomove(mtod(m, char *) + moff, (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) { /* * The MT_SONAME mbuf has already been removed * from the record, so it is necessary to * remove the data mbufs, if any, to preserve * the invariant in the case of PR_ADDR that * requires MT_SONAME mbufs at the head of * each record. */ if (pr->pr_flags & PR_ATOMIC && ((flags & MSG_PEEK) == 0)) (void)sbdroprecord_locked(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } else uio->uio_resid -= len; SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); if (mp != NULL) { m->m_nextpkt = NULL; *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } sockbuf_pushsync(&so->so_rcv, nextrecord); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); } } else { if (flags & MSG_PEEK) moff += len; else { if (mp != NULL) { if (flags & MSG_DONTWAIT) { *mp = m_copym(m, 0, len, M_NOWAIT); if (*mp == NULL) { /* * m_copym() couldn't * allocate an mbuf. * Adjust uio_resid back * (it was adjusted * down by len bytes, * which we didn't end * up "copying" over). */ uio->uio_resid += len; break; } } else { SOCKBUF_UNLOCK(&so->so_rcv); *mp = m_copym(m, 0, len, M_WAITOK); SOCKBUF_LOCK(&so->so_rcv); } } sbcut_locked(&so->so_rcv, len); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_rcv.sb_state |= SBS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), we * must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return with a * short count but without error. Keep sockbuf locked * against other readers. */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_error || so->so_rerror || so->so_rcv.sb_state & SBS_CANTRCVMORE) break; /* * Notify the protocol that some data has been * drained before blocking. */ if (pr->pr_flags & PR_WANTRCVD) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); pr->pr_rcvd(so, flags); SOCKBUF_LOCK(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * We could receive some data while was notifying * the protocol. Skip blocking in this case. */ if (so->so_rcv.sb_mb == NULL) { error = sbwait(so, SO_RCV); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } m = so->so_rcv.sb_mb; if (m != NULL) nextrecord = m->m_nextpkt; } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m != NULL && pr->pr_flags & PR_ATOMIC) { if (report_real_len) uio->uio_resid -= m_length(m, NULL) - moff; flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord_locked(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * If soreceive() is being done from the socket callback, * then don't need to generate ACK to peer to update window, * since ACK will be generated on return to TCP. */ if (!(flags & MSG_SOCALLBCK) && (pr->pr_flags & PR_WANTRCVD)) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); pr->pr_rcvd(so, flags); SOCKBUF_LOCK(&so->so_rcv); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto restart; } SOCKBUF_UNLOCK(&so->so_rcv); if (flagsp != NULL) *flagsp |= flags; release: SOCK_IO_RECV_UNLOCK(so); return (error); } /* * Optimized version of soreceive() for stream (TCP) sockets. */ int soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int len = 0, error = 0, flags, oresid; struct sockbuf *sb; struct mbuf *m, *n = NULL; /* We only do stream sockets. */ if (so->so_type != SOCK_STREAM) return (EINVAL); if (psa != NULL) *psa = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (controlp != NULL) *controlp = NULL; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp0 != NULL) *mp0 = NULL; sb = &so->so_rcv; #ifdef KERN_TLS /* * KTLS store TLS records as records with a control message to * describe the framing. * * We check once here before acquiring locks to optimize the * common case. */ if (sb->sb_tls_info != NULL) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); #endif /* Prevent other readers from entering the socket. */ error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); SOCKBUF_LOCK(sb); #ifdef KERN_TLS if (sb->sb_tls_info != NULL) { SOCKBUF_UNLOCK(sb); SOCK_IO_RECV_UNLOCK(so); return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); } #endif /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) { error = EINVAL; goto out; } oresid = uio->uio_resid; /* We will never ever get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { error = ENOTCONN; goto out; } restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); /* Abort if socket has reported problems. */ if (so->so_error) { if (sbavail(sb) > 0) goto deliver; if (oresid > uio->uio_resid) goto out; error = so->so_error; if (!(flags & MSG_PEEK)) so->so_error = 0; goto out; } /* Door is closed. Deliver what is left, if any. */ if (sb->sb_state & SBS_CANTRCVMORE) { if (sbavail(sb) > 0) goto deliver; else goto out; } /* Socket buffer is empty and we shall not block. */ if (sbavail(sb) == 0 && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { error = EAGAIN; goto out; } /* Socket buffer got some data that we shall deliver now. */ if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)) || sbavail(sb) >= sb->sb_lowat || sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat) ) { goto deliver; } /* On MSG_WAITALL we must wait until all data or error arrives. */ if ((flags & MSG_WAITALL) && (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) goto deliver; /* * Wait and block until (more) data comes in. * NB: Drops the sockbuf lock during wait. */ error = sbwait(so, SO_RCV); if (error) goto out; goto restart; deliver: SOCKBUF_LOCK_ASSERT(&so->so_rcv); KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); /* Statistics. */ if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; /* Fill uio until full or current end of socket buffer is reached. */ len = min(uio->uio_resid, sbavail(sb)); if (mp0 != NULL) { /* Dequeue as many mbufs as possible. */ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { if (*mp0 == NULL) *mp0 = sb->sb_mb; else m_cat(*mp0, sb->sb_mb); for (m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p not available", __func__, m)); len -= m->m_len; uio->uio_resid -= m->m_len; sbfree(sb, m); n = m; } n->m_next = NULL; sb->sb_mb = m; sb->sb_lastrecord = sb->sb_mb; if (sb->sb_mb == NULL) SB_EMPTY_FIXUP(sb); } /* Copy the remainder. */ if (len > 0) { KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else uio->uio_resid -= len; if (*mp0 != NULL) m_cat(*mp0, m); else *mp0 = m; if (*mp0 == NULL) { error = ENOBUFS; goto out; } } } else { /* NB: Must unlock socket buffer as uiomove may sleep. */ SOCKBUF_UNLOCK(sb); error = m_mbuftouio(uio, sb->sb_mb, len); SOCKBUF_LOCK(sb); if (error) goto out; } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); /* * Remove the delivered data from the socket buffer unless we * were only peeking. */ if (!(flags & MSG_PEEK)) { if (len > 0) sbdrop_locked(sb, len); /* Notify protocol that we drained some data. */ if ((so->so_proto->pr_flags & PR_WANTRCVD) && (((flags & MSG_WAITALL) && uio->uio_resid > 0) || !(flags & MSG_SOCALLBCK))) { SOCKBUF_UNLOCK(sb); VNET_SO_ASSERT(so); so->so_proto->pr_rcvd(so, flags); SOCKBUF_LOCK(sb); } } /* * For MSG_WAITALL we may have to loop again and wait for * more data to come in. */ if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); SOCK_IO_RECV_UNLOCK(so); return (error); } /* * Optimized version of soreceive() for simple datagram cases from userspace. * Unlike in the stream case, we're able to drop a datagram if copyout() * fails, and because we handle datagrams atomically, we don't need to use a * sleep lock to prevent I/O interlacing. */ int soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, *m2; int flags, error; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; /* * For any complicated cases, fall back to the full * soreceive_generic(). */ if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC))) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); /* * Enforce restrictions on use. */ KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, ("soreceive_dgram: wantrcvd")); KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, ("soreceive_dgram: SBS_RCVATMARK")); KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, ("soreceive_dgram: P_CONNREQUIRED")); /* * Loop blocking while waiting for a datagram. */ SOCKBUF_LOCK(&so->so_rcv); while ((m = so->so_rcv.sb_mb) == NULL) { KASSERT(sbavail(&so->so_rcv) == 0, ("soreceive_dgram: sb_mb NULL but sbavail %u", sbavail(&so->so_rcv))); if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); return (error); } if (so->so_rcv.sb_state & SBS_CANTRCVMORE || uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); return (0); } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); return (EWOULDBLOCK); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(so, SO_RCV); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); return (error); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive_dgram: lastrecord != m")); } KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, ("soreceive_dgram: m_nextpkt != nextrecord")); /* * Pull 'm' and its chain off the front of the packet queue. */ so->so_rcv.sb_mb = NULL; sockbuf_pushsync(&so->so_rcv, nextrecord); /* * Walk 'm's chain and free that many bytes from the socket buffer. */ for (m2 = m; m2 != NULL; m2 = m2->m_next) sbfree(&so->so_rcv, m2); /* * Do a few last checks before we let go of the lock. */ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); m = m_free(m); } if (m == NULL) { /* XXXRW: Can this happen? */ return (0); } /* * Packet to copyout() is now in 'm' and it is disconnected from the * queue. * * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. We call into the * protocol to perform externalization (or freeing if controlp == * NULL). In some cases there can be only MT_CONTROL mbufs without * MT_DATA mbufs. */ if (m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { m2 = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = m2; } while (m != NULL && m->m_type == MT_CONTROL); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } } KASSERT(m == NULL || m->m_type == MT_DATA, ("soreceive_dgram: !data")); while (m != NULL && uio->uio_resid > 0) { len = uio->uio_resid; if (len > m->m_len) len = m->m_len; error = uiomove(mtod(m, char *), (int)len, uio); if (error) { m_freem(m); return (error); } if (len == m->m_len) m = m_free(m); else { m->m_data += len; m->m_len -= len; } } if (m != NULL) { flags |= MSG_TRUNC; m_freem(m); } if (flagsp != NULL) *flagsp |= flags; return (0); } int soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp); CURVNET_RESTORE(); return (error); } int soshutdown(struct socket *so, int how) { struct protosw *pr; int error, soerror_enotconn; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) return (EINVAL); soerror_enotconn = 0; SOCK_LOCK(so); if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { /* * POSIX mandates us to return ENOTCONN when shutdown(2) is * invoked on a datagram sockets, however historically we would * actually tear socket down. This is known to be leveraged by * some applications to unblock process waiting in recvXXX(2) * by other process that it shares that socket with. Try to meet * both backward-compatibility and POSIX requirements by forcing * ENOTCONN but still asking protocol to perform pru_shutdown(). */ if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) { SOCK_UNLOCK(so); return (ENOTCONN); } soerror_enotconn = 1; } if (SOLISTENING(so)) { if (how != SHUT_WR) { so->so_error = ECONNABORTED; solisten_wakeup(so); /* unlocks so */ } else { SOCK_UNLOCK(so); } goto done; } SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); pr = so->so_proto; if (pr->pr_flush != NULL) pr->pr_flush(so, how); if (how != SHUT_WR) sorflush(so); if (how != SHUT_RD) { error = pr->pr_shutdown(so); wakeup(&so->so_timeo); CURVNET_RESTORE(); return ((error == 0 && soerror_enotconn) ? ENOTCONN : error); } wakeup(&so->so_timeo); CURVNET_RESTORE(); done: return (soerror_enotconn ? ENOTCONN : 0); } void sorflush(struct socket *so) { struct protosw *pr; int error; VNET_SO_ASSERT(so); /* * Dislodge threads currently blocked in receive and wait to acquire * a lock against other simultaneous readers before clearing the * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. */ socantrcvmore(so); error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR); if (error != 0) { KASSERT(SOLISTENING(so), ("%s: soiolock(%p) failed", __func__, so)); return; } pr = so->so_proto; if (pr->pr_flags & PR_RIGHTS) { MPASS(pr->pr_domain->dom_dispose != NULL); (*pr->pr_domain->dom_dispose)(so); } else { sbrelease(so, SO_RCV); SOCK_IO_RECV_UNLOCK(so); } } /* * Wrapper for Socket established helper hook. * Parameters: socket, context of the hook point, hook id. */ static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) { struct socket_hhook_data hhook_data = { .so = so, .hctx = hctx, .m = NULL, .status = 0 }; CURVNET_SET(so->so_vnet); HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); CURVNET_RESTORE(); /* Ugly but needed, since hhooks return void for now */ return (hhook_data.status); } /* * Perhaps this routine, and sooptcopyout(), below, ought to come in an * additional variant to handle the case where the option value needs to be * some kind of integer, but not a specific size. In addition to their use * here, these functions are also called by the protocol-level pr_ctloutput() * routines. */ int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) { size_t valsize; /* * If the user gives us more than we wanted, we ignore it, but if we * don't get the minimum length the caller wants, we return EINVAL. * On success, sopt->sopt_valsize is set to however much we actually * retrieved. */ if ((valsize = sopt->sopt_valsize) < minlen) return EINVAL; if (valsize > len) sopt->sopt_valsize = valsize = len; if (sopt->sopt_td != NULL) return (copyin(sopt->sopt_val, buf, valsize)); bcopy(sopt->sopt_val, buf, valsize); return (0); } /* * Kernel version of setsockopt(2). * * XXX: optlen is size_t, not socklen_t */ int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen) { struct sockopt sopt; sopt.sopt_level = level; sopt.sopt_name = optname; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = optval; sopt.sopt_valsize = optlen; sopt.sopt_td = NULL; return (sosetopt(so, &sopt)); } int sosetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; sbintime_t val, *valp; uint32_t val32; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_setopt(so, sopt); if (error) goto bad; break; case SO_LINGER: error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; if (l.l_linger < 0 || l.l_linger > USHRT_MAX || l.l_linger > (INT_MAX / hz)) { error = EDOM; goto bad; } SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) so->so_options |= SO_LINGER; else so->so_options &= ~SO_LINGER; SOCK_UNLOCK(so); break; case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: case SO_RERROR: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; SOCK_LOCK(so); if (optval) so->so_options |= sopt->sopt_name; else so->so_options &= ~sopt->sopt_name; SOCK_UNLOCK(so); break; case SO_SETFIB: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval >= rt_numfibs) { error = EINVAL; goto bad; } if (((so->so_proto->pr_domain->dom_family == PF_INET) || (so->so_proto->pr_domain->dom_family == PF_INET6) || (so->so_proto->pr_domain->dom_family == PF_ROUTE))) so->so_fibnum = optval; else so->so_fibnum = 0; break; case SO_USER_COOKIE: error = sooptcopyin(sopt, &val32, sizeof val32, sizeof val32); if (error) goto bad; so->so_user_cookie = val32; break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; /* * Values < 1 make no sense for any of these options, * so disallow them. */ if (optval < 1) { error = EINVAL; goto bad; } error = sbsetopt(so, sopt->sopt_name, optval); break; case SO_SNDTIMEO: case SO_RCVTIMEO: #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; error = sooptcopyin(sopt, &tv32, sizeof tv32, sizeof tv32); CP(tv32, tv, tv_sec); CP(tv32, tv, tv_usec); } else #endif error = sooptcopyin(sopt, &tv, sizeof tv, sizeof tv); if (error) goto bad; if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; goto bad; } if (tv.tv_sec > INT32_MAX) val = SBT_MAX; else val = tvtosbt(tv); SOCK_LOCK(so); valp = sopt->sopt_name == SO_SNDTIMEO ? (SOLISTENING(so) ? &so->sol_sbsnd_timeo : &so->so_snd.sb_timeo) : (SOLISTENING(so) ? &so->sol_sbrcv_timeo : &so->so_rcv.sb_timeo); *valp = val; SOCK_UNLOCK(so); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof extmac, sizeof extmac); if (error) goto bad; error = mac_setsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); #else error = EOPNOTSUPP; #endif break; case SO_TS_CLOCK: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval > SO_TS_CLOCK_MAX) { error = EINVAL; goto bad; } so->so_ts_clock = optval; break; case SO_MAX_PACING_RATE: error = sooptcopyin(sopt, &val32, sizeof(val32), sizeof(val32)); if (error) goto bad; so->so_max_pacing_rate = val32; break; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } if (error == 0 && so->so_proto->pr_ctloutput != NULL) (void)(*so->so_proto->pr_ctloutput)(so, sopt); } bad: CURVNET_RESTORE(); return (error); } /* * Helper routine for getsockopt. */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) { int error; size_t valsize; error = 0; /* * Documented get behavior is that we always return a value, possibly * truncated to fit in the user's buffer. Traditional behavior is * that we always tell the user precisely how much we copied, rather * than something useful like the total amount we had available for * her. Note that this interface is not idempotent; the entire * answer must be generated ahead of time. */ valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != NULL) { if (sopt->sopt_td != NULL) error = copyout(buf, sopt->sopt_val, valsize); else bcopy(buf, sopt->sopt_val, valsize); } return (error); } int sogetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; CURVNET_RESTORE(); return (error); } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_getopt(so, sopt); break; case SO_LINGER: SOCK_LOCK(so); l.l_onoff = so->so_options & SO_LINGER; l.l_linger = so->so_linger; SOCK_UNLOCK(so); error = sooptcopyout(sopt, &l, sizeof l); break; case SO_USELOOPBACK: case SO_DONTROUTE: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_BROADCAST: case SO_OOBINLINE: case SO_ACCEPTCONN: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: case SO_RERROR: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); break; case SO_DOMAIN: optval = so->so_proto->pr_domain->dom_family; goto integer; case SO_TYPE: optval = so->so_type; goto integer; case SO_PROTOCOL: optval = so->so_proto->pr_protocol; goto integer; case SO_ERROR: SOCK_LOCK(so); if (so->so_error) { optval = so->so_error; so->so_error = 0; } else { optval = so->so_rerror; so->so_rerror = 0; } SOCK_UNLOCK(so); goto integer; case SO_SNDBUF: optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : so->so_snd.sb_hiwat; goto integer; case SO_RCVBUF: optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : so->so_rcv.sb_hiwat; goto integer; case SO_SNDLOWAT: optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : so->so_snd.sb_lowat; goto integer; case SO_RCVLOWAT: optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : so->so_rcv.sb_lowat; goto integer; case SO_SNDTIMEO: case SO_RCVTIMEO: SOCK_LOCK(so); tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? (SOLISTENING(so) ? so->sol_sbsnd_timeo : so->so_snd.sb_timeo) : (SOLISTENING(so) ? so->sol_sbrcv_timeo : so->so_rcv.sb_timeo)); SOCK_UNLOCK(so); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; CP(tv, tv32, tv_sec); CP(tv, tv32, tv_usec); error = sooptcopyout(sopt, &tv32, sizeof tv32); } else #endif error = sooptcopyout(sopt, &tv, sizeof tv); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_PEERLABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_peerlabel( sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_LISTENQLIMIT: optval = SOLISTENING(so) ? so->sol_qlimit : 0; goto integer; case SO_LISTENQLEN: optval = SOLISTENING(so) ? so->sol_qlen : 0; goto integer; case SO_LISTENINCQLEN: optval = SOLISTENING(so) ? so->sol_incqlen : 0; goto integer; case SO_TS_CLOCK: optval = so->so_ts_clock; goto integer; case SO_MAX_PACING_RATE: optval = so->so_max_pacing_rate; goto integer; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } } #ifdef MAC bad: #endif CURVNET_RESTORE(); return (error); } int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { struct mbuf *m, *m_prev; int sopt_size = sopt->sopt_valsize; MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) return ENOBUFS; if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; *mp = m; m_prev = m; while (sopt_size) { MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { m_freem(*mp); return ENOBUFS; } if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m_freem(*mp); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; m_prev->m_next = m; m_prev = m; } return (0); } int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; m = m->m_next; } if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ panic("ip6_sooptmcopyin"); return (0); } int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; size_t valsize = 0; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; valsize += m->m_len; m = m->m_next; } if (m != NULL) { /* enough soopt buffer should be given from user-land */ m_freem(m0); return(EINVAL); } sopt->sopt_valsize = valsize; return (0); } /* * sohasoutofband(): protocol notifies socket layer of the arrival of new * out-of-band data, which will then notify socket consumers. */ void sohasoutofband(struct socket *so) { if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); selwakeuppri(&so->so_rdsel, PSOCK); } int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { /* * We do not need to set or assert curvnet as long as everyone uses * sopoll_generic(). */ return (so->so_proto->pr_sopoll(so, events, active_cred, td)); } int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { int revents; SOCK_LOCK(so); if (SOLISTENING(so)) { if (!(events & (POLLIN | POLLRDNORM))) revents = 0; else if (!TAILQ_EMPTY(&so->sol_comp)) revents = events & (POLLIN | POLLRDNORM); else if ((events & POLLINIGNEOF) == 0 && so->so_error) revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; else { selrecord(td, &so->so_rdsel); revents = 0; } } else { revents = 0; SOCK_SENDBUF_LOCK(so); SOCK_RECVBUF_LOCK(so); if (events & (POLLIN | POLLRDNORM)) if (soreadabledata(so)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (sowriteable(so)) revents |= events & (POLLOUT | POLLWRNORM); if (events & (POLLPRI | POLLRDBAND)) if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) revents |= events & (POLLPRI | POLLRDBAND); if ((events & POLLINIGNEOF) == 0) { if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { revents |= events & (POLLIN | POLLRDNORM); if (so->so_snd.sb_state & SBS_CANTSENDMORE) revents |= POLLHUP; } } if (so->so_rcv.sb_state & SBS_CANTRCVMORE) revents |= events & POLLRDHUP; if (revents == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) { selrecord(td, &so->so_rdsel); so->so_rcv.sb_flags |= SB_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &so->so_wrsel); so->so_snd.sb_flags |= SB_SEL; } } SOCK_RECVBUF_UNLOCK(so); SOCK_SENDBUF_UNLOCK(so); } SOCK_UNLOCK(so); return (revents); } int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; sb_which which; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; knl = &so->so_rdsel.si_note; sb = &so->so_rcv; which = SO_RCV; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; which = SO_SND; break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; which = SO_SND; break; default: return (EINVAL); } SOCK_LOCK(so); if (SOLISTENING(so)) { knlist_add(knl, kn, 1); } else { SOCK_BUF_LOCK(so, which); knlist_add(knl, kn, 1); sb->sb_flags |= SB_KNOTE; SOCK_BUF_UNLOCK(so, which); } SOCK_UNLOCK(so); return (0); } static void filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_rdknl_lock(so); knlist_remove(&so->so_rdsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; so_rdknl_unlock(so); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) { SOCK_LOCK_ASSERT(so); kn->kn_data = so->sol_qlen; if (so->so_error) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } return (!TAILQ_EMPTY(&so->sol_comp)); } SOCK_RECVBUF_LOCK_ASSERT(so); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error || so->so_rerror) return (1); if (kn->kn_sfflags & NOTE_LOWAT) { if (kn->kn_data >= kn->kn_sdata) return (1); } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) return (1); /* This hook returning non-zero indicates an event, not error */ return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); } static void filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_wrknl_lock(so); knlist_remove(&so->so_wrsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; so_wrknl_unlock(so); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (0); SOCK_SENDBUF_LOCK_ASSERT(so); kn->kn_data = sbspace(&so->so_snd); hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) return (0); else if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); else return (kn->kn_data >= so->so_snd.sb_lowat); } static int filt_soempty(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (1); SOCK_SENDBUF_LOCK_ASSERT(so); kn->kn_data = sbused(&so->so_snd); if (kn->kn_data == 0) return (1); else return (0); } int socheckuid(struct socket *so, uid_t uid) { if (so == NULL) return (EPERM); if (so->so_cred->cr_uid != uid) return (EPERM); return (0); } /* * These functions are used by protocols to notify the socket layer (and its * consumers) of state changes in the sockets driven by protocol-side events. */ /* * Procedures to manipulate state flags of socket and do appropriate wakeups. * * Normal sequence from the active (originating) side is that * soisconnecting() is called during processing of connect() call, resulting * in an eventual call to soisconnected() if/when the connection is * established. When the connection is torn down soisdisconnecting() is * called during processing of disconnect() call, and soisdisconnected() is * called when the connection to the peer is totally severed. The semantics * of these routines are such that connectionless protocols can call * soisconnected() and soisdisconnected() only, bypassing the in-progress * calls when setting up a ``connection'' takes no time. * * From the passive side, a socket is created with two queues of sockets: * so_incomp for connections in progress and so_comp for connections already * made and awaiting user acceptance. As a protocol is preparing incoming * connections, it creates a socket structure queued on so_incomp by calling * sonewconn(). When the connection is established, soisconnected() is * called, and transfers the socket structure to so_comp, making it available * to accept(). * * If a socket is closed with sockets on either so_incomp or so_comp, these * sockets are dropped. * * If higher-level protocols are implemented in the kernel, the wakeups done * here will sometimes cause software-interrupt process scheduling. */ void soisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } void soisconnected(struct socket *so) { bool last __diagused; SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; if (so->so_qstate == SQ_INCOMP) { struct socket *head = so->so_listen; int ret; KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); /* * Promoting a socket from incomplete queue to complete, we * need to go through reverse order of locking. We first do * trylock, and if that doesn't succeed, we go the hard way * leaving a reference and rechecking consistency after proper * locking. */ if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { soref(head); SOCK_UNLOCK(so); SOLISTEN_LOCK(head); SOCK_LOCK(so); if (__predict_false(head != so->so_listen)) { /* * The socket went off the listen queue, * should be lost race to close(2) of sol. * The socket is about to soabort(). */ SOCK_UNLOCK(so); sorele_locked(head); return; } last = refcount_release(&head->so_count); KASSERT(!last, ("%s: released last reference for %p", __func__, head)); } again: if ((so->so_options & SO_ACCEPTFILTER) == 0) { TAILQ_REMOVE(&head->sol_incomp, so, so_list); head->sol_incqlen--; TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); head->sol_qlen++; so->so_qstate = SQ_COMP; SOCK_UNLOCK(so); solisten_wakeup(head); /* unlocks */ } else { SOCK_RECVBUF_LOCK(so); soupcall_set(so, SO_RCV, head->sol_accept_filter->accf_callback, head->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; ret = head->sol_accept_filter->accf_callback(so, head->sol_accept_filter_arg, M_NOWAIT); if (ret == SU_ISCONNECTED) { soupcall_clear(so, SO_RCV); SOCK_RECVBUF_UNLOCK(so); goto again; } SOCK_RECVBUF_UNLOCK(so); SOCK_UNLOCK(so); SOLISTEN_UNLOCK(head); } return; } SOCK_UNLOCK(so); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); } void soisdisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; if (!SOLISTENING(so)) { SOCK_RECVBUF_LOCK(so); socantrcvmore_locked(so); SOCK_SENDBUF_LOCK(so); socantsendmore_locked(so); } SOCK_UNLOCK(so); wakeup(&so->so_timeo); } void soisdisconnected(struct socket *so) { SOCK_LOCK(so); /* * There is at least one reader of so_state that does not * acquire socket lock, namely soreceive_generic(). Ensure * that it never sees all flags that track connection status * cleared, by ordering the update with a barrier semantic of * our release thread fence. */ so->so_state |= SS_ISDISCONNECTED; atomic_thread_fence_rel(); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); if (!SOLISTENING(so)) { SOCK_UNLOCK(so); SOCK_RECVBUF_LOCK(so); socantrcvmore_locked(so); SOCK_SENDBUF_LOCK(so); sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); socantsendmore_locked(so); } else SOCK_UNLOCK(so); wakeup(&so->so_timeo); } int soiolock(struct socket *so, struct sx *sx, int flags) { int error; KASSERT((flags & SBL_VALID) == flags, ("soiolock: invalid flags %#x", flags)); if ((flags & SBL_WAIT) != 0) { if ((flags & SBL_NOINTR) != 0) { sx_xlock(sx); } else { error = sx_xlock_sig(sx); if (error != 0) return (error); } } else if (!sx_try_xlock(sx)) { return (EWOULDBLOCK); } if (__predict_false(SOLISTENING(so))) { sx_xunlock(sx); return (ENOTCONN); } return (0); } void soiounlock(struct sx *sx) { sx_xunlock(sx); } /* * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. */ struct sockaddr * sodupsockaddr(const struct sockaddr *sa, int mflags) { struct sockaddr *sa2; sa2 = malloc(sa->sa_len, M_SONAME, mflags); if (sa2) bcopy(sa, sa2, sa->sa_len); return sa2; } /* * Register per-socket destructor. */ void sodtor_set(struct socket *so, so_dtor_t *func) { SOCK_LOCK_ASSERT(so); so->so_dtor = func; } /* * Register per-socket buffer upcalls. */ void soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; } SOCK_BUF_LOCK_ASSERT(so, which); sb->sb_upcall = func; sb->sb_upcallarg = arg; sb->sb_flags |= SB_UPCALL; } void soupcall_clear(struct socket *so, sb_which which) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; } SOCK_BUF_LOCK_ASSERT(so, which); KASSERT(sb->sb_upcall != NULL, ("%s: so %p no upcall to clear", __func__, so)); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } void solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) { SOLISTEN_LOCK_ASSERT(so); so->sol_upcall = func; so->sol_upcallarg = arg; } static void so_rdknl_lock(void *arg) { struct socket *so = arg; retry: if (SOLISTENING(so)) { SOLISTEN_LOCK(so); } else { SOCK_RECVBUF_LOCK(so); if (__predict_false(SOLISTENING(so))) { SOCK_RECVBUF_UNLOCK(so); goto retry; } } } static void so_rdknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOLISTEN_UNLOCK(so); else SOCK_RECVBUF_UNLOCK(so); } static void so_rdknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOLISTEN_LOCK_ASSERT(so); else SOCK_RECVBUF_LOCK_ASSERT(so); } else { if (SOLISTENING(so)) SOLISTEN_UNLOCK_ASSERT(so); else SOCK_RECVBUF_UNLOCK_ASSERT(so); } } static void so_wrknl_lock(void *arg) { struct socket *so = arg; retry: if (SOLISTENING(so)) { SOLISTEN_LOCK(so); } else { SOCK_SENDBUF_LOCK(so); if (__predict_false(SOLISTENING(so))) { SOCK_SENDBUF_UNLOCK(so); goto retry; } } } static void so_wrknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOLISTEN_UNLOCK(so); else SOCK_SENDBUF_UNLOCK(so); } static void so_wrknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOLISTEN_LOCK_ASSERT(so); else SOCK_SENDBUF_LOCK_ASSERT(so); } else { if (SOLISTENING(so)) SOLISTEN_UNLOCK_ASSERT(so); else SOCK_SENDBUF_UNLOCK_ASSERT(so); } } /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void sotoxsocket(struct socket *so, struct xsocket *xso) { bzero(xso, sizeof(*xso)); xso->xso_len = sizeof *xso; xso->xso_so = (uintptr_t)so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = (uintptr_t)so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; xso->so_uid = so->so_cred->cr_uid; xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; if (SOLISTENING(so)) { xso->so_qlen = so->sol_qlen; xso->so_incqlen = so->sol_incqlen; xso->so_qlimit = so->sol_qlimit; xso->so_oobmark = 0; } else { xso->so_state |= so->so_qstate; xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; xso->so_oobmark = so->so_oobmark; sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); } } struct sockbuf * so_sockbuf_rcv(struct socket *so) { return (&so->so_rcv); } struct sockbuf * so_sockbuf_snd(struct socket *so) { return (&so->so_snd); } int so_state_get(const struct socket *so) { return (so->so_state); } void so_state_set(struct socket *so, int val) { so->so_state = val; } int so_options_get(const struct socket *so) { return (so->so_options); } void so_options_set(struct socket *so, int val) { so->so_options = val; } int so_error_get(const struct socket *so) { return (so->so_error); } void so_error_set(struct socket *so, int val) { so->so_error = val; } int so_linger_get(const struct socket *so) { return (so->so_linger); } void so_linger_set(struct socket *so, int val) { KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), ("%s: val %d out of range", __func__, val)); so->so_linger = val; } struct protosw * so_protosw_get(const struct socket *so) { return (so->so_proto); } void so_protosw_set(struct socket *so, struct protosw *val) { so->so_proto = val; } void so_sorwakeup(struct socket *so) { sorwakeup(so); } void so_sowwakeup(struct socket *so) { sowwakeup(so); } void so_sorwakeup_locked(struct socket *so) { sorwakeup_locked(so); } void so_sowwakeup_locked(struct socket *so) { sowwakeup_locked(so); } void so_lock(struct socket *so) { SOCK_LOCK(so); } void so_unlock(struct socket *so) { SOCK_UNLOCK(so); } diff --git a/sys/netgraph/ng_ksocket.c b/sys/netgraph/ng_ksocket.c index d4f41fe02205..ff5e7b4812bf 100644 --- a/sys/netgraph/ng_ksocket.c +++ b/sys/netgraph/ng_ksocket.c @@ -1,1270 +1,1270 @@ /* * ng_ksocket.c */ /*- * Copyright (c) 1996-1999 Whistle Communications, Inc. * All rights reserved. * * Subject to the following obligations and disclaimer of warranty, use and * redistribution of this software, in source or object code forms, with or * without modifications are expressly permitted by Whistle Communications; * provided, however, that: * 1. Any and all reproductions of the source or object code must include the * copyright notice above and the following disclaimer of warranties; and * 2. No rights are granted, in any manner or form, to use Whistle * Communications, Inc. trademarks, including the mark "WHISTLE * COMMUNICATIONS" on advertising, endorsements, or otherwise except as * such appears in the above copyright notice or in the software. * * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. * * Author: Archie Cobbs * * $FreeBSD$ * $Whistle: ng_ksocket.c,v 1.1 1999/11/16 20:04:40 archie Exp $ */ /* * Kernel socket node type. This node type is basically a kernel-mode * version of a socket... kindof like the reverse of the socket node type. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef NG_SEPARATE_MALLOC static MALLOC_DEFINE(M_NETGRAPH_KSOCKET, "netgraph_ksock", "netgraph ksock node"); #else #define M_NETGRAPH_KSOCKET M_NETGRAPH #endif #define OFFSETOF(s, e) ((char *)&((s *)0)->e - (char *)((s *)0)) #define SADATA_OFFSET (OFFSETOF(struct sockaddr, sa_data)) /* Node private data */ struct ng_ksocket_private { node_p node; hook_p hook; struct socket *so; int fn_sent; /* FN call on incoming event was sent */ LIST_HEAD(, ng_ksocket_private) embryos; LIST_ENTRY(ng_ksocket_private) siblings; u_int32_t flags; u_int32_t response_token; ng_ID_t response_addr; }; typedef struct ng_ksocket_private *priv_p; /* Flags for priv_p */ #define KSF_CONNECTING 0x00000001 /* Waiting for connection complete */ #define KSF_ACCEPTING 0x00000002 /* Waiting for accept complete */ #define KSF_EOFSEEN 0x00000004 /* Have sent 0-length EOF mbuf */ #define KSF_CLONED 0x00000008 /* Cloned from an accepting socket */ #define KSF_EMBRYONIC 0x00000010 /* Cloned node with no hooks yet */ /* Netgraph node methods */ static ng_constructor_t ng_ksocket_constructor; static ng_rcvmsg_t ng_ksocket_rcvmsg; static ng_shutdown_t ng_ksocket_shutdown; static ng_newhook_t ng_ksocket_newhook; static ng_rcvdata_t ng_ksocket_rcvdata; static ng_connect_t ng_ksocket_connect; static ng_disconnect_t ng_ksocket_disconnect; /* Alias structure */ struct ng_ksocket_alias { const char *name; const int value; const int family; }; /* Protocol family aliases */ static const struct ng_ksocket_alias ng_ksocket_families[] = { { "local", PF_LOCAL }, { "inet", PF_INET }, { "inet6", PF_INET6 }, { "atm", PF_ATM }, + { "divert", PF_DIVERT }, { NULL, -1 }, }; /* Socket type aliases */ static const struct ng_ksocket_alias ng_ksocket_types[] = { { "stream", SOCK_STREAM }, { "dgram", SOCK_DGRAM }, { "raw", SOCK_RAW }, { "rdm", SOCK_RDM }, { "seqpacket", SOCK_SEQPACKET }, { NULL, -1 }, }; /* Protocol aliases */ static const struct ng_ksocket_alias ng_ksocket_protos[] = { { "ip", IPPROTO_IP, PF_INET }, { "raw", IPPROTO_RAW, PF_INET }, { "icmp", IPPROTO_ICMP, PF_INET }, { "igmp", IPPROTO_IGMP, PF_INET }, { "tcp", IPPROTO_TCP, PF_INET }, { "udp", IPPROTO_UDP, PF_INET }, { "gre", IPPROTO_GRE, PF_INET }, { "esp", IPPROTO_ESP, PF_INET }, { "ah", IPPROTO_AH, PF_INET }, { "swipe", IPPROTO_SWIPE, PF_INET }, { "encap", IPPROTO_ENCAP, PF_INET }, - { "divert", IPPROTO_DIVERT, PF_INET }, { "pim", IPPROTO_PIM, PF_INET }, { NULL, -1 }, }; /* Helper functions */ static int ng_ksocket_accept(priv_p); static int ng_ksocket_incoming(struct socket *so, void *arg, int waitflag); static int ng_ksocket_parse(const struct ng_ksocket_alias *aliases, const char *s, int family); static void ng_ksocket_incoming2(node_p node, hook_p hook, void *arg1, int arg2); /************************************************************************ STRUCT SOCKADDR PARSE TYPE ************************************************************************/ /* Get the length of the data portion of a generic struct sockaddr */ static int ng_parse_generic_sockdata_getLength(const struct ng_parse_type *type, const u_char *start, const u_char *buf) { const struct sockaddr *sa; sa = (const struct sockaddr *)(buf - SADATA_OFFSET); return (sa->sa_len < SADATA_OFFSET) ? 0 : sa->sa_len - SADATA_OFFSET; } /* Type for the variable length data portion of a generic struct sockaddr */ static const struct ng_parse_type ng_ksocket_generic_sockdata_type = { &ng_parse_bytearray_type, &ng_parse_generic_sockdata_getLength }; /* Type for a generic struct sockaddr */ static const struct ng_parse_struct_field ng_parse_generic_sockaddr_type_fields[] = { { "len", &ng_parse_uint8_type }, { "family", &ng_parse_uint8_type }, { "data", &ng_ksocket_generic_sockdata_type }, { NULL } }; static const struct ng_parse_type ng_ksocket_generic_sockaddr_type = { &ng_parse_struct_type, &ng_parse_generic_sockaddr_type_fields }; /* Convert a struct sockaddr from ASCII to binary. If its a protocol family that we specially handle, do that, otherwise defer to the generic parse type ng_ksocket_generic_sockaddr_type. */ static int ng_ksocket_sockaddr_parse(const struct ng_parse_type *type, const char *s, int *off, const u_char *const start, u_char *const buf, int *buflen) { struct sockaddr *const sa = (struct sockaddr *)buf; enum ng_parse_token tok; char fambuf[32]; int family, len; char *t; /* If next token is a left curly brace, use generic parse type */ if ((tok = ng_parse_get_token(s, off, &len)) == T_LBRACE) { return (*ng_ksocket_generic_sockaddr_type.supertype->parse) (&ng_ksocket_generic_sockaddr_type, s, off, start, buf, buflen); } /* Get socket address family followed by a slash */ while (isspace(s[*off])) (*off)++; if ((t = strchr(s + *off, '/')) == NULL) return (EINVAL); if ((len = t - (s + *off)) > sizeof(fambuf) - 1) return (EINVAL); strncpy(fambuf, s + *off, len); fambuf[len] = '\0'; *off += len + 1; if ((family = ng_ksocket_parse(ng_ksocket_families, fambuf, 0)) == -1) return (EINVAL); /* Set family */ if (*buflen < SADATA_OFFSET) return (ERANGE); sa->sa_family = family; /* Set family-specific data and length */ switch (sa->sa_family) { case PF_LOCAL: /* Get pathname */ { const int pathoff = OFFSETOF(struct sockaddr_un, sun_path); struct sockaddr_un *const sun = (struct sockaddr_un *)sa; int toklen, pathlen; char *path; if ((path = ng_get_string_token(s, off, &toklen, NULL)) == NULL) return (EINVAL); pathlen = strlen(path); if (pathlen > SOCK_MAXADDRLEN) { free(path, M_NETGRAPH_KSOCKET); return (E2BIG); } if (*buflen < pathoff + pathlen) { free(path, M_NETGRAPH_KSOCKET); return (ERANGE); } *off += toklen; bcopy(path, sun->sun_path, pathlen); sun->sun_len = pathoff + pathlen; free(path, M_NETGRAPH_KSOCKET); break; } case PF_INET: /* Get an IP address with optional port */ { struct sockaddr_in *const sin = (struct sockaddr_in *)sa; int i; /* Parse this: [:port] */ for (i = 0; i < 4; i++) { u_long val; char *eptr; val = strtoul(s + *off, &eptr, 10); if (val > 0xff || eptr == s + *off) return (EINVAL); *off += (eptr - (s + *off)); ((u_char *)&sin->sin_addr)[i] = (u_char)val; if (i < 3) { if (s[*off] != '.') return (EINVAL); (*off)++; } else if (s[*off] == ':') { (*off)++; val = strtoul(s + *off, &eptr, 10); if (val > 0xffff || eptr == s + *off) return (EINVAL); *off += (eptr - (s + *off)); sin->sin_port = htons(val); } else sin->sin_port = 0; } bzero(&sin->sin_zero, sizeof(sin->sin_zero)); sin->sin_len = sizeof(*sin); break; } #if 0 case PF_INET6: /* XXX implement this someday */ #endif default: return (EINVAL); } /* Done */ *buflen = sa->sa_len; return (0); } /* Convert a struct sockaddr from binary to ASCII */ static int ng_ksocket_sockaddr_unparse(const struct ng_parse_type *type, const u_char *data, int *off, char *cbuf, int cbuflen) { const struct sockaddr *sa = (const struct sockaddr *)(data + *off); int slen = 0; /* Output socket address, either in special or generic format */ switch (sa->sa_family) { case PF_LOCAL: { const int pathoff = OFFSETOF(struct sockaddr_un, sun_path); const struct sockaddr_un *sun = (const struct sockaddr_un *)sa; const int pathlen = sun->sun_len - pathoff; char pathbuf[SOCK_MAXADDRLEN + 1]; char *pathtoken; bcopy(sun->sun_path, pathbuf, pathlen); if ((pathtoken = ng_encode_string(pathbuf, pathlen)) == NULL) return (ENOMEM); slen += snprintf(cbuf, cbuflen, "local/%s", pathtoken); free(pathtoken, M_NETGRAPH_KSOCKET); if (slen >= cbuflen) return (ERANGE); *off += sun->sun_len; return (0); } case PF_INET: { const struct sockaddr_in *sin = (const struct sockaddr_in *)sa; slen += snprintf(cbuf, cbuflen, "inet/%d.%d.%d.%d", ((const u_char *)&sin->sin_addr)[0], ((const u_char *)&sin->sin_addr)[1], ((const u_char *)&sin->sin_addr)[2], ((const u_char *)&sin->sin_addr)[3]); if (sin->sin_port != 0) { slen += snprintf(cbuf + strlen(cbuf), cbuflen - strlen(cbuf), ":%d", (u_int)ntohs(sin->sin_port)); } if (slen >= cbuflen) return (ERANGE); *off += sizeof(*sin); return(0); } #if 0 case PF_INET6: /* XXX implement this someday */ #endif default: return (*ng_ksocket_generic_sockaddr_type.supertype->unparse) (&ng_ksocket_generic_sockaddr_type, data, off, cbuf, cbuflen); } } /* Parse type for struct sockaddr */ static const struct ng_parse_type ng_ksocket_sockaddr_type = { NULL, NULL, NULL, &ng_ksocket_sockaddr_parse, &ng_ksocket_sockaddr_unparse, NULL /* no such thing as a default struct sockaddr */ }; /************************************************************************ STRUCT NG_KSOCKET_SOCKOPT PARSE TYPE ************************************************************************/ /* Get length of the struct ng_ksocket_sockopt value field, which is the just the excess of the message argument portion over the length of the struct ng_ksocket_sockopt. */ static int ng_parse_sockoptval_getLength(const struct ng_parse_type *type, const u_char *start, const u_char *buf) { static const int offset = OFFSETOF(struct ng_ksocket_sockopt, value); const struct ng_ksocket_sockopt *sopt; const struct ng_mesg *msg; sopt = (const struct ng_ksocket_sockopt *)(buf - offset); msg = (const struct ng_mesg *)((const u_char *)sopt - sizeof(*msg)); return msg->header.arglen - sizeof(*sopt); } /* Parse type for the option value part of a struct ng_ksocket_sockopt XXX Eventually, we should handle the different socket options specially. XXX This would avoid byte order problems, eg an integer value of 1 is XXX going to be "[1]" for little endian or "[3=1]" for big endian. */ static const struct ng_parse_type ng_ksocket_sockoptval_type = { &ng_parse_bytearray_type, &ng_parse_sockoptval_getLength }; /* Parse type for struct ng_ksocket_sockopt */ static const struct ng_parse_struct_field ng_ksocket_sockopt_type_fields[] = NG_KSOCKET_SOCKOPT_INFO(&ng_ksocket_sockoptval_type); static const struct ng_parse_type ng_ksocket_sockopt_type = { &ng_parse_struct_type, &ng_ksocket_sockopt_type_fields }; /* Parse type for struct ng_ksocket_accept */ static const struct ng_parse_struct_field ng_ksocket_accept_type_fields[] = NGM_KSOCKET_ACCEPT_INFO; static const struct ng_parse_type ng_ksocket_accept_type = { &ng_parse_struct_type, &ng_ksocket_accept_type_fields }; /* List of commands and how to convert arguments to/from ASCII */ static const struct ng_cmdlist ng_ksocket_cmds[] = { { NGM_KSOCKET_COOKIE, NGM_KSOCKET_BIND, "bind", &ng_ksocket_sockaddr_type, NULL }, { NGM_KSOCKET_COOKIE, NGM_KSOCKET_LISTEN, "listen", &ng_parse_int32_type, NULL }, { NGM_KSOCKET_COOKIE, NGM_KSOCKET_ACCEPT, "accept", NULL, &ng_ksocket_accept_type }, { NGM_KSOCKET_COOKIE, NGM_KSOCKET_CONNECT, "connect", &ng_ksocket_sockaddr_type, &ng_parse_int32_type }, { NGM_KSOCKET_COOKIE, NGM_KSOCKET_GETNAME, "getname", NULL, &ng_ksocket_sockaddr_type }, { NGM_KSOCKET_COOKIE, NGM_KSOCKET_GETPEERNAME, "getpeername", NULL, &ng_ksocket_sockaddr_type }, { NGM_KSOCKET_COOKIE, NGM_KSOCKET_SETOPT, "setopt", &ng_ksocket_sockopt_type, NULL }, { NGM_KSOCKET_COOKIE, NGM_KSOCKET_GETOPT, "getopt", &ng_ksocket_sockopt_type, &ng_ksocket_sockopt_type }, { 0 } }; /* Node type descriptor */ static struct ng_type ng_ksocket_typestruct = { .version = NG_ABI_VERSION, .name = NG_KSOCKET_NODE_TYPE, .constructor = ng_ksocket_constructor, .rcvmsg = ng_ksocket_rcvmsg, .shutdown = ng_ksocket_shutdown, .newhook = ng_ksocket_newhook, .connect = ng_ksocket_connect, .rcvdata = ng_ksocket_rcvdata, .disconnect = ng_ksocket_disconnect, .cmdlist = ng_ksocket_cmds, }; NETGRAPH_INIT(ksocket, &ng_ksocket_typestruct); #define ERROUT(x) do { error = (x); goto done; } while (0) /************************************************************************ NETGRAPH NODE STUFF ************************************************************************/ /* * Node type constructor * The NODE part is assumed to be all set up. * There is already a reference to the node for us. */ static int ng_ksocket_constructor(node_p node) { priv_p priv; /* Allocate private structure */ priv = malloc(sizeof(*priv), M_NETGRAPH_KSOCKET, M_NOWAIT | M_ZERO); if (priv == NULL) return (ENOMEM); LIST_INIT(&priv->embryos); /* cross link them */ priv->node = node; NG_NODE_SET_PRIVATE(node, priv); /* Done */ return (0); } /* * Give our OK for a hook to be added. The hook name is of the * form "//" where the three components may * be decimal numbers or else aliases from the above lists. * * Connecting a hook amounts to opening the socket. Disconnecting * the hook closes the socket and destroys the node as well. */ static int ng_ksocket_newhook(node_p node, hook_p hook, const char *name0) { struct thread *td = curthread; /* XXX broken */ const priv_p priv = NG_NODE_PRIVATE(node); char *s1, *s2, name[NG_HOOKSIZ]; int family, type, protocol, error; /* Check if we're already connected */ if (priv->hook != NULL) return (EISCONN); if (priv->flags & KSF_CLONED) { if (priv->flags & KSF_EMBRYONIC) { /* Remove ourselves from our parent's embryo list */ LIST_REMOVE(priv, siblings); priv->flags &= ~KSF_EMBRYONIC; } } else { /* Extract family, type, and protocol from hook name */ snprintf(name, sizeof(name), "%s", name0); s1 = name; if ((s2 = strchr(s1, '/')) == NULL) return (EINVAL); *s2++ = '\0'; family = ng_ksocket_parse(ng_ksocket_families, s1, 0); if (family == -1) return (EINVAL); s1 = s2; if ((s2 = strchr(s1, '/')) == NULL) return (EINVAL); *s2++ = '\0'; type = ng_ksocket_parse(ng_ksocket_types, s1, 0); if (type == -1) return (EINVAL); s1 = s2; protocol = ng_ksocket_parse(ng_ksocket_protos, s1, family); if (protocol == -1) return (EINVAL); /* Create the socket */ error = socreate(family, &priv->so, type, protocol, td->td_ucred, td); if (error != 0) return (error); /* XXX call soreserve() ? */ } /* OK */ priv->hook = hook; /* * In case of misconfigured routing a packet may reenter * ksocket node recursively. Decouple stack to avoid possible * panics about sleeping with locks held. */ NG_HOOK_FORCE_QUEUE(hook); return(0); } static int ng_ksocket_connect(hook_p hook) { node_p node = NG_HOOK_NODE(hook); const priv_p priv = NG_NODE_PRIVATE(node); struct socket *const so = priv->so; /* Add our hook for incoming data and other events */ SOCKBUF_LOCK(&priv->so->so_rcv); soupcall_set(priv->so, SO_RCV, ng_ksocket_incoming, node); SOCKBUF_UNLOCK(&priv->so->so_rcv); SOCKBUF_LOCK(&priv->so->so_snd); soupcall_set(priv->so, SO_SND, ng_ksocket_incoming, node); SOCKBUF_UNLOCK(&priv->so->so_snd); SOCK_LOCK(priv->so); priv->so->so_state |= SS_NBIO; SOCK_UNLOCK(priv->so); /* * --Original comment-- * On a cloned socket we may have already received one or more * upcalls which we couldn't handle without a hook. Handle * those now. * We cannot call the upcall function directly * from here, because until this function has returned our * hook isn't connected. * * ---meta comment for -current --- * XXX This is dubius. * Upcalls between the time that the hook was * first created and now (on another processesor) will * be earlier on the queue than the request to finalise the hook. * By the time the hook is finalised, * The queued upcalls will have happened and the code * will have discarded them because of a lack of a hook. * (socket not open). * * This is a bad byproduct of the complicated way in which hooks * are now created (3 daisy chained async events). * * Since we are a netgraph operation * We know that we hold a lock on this node. This forces the * request we make below to be queued rather than implemented * immediately which will cause the upcall function to be called a bit * later. * However, as we will run any waiting queued operations immediately * after doing this one, if we have not finalised the other end * of the hook, those queued operations will fail. */ if (priv->flags & KSF_CLONED) { ng_send_fn(node, NULL, &ng_ksocket_incoming2, so, M_NOWAIT); } return (0); } /* * Receive a control message */ static int ng_ksocket_rcvmsg(node_p node, item_p item, hook_p lasthook) { struct thread *td = curthread; /* XXX broken */ const priv_p priv = NG_NODE_PRIVATE(node); struct socket *const so = priv->so; struct ng_mesg *resp = NULL; int error = 0; struct ng_mesg *msg; NGI_GET_MSG(item, msg); switch (msg->header.typecookie) { case NGM_KSOCKET_COOKIE: switch (msg->header.cmd) { case NGM_KSOCKET_BIND: { struct sockaddr *const sa = (struct sockaddr *)msg->data; /* Sanity check */ if (msg->header.arglen < SADATA_OFFSET || msg->header.arglen < sa->sa_len) ERROUT(EINVAL); if (so == NULL) ERROUT(ENXIO); /* Bind */ error = sobind(so, sa, td); break; } case NGM_KSOCKET_LISTEN: { /* Sanity check */ if (msg->header.arglen != sizeof(int32_t)) ERROUT(EINVAL); if (so == NULL) ERROUT(ENXIO); /* Listen */ so->so_state |= SS_NBIO; error = solisten(so, *((int32_t *)msg->data), td); break; } case NGM_KSOCKET_ACCEPT: { /* Sanity check */ if (msg->header.arglen != 0) ERROUT(EINVAL); if (so == NULL) ERROUT(ENXIO); /* Make sure the socket is capable of accepting */ if (!(so->so_options & SO_ACCEPTCONN)) ERROUT(EINVAL); if (priv->flags & KSF_ACCEPTING) ERROUT(EALREADY); /* * If a connection is already complete, take it. * Otherwise let the upcall function deal with * the connection when it comes in. */ error = ng_ksocket_accept(priv); if (error != 0 && error != EWOULDBLOCK) ERROUT(error); priv->response_token = msg->header.token; priv->response_addr = NGI_RETADDR(item); break; } case NGM_KSOCKET_CONNECT: { struct sockaddr *const sa = (struct sockaddr *)msg->data; /* Sanity check */ if (msg->header.arglen < SADATA_OFFSET || msg->header.arglen < sa->sa_len) ERROUT(EINVAL); if (so == NULL) ERROUT(ENXIO); /* Do connect */ if ((so->so_state & SS_ISCONNECTING) != 0) ERROUT(EALREADY); if ((error = soconnect(so, sa, td)) != 0) { so->so_state &= ~SS_ISCONNECTING; ERROUT(error); } if ((so->so_state & SS_ISCONNECTING) != 0) { /* We will notify the sender when we connect */ priv->response_token = msg->header.token; priv->response_addr = NGI_RETADDR(item); priv->flags |= KSF_CONNECTING; ERROUT(EINPROGRESS); } break; } case NGM_KSOCKET_GETNAME: case NGM_KSOCKET_GETPEERNAME: { int (*func)(struct socket *so, struct sockaddr **nam); struct sockaddr *sa = NULL; int len; /* Sanity check */ if (msg->header.arglen != 0) ERROUT(EINVAL); if (so == NULL) ERROUT(ENXIO); /* Get function */ if (msg->header.cmd == NGM_KSOCKET_GETPEERNAME) { if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) ERROUT(ENOTCONN); func = so->so_proto->pr_peeraddr; } else func = so->so_proto->pr_sockaddr; /* Get local or peer address */ if ((error = (*func)(so, &sa)) != 0) goto bail; len = (sa == NULL) ? 0 : sa->sa_len; /* Send it back in a response */ NG_MKRESPONSE(resp, msg, len, M_NOWAIT); if (resp == NULL) { error = ENOMEM; goto bail; } bcopy(sa, resp->data, len); bail: /* Cleanup */ if (sa != NULL) free(sa, M_SONAME); break; } case NGM_KSOCKET_GETOPT: { struct ng_ksocket_sockopt *ksopt = (struct ng_ksocket_sockopt *)msg->data; struct sockopt sopt; /* Sanity check */ if (msg->header.arglen != sizeof(*ksopt)) ERROUT(EINVAL); if (so == NULL) ERROUT(ENXIO); /* Get response with room for option value */ NG_MKRESPONSE(resp, msg, sizeof(*ksopt) + NG_KSOCKET_MAX_OPTLEN, M_NOWAIT); if (resp == NULL) ERROUT(ENOMEM); /* Get socket option, and put value in the response */ sopt.sopt_dir = SOPT_GET; sopt.sopt_level = ksopt->level; sopt.sopt_name = ksopt->name; sopt.sopt_td = NULL; sopt.sopt_valsize = NG_KSOCKET_MAX_OPTLEN; ksopt = (struct ng_ksocket_sockopt *)resp->data; sopt.sopt_val = ksopt->value; if ((error = sogetopt(so, &sopt)) != 0) { NG_FREE_MSG(resp); break; } /* Set actual value length */ resp->header.arglen = sizeof(*ksopt) + sopt.sopt_valsize; break; } case NGM_KSOCKET_SETOPT: { struct ng_ksocket_sockopt *const ksopt = (struct ng_ksocket_sockopt *)msg->data; const int valsize = msg->header.arglen - sizeof(*ksopt); struct sockopt sopt; /* Sanity check */ if (valsize < 0) ERROUT(EINVAL); if (so == NULL) ERROUT(ENXIO); /* Set socket option */ sopt.sopt_dir = SOPT_SET; sopt.sopt_level = ksopt->level; sopt.sopt_name = ksopt->name; sopt.sopt_val = ksopt->value; sopt.sopt_valsize = valsize; sopt.sopt_td = NULL; error = sosetopt(so, &sopt); break; } default: error = EINVAL; break; } break; default: error = EINVAL; break; } done: NG_RESPOND_MSG(error, node, item, resp); NG_FREE_MSG(msg); return (error); } /* * Receive incoming data on our hook. Send it out the socket. */ static int ng_ksocket_rcvdata(hook_p hook, item_p item) { struct thread *td = curthread; /* XXX broken */ const node_p node = NG_HOOK_NODE(hook); const priv_p priv = NG_NODE_PRIVATE(node); struct socket *const so = priv->so; struct sockaddr *sa = NULL; int error; struct mbuf *m; #ifdef ALIGNED_POINTER struct mbuf *n; #endif /* ALIGNED_POINTER */ struct sa_tag *stag; /* Extract data */ NGI_GET_M(item, m); NG_FREE_ITEM(item); #ifdef ALIGNED_POINTER if (!ALIGNED_POINTER(mtod(m, caddr_t), uint32_t)) { n = m_defrag(m, M_NOWAIT); if (n == NULL) { m_freem(m); return (ENOBUFS); } m = n; } #endif /* ALIGNED_POINTER */ /* * Look if socket address is stored in packet tags. * If sockaddr is ours, or provided by a third party (zero id), * then we accept it. */ if (((stag = (struct sa_tag *)m_tag_locate(m, NGM_KSOCKET_COOKIE, NG_KSOCKET_TAG_SOCKADDR, NULL)) != NULL) && (stag->id == NG_NODE_ID(node) || stag->id == 0)) sa = &stag->sa; /* Reset specific mbuf flags to prevent addressing problems. */ m->m_flags &= ~(M_BCAST|M_MCAST); /* Send packet */ error = sosend(so, sa, 0, m, 0, 0, td); return (error); } /* * Destroy node */ static int ng_ksocket_shutdown(node_p node) { const priv_p priv = NG_NODE_PRIVATE(node); priv_p embryo; /* Close our socket (if any) */ if (priv->so != NULL) { SOCKBUF_LOCK(&priv->so->so_rcv); soupcall_clear(priv->so, SO_RCV); SOCKBUF_UNLOCK(&priv->so->so_rcv); SOCKBUF_LOCK(&priv->so->so_snd); soupcall_clear(priv->so, SO_SND); SOCKBUF_UNLOCK(&priv->so->so_snd); soclose(priv->so); priv->so = NULL; } /* If we are an embryo, take ourselves out of the parent's list */ if (priv->flags & KSF_EMBRYONIC) { LIST_REMOVE(priv, siblings); priv->flags &= ~KSF_EMBRYONIC; } /* Remove any embryonic children we have */ while (!LIST_EMPTY(&priv->embryos)) { embryo = LIST_FIRST(&priv->embryos); ng_rmnode_self(embryo->node); } /* Take down netgraph node */ bzero(priv, sizeof(*priv)); free(priv, M_NETGRAPH_KSOCKET); NG_NODE_SET_PRIVATE(node, NULL); NG_NODE_UNREF(node); /* let the node escape */ return (0); } /* * Hook disconnection */ static int ng_ksocket_disconnect(hook_p hook) { KASSERT(NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0, ("%s: numhooks=%d?", __func__, NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)))); if (NG_NODE_IS_VALID(NG_HOOK_NODE(hook))) ng_rmnode_self(NG_HOOK_NODE(hook)); return (0); } /************************************************************************ HELPER STUFF ************************************************************************/ /* * You should not "just call" a netgraph node function from an external * asynchronous event. This is because in doing so you are ignoring the * locking on the netgraph nodes. Instead call your function via ng_send_fn(). * This will call the function you chose, but will first do all the * locking rigmarole. Your function MAY only be called at some distant future * time (several millisecs away) so don't give it any arguments * that may be revoked soon (e.g. on your stack). * * To decouple stack, we use queue version of ng_send_fn(). */ static int ng_ksocket_incoming(struct socket *so, void *arg, int waitflag) { const node_p node = arg; const priv_p priv = NG_NODE_PRIVATE(node); int wait = ((waitflag & M_WAITOK) ? NG_WAITOK : 0) | NG_QUEUE; /* * Even if node is not locked, as soon as we are called, we assume * it exist and it's private area is valid. With some care we can * access it. Mark node that incoming event for it was sent to * avoid unneded queue trashing. */ if (atomic_cmpset_int(&priv->fn_sent, 0, 1) && ng_send_fn1(node, NULL, &ng_ksocket_incoming2, so, 0, wait)) { atomic_store_rel_int(&priv->fn_sent, 0); } return (SU_OK); } /* * When incoming data is appended to the socket, we get notified here. * This is also called whenever a significant event occurs for the socket. * Our original caller may have queued this even some time ago and * we cannot trust that he even still exists. The node however is being * held with a reference by the queueing code and guarantied to be valid. */ static void ng_ksocket_incoming2(node_p node, hook_p hook, void *arg1, int arg2) { struct socket *so = arg1; const priv_p priv = NG_NODE_PRIVATE(node); struct ng_mesg *response; int error; KASSERT(so == priv->so, ("%s: wrong socket", __func__)); /* Allow next incoming event to be queued. */ atomic_store_rel_int(&priv->fn_sent, 0); /* Check whether a pending connect operation has completed */ if (priv->flags & KSF_CONNECTING) { if ((error = so->so_error) != 0) { so->so_error = 0; so->so_state &= ~SS_ISCONNECTING; } if (!(so->so_state & SS_ISCONNECTING)) { NG_MKMESSAGE(response, NGM_KSOCKET_COOKIE, NGM_KSOCKET_CONNECT, sizeof(int32_t), M_NOWAIT); if (response != NULL) { response->header.flags |= NGF_RESP; response->header.token = priv->response_token; *(int32_t *)response->data = error; /* * send an async "response" message * to the node that set us up * (if it still exists) */ NG_SEND_MSG_ID(error, node, response, priv->response_addr, 0); } priv->flags &= ~KSF_CONNECTING; } } /* Check whether a pending accept operation has completed */ if (priv->flags & KSF_ACCEPTING) (void )ng_ksocket_accept(priv); /* * If we don't have a hook, we must handle data events later. When * the hook gets created and is connected, this upcall function * will be called again. */ if (priv->hook == NULL) return; /* Read and forward available mbufs. */ while (1) { struct uio uio; struct sockaddr *sa; struct mbuf *m; int flags; /* Try to get next packet from socket. */ uio.uio_td = NULL; uio.uio_resid = IP_MAXPACKET; flags = MSG_DONTWAIT; sa = NULL; if ((error = soreceive(so, (so->so_state & SS_ISCONNECTED) ? NULL : &sa, &uio, &m, NULL, &flags)) != 0) break; /* See if we got anything. */ if (flags & MSG_TRUNC) { m_freem(m); m = NULL; } if (m == NULL) { if (sa != NULL) free(sa, M_SONAME); break; } KASSERT(m->m_nextpkt == NULL, ("%s: nextpkt", __func__)); /* * Stream sockets do not have packet boundaries, so * we have to allocate a header mbuf and attach the * stream of data to it. */ if (so->so_type == SOCK_STREAM) { struct mbuf *mh; mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { m_freem(m); if (sa != NULL) free(sa, M_SONAME); break; } mh->m_next = m; for (; m; m = m->m_next) mh->m_pkthdr.len += m->m_len; m = mh; } /* Put peer's socket address (if any) into a tag */ if (sa != NULL) { struct sa_tag *stag; stag = (struct sa_tag *)m_tag_alloc(NGM_KSOCKET_COOKIE, NG_KSOCKET_TAG_SOCKADDR, sizeof(ng_ID_t) + sa->sa_len, M_NOWAIT); if (stag == NULL) { free(sa, M_SONAME); goto sendit; } bcopy(sa, &stag->sa, sa->sa_len); free(sa, M_SONAME); stag->id = NG_NODE_ID(node); m_tag_prepend(m, &stag->tag); } sendit: /* Forward data with optional peer sockaddr as packet tag */ NG_SEND_DATA_ONLY(error, priv->hook, m); } /* * If the peer has closed the connection, forward a 0-length mbuf * to indicate end-of-file. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE && !(priv->flags & KSF_EOFSEEN)) { struct mbuf *m; m = m_gethdr(M_NOWAIT, MT_DATA); if (m != NULL) NG_SEND_DATA_ONLY(error, priv->hook, m); priv->flags |= KSF_EOFSEEN; } } static int ng_ksocket_accept(priv_p priv) { struct socket *const head = priv->so; struct socket *so; struct sockaddr *sa = NULL; struct ng_mesg *resp; struct ng_ksocket_accept *resp_data; node_p node; priv_p priv2; int len; int error; SOLISTEN_LOCK(head); error = solisten_dequeue(head, &so, SOCK_NONBLOCK); if (error == EWOULDBLOCK) { priv->flags |= KSF_ACCEPTING; return (error); } priv->flags &= ~KSF_ACCEPTING; if (error) return (error); if ((error = soaccept(so, &sa)) != 0) return (error); len = OFFSETOF(struct ng_ksocket_accept, addr); if (sa != NULL) len += sa->sa_len; NG_MKMESSAGE(resp, NGM_KSOCKET_COOKIE, NGM_KSOCKET_ACCEPT, len, M_NOWAIT); if (resp == NULL) { soclose(so); goto out; } resp->header.flags |= NGF_RESP; resp->header.token = priv->response_token; /* Clone a ksocket node to wrap the new socket */ error = ng_make_node_common(&ng_ksocket_typestruct, &node); if (error) { free(resp, M_NETGRAPH); soclose(so); goto out; } if (ng_ksocket_constructor(node) != 0) { NG_NODE_UNREF(node); free(resp, M_NETGRAPH); soclose(so); goto out; } priv2 = NG_NODE_PRIVATE(node); priv2->so = so; priv2->flags |= KSF_CLONED | KSF_EMBRYONIC; /* * Insert the cloned node into a list of embryonic children * on the parent node. When a hook is created on the cloned * node it will be removed from this list. When the parent * is destroyed it will destroy any embryonic children it has. */ LIST_INSERT_HEAD(&priv->embryos, priv2, siblings); SOCKBUF_LOCK(&so->so_rcv); soupcall_set(so, SO_RCV, ng_ksocket_incoming, node); SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_snd); soupcall_set(so, SO_SND, ng_ksocket_incoming, node); SOCKBUF_UNLOCK(&so->so_snd); /* Fill in the response data and send it or return it to the caller */ resp_data = (struct ng_ksocket_accept *)resp->data; resp_data->nodeid = NG_NODE_ID(node); if (sa != NULL) bcopy(sa, &resp_data->addr, sa->sa_len); NG_SEND_MSG_ID(error, node, resp, priv->response_addr, 0); out: if (sa != NULL) free(sa, M_SONAME); return (0); } /* * Parse out either an integer value or an alias. */ static int ng_ksocket_parse(const struct ng_ksocket_alias *aliases, const char *s, int family) { int k, val; char *eptr; /* Try aliases */ for (k = 0; aliases[k].name != NULL; k++) { if (strcmp(s, aliases[k].name) == 0 && aliases[k].family == family) return aliases[k].value; } /* Try parsing as a number */ val = (int)strtoul(s, &eptr, 10); if (val < 0 || *eptr != '\0') return (-1); return (val); } diff --git a/sys/netinet/in_mcast.c b/sys/netinet/in_mcast.c index 3f25471f0858..87de83da7a6a 100644 --- a/sys/netinet/in_mcast.c +++ b/sys/netinet/in_mcast.c @@ -1,3040 +1,3032 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 2005 Robert N. M. Watson. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPv4 multicast socket, group, and socket option processing module. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IGMPV3 #define KTR_IGMPV3 KTR_INET #endif #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in sin; }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ static MALLOC_DEFINE(M_INMFILTER, "in_mfilter", "IPv4 multicast PCB-layer source filter"); static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group"); static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options"); static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource", "IPv4 multicast IGMP-layer source filter"); /* * Locking: * * - Lock order is: Giant, IN_MULTI_LOCK, INP_WLOCK, * IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip_moptions and in_mfilter are covered by the INP_WLOCK. * * struct in_multi is covered by IN_MULTI_LIST_LOCK. There isn't strictly * any need for in_multi itself to be virtualized -- it is bound to an ifp * anyway no matter what happens. */ struct mtx in_multi_list_mtx; MTX_SYSINIT(in_multi_mtx, &in_multi_list_mtx, "in_multi_list_mtx", MTX_DEF); struct mtx in_multi_free_mtx; MTX_SYSINIT(in_multi_free_mtx, &in_multi_free_mtx, "in_multi_free_mtx", MTX_DEF); struct sx in_multi_sx; SX_SYSINIT(in_multi_sx, &in_multi_sx, "in_multi_sx"); int ifma_restart; /* * Functions with non-static linkage defined in this file should be * declared in in_var.h: * imo_multi_filter() * in_joingroup() * in_joingroup_locked() * in_leavegroup() * in_leavegroup_locked() * and ip_var.h: * inp_freemoptions() * inp_getmoptions() * inp_setmoptions() */ static void imf_commit(struct in_mfilter *); static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **); static struct in_msource * imf_graft(struct in_mfilter *, const uint8_t, const struct sockaddr_in *); static void imf_leave(struct in_mfilter *); static int imf_prune(struct in_mfilter *, const struct sockaddr_in *); static void imf_purge(struct in_mfilter *); static void imf_rollback(struct in_mfilter *); static void imf_reap(struct in_mfilter *); static struct in_mfilter * imo_match_group(const struct ip_moptions *, const struct ifnet *, const struct sockaddr *); static struct in_msource * imo_match_source(struct in_mfilter *, const struct sockaddr *); static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback); static int in_getmulti(struct ifnet *, const struct in_addr *, struct in_multi **); static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims); #ifdef KTR static int inm_is_ifp_detached(const struct in_multi *); #endif static int inm_merge(struct in_multi *, /*const*/ struct in_mfilter *); static void inm_purge(struct in_multi *); static void inm_reap(struct in_multi *); static void inm_release(struct in_multi *); static struct ip_moptions * inp_findmoptions(struct inpcb *); static int inp_get_source_filters(struct inpcb *, struct sockopt *); static int inp_join_group(struct inpcb *, struct sockopt *); static int inp_leave_group(struct inpcb *, struct sockopt *); static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *, const struct sockaddr_in *, const struct in_addr); static int inp_block_unblock_source(struct inpcb *, struct sockopt *); static int inp_set_multicast_if(struct inpcb *, struct sockopt *); static int inp_set_source_filters(struct inpcb *, struct sockopt *); static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS); static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPv4 multicast"); static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc, CTLFLAG_RWTUN, &in_mcast_maxgrpsrc, 0, "Max source filters per group"); static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc, CTLFLAG_RWTUN, &in_mcast_maxsocksrc, 0, "Max source filters per socket"); int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP; SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RWTUN, &in_mcast_loop, 0, "Loopback multicast datagrams by default"); static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters, "Per-interface stack-wide source filters"); #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp * is detached. */ static int __inline inm_is_ifp_detached(const struct in_multi *inm) { struct ifnet *ifp; KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); } return (ifp == NULL); } #endif /* * Interface detach can happen in a taskqueue thread context, so we must use a * dedicated thread to avoid deadlocks when draining inm_release tasks. */ TASKQUEUE_DEFINE_THREAD(inm_free); static struct in_multi_head inm_free_list = SLIST_HEAD_INITIALIZER(); static void inm_release_task(void *arg __unused, int pending __unused); static struct task inm_free_task = TASK_INITIALIZER(0, inm_release_task, NULL); void inm_release_wait(void *arg __unused) { /* * Make sure all pending multicast addresses are freed before * the VNET or network device is destroyed: */ taskqueue_drain(taskqueue_inm_free, &inm_free_task); } #ifdef VIMAGE /* XXX-BZ FIXME, see D24914. */ VNET_SYSUNINIT(inm_release_wait, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, inm_release_wait, NULL); #endif void inm_release_list_deferred(struct in_multi_head *inmh) { if (SLIST_EMPTY(inmh)) return; mtx_lock(&in_multi_free_mtx); SLIST_CONCAT(&inm_free_list, inmh, in_multi, inm_nrele); mtx_unlock(&in_multi_free_mtx); taskqueue_enqueue(taskqueue_inm_free, &inm_free_task); } void inm_disconnect(struct in_multi *inm) { struct ifnet *ifp; struct ifmultiaddr *ifma, *ll_ifma; ifp = inm->inm_ifp; IF_ADDR_WLOCK_ASSERT(ifp); ifma = inm->inm_ifma; if_ref(ifp); if (ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname); if ((ll_ifma = ifma->ifma_llifma) != NULL) { MPASS(ifma != ll_ifma); ifma->ifma_llifma = NULL; MPASS(ll_ifma->ifma_llifma == NULL); MPASS(ll_ifma->ifma_ifp == ifp); if (--ll_ifma->ifma_refcount == 0) { if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname); if_freemulti(ll_ifma); ifma_restart = true; } } } void inm_release_deferred(struct in_multi *inm) { struct in_multi_head tmp; IN_MULTI_LIST_LOCK_ASSERT(); MPASS(inm->inm_refcount > 0); if (--inm->inm_refcount == 0) { SLIST_INIT(&tmp); inm_disconnect(inm); inm->inm_ifma->ifma_protospec = NULL; SLIST_INSERT_HEAD(&tmp, inm, inm_nrele); inm_release_list_deferred(&tmp); } } static void inm_release_task(void *arg __unused, int pending __unused) { struct in_multi_head inm_free_tmp; struct in_multi *inm, *tinm; SLIST_INIT(&inm_free_tmp); mtx_lock(&in_multi_free_mtx); SLIST_CONCAT(&inm_free_tmp, &inm_free_list, in_multi, inm_nrele); mtx_unlock(&in_multi_free_mtx); IN_MULTI_LOCK(); SLIST_FOREACH_SAFE(inm, &inm_free_tmp, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&inm_free_tmp, inm_nrele); MPASS(inm); inm_release(inm); } IN_MULTI_UNLOCK(); } /* * Initialize an in_mfilter structure to a known state at t0, t1 * with an empty source filter list. */ static __inline void imf_init(struct in_mfilter *imf, const int st0, const int st1) { memset(imf, 0, sizeof(struct in_mfilter)); RB_INIT(&imf->imf_sources); imf->imf_st[0] = st0; imf->imf_st[1] = st1; } struct in_mfilter * ip_mfilter_alloc(const int mflags, const int st0, const int st1) { struct in_mfilter *imf; imf = malloc(sizeof(*imf), M_INMFILTER, mflags); if (imf != NULL) imf_init(imf, st0, st1); return (imf); } void ip_mfilter_free(struct in_mfilter *imf) { imf_purge(imf); free(imf, M_INMFILTER); } /* * Function for looking up an in_multi record for an IPv4 multicast address * on a given interface. ifp must be valid. If no record found, return NULL. * The IN_MULTI_LIST_LOCK and IF_ADDR_LOCK on ifp must be held. */ struct in_multi * inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina) { struct ifmultiaddr *ifma; struct in_multi *inm; IN_MULTI_LIST_LOCK_ASSERT(); IF_ADDR_LOCK_ASSERT(ifp); inm = NULL; CK_STAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_addr.s_addr == ina.s_addr) break; inm = NULL; } return (inm); } /* * Wrapper for inm_lookup_locked(). * The IF_ADDR_LOCK will be taken on ifp and released on return. */ struct in_multi * inm_lookup(struct ifnet *ifp, const struct in_addr ina) { struct epoch_tracker et; struct in_multi *inm; IN_MULTI_LIST_LOCK_ASSERT(); NET_EPOCH_ENTER(et); inm = inm_lookup_locked(ifp, ina); NET_EPOCH_EXIT(et); return (inm); } /* * Find an IPv4 multicast group entry for this ip_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ static struct in_mfilter * imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group) { const struct sockaddr_in *gsin; struct in_mfilter *imf; struct in_multi *inm; gsin = (const struct sockaddr_in *)group; IP_MFILTER_FOREACH(imf, &imo->imo_head) { inm = imf->imf_inm; if (inm == NULL) continue; if ((ifp == NULL || (inm->inm_ifp == ifp)) && in_hosteq(inm->inm_addr, gsin->sin_addr)) { break; } } return (imf); } /* * Find an IPv4 multicast source entry for this imo which matches * the given group index for this socket, and source address. * * NOTE: This does not check if the entry is in-mode, merely if * it exists, which may not be the desired behaviour. */ static struct in_msource * imo_match_source(struct in_mfilter *imf, const struct sockaddr *src) { struct ip_msource find; struct ip_msource *ims; const sockunion_t *psa; KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__)); /* Source trees are keyed in host byte order. */ psa = (const sockunion_t *)src; find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); return ((struct in_msource *)ims); } /* * Perform filtering for multicast datagrams on a socket by group and source. * * Returns 0 if a datagram should be allowed through, or various error codes * if the socket was not a member of the group, or the source was muted, etc. */ int imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group, const struct sockaddr *src) { struct in_mfilter *imf; struct in_msource *ims; int mode; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); imf = imo_match_group(imo, ifp, group); if (imf == NULL) return (MCAST_NOTGMEMBER); /* * Check if the source was included in an (S,G) join. * Allow reception on exclusive memberships by default, * reject reception on inclusive memberships by default. * Exclude source only if an in-mode exclude filter exists. * Include source only if an in-mode include filter exists. * NOTE: We are comparing group state here at IGMP t1 (now) * with socket-layer t0 (since last downcall). */ mode = imf->imf_st[1]; ims = imo_match_source(imf, src); if ((ims == NULL && mode == MCAST_INCLUDE) || (ims != NULL && ims->imsl_st[0] != mode)) return (MCAST_NOTSMEMBER); return (MCAST_PASS); } /* * Find and return a reference to an in_multi record for (ifp, group), * and bump its reference count. * If one does not exist, try to allocate it, and update link-layer multicast * filters on ifp to listen for group. * Assumes the IN_MULTI lock is held across the call. * Return 0 if successful, otherwise return an appropriate error code. */ static int in_getmulti(struct ifnet *ifp, const struct in_addr *group, struct in_multi **pinm) { struct sockaddr_in gsin; struct ifmultiaddr *ifma; struct in_ifinfo *ii; struct in_multi *inm; int error; IN_MULTI_LOCK_ASSERT(); ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET]; IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, *group); if (inm != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(inm->inm_refcount >= 1, ("%s: bad refcount %d", __func__, inm->inm_refcount)); inm_acquire_locked(inm); *pinm = inm; } IN_MULTI_LIST_UNLOCK(); if (inm != NULL) return (0); memset(&gsin, 0, sizeof(gsin)); gsin.sin_family = AF_INET; gsin.sin_len = sizeof(struct sockaddr_in); gsin.sin_addr = *group; /* * Check if a link-layer group is already associated * with this network-layer group on the given ifnet. */ error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma); if (error != 0) return (error); /* XXX ifma_protospec must be covered by IF_ADDR_LOCK */ IN_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); /* * If something other than netinet is occupying the link-layer * group, print a meaningful error message and back out of * the allocation. * Otherwise, bump the refcount on the existing network-layer * group association and return it. */ if (ifma->ifma_protospec != NULL) { inm = (struct in_multi *)ifma->ifma_protospec; #ifdef INVARIANTS KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr", __func__)); KASSERT(ifma->ifma_addr->sa_family == AF_INET, ("%s: ifma not AF_INET", __func__)); KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__)); if (inm->inm_ifma != ifma || inm->inm_ifp != ifp || !in_hosteq(inm->inm_addr, *group)) { char addrbuf[INET_ADDRSTRLEN]; panic("%s: ifma %p is inconsistent with %p (%s)", __func__, ifma, inm, inet_ntoa_r(*group, addrbuf)); } #endif inm_acquire_locked(inm); *pinm = inm; goto out_locked; } IF_ADDR_WLOCK_ASSERT(ifp); /* * A new in_multi record is needed; allocate and initialize it. * We DO NOT perform an IGMP join as the in_ layer may need to * push an initial source list down to IGMP to support SSM. * * The initial source filter state is INCLUDE, {} as per the RFC. */ inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO); if (inm == NULL) { IF_ADDR_WUNLOCK(ifp); IN_MULTI_LIST_UNLOCK(); if_delmulti_ifma(ifma); return (ENOMEM); } inm->inm_addr = *group; inm->inm_ifp = ifp; inm->inm_igi = ii->ii_igmp; inm->inm_ifma = ifma; inm->inm_refcount = 1; inm->inm_state = IGMP_NOT_MEMBER; mbufq_init(&inm->inm_scq, IGMP_MAX_STATE_CHANGES); inm->inm_st[0].iss_fmode = MCAST_UNDEFINED; inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; RB_INIT(&inm->inm_srcs); ifma->ifma_protospec = inm; *pinm = inm; out_locked: IF_ADDR_WUNLOCK(ifp); IN_MULTI_LIST_UNLOCK(); return (0); } /* * Drop a reference to an in_multi record. * * If the refcount drops to 0, free the in_multi record and * delete the underlying link-layer membership. */ static void inm_release(struct in_multi *inm) { struct ifmultiaddr *ifma; struct ifnet *ifp; CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount); MPASS(inm->inm_refcount == 0); CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm); ifma = inm->inm_ifma; ifp = inm->inm_ifp; /* XXX this access is not covered by IF_ADDR_LOCK */ CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma); if (ifp != NULL) { CURVNET_SET(ifp->if_vnet); inm_purge(inm); free(inm, M_IPMADDR); if_delmulti_ifma_flags(ifma, 1); CURVNET_RESTORE(); if_rele(ifp); } else { inm_purge(inm); free(inm, M_IPMADDR); if_delmulti_ifma_flags(ifma, 1); } } /* * Clear recorded source entries for a group. * Used by the IGMP code. Caller must hold the IN_MULTI lock. * FIXME: Should reap. */ void inm_clear_recorded(struct in_multi *inm) { struct ip_msource *ims; IN_MULTI_LIST_LOCK_ASSERT(); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { if (ims->ims_stp) { ims->ims_stp = 0; --inm->inm_st[1].iss_rec; } } KASSERT(inm->inm_st[1].iss_rec == 0, ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec)); } /* * Record a source as pending for a Source-Group IGMPv3 query. * This lives here as it modifies the shared tree. * * inm is the group descriptor. * naddr is the address of the source to record in network-byte order. * * If the net.inet.igmp.sgalloc sysctl is non-zero, we will * lazy-allocate a source node in response to an SG query. * Otherwise, no allocation is performed. This saves some memory * with the trade-off that the source will not be reported to the * router if joined in the window between the query response and * the group actually being joined on the local host. * * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed. * This turns off the allocation of a recorded source entry if * the group has not been joined. * * Return 0 if the source didn't exist or was already marked as recorded. * Return 1 if the source was marked as recorded by this function. * Return <0 if any error occurred (negated errno code). */ int inm_record_source(struct in_multi *inm, const in_addr_t naddr) { struct ip_msource find; struct ip_msource *ims, *nims; IN_MULTI_LIST_LOCK_ASSERT(); find.ims_haddr = ntohl(naddr); ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims && ims->ims_stp) return (0); if (ims == NULL) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (-ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (-ENOMEM); nims->ims_haddr = find.ims_haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; } /* * Mark the source as recorded and update the recorded * source count. */ ++ims->ims_stp; ++inm->inm_st[1].iss_rec; return (1); } /* * Return a pointer to an in_msource owned by an in_mfilter, * given its source address. * Lazy-allocate if needed. If this is a new entry its filter state is * undefined at t0. * * imf is the filter set being modified. * haddr is the source address in *host* byte-order. * * SMPng: May be called with locks held; malloc must not block. */ static int imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, struct in_msource **plims) { struct ip_msource find; struct ip_msource *ims, *nims; struct in_msource *lims; int error; error = 0; ims = NULL; lims = NULL; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); lims = (struct in_msource *)ims; if (lims == NULL) { if (imf->imf_nsrc == in_mcast_maxsocksrc) return (ENOSPC); nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); lims = (struct in_msource *)nims; lims->ims_haddr = find.ims_haddr; lims->imsl_st[0] = MCAST_UNDEFINED; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; } *plims = lims; return (error); } /* * Graft a source entry into an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being in the new filter mode at t1. * * Return the pointer to the new node, otherwise return NULL. */ static struct in_msource * imf_graft(struct in_mfilter *imf, const uint8_t st1, const struct sockaddr_in *psin) { struct ip_msource *nims; struct in_msource *lims; nims = malloc(sizeof(struct in_msource), M_INMFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (NULL); lims = (struct in_msource *)nims; lims->ims_haddr = ntohl(psin->sin_addr.s_addr); lims->imsl_st[0] = MCAST_UNDEFINED; lims->imsl_st[1] = st1; RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); ++imf->imf_nsrc; return (lims); } /* * Prune a source entry from an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being left at t1, it is not freed. * * Return 0 if no error occurred, otherwise return an errno value. */ static int imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin) { struct ip_msource find; struct ip_msource *ims; struct in_msource *lims; /* key is host byte order */ find.ims_haddr = ntohl(psin->sin_addr.s_addr); ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); if (ims == NULL) return (ENOENT); lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; return (0); } /* * Revert socket-layer filter set deltas at t1 to t0 state. */ static void imf_rollback(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) { /* no change at t1 */ continue; } else if (lims->imsl_st[0] != MCAST_UNDEFINED) { /* revert change to existing source at t1 */ lims->imsl_st[1] = lims->imsl_st[0]; } else { /* revert source added t1 */ CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } imf->imf_st[1] = imf->imf_st[0]; } /* * Mark socket-layer filter set as INCLUDE {} at t1. */ static void imf_leave(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[1] = MCAST_UNDEFINED; } imf->imf_st[1] = MCAST_INCLUDE; } /* * Mark socket-layer filter set deltas as committed. */ static void imf_commit(struct in_mfilter *imf) { struct ip_msource *ims; struct in_msource *lims; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; lims->imsl_st[0] = lims->imsl_st[1]; } imf->imf_st[0] = imf->imf_st[1]; } /* * Reap unreferenced sources from socket-layer filter set. */ static void imf_reap(struct in_mfilter *imf) { struct ip_msource *ims, *tims; struct in_msource *lims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { lims = (struct in_msource *)ims; if ((lims->imsl_st[0] == MCAST_UNDEFINED) && (lims->imsl_st[1] == MCAST_UNDEFINED)) { CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } } } /* * Purge socket-layer filter set. */ static void imf_purge(struct in_mfilter *imf) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); free(ims, M_INMFILTER); imf->imf_nsrc--; } imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED; KASSERT(RB_EMPTY(&imf->imf_sources), ("%s: imf_sources not empty", __func__)); } /* * Look up a source filter entry for a multicast group. * * inm is the group descriptor to work with. * haddr is the host-byte-order IPv4 address to look up. * noalloc may be non-zero to suppress allocation of sources. * *pims will be set to the address of the retrieved or allocated source. * * SMPng: NOTE: may be called with locks held. * Return 0 if successful, otherwise return a non-zero error code. */ static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, const int noalloc, struct ip_msource **pims) { struct ip_msource find; struct ip_msource *ims, *nims; find.ims_haddr = haddr; ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); if (ims == NULL && !noalloc) { if (inm->inm_nsrc == in_mcast_maxgrpsrc) return (ENOSPC); nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); nims->ims_haddr = haddr; RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); ++inm->inm_nsrc; ims = nims; #ifdef KTR CTR3(KTR_IGMPV3, "%s: allocated 0x%08x as %p", __func__, haddr, ims); #endif } *pims = ims; return (0); } /* * Merge socket-layer source into IGMP-layer source. * If rollback is non-zero, perform the inverse of the merge. */ static void ims_merge(struct ip_msource *ims, const struct in_msource *lims, const int rollback) { int n = rollback ? -1 : 1; if (lims->imsl_st[0] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].ex -= n; } else if (lims->imsl_st[0] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in -= %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].in -= n; } if (lims->imsl_st[1] == MCAST_EXCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 ex += %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].ex += n; } else if (lims->imsl_st[1] == MCAST_INCLUDE) { CTR3(KTR_IGMPV3, "%s: t1 in += %d on 0x%08x", __func__, n, ims->ims_haddr); ims->ims_st[1].in += n; } } /* * Atomically update the global in_multi state, when a membership's * filter list is being updated in any way. * * imf is the per-inpcb-membership group filter pointer. * A fake imf may be passed for in-kernel consumers. * * XXX This is a candidate for a set-symmetric-difference style loop * which would eliminate the repeated lookup from root of ims nodes, * as they share the same key space. * * If any error occurred this function will back out of refcounts * and return a non-zero value. */ static int inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct ip_msource *ims, *nims; struct in_msource *lims; int schanged, error; int nsrc0, nsrc1; schanged = 0; error = 0; nsrc1 = nsrc0 = 0; IN_MULTI_LIST_LOCK_ASSERT(); /* * Update the source filters first, as this may fail. * Maintain count of in-mode filters at t0, t1. These are * used to work out if we transition into ASM mode or not. * Maintain a count of source filters whose state was * actually modified by this operation. */ RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++; if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; error = inm_get_source(inm, lims->ims_haddr, 0, &nims); ++schanged; if (error) break; ims_merge(nims, lims, 0); } if (error) { struct ip_msource *bims; RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == lims->imsl_st[1]) continue; (void)inm_get_source(inm, lims->ims_haddr, 1, &bims); if (bims == NULL) continue; ims_merge(bims, lims, 1); } goto out_reap; } CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1", __func__, nsrc0, nsrc1); /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ if (imf->imf_st[0] == imf->imf_st[1] && imf->imf_st[1] == MCAST_INCLUDE) { if (nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } } /* Handle filter mode transition on socket. */ if (imf->imf_st[0] != imf->imf_st[1]) { CTR3(KTR_IGMPV3, "%s: imf transition %d to %d", __func__, imf->imf_st[0], imf->imf_st[1]); if (imf->imf_st[0] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__); --inm->inm_st[1].iss_ex; } else if (imf->imf_st[0] == MCAST_INCLUDE) { CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); --inm->inm_st[1].iss_in; } if (imf->imf_st[1] == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__); inm->inm_st[1].iss_ex++; } else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) { CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__); inm->inm_st[1].iss_in++; } } /* * Track inm filter state in terms of listener counts. * If there are any exclusive listeners, stack-wide * membership is exclusive. * Otherwise, if only inclusive listeners, stack-wide is inclusive. * If no listeners remain, state is undefined at t1, * and the IGMP lifecycle for this group should finish. */ if (inm->inm_st[1].iss_ex > 0) { CTR1(KTR_IGMPV3, "%s: transition to EX", __func__); inm->inm_st[1].iss_fmode = MCAST_EXCLUDE; } else if (inm->inm_st[1].iss_in > 0) { CTR1(KTR_IGMPV3, "%s: transition to IN", __func__); inm->inm_st[1].iss_fmode = MCAST_INCLUDE; } else { CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__); inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; } /* Decrement ASM listener count on transition out of ASM mode. */ if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { if ((imf->imf_st[1] != MCAST_EXCLUDE) || (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) { CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__); --inm->inm_st[1].iss_asm; } } /* Increment ASM listener count on transition to ASM mode. */ if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__); inm->inm_st[1].iss_asm++; } CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm); inm_print(inm); out_reap: if (schanged > 0) { CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__); inm_reap(inm); } return (error); } /* * Mark an in_multi's filter set deltas as committed. * Called by IGMP after a state change has been enqueued. */ void inm_commit(struct in_multi *inm) { struct ip_msource *ims; CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm); CTR1(KTR_IGMPV3, "%s: pre commit:", __func__); inm_print(inm); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { ims->ims_st[0] = ims->ims_st[1]; } inm->inm_st[0] = inm->inm_st[1]; } /* * Reap unreferenced nodes from an in_multi's filter set. */ static void inm_reap(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 || ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 || ims->ims_stp != 0) continue; CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Purge all source nodes from an in_multi's filter set. */ static void inm_purge(struct in_multi *inm) { struct ip_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); free(ims, M_IPMSOURCE); inm->inm_nsrc--; } } /* * Join a multicast group; unlocked entry point. * * SMPng: XXX: in_joingroup() is called from in_control() when Giant * is not held. Fortunately, ifp is unlikely to have been detached * at this point, so we assume it's OK to recurse. */ int in_joingroup(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { int error; IN_MULTI_LOCK(); error = in_joingroup_locked(ifp, gina, imf, pinm); IN_MULTI_UNLOCK(); return (error); } /* * Join a multicast group; real entry point. * * Only preserves atomicity at inm level. * NOTE: imf argument cannot be const due to sys/tree.h limitations. * * If the IGMP downcall fails, the group is not joined, and an error * code is returned. */ int in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina, /*const*/ struct in_mfilter *imf, struct in_multi **pinm) { struct in_mfilter timf; struct in_multi *inm; int error; IN_MULTI_LOCK_ASSERT(); IN_MULTI_LIST_UNLOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: join 0x%08x on %p(%s))", __func__, ntohl(gina->s_addr), ifp, ifp->if_xname); error = 0; inm = NULL; /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); imf = &timf; } error = in_getmulti(ifp, gina, &inm); if (error) { CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__); return (error); } IN_MULTI_LIST_LOCK(); CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); goto out_inm_release; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); if (error) { CTR1(KTR_IGMPV3, "%s: failed to update source", __func__); goto out_inm_release; } out_inm_release: if (error) { CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); IF_ADDR_WLOCK(ifp); inm_release_deferred(inm); IF_ADDR_WUNLOCK(ifp); } else { *pinm = inm; } IN_MULTI_LIST_UNLOCK(); return (error); } /* * Leave a multicast group; unlocked entry point. */ int in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { int error; IN_MULTI_LOCK(); error = in_leavegroup_locked(inm, imf); IN_MULTI_UNLOCK(); return (error); } /* * Leave a multicast group; real entry point. * All source filters will be expunged. * * Only preserves atomicity at inm level. * * Holding the write lock for the INP which contains imf * is highly advisable. We can't assert for it as imf does not * contain a back-pointer to the owning inp. * * Note: This is not the same as inm_release(*) as this function also * makes a state change downcall into IGMP. */ int in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { struct in_mfilter timf; int error; IN_MULTI_LOCK_ASSERT(); IN_MULTI_LIST_UNLOCK_ASSERT(); error = 0; CTR5(KTR_IGMPV3, "%s: leave inm %p, 0x%08x/%s, imf %p", __func__, inm, ntohl(inm->inm_addr.s_addr), (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname), imf); /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); imf = &timf; } /* * Begin state merge transaction at IGMP layer. * * As this particular invocation should not cause any memory * to be allocated, and there is no opportunity to roll back * the transaction, it MUST NOT fail. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); CURVNET_SET(inm->inm_ifp->if_vnet); error = igmp_change_state(inm); IF_ADDR_WLOCK(inm->inm_ifp); inm_release_deferred(inm); IF_ADDR_WUNLOCK(inm->inm_ifp); IN_MULTI_LIST_UNLOCK(); CURVNET_RESTORE(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); return (error); } /*#ifndef BURN_BRIDGES*/ /* * Block or unblock an ASM multicast source on an inpcb. * This implements the delta-based API described in RFC 3678. * * The delta-based API applies only to exclusive-mode memberships. * An IGMP downcall will be performed. * * SMPng: NOTE: Must take Giant as a join may create a new ifma. * * Return 0 if successful, otherwise return an appropriate error code. */ static int inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) { struct epoch_tracker et; struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; uint16_t fmode; int error, doblock; ifp = NULL; error = 0; doblock = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; ssa = (sockunion_t *)&gsr.gsr_source; switch (sopt->sopt_name) { case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: { struct ip_mreq_source mreqs; error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; if (!in_nullhost(mreqs.imr_interface)) { NET_EPOCH_ENTER(et); INADDR_TO_IFP(mreqs.imr_interface, ifp); /* XXXGL: ifref? */ NET_EPOCH_EXIT(et); } if (sopt->sopt_name == IP_BLOCK_SOURCE) doblock = 1; CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; } case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); NET_EPOCH_ENTER(et); ifp = ifnet_byindex(gsr.gsr_interface); NET_EPOCH_EXIT(et); if (ifp == NULL) return (EADDRNOTAVAIL); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) doblock = 1; break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); IN_MULTI_LOCK(); /* * Check if we are actually a member of this group. */ imo = inp_findmoptions(inp); imf = imo_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imf->imf_inm; /* * Attempting to use the delta-based API on an * non exclusive-mode membership is an error. */ fmode = imf->imf_st[0]; if (fmode != MCAST_EXCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Deal with error cases up-front: * Asked to block, but already blocked; or * Asked to unblock, but nothing to unblock. * If adding a new block entry, allocate it. */ ims = imo_match_source(imf, &ssa->sa); if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__, ntohl(ssa->sin.sin_addr.s_addr), doblock ? "" : "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at socket layer. */ if (doblock) { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); ims = imf_graft(imf, fmode, &ssa->sin); if (ims == NULL) error = ENOMEM; } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); error = imf_prune(imf, &ssa->sin); } if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_imf_rollback; } /* * Begin state merge transaction at IGMP layer. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); goto out_imf_rollback; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); IN_MULTI_UNLOCK(); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. * SMPng: NOTE: Returns with the INP write lock held. */ static struct ip_moptions * inp_findmoptions(struct inpcb *inp) { struct ip_moptions *imo; INP_WLOCK(inp); if (inp->inp_moptions != NULL) return (inp->inp_moptions); INP_WUNLOCK(inp); imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); imo->imo_multicast_ifp = NULL; imo->imo_multicast_addr.s_addr = INADDR_ANY; imo->imo_multicast_vif = -1; imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; imo->imo_multicast_loop = in_mcast_loop; STAILQ_INIT(&imo->imo_head); INP_WLOCK(inp); if (inp->inp_moptions != NULL) { free(imo, M_IPMOPTS); return (inp->inp_moptions); } inp->inp_moptions = imo; return (imo); } void inp_freemoptions(struct ip_moptions *imo) { struct in_mfilter *imf; struct in_multi *inm; struct ifnet *ifp; if (imo == NULL) return; while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) { ip_mfilter_remove(&imo->imo_head, imf); imf_leave(imf); if ((inm = imf->imf_inm) != NULL) { if ((ifp = inm->inm_ifp) != NULL) { CURVNET_SET(ifp->if_vnet); (void)in_leavegroup(inm, imf); CURVNET_RESTORE(); } else { (void)in_leavegroup(inm, imf); } } ip_mfilter_free(imf); } free(imo, M_IPMOPTS); } /* * Atomically get source filters on a socket for an IPv4 multicast group. * Called with INP lock held; returns with lock released. */ static int inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct epoch_tracker et; struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct ip_moptions *imo; struct in_mfilter *imf; struct ip_msource *ims; struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; size_t nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); imo = inp->inp_moptions; KASSERT(imo != NULL, ("%s: null ip_moptions", __func__)); INP_WUNLOCK(inp); error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); NET_EPOCH_ENTER(et); ifp = ifnet_byindex(msfr.msfr_ifindex); NET_EPOCH_EXIT(et); /* XXXGL: unsafe ifnet pointer left */ if (ifp == NULL) return (EINVAL); INP_WLOCK(inp); /* * Lookup group on the socket. */ gsa = (sockunion_t *)&msfr.msfr_group; imf = imo_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } /* * Ignore memberships which are in limbo. */ if (imf->imf_st[1] == MCAST_UNDEFINED) { INP_WUNLOCK(inp); return (EAGAIN); } msfr.msfr_fmode = imf->imf_st[1]; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. * We only copy out the number of entries which userland * has asked for, but we always tell userland how big the * buffer really needs to be. */ if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) msfr.msfr_nsrcs = in_mcast_maxsocksrc; tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_NOWAIT | M_ZERO); if (tss == NULL) { INP_WUNLOCK(inp); return (ENOBUFS); } } /* * Count number of sources in-mode at t0. * If buffer space exists and remains, copy out source entries. */ nsrcs = msfr.msfr_nsrcs; ncsrcs = 0; ptss = tss; RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { lims = (struct in_msource *)ims; if (lims->imsl_st[0] == MCAST_UNDEFINED || lims->imsl_st[0] != imf->imf_st[0]) continue; ++ncsrcs; if (tss != NULL && nsrcs > 0) { psin = (struct sockaddr_in *)ptss; psin->sin_family = AF_INET; psin->sin_len = sizeof(struct sockaddr_in); psin->sin_addr.s_addr = htonl(lims->ims_haddr); psin->sin_port = 0; ++ptss; --nsrcs; } } INP_WUNLOCK(inp); if (tss != NULL) { error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); free(tss, M_TEMP); if (error) return (error); } msfr.msfr_nsrcs = ncsrcs; error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ int inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip_mreqn mreqn; struct ip_moptions *imo; struct ifnet *ifp; struct in_ifaddr *ia; int error, optval; u_char coptval; INP_WLOCK(inp); imo = inp->inp_moptions; - /* - * If socket is neither of type SOCK_RAW or SOCK_DGRAM, - * or is a divert socket, reject it. - */ - if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || - (inp->inp_socket->so_proto->pr_type != SOCK_RAW && - inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { + /* If socket is neither of type SOCK_RAW or SOCK_DGRAM reject it. */ + if (inp->inp_socket->so_proto->pr_type != SOCK_RAW && + inp->inp_socket->so_proto->pr_type != SOCK_DGRAM) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = 0; switch (sopt->sopt_name) { case IP_MULTICAST_VIF: if (imo != NULL) optval = imo->imo_multicast_vif; else optval = -1; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_IF: memset(&mreqn, 0, sizeof(struct ip_mreqn)); if (imo != NULL) { ifp = imo->imo_multicast_ifp; if (!in_nullhost(imo->imo_multicast_addr)) { mreqn.imr_address = imo->imo_multicast_addr; } else if (ifp != NULL) { struct epoch_tracker et; mreqn.imr_ifindex = ifp->if_index; NET_EPOCH_ENTER(et); IFP_TO_IA(ifp, ia); if (ia != NULL) mreqn.imr_address = IA_SIN(ia)->sin_addr; NET_EPOCH_EXIT(et); } } INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { error = sooptcopyout(sopt, &mreqn, sizeof(struct ip_mreqn)); } else { error = sooptcopyout(sopt, &mreqn.imr_address, sizeof(struct in_addr)); } break; case IP_MULTICAST_TTL: if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_TTL; else optval = coptval = imo->imo_multicast_ttl; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MULTICAST_LOOP: if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_LOOP; else optval = coptval = imo->imo_multicast_loop; INP_WUNLOCK(inp); if (sopt->sopt_valsize == sizeof(u_char)) error = sooptcopyout(sopt, &coptval, sizeof(u_char)); else error = sooptcopyout(sopt, &optval, sizeof(int)); break; case IP_MSFILTER: if (imo == NULL) { error = EADDRNOTAVAIL; INP_WUNLOCK(inp); } else { error = inp_get_source_filters(inp, sopt); } break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Look up the ifnet to use for a multicast group membership, * given the IPv4 address of an interface, and the IPv4 group address. * * This routine exists to support legacy multicast applications * which do not understand that multicast memberships are scoped to * specific physical links in the networking stack, or which need * to join link-scope groups before IPv4 addresses are configured. * * Use this socket's current FIB number for any required FIB lookup. * If ina is INADDR_ANY, look up the group address in the unicast FIB, * and use its ifp; usually, this points to the default next-hop. * * If the FIB lookup fails, attempt to use the first non-loopback * interface with multicast capability in the system as a * last resort. The legacy IPv4 ASM API requires that we do * this in order to allow groups to be joined when the routing * table has not yet been populated during boot. * * Returns NULL if no ifp could be found, otherwise return referenced ifp. * * FUTURE: Implement IPv4 source-address selection. */ static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *inp, const struct sockaddr_in *gsin, const struct in_addr ina) { struct ifnet *ifp; struct nhop_object *nh; NET_EPOCH_ASSERT(); KASSERT(inp != NULL, ("%s: inp must not be NULL", __func__)); KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__)); KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)), ("%s: not multicast", __func__)); ifp = NULL; if (!in_nullhost(ina)) { INADDR_TO_IFP(ina, ifp); if (ifp != NULL) if_ref(ifp); } else { nh = fib4_lookup(inp->inp_inc.inc_fibnum, gsin->sin_addr, 0, NHR_NONE, 0); if (nh != NULL) { ifp = nh->nh_ifp; if_ref(ifp); } else { struct in_ifaddr *ia; struct ifnet *mifp; mifp = NULL; CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { mifp = ia->ia_ifp; if (!(mifp->if_flags & IFF_LOOPBACK) && (mifp->if_flags & IFF_MULTICAST)) { ifp = mifp; if_ref(ifp); break; } } } } return (ifp); } /* * Join an IPv4 multicast group, possibly with a source. */ static int inp_join_group(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; struct in_msource *lims; struct epoch_tracker et; int error, is_new; ifp = NULL; lims = NULL; error = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_ADD_MEMBERSHIP: { struct ip_mreqn mreqn; if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); else error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqn.imr_multiaddr; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); NET_EPOCH_ENTER(et); if (sopt->sopt_valsize == sizeof(struct ip_mreqn) && mreqn.imr_ifindex != 0) ifp = ifnet_byindex_ref(mreqn.imr_ifindex); else ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, mreqn.imr_address); NET_EPOCH_EXIT(et); break; } case IP_ADD_SOURCE_MEMBERSHIP: { struct ip_mreq_source mreqs; error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); if (error) return (error); gsa->sin.sin_family = ssa->sin.sin_family = AF_INET; gsa->sin.sin_len = ssa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); ssa->sin.sin_addr = mreqs.imr_sourceaddr; NET_EPOCH_ENTER(et); ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, mreqs.imr_interface); NET_EPOCH_EXIT(et); CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; } case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: if (sopt->sopt_name == MCAST_JOIN_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); /* * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. */ gsa->sin.sin_port = 0; if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); ssa->sin.sin_port = 0; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); NET_EPOCH_ENTER(et); ifp = ifnet_byindex_ref(gsr.gsr_interface); NET_EPOCH_EXIT(et); if (ifp == NULL) return (EADDRNOTAVAIL); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { if (ifp != NULL) if_rele(ifp); return (EADDRNOTAVAIL); } IN_MULTI_LOCK(); /* * Find the membership in the membership list. */ imo = inp_findmoptions(inp); imf = imo_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { is_new = 1; inm = NULL; if (ip_mfilter_count(&imo->imo_head) >= IP_MAX_MEMBERSHIPS) { error = ENOMEM; goto out_inp_locked; } } else { is_new = 0; inm = imf->imf_inm; if (ssa->ss.ss_family != AF_UNSPEC) { /* * MCAST_JOIN_SOURCE_GROUP on an exclusive membership * is an error. On an existing inclusive membership, * it just adds the source to the filter list. */ if (imf->imf_st[1] != MCAST_INCLUDE) { error = EINVAL; goto out_inp_locked; } /* * Throw out duplicates. * * XXX FIXME: This makes a naive assumption that * even if entries exist for *ssa in this imf, * they will be rejected as dupes, even if they * are not valid in the current mode (in-mode). * * in_msource is transactioned just as for anything * else in SSM -- but note naive use of inm_graft() * below for allocating new filter entries. * * This is only an issue if someone mixes the * full-state SSM API with the delta-based API, * which is discouraged in the relevant RFCs. */ lims = imo_match_source(imf, &ssa->sa); if (lims != NULL /*&& lims->imsl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; goto out_inp_locked; } } else { /* * MCAST_JOIN_GROUP on an existing exclusive * membership is an error; return EADDRINUSE * to preserve 4.4BSD API idempotence, and * avoid tedious detour to code below. * NOTE: This is bending RFC 3678 a bit. * * On an existing inclusive membership, this is also * an error; if you want to change filter mode, * you must use the userland API setsourcefilter(). * XXX We don't reject this for imf in UNDEFINED * state at t1, because allocation of a filter * is atomic with allocation of a membership. */ error = EINVAL; if (imf->imf_st[1] == MCAST_EXCLUDE) error = EADDRINUSE; goto out_inp_locked; } } /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * Graft new source into filter list for this inpcb's * membership of the group. The in_multi may not have * been allocated yet if this is a new membership, however, * the in_mfilter slot will be allocated and must be initialized. * * Note: Grafting of exclusive mode filters doesn't happen * in this path. * XXX: Should check for non-NULL lims (node exists but may * not be in-mode) for interop with full-state API. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* Membership starts in IN mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/source", __func__); imf = ip_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_INCLUDE); if (imf == NULL) { error = ENOMEM; goto out_inp_locked; } } else { CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); } lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin); if (lims == NULL) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); error = ENOMEM; goto out_inp_locked; } } else { /* No address specified; Membership starts in EX mode */ if (is_new) { CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__); imf = ip_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_EXCLUDE); if (imf == NULL) { error = ENOMEM; goto out_inp_locked; } } } /* * Begin state merge transaction at IGMP layer. */ if (is_new) { in_pcbref(inp); INP_WUNLOCK(inp); error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf, &imf->imf_inm); INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { error = ENXIO; goto out_inp_unlocked; } if (error) { CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed", __func__); goto out_inp_locked; } /* * NOTE: Refcount from in_joingroup_locked() * is protecting membership. */ ip_mfilter_insert(&imo->imo_head, imf); } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); imf_rollback(imf); imf_reap(imf); goto out_inp_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); imf_rollback(imf); imf_reap(imf); goto out_inp_locked; } } imf_commit(imf); imf = NULL; out_inp_locked: INP_WUNLOCK(inp); out_inp_unlocked: IN_MULTI_UNLOCK(); if (is_new && imf) { if (imf->imf_inm != NULL) { IN_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); inm_release_deferred(imf->imf_inm); IF_ADDR_WUNLOCK(ifp); IN_MULTI_LIST_UNLOCK(); } ip_mfilter_free(imf); } if_rele(ifp); return (error); } /* * Leave an IPv4 multicast group on an inpcb, possibly with a source. */ static int inp_leave_group(struct inpcb *inp, struct sockopt *sopt) { struct epoch_tracker et; struct group_source_req gsr; struct ip_mreq_source mreqs; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_msource *ims; struct in_multi *inm; int error; bool is_final; ifp = NULL; error = 0; is_final = true; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; switch (sopt->sopt_name) { case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: if (sopt->sopt_name == IP_DROP_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq), sizeof(struct ip_mreq)); /* * Swap interface and sourceaddr arguments, * as ip_mreq and ip_mreq_source are laid * out differently. */ mreqs.imr_interface = mreqs.imr_sourceaddr; mreqs.imr_sourceaddr.s_addr = INADDR_ANY; } else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source), sizeof(struct ip_mreq_source)); } if (error) return (error); gsa->sin.sin_family = AF_INET; gsa->sin.sin_len = sizeof(struct sockaddr_in); gsa->sin.sin_addr = mreqs.imr_multiaddr; if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { ssa->sin.sin_family = AF_INET; ssa->sin.sin_len = sizeof(struct sockaddr_in); ssa->sin.sin_addr = mreqs.imr_sourceaddr; } /* * Attempt to look up hinted ifp from interface address. * Fallthrough with null ifp iff lookup fails, to * preserve 4.4BSD mcast API idempotence. * XXX NOTE WELL: The RFC 3678 API is preferred because * using an IPv4 address as a key is racy. */ if (!in_nullhost(mreqs.imr_interface)) { NET_EPOCH_ENTER(et); INADDR_TO_IFP(mreqs.imr_interface, ifp); /* XXXGL ifref? */ NET_EPOCH_EXIT(et); } CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p", __func__, ntohl(mreqs.imr_interface.s_addr), ifp); break; case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: if (sopt->sopt_name == MCAST_LEAVE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin.sin_family != AF_INET || gsa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { if (ssa->sin.sin_family != AF_INET || ssa->sin.sin_len != sizeof(struct sockaddr_in)) return (EINVAL); } NET_EPOCH_ENTER(et); ifp = ifnet_byindex(gsr.gsr_interface); NET_EPOCH_EXIT(et); /* XXXGL: unsafe ifp */ if (ifp == NULL) return (EADDRNOTAVAIL); break; default: CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); IN_MULTI_LOCK(); /* * Find the membership in the membership list. */ imo = inp_findmoptions(inp); imf = imo_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imf->imf_inm; if (ssa->ss.ss_family != AF_UNSPEC) is_final = false; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * If we were instructed only to leave a given source, do so. * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ if (is_final) { ip_mfilter_remove(&imo->imo_head, imf); imf_leave(imf); /* * Give up the multicast address record to which * the membership points. */ (void) in_leavegroup_locked(imf->imf_inm, imf); } else { if (imf->imf_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; goto out_inp_locked; } ims = imo_match_source(imf, &ssa->sa); if (ims == NULL) { CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__, ntohl(ssa->sin.sin_addr.s_addr), "not "); error = EADDRNOTAVAIL; goto out_inp_locked; } CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); error = imf_prune(imf, &ssa->sin); if (error) { CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); goto out_inp_locked; } } /* * Begin state merge transaction at IGMP layer. */ if (!is_final) { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); imf_rollback(imf); imf_reap(imf); goto out_inp_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); imf_rollback(imf); imf_reap(imf); goto out_inp_locked; } } imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); if (is_final && imf) ip_mfilter_free(imf); IN_MULTI_UNLOCK(); return (error); } /* * Select the interface for transmitting IPv4 multicast datagrams. * * Either an instance of struct in_addr or an instance of struct ip_mreqn * may be passed to this socket option. An address of INADDR_ANY or an * interface index of 0 is used to remove a previous selection. * When no interface is selected, one is chosen for every send. */ static int inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { struct in_addr addr; struct ip_mreqn mreqn; struct ifnet *ifp; struct ip_moptions *imo; int error; if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { /* * An interface index was specified using the * Linux-derived ip_mreqn structure. */ error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), sizeof(struct ip_mreqn)); if (error) return (error); if (mreqn.imr_ifindex < 0) return (EINVAL); if (mreqn.imr_ifindex == 0) { ifp = NULL; } else { struct epoch_tracker et; NET_EPOCH_ENTER(et); ifp = ifnet_byindex(mreqn.imr_ifindex); NET_EPOCH_EXIT(et); /* XXXGL: unsafe ifp */ if (ifp == NULL) return (EADDRNOTAVAIL); } } else { /* * An interface was specified by IPv4 address. * This is the traditional BSD usage. */ error = sooptcopyin(sopt, &addr, sizeof(struct in_addr), sizeof(struct in_addr)); if (error) return (error); if (in_nullhost(addr)) { ifp = NULL; } else { struct epoch_tracker et; NET_EPOCH_ENTER(et); INADDR_TO_IFP(addr, ifp); /* XXXGL ifref? */ NET_EPOCH_EXIT(et); if (ifp == NULL) return (EADDRNOTAVAIL); } CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = 0x%08x", __func__, ifp, ntohl(addr.s_addr)); } /* Reject interfaces which do not support multicast. */ if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); imo = inp_findmoptions(inp); imo->imo_multicast_ifp = ifp; imo->imo_multicast_addr.s_addr = INADDR_ANY; INP_WUNLOCK(inp); return (0); } /* * Atomically set source filters on a socket for an IPv4 multicast group. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. */ static int inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct epoch_tracker et; struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct in_mfilter *imf; struct ip_moptions *imo; struct in_multi *inm; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) return (ENOBUFS); if ((msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE)) return (EINVAL); if (msfr.msfr_group.ss_family != AF_INET || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) return (EINVAL); gsa->sin.sin_port = 0; /* ignore port */ NET_EPOCH_ENTER(et); ifp = ifnet_byindex(msfr.msfr_ifindex); NET_EPOCH_EXIT(et); /* XXXGL: unsafe ifp */ if (ifp == NULL) return (EADDRNOTAVAIL); IN_MULTI_LOCK(); /* * Take the INP write lock. * Check if this socket is a member of this group. */ imo = inp_findmoptions(inp); imf = imo_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_inp_locked; } inm = imf->imf_inm; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); imf->imf_st[1] = msfr.msfr_fmode; /* * Apply any new source filters, if present. * Make a copy of the user-space source vector so * that we may copy them with a single copyin. This * allows us to deal with page faults up-front. */ if (msfr.msfr_nsrcs > 0) { struct in_msource *lims; struct sockaddr_in *psin; struct sockaddr_storage *kss, *pkss; int i; INP_WUNLOCK(inp); CTR2(KTR_IGMPV3, "%s: loading %lu source list entries", __func__, (unsigned long)msfr.msfr_nsrcs); kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); if (error) { free(kss, M_TEMP); return (error); } INP_WLOCK(inp); /* * Mark all source filters as UNDEFINED at t1. * Restore new group filter mode, as imf_leave() * will set it to INCLUDE. */ imf_leave(imf); imf->imf_st[1] = msfr.msfr_fmode; /* * Update socket layer filters at t1, lazy-allocating * new entries. This saves a bunch of memory at the * cost of one RB_FIND() per source entry; duplicate * entries in the msfr_nsrcs vector are ignored. * If we encounter an error, rollback transaction. * * XXX This too could be replaced with a set-symmetric * difference like loop to avoid walking from root * every time, as the key space is common. */ for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { psin = (struct sockaddr_in *)pkss; if (psin->sin_family != AF_INET) { error = EAFNOSUPPORT; break; } if (psin->sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; break; } error = imf_get_source(imf, psin, &lims); if (error) break; lims->imsl_st[1] = imf->imf_st[1]; } free(kss, M_TEMP); } if (error) goto out_imf_rollback; INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at IGMP layer. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); IN_MULTI_LIST_UNLOCK(); goto out_imf_rollback; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); IN_MULTI_LIST_UNLOCK(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); out_imf_rollback: if (error) imf_rollback(imf); else imf_commit(imf); imf_reap(imf); out_inp_locked: INP_WUNLOCK(inp); IN_MULTI_UNLOCK(); return (error); } /* * Set the IP multicast options in response to user setsockopt(). * * Many of the socket options handled in this function duplicate the * functionality of socket options in the regular unicast API. However, * it is not possible to merge the duplicate code, because the idempotence * of the IPv4 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. * * SMPng: XXX: Unlocked read of inp_socket believed OK. * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING * is refactored to no longer use vifs. */ int inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip_moptions *imo; int error; error = 0; - /* - * If socket is neither of type SOCK_RAW or SOCK_DGRAM, - * or is a divert socket, reject it. - */ - if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || - (inp->inp_socket->so_proto->pr_type != SOCK_RAW && - inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) + /* If socket is neither of type SOCK_RAW or SOCK_DGRAM, reject it. */ + if (inp->inp_socket->so_proto->pr_type != SOCK_RAW && + inp->inp_socket->so_proto->pr_type != SOCK_DGRAM) return (EOPNOTSUPP); switch (sopt->sopt_name) { case IP_MULTICAST_VIF: { int vifi; /* * Select a multicast VIF for transmission. * Only useful if multicast forwarding is active. */ if (legal_vif_num == NULL) { error = EOPNOTSUPP; break; } error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int)); if (error) break; if (!legal_vif_num(vifi) && (vifi != -1)) { error = EINVAL; break; } imo = inp_findmoptions(inp); imo->imo_multicast_vif = vifi; INP_WUNLOCK(inp); break; } case IP_MULTICAST_IF: error = inp_set_multicast_if(inp, sopt); break; case IP_MULTICAST_TTL: { u_char ttl; /* * Set the IP time-to-live for outgoing multicast packets. * The original multicast API required a char argument, * which is inconsistent with the rest of the socket API. * We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &ttl, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int ittl; error = sooptcopyin(sopt, &ittl, sizeof(u_int), sizeof(u_int)); if (error) break; if (ittl > 255) { error = EINVAL; break; } ttl = (u_char)ittl; } imo = inp_findmoptions(inp); imo->imo_multicast_ttl = ttl; INP_WUNLOCK(inp); break; } case IP_MULTICAST_LOOP: { u_char loop; /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. The original multicast API required a * char argument, which is inconsistent with the rest * of the socket API. We allow either a char or an int. */ if (sopt->sopt_valsize == sizeof(u_char)) { error = sooptcopyin(sopt, &loop, sizeof(u_char), sizeof(u_char)); if (error) break; } else { u_int iloop; error = sooptcopyin(sopt, &iloop, sizeof(u_int), sizeof(u_int)); if (error) break; loop = (u_char)iloop; } imo = inp_findmoptions(inp); imo->imo_multicast_loop = !!loop; INP_WUNLOCK(inp); break; } case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: error = inp_join_group(inp, sopt); break; case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = inp_leave_group(inp, sopt); break; case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_block_unblock_source(inp, sopt); break; case IP_MSFILTER: error = inp_set_source_filters(inp, sopt); break; default: error = EOPNOTSUPP; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Expose IGMP's multicast filter mode and source list(s) to userland, * keyed by (ifindex, group). * The filter mode is written out as a uint32_t, followed by * 0..n of struct in_addr. * For use by ifmcstat(8). * SMPng: NOTE: unlocked read of ifindex space. */ static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) { struct in_addr src, group; struct epoch_tracker et; struct ifnet *ifp; struct ifmultiaddr *ifma; struct in_multi *inm; struct ip_msource *ims; int *name; int retval; u_int namelen; uint32_t fmode, ifindex; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 2) return (EINVAL); group.s_addr = name[1]; if (!IN_MULTICAST(ntohl(group.s_addr))) { CTR2(KTR_IGMPV3, "%s: group 0x%08x is not multicast", __func__, ntohl(group.s_addr)); return (EINVAL); } ifindex = name[0]; NET_EPOCH_ENTER(et); ifp = ifnet_byindex(ifindex); if (ifp == NULL) { NET_EPOCH_EXIT(et); CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u", __func__, ifindex); return (ENOENT); } retval = sysctl_wire_old_buffer(req, sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr))); if (retval) { NET_EPOCH_EXIT(et); return (retval); } IN_MULTI_LIST_LOCK(); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (!in_hosteq(inm->inm_addr, group)) continue; fmode = inm->inm_st[1].iss_fmode; retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); if (retval != 0) break; RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__, ims->ims_haddr); /* * Only copy-out sources which are in-mode. */ if (fmode != ims_get_mode(inm, ims, 1)) { CTR1(KTR_IGMPV3, "%s: skip non-in-mode", __func__); continue; } src.s_addr = htonl(ims->ims_haddr); retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr)); if (retval != 0) break; } } IN_MULTI_LIST_UNLOCK(); NET_EPOCH_EXIT(et); return (retval); } #if defined(KTR) && (KTR_COMPILE & KTR_IGMPV3) static const char *inm_modestrs[] = { [MCAST_UNDEFINED] = "un", [MCAST_INCLUDE] = "in", [MCAST_EXCLUDE] = "ex", }; _Static_assert(MCAST_UNDEFINED == 0 && MCAST_EXCLUDE + 1 == nitems(inm_modestrs), "inm_modestrs: no longer matches #defines"); static const char * inm_mode_str(const int mode) { if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) return (inm_modestrs[mode]); return ("??"); } static const char *inm_statestrs[] = { [IGMP_NOT_MEMBER] = "not-member", [IGMP_SILENT_MEMBER] = "silent", [IGMP_REPORTING_MEMBER] = "reporting", [IGMP_IDLE_MEMBER] = "idle", [IGMP_LAZY_MEMBER] = "lazy", [IGMP_SLEEPING_MEMBER] = "sleeping", [IGMP_AWAKENING_MEMBER] = "awakening", [IGMP_G_QUERY_PENDING_MEMBER] = "query-pending", [IGMP_SG_QUERY_PENDING_MEMBER] = "sg-query-pending", [IGMP_LEAVING_MEMBER] = "leaving", }; _Static_assert(IGMP_NOT_MEMBER == 0 && IGMP_LEAVING_MEMBER + 1 == nitems(inm_statestrs), "inm_statetrs: no longer matches #defines"); static const char * inm_state_str(const int state) { if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER) return (inm_statestrs[state]); return ("??"); } /* * Dump an in_multi structure to the console. */ void inm_print(const struct in_multi *inm) { int t; char addrbuf[INET_ADDRSTRLEN]; if ((ktr_mask & KTR_IGMPV3) == 0) return; printf("%s: --- begin inm %p ---\n", __func__, inm); printf("addr %s ifp %p(%s) ifma %p\n", inet_ntoa_r(inm->inm_addr, addrbuf), inm->inm_ifp, inm->inm_ifp->if_xname, inm->inm_ifma); printf("timer %u state %s refcount %u scq.len %u\n", inm->inm_timer, inm_state_str(inm->inm_state), inm->inm_refcount, inm->inm_scq.mq_len); printf("igi %p nsrc %lu sctimer %u scrv %u\n", inm->inm_igi, inm->inm_nsrc, inm->inm_sctimer, inm->inm_scrv); for (t = 0; t < 2; t++) { printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, inm_mode_str(inm->inm_st[t].iss_fmode), inm->inm_st[t].iss_asm, inm->inm_st[t].iss_ex, inm->inm_st[t].iss_in, inm->inm_st[t].iss_rec); } printf("%s: --- end inm %p ---\n", __func__, inm); } #else /* !KTR || !(KTR_COMPILE & KTR_IGMPV3) */ void inm_print(const struct in_multi *inm) { } #endif /* KTR && (KTR_COMPILE & KTR_IGMPV3) */ RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c index d14ec5190ad0..b09d7e1dda7a 100644 --- a/sys/netinet/ip_divert.c +++ b/sys/netinet/ip_divert.c @@ -1,795 +1,800 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_sctp.h" #ifndef INET -#error "IPDIVERT requires INET" +#error "IPDIVERT requires INET" /* XXX! */ #endif #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #if defined(SCTP) || defined(SCTP_SUPPORT) #include #endif #include /* * Divert sockets */ /* * Allocate enough space to hold a full IP packet */ #define DIVSNDQ (65536 + 100) #define DIVRCVQ (65536 + 100) /* * Divert sockets work in conjunction with ipfw or other packet filters, * see the divert(4) manpage for features. * Packets are selected by the packet filter and tagged with an * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by * the packet filter) and information on the matching filter rule for * subsequent reinjection. The divert_port is used to put the packet * on the corresponding divert socket, while the rule number is passed * up (at least partially) as the sin_port in the struct sockaddr. * * Packets written to the divert socket carry in sin_addr a * destination address, and in sin_port the number of the filter rule * after which to continue processing. * If the destination address is INADDR_ANY, the packet is treated as * as outgoing and sent to ip_output(); otherwise it is treated as * incoming and sent to ip_input(). * Further, sin_zero carries some information on the interface, * which can be used in the reinject -- see comments in the code. * * On reinjection, processing in ip_input() and ip_output() * will be exactly the same as for the original packet, except that * packet filter processing will start at the rule number after the one * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0 * will apply the entire ruleset to the packet). */ /* Internal variables. */ VNET_DEFINE_STATIC(struct inpcbinfo, divcbinfo); #define V_divcbinfo VNET(divcbinfo) static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ static int div_output_inbound(int fmaily, struct socket *so, struct mbuf *m, struct sockaddr_in *sin); static int div_output_outbound(int family, struct socket *so, struct mbuf *m); /* * Initialize divert connection block queue. */ INPCBSTORAGE_DEFINE(divcbstor, "divinp", "divcb", "div", "divhash"); static void div_init(void *arg __unused) { /* * XXX We don't use the hash list for divert IP, but it's easier to * allocate one-entry hash lists than it is to check all over the * place for hashbase == NULL. */ in_pcbinfo_init(&V_divcbinfo, &divcbstor, 1, 1); } VNET_SYSINIT(div_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_init, NULL); static void div_destroy(void *unused __unused) { in_pcbinfo_destroy(&V_divcbinfo); } VNET_SYSUNINIT(divert, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_destroy, NULL); static bool div_port_match(const struct inpcb *inp, void *v) { uint16_t nport = *(uint16_t *)v; return (inp->inp_lport == nport); } /* * Divert a packet by passing it up to the divert socket at port 'port'. */ static void divert_packet(struct mbuf *m, bool incoming) { #if defined(SCTP) || defined(SCTP_SUPPORT) struct ip *ip; #endif struct inpcb *inp; struct socket *sa; u_int16_t nport; struct sockaddr_in divsrc; struct inpcb_iterator inpi = INP_ITERATOR(&V_divcbinfo, INPLOOKUP_RLOCKPCB, div_port_match, &nport); struct m_tag *mtag; NET_EPOCH_ASSERT(); mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); if (mtag == NULL) { m_freem(m); return; } /* Assure header */ if (m->m_len < sizeof(struct ip) && (m = m_pullup(m, sizeof(struct ip))) == NULL) return; /* Delayed checksums are currently not compatible with divert. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #if defined(SCTP) || defined(SCTP_SUPPORT) if (m->m_pkthdr.csum_flags & CSUM_SCTP) { ip = mtod(m, struct ip *); sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif #ifdef INET6 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { in6_delayed_cksum(m, m->m_pkthdr.len - sizeof(struct ip6_hdr), sizeof(struct ip6_hdr)); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; } #if defined(SCTP) || defined(SCTP_SUPPORT) if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) { sctp_delayed_cksum(m, sizeof(struct ip6_hdr)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6; } #endif #endif /* INET6 */ bzero(&divsrc, sizeof(divsrc)); divsrc.sin_len = sizeof(divsrc); divsrc.sin_family = AF_INET; /* record matching rule, in host format */ divsrc.sin_port = ((struct ipfw_rule_ref *)(mtag+1))->rulenum; /* * Record receive interface address, if any. * But only for incoming packets. */ if (incoming) { struct ifaddr *ifa; struct ifnet *ifp; /* Sanity check */ M_ASSERTPKTHDR(m); /* Find IP address for receive interface */ ifp = m->m_pkthdr.rcvif; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; divsrc.sin_addr = ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; break; } } /* * Record the incoming interface name whenever we have one. */ if (m->m_pkthdr.rcvif) { /* * Hide the actual interface name in there in the * sin_zero array. XXX This needs to be moved to a * different sockaddr type for divert, e.g. * sockaddr_div with multiple fields like * sockaddr_dl. Presently we have only 7 bytes * but that will do for now as most interfaces * are 4 or less + 2 or less bytes for unit. * There is probably a faster way of doing this, * possibly taking it from the sockaddr_dl on the iface. * This solves the problem of a P2P link and a LAN interface * having the same address, which can result in the wrong * interface being assigned to the packet when fed back * into the divert socket. Theoretically if the daemon saves * and re-uses the sockaddr_in as suggested in the man pages, * this iface name will come along for the ride. * (see div_output for the other half of this.) */ strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname, sizeof(divsrc.sin_zero)); } /* Put packet on socket queue, if any */ sa = NULL; /* nport is inp_next's context. */ nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info)); while ((inp = inp_next(&inpi)) != NULL) { sa = inp->inp_socket; SOCKBUF_LOCK(&sa->so_rcv); if (sbappendaddr_locked(&sa->so_rcv, (struct sockaddr *)&divsrc, m, NULL) == 0) { soroverflow_locked(sa); sa = NULL; /* force mbuf reclaim below */ } else sorwakeup_locked(sa); /* XXX why does only one socket match? */ INP_RUNLOCK(inp); break; } if (sa == NULL) { m_freem(m); KMOD_IPSTAT_INC(ips_noproto); KMOD_IPSTAT_DEC(ips_delivered); } } /* * Deliver packet back into the IP processing machinery. * * If no address specified, or address is 0.0.0.0, send to ip_output(); * otherwise, send to ip_input() and mark as having been received on * the interface with that address. */ static int div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct epoch_tracker et; struct sockaddr_in *sin = (struct sockaddr_in *)nam; const struct ip *ip; struct m_tag *mtag; struct ipfw_rule_ref *dt; int error, family; if (control) m_freem(control); /* Packet must have a header (but that's about it) */ if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { KMOD_IPSTAT_INC(ips_toosmall); m_freem(m); return (EINVAL); } if (sin != NULL) { if (sin->sin_family != AF_INET) { m_freem(m); return (EAFNOSUPPORT); } if (sin->sin_len != sizeof(*sin)) { m_freem(m); return (EINVAL); } } /* * An mbuf may hasn't come from userland, but we pretend * that it has. */ m->m_pkthdr.rcvif = NULL; m->m_nextpkt = NULL; M_SETFIB(m, so->so_fibnum); mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); if (mtag == NULL) { /* this should be normal */ mtag = m_tag_alloc(MTAG_IPFW_RULE, 0, sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO); if (mtag == NULL) { m_freem(m); return (ENOBUFS); } m_tag_prepend(m, mtag); } dt = (struct ipfw_rule_ref *)(mtag+1); /* Loopback avoidance and state recovery */ if (sin) { int i; /* set the starting point. We provide a non-zero slot, * but a non_matching chain_id to skip that info and use * the rulenum/rule_id. */ dt->slot = 1; /* dummy, chain_id is invalid */ dt->chain_id = 0; dt->rulenum = sin->sin_port+1; /* host format ? */ dt->rule_id = 0; /* XXX: broken for IPv6 */ /* * Find receive interface with the given name, stuffed * (if it exists) in the sin_zero[] field. * The name is user supplied data so don't trust its size * or that it is zero terminated. */ for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++) ; if ( i > 0 && i < sizeof(sin->sin_zero)) m->m_pkthdr.rcvif = ifunit(sin->sin_zero); } ip = mtod(m, struct ip *); switch (ip->ip_v) { case IPVERSION: family = AF_INET; break; #ifdef INET6 case IPV6_VERSION >> 4: family = AF_INET6; break; #endif default: m_freem(m); return (EAFNOSUPPORT); } /* Reinject packet into the system as incoming or outgoing */ NET_EPOCH_ENTER(et); if (!sin || sin->sin_addr.s_addr == 0) { dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT; error = div_output_outbound(family, so, m); } else { dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN; error = div_output_inbound(family, so, m, sin); } NET_EPOCH_EXIT(et); return (error); } /* * Sends mbuf @m to the wire via ip[6]_output(). * * Returns 0 on success or an errno value on failure. @m is always consumed. */ static int div_output_outbound(int family, struct socket *so, struct mbuf *m) { struct ip *const ip = mtod(m, struct ip *); struct mbuf *options; struct inpcb *inp; int error; inp = sotoinpcb(so); INP_RLOCK(inp); switch (family) { case AF_INET: /* * Don't allow both user specified and setsockopt * options, and don't allow packet length sizes that * will crash. */ if ((((ip->ip_hl << 2) != sizeof(struct ip)) && inp->inp_options != NULL) || ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { INP_RUNLOCK(inp); m_freem(m); return (EINVAL); } break; #ifdef INET6 case AF_INET6: { struct ip6_hdr *const ip6 = mtod(m, struct ip6_hdr *); /* Don't allow packet length sizes that will crash */ if (((u_short)ntohs(ip6->ip6_plen) > m->m_pkthdr.len)) { INP_RUNLOCK(inp); m_freem(m); return (EINVAL); } break; } #endif } /* Send packet to output processing */ KMOD_IPSTAT_INC(ips_rawout); /* XXX */ #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif /* * Get ready to inject the packet into ip_output(). * Just in case socket options were specified on the * divert socket, we duplicate them. This is done * to avoid having to hold the PCB locks over the call * to ip_output(), as doing this results in a number of * lock ordering complexities. * * Note that we set the multicast options argument for * ip_output() to NULL since it should be invariant that * they are not present. */ KASSERT(inp->inp_moptions == NULL, ("multicast options set on a divert socket")); /* * XXXCSJP: It is unclear to me whether or not it makes * sense for divert sockets to have options. However, * for now we will duplicate them with the INP locks * held so we can use them in ip_output() without * requring a reference to the pcb. */ options = NULL; if (inp->inp_options != NULL) { options = m_dup(inp->inp_options, M_NOWAIT); if (options == NULL) { INP_RUNLOCK(inp); m_freem(m); return (ENOBUFS); } } INP_RUNLOCK(inp); error = 0; switch (family) { case AF_INET: error = ip_output(m, options, NULL, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL); break; #ifdef INET6 case AF_INET6: error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); break; #endif } if (options != NULL) m_freem(options); return (error); } /* * Schedules mbuf @m for local processing via IPv4/IPv6 netisr queue. * * Returns 0 on success or an errno value on failure. @m is always consumed. */ static int div_output_inbound(int family, struct socket *so, struct mbuf *m, struct sockaddr_in *sin) { const struct ip *ip; struct ifaddr *ifa; if (m->m_pkthdr.rcvif == NULL) { /* * No luck with the name, check by IP address. * Clear the port and the ifname to make sure * there are no distractions for ifa_ifwithaddr. */ /* XXX: broken for IPv6 */ bzero(sin->sin_zero, sizeof(sin->sin_zero)); sin->sin_port = 0; ifa = ifa_ifwithaddr((struct sockaddr *) sin); if (ifa == NULL) { m_freem(m); return (EADDRNOTAVAIL); } m->m_pkthdr.rcvif = ifa->ifa_ifp; } #ifdef MAC mac_socket_create_mbuf(so, m); #endif /* Send packet to input processing via netisr */ switch (family) { case AF_INET: ip = mtod(m, struct ip *); /* * Restore M_BCAST flag when destination address is * broadcast. It is expected by ip_tryforward(). */ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) m->m_flags |= M_MCAST; else if (in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) m->m_flags |= M_BCAST; netisr_queue_src(NETISR_IP, (uintptr_t)so, m); break; #ifdef INET6 case AF_INET6: netisr_queue_src(NETISR_IPV6, (uintptr_t)so, m); break; #endif default: m_freem(m); return (EINVAL); } return (0); } static int div_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("div_attach: inp != NULL")); if (td != NULL) { error = priv_check(td, PRIV_NETINET_DIVERT); if (error) return (error); } error = soreserve(so, div_sendspace, div_recvspace); if (error) return error; error = in_pcballoc(so, &V_divcbinfo); if (error) return error; inp = (struct inpcb *)so->so_pcb; inp->inp_ip_p = proto; inp->inp_flags |= INP_HDRINCL; INP_WUNLOCK(inp); return 0; } static void div_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("div_detach: inp == NULL")); INP_WLOCK(inp); in_pcbdetach(inp); in_pcbfree(inp); } static int div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("div_bind: inp == NULL")); /* in_pcbbind assumes that nam is a sockaddr_in * and in_pcbbind requires a valid address. Since divert * sockets don't we need to make sure the address is * filled in properly. * XXX -- divert should not be abusing in_pcbind * and should probably have its own family. */ if (nam->sa_family != AF_INET) return EAFNOSUPPORT; if (nam->sa_len != sizeof(struct sockaddr_in)) return EINVAL; ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; INP_WLOCK(inp); INP_HASH_WLOCK(&V_divcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(&V_divcbinfo); INP_WUNLOCK(inp); return error; } static int div_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("div_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return 0; } static int div_pcblist(SYSCTL_HANDLER_ARGS) { struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_divcbinfo, INPLOOKUP_RLOCKPCB); struct xinpgen xig; struct inpcb *inp; int error; if (req->newptr != 0) return EPERM; if (req->oldptr == 0) { int n; n = V_divcbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return 0; } if ((error = sysctl_wire_old_buffer(req, 0)) != 0) return (error); bzero(&xig, sizeof(xig)); xig.xig_len = sizeof xig; xig.xig_count = V_divcbinfo.ipi_count; xig.xig_gen = V_divcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt <= xig.xig_gen) { struct xinpcb xi; in_pcbtoxinpcb(inp, &xi); error = SYSCTL_OUT(req, &xi, sizeof xi); if (error) { INP_RUNLOCK(inp); break; } } } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ xig.xig_gen = V_divcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_divcbinfo.ipi_count; error = SYSCTL_OUT(req, &xig, sizeof xig); } return (error); } #ifdef SYSCTL_NODE static SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPDIVERT"); SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, div_pcblist, "S,xinpcb", "List of active divert sockets"); #endif static struct protosw div_protosw = { .pr_type = SOCK_RAW, - .pr_protocol = IPPROTO_DIVERT, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_attach = div_attach, .pr_bind = div_bind, .pr_control = in_control, .pr_detach = div_detach, .pr_peeraddr = in_getpeeraddr, .pr_send = div_send, .pr_shutdown = div_shutdown, .pr_sockaddr = in_getsockaddr, .pr_sosetlabel = in_pcbsosetlabel }; +static struct domain divertdomain = { + .dom_family = PF_DIVERT, + .dom_name = "divert", + .dom_nprotosw = 1, + .dom_protosw = { &div_protosw }, +}; + static int div_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: - /* - * Protocol will be initialized by pf_proto_register(). - */ - err = protosw_register(&inetdomain, &div_protosw); - if (err != 0) - return (err); + domain_add(&divertdomain); ip_divert_ptr = divert_packet; break; case MOD_QUIESCE: /* * IPDIVERT may normally not be unloaded because of the * potential race conditions. Tell kldunload we can't be * unloaded unless the unload is forced. */ err = EPERM; break; case MOD_UNLOAD: /* * Forced unload. * * Module ipdivert can only be unloaded if no sockets are * connected. Maybe this can be changed later to forcefully * disconnect any open sockets. * * XXXRW: Note that there is a slight race here, as a new * socket open request could be spinning on the lock and then * we destroy the lock. + * + * XXXGL: One more reason this code is incorrect is that it + * checks only the current vnet. */ INP_INFO_WLOCK(&V_divcbinfo); if (V_divcbinfo.ipi_count != 0) { err = EBUSY; INP_INFO_WUNLOCK(&V_divcbinfo); break; } ip_divert_ptr = NULL; - err = protosw_unregister(&div_protosw); + domain_remove(&divertdomain); INP_INFO_WUNLOCK(&V_divcbinfo); #ifndef VIMAGE div_destroy(NULL); #endif break; default: err = EOPNOTSUPP; break; } return err; } static moduledata_t ipdivertmod = { "ipdivert", div_modevent, 0 }; DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY); MODULE_DEPEND(ipdivert, ipfw, 3, 3, 3); MODULE_VERSION(ipdivert, 1); diff --git a/sys/netinet6/in6_mcast.c b/sys/netinet6/in6_mcast.c index d0f8186e75c7..a02e18656dc2 100644 --- a/sys/netinet6/in6_mcast.c +++ b/sys/netinet6/in6_mcast.c @@ -1,2930 +1,2922 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2009 Bruce Simpson. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPv6 multicast socket, group, and socket option processing module. * Normative references: RFC 2292, RFC 3492, RFC 3542, RFC 3678, RFC 3810. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_MLD #define KTR_MLD KTR_INET6 #endif #ifndef __SOCKUNION_DECLARED union sockunion { struct sockaddr_storage ss; struct sockaddr sa; struct sockaddr_dl sdl; struct sockaddr_in6 sin6; }; typedef union sockunion sockunion_t; #define __SOCKUNION_DECLARED #endif /* __SOCKUNION_DECLARED */ static MALLOC_DEFINE(M_IN6MFILTER, "in6_mfilter", "IPv6 multicast PCB-layer source filter"); MALLOC_DEFINE(M_IP6MADDR, "in6_multi", "IPv6 multicast group"); static MALLOC_DEFINE(M_IP6MOPTS, "ip6_moptions", "IPv6 multicast options"); static MALLOC_DEFINE(M_IP6MSOURCE, "ip6_msource", "IPv6 multicast MLD-layer source filter"); RB_GENERATE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp); /* * Locking: * - Lock order is: Giant, IN6_MULTI_LOCK, INP_WLOCK, * IN6_MULTI_LIST_LOCK, MLD_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by in6m_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip6_moptions and in6_mfilter are covered by the INP_WLOCK. * * struct in6_multi is covered by IN6_MULTI_LOCK. There isn't strictly * any need for in6_multi itself to be virtualized -- it is bound to an ifp * anyway no matter what happens. */ struct mtx in6_multi_list_mtx; MTX_SYSINIT(in6_multi_mtx, &in6_multi_list_mtx, "in6_multi_list_mtx", MTX_DEF); struct mtx in6_multi_free_mtx; MTX_SYSINIT(in6_multi_free_mtx, &in6_multi_free_mtx, "in6_multi_free_mtx", MTX_DEF); struct sx in6_multi_sx; SX_SYSINIT(in6_multi_sx, &in6_multi_sx, "in6_multi_sx"); static void im6f_commit(struct in6_mfilter *); static int im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin, struct in6_msource **); static struct in6_msource * im6f_graft(struct in6_mfilter *, const uint8_t, const struct sockaddr_in6 *); static void im6f_leave(struct in6_mfilter *); static int im6f_prune(struct in6_mfilter *, const struct sockaddr_in6 *); static void im6f_purge(struct in6_mfilter *); static void im6f_rollback(struct in6_mfilter *); static void im6f_reap(struct in6_mfilter *); static struct in6_mfilter * im6o_match_group(const struct ip6_moptions *, const struct ifnet *, const struct sockaddr *); static struct in6_msource * im6o_match_source(struct in6_mfilter *, const struct sockaddr *); static void im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims, const int rollback); static int in6_getmulti(struct ifnet *, const struct in6_addr *, struct in6_multi **); static int in6_joingroup_locked(struct ifnet *, const struct in6_addr *, struct in6_mfilter *, struct in6_multi **, int); static int in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr, const int noalloc, struct ip6_msource **pims); #ifdef KTR static int in6m_is_ifp_detached(const struct in6_multi *); #endif static int in6m_merge(struct in6_multi *, /*const*/ struct in6_mfilter *); static void in6m_purge(struct in6_multi *); static void in6m_reap(struct in6_multi *); static struct ip6_moptions * in6p_findmoptions(struct inpcb *); static int in6p_get_source_filters(struct inpcb *, struct sockopt *); static int in6p_join_group(struct inpcb *, struct sockopt *); static int in6p_leave_group(struct inpcb *, struct sockopt *); static struct ifnet * in6p_lookup_mcast_ifp(const struct inpcb *, const struct sockaddr_in6 *); static int in6p_block_unblock_source(struct inpcb *, struct sockopt *); static int in6p_set_multicast_if(struct inpcb *, struct sockopt *); static int in6p_set_source_filters(struct inpcb *, struct sockopt *); static int sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS); SYSCTL_DECL(_net_inet6_ip6); /* XXX Not in any common header. */ static SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, mcast, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPv6 multicast"); static u_long in6_mcast_maxgrpsrc = IPV6_MAX_GROUP_SRC_FILTER; SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxgrpsrc, CTLFLAG_RWTUN, &in6_mcast_maxgrpsrc, 0, "Max source filters per group"); static u_long in6_mcast_maxsocksrc = IPV6_MAX_SOCK_SRC_FILTER; SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxsocksrc, CTLFLAG_RWTUN, &in6_mcast_maxsocksrc, 0, "Max source filters per socket"); /* TODO Virtualize this switch. */ int in6_mcast_loop = IPV6_DEFAULT_MULTICAST_LOOP; SYSCTL_INT(_net_inet6_ip6_mcast, OID_AUTO, loop, CTLFLAG_RWTUN, &in6_mcast_loop, 0, "Loopback multicast datagrams by default"); static SYSCTL_NODE(_net_inet6_ip6_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip6_mcast_filters, "Per-interface stack-wide source filters"); #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp * is detached. */ static int __inline in6m_is_ifp_detached(const struct in6_multi *inm) { struct ifnet *ifp; KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->in6m_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that network-layer notion of ifp is the * same as that of link-layer. */ KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__)); } return (ifp == NULL); } #endif /* * Initialize an in6_mfilter structure to a known state at t0, t1 * with an empty source filter list. */ static __inline void im6f_init(struct in6_mfilter *imf, const int st0, const int st1) { memset(imf, 0, sizeof(struct in6_mfilter)); RB_INIT(&imf->im6f_sources); imf->im6f_st[0] = st0; imf->im6f_st[1] = st1; } struct in6_mfilter * ip6_mfilter_alloc(const int mflags, const int st0, const int st1) { struct in6_mfilter *imf; imf = malloc(sizeof(*imf), M_IN6MFILTER, mflags); if (imf != NULL) im6f_init(imf, st0, st1); return (imf); } void ip6_mfilter_free(struct in6_mfilter *imf) { im6f_purge(imf); free(imf, M_IN6MFILTER); } /* * Find an IPv6 multicast group entry for this ip6_moptions instance * which matches the specified group, and optionally an interface. * Return its index into the array, or -1 if not found. */ static struct in6_mfilter * im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group) { const struct sockaddr_in6 *gsin6; struct in6_mfilter *imf; struct in6_multi *inm; gsin6 = (const struct sockaddr_in6 *)group; IP6_MFILTER_FOREACH(imf, &imo->im6o_head) { inm = imf->im6f_in6m; if (inm == NULL) continue; if ((ifp == NULL || (inm->in6m_ifp == ifp)) && IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &gsin6->sin6_addr)) { break; } } return (imf); } /* * Find an IPv6 multicast source entry for this imo which matches * the given group index for this socket, and source address. * * XXX TODO: The scope ID, if present in src, is stripped before * any comparison. We SHOULD enforce scope/zone checks where the source * filter entry has a link scope. * * NOTE: This does not check if the entry is in-mode, merely if * it exists, which may not be the desired behaviour. */ static struct in6_msource * im6o_match_source(struct in6_mfilter *imf, const struct sockaddr *src) { struct ip6_msource find; struct ip6_msource *ims; const sockunion_t *psa; KASSERT(src->sa_family == AF_INET6, ("%s: !AF_INET6", __func__)); psa = (const sockunion_t *)src; find.im6s_addr = psa->sin6.sin6_addr; in6_clearscope(&find.im6s_addr); /* XXX */ ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); return ((struct in6_msource *)ims); } /* * Perform filtering for multicast datagrams on a socket by group and source. * * Returns 0 if a datagram should be allowed through, or various error codes * if the socket was not a member of the group, or the source was muted, etc. */ int im6o_mc_filter(const struct ip6_moptions *imo, const struct ifnet *ifp, const struct sockaddr *group, const struct sockaddr *src) { struct in6_mfilter *imf; struct in6_msource *ims; int mode; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); imf = im6o_match_group(imo, ifp, group); if (imf == NULL) return (MCAST_NOTGMEMBER); /* * Check if the source was included in an (S,G) join. * Allow reception on exclusive memberships by default, * reject reception on inclusive memberships by default. * Exclude source only if an in-mode exclude filter exists. * Include source only if an in-mode include filter exists. * NOTE: We are comparing group state here at MLD t1 (now) * with socket-layer t0 (since last downcall). */ mode = imf->im6f_st[1]; ims = im6o_match_source(imf, src); if ((ims == NULL && mode == MCAST_INCLUDE) || (ims != NULL && ims->im6sl_st[0] != mode)) return (MCAST_NOTSMEMBER); return (MCAST_PASS); } /* * Find and return a reference to an in6_multi record for (ifp, group), * and bump its reference count. * If one does not exist, try to allocate it, and update link-layer multicast * filters on ifp to listen for group. * Assumes the IN6_MULTI lock is held across the call. * Return 0 if successful, otherwise return an appropriate error code. */ static int in6_getmulti(struct ifnet *ifp, const struct in6_addr *group, struct in6_multi **pinm) { struct epoch_tracker et; struct sockaddr_in6 gsin6; struct ifmultiaddr *ifma; struct in6_multi *inm; int error; error = 0; /* * XXX: Accesses to ifma_protospec must be covered by IF_ADDR_LOCK; * if_addmulti() takes this mutex itself, so we must drop and * re-acquire around the call. */ IN6_MULTI_LOCK_ASSERT(); IN6_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); NET_EPOCH_ENTER(et); /* * Does ifp support IPv6 multicasts? */ if (ifp->if_afdata[AF_INET6] == NULL) error = ENODEV; else inm = in6m_lookup_locked(ifp, group); NET_EPOCH_EXIT(et); if (error != 0) goto out_locked; if (inm != NULL) { /* * If we already joined this group, just bump the * refcount and return it. */ KASSERT(inm->in6m_refcount >= 1, ("%s: bad refcount %d", __func__, inm->in6m_refcount)); in6m_acquire_locked(inm); *pinm = inm; goto out_locked; } memset(&gsin6, 0, sizeof(gsin6)); gsin6.sin6_family = AF_INET6; gsin6.sin6_len = sizeof(struct sockaddr_in6); gsin6.sin6_addr = *group; /* * Check if a link-layer group is already associated * with this network-layer group on the given ifnet. */ IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); error = if_addmulti(ifp, (struct sockaddr *)&gsin6, &ifma); if (error != 0) return (error); IN6_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); /* * If something other than netinet6 is occupying the link-layer * group, print a meaningful error message and back out of * the allocation. * Otherwise, bump the refcount on the existing network-layer * group association and return it. */ if (ifma->ifma_protospec != NULL) { inm = (struct in6_multi *)ifma->ifma_protospec; #ifdef INVARIANTS KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr", __func__)); KASSERT(ifma->ifma_addr->sa_family == AF_INET6, ("%s: ifma not AF_INET6", __func__)); KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__)); if (inm->in6m_ifma != ifma || inm->in6m_ifp != ifp || !IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, group)) panic("%s: ifma %p is inconsistent with %p (%p)", __func__, ifma, inm, group); #endif in6m_acquire_locked(inm); *pinm = inm; goto out_locked; } IF_ADDR_WLOCK_ASSERT(ifp); /* * A new in6_multi record is needed; allocate and initialize it. * We DO NOT perform an MLD join as the in6_ layer may need to * push an initial source list down to MLD to support SSM. * * The initial source filter state is INCLUDE, {} as per the RFC. * Pending state-changes per group are subject to a bounds check. */ inm = malloc(sizeof(*inm), M_IP6MADDR, M_NOWAIT | M_ZERO); if (inm == NULL) { IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); if_delmulti_ifma(ifma); return (ENOMEM); } inm->in6m_addr = *group; inm->in6m_ifp = ifp; inm->in6m_mli = MLD_IFINFO(ifp); inm->in6m_ifma = ifma; inm->in6m_refcount = 1; inm->in6m_state = MLD_NOT_MEMBER; mbufq_init(&inm->in6m_scq, MLD_MAX_STATE_CHANGES); inm->in6m_st[0].iss_fmode = MCAST_UNDEFINED; inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; RB_INIT(&inm->in6m_srcs); ifma->ifma_protospec = inm; *pinm = inm; out_locked: IN6_MULTI_LIST_UNLOCK(); IF_ADDR_WUNLOCK(ifp); return (error); } /* * Drop a reference to an in6_multi record. * * If the refcount drops to 0, free the in6_multi record and * delete the underlying link-layer membership. */ static void in6m_release(struct in6_multi *inm) { struct ifmultiaddr *ifma; struct ifnet *ifp; CTR2(KTR_MLD, "%s: refcount is %d", __func__, inm->in6m_refcount); MPASS(inm->in6m_refcount == 0); CTR2(KTR_MLD, "%s: freeing inm %p", __func__, inm); ifma = inm->in6m_ifma; ifp = inm->in6m_ifp; MPASS(ifma->ifma_llifma == NULL); /* XXX this access is not covered by IF_ADDR_LOCK */ CTR2(KTR_MLD, "%s: purging ifma %p", __func__, ifma); KASSERT(ifma->ifma_protospec == NULL, ("%s: ifma_protospec != NULL", __func__)); if (ifp == NULL) ifp = ifma->ifma_ifp; if (ifp != NULL) { CURVNET_SET(ifp->if_vnet); in6m_purge(inm); free(inm, M_IP6MADDR); if_delmulti_ifma_flags(ifma, 1); CURVNET_RESTORE(); if_rele(ifp); } else { in6m_purge(inm); free(inm, M_IP6MADDR); if_delmulti_ifma_flags(ifma, 1); } } /* * Interface detach can happen in a taskqueue thread context, so we must use a * dedicated thread to avoid deadlocks when draining in6m_release tasks. */ TASKQUEUE_DEFINE_THREAD(in6m_free); static struct in6_multi_head in6m_free_list = SLIST_HEAD_INITIALIZER(); static void in6m_release_task(void *arg __unused, int pending __unused); static struct task in6m_free_task = TASK_INITIALIZER(0, in6m_release_task, NULL); void in6m_release_list_deferred(struct in6_multi_head *inmh) { if (SLIST_EMPTY(inmh)) return; mtx_lock(&in6_multi_free_mtx); SLIST_CONCAT(&in6m_free_list, inmh, in6_multi, in6m_nrele); mtx_unlock(&in6_multi_free_mtx); taskqueue_enqueue(taskqueue_in6m_free, &in6m_free_task); } void in6m_release_wait(void *arg __unused) { /* * Make sure all pending multicast addresses are freed before * the VNET or network device is destroyed: */ taskqueue_drain_all(taskqueue_in6m_free); } #ifdef VIMAGE /* XXX-BZ FIXME, see D24914. */ VNET_SYSUNINIT(in6m_release_wait, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, in6m_release_wait, NULL); #endif void in6m_disconnect_locked(struct in6_multi_head *inmh, struct in6_multi *inm) { struct ifnet *ifp; struct ifaddr *ifa; struct in6_ifaddr *ifa6; struct in6_multi_mship *imm, *imm_tmp; struct ifmultiaddr *ifma, *ll_ifma; IN6_MULTI_LIST_LOCK_ASSERT(); ifp = inm->in6m_ifp; if (ifp == NULL) return; /* already called */ inm->in6m_ifp = NULL; IF_ADDR_WLOCK_ASSERT(ifp); ifma = inm->in6m_ifma; if (ifma == NULL) return; if_ref(ifp); if (ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname); if ((ll_ifma = ifma->ifma_llifma) != NULL) { MPASS(ifma != ll_ifma); ifma->ifma_llifma = NULL; MPASS(ll_ifma->ifma_llifma == NULL); MPASS(ll_ifma->ifma_ifp == ifp); if (--ll_ifma->ifma_refcount == 0) { if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname); if_freemulti(ll_ifma); } } CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (void *)ifa; LIST_FOREACH_SAFE(imm, &ifa6->ia6_memberships, i6mm_chain, imm_tmp) { if (inm == imm->i6mm_maddr) { LIST_REMOVE(imm, i6mm_chain); free(imm, M_IP6MADDR); in6m_rele_locked(inmh, inm); } } } } static void in6m_release_task(void *arg __unused, int pending __unused) { struct in6_multi_head in6m_free_tmp; struct in6_multi *inm, *tinm; SLIST_INIT(&in6m_free_tmp); mtx_lock(&in6_multi_free_mtx); SLIST_CONCAT(&in6m_free_tmp, &in6m_free_list, in6_multi, in6m_nrele); mtx_unlock(&in6_multi_free_mtx); IN6_MULTI_LOCK(); SLIST_FOREACH_SAFE(inm, &in6m_free_tmp, in6m_nrele, tinm) { SLIST_REMOVE_HEAD(&in6m_free_tmp, in6m_nrele); in6m_release(inm); } IN6_MULTI_UNLOCK(); } /* * Clear recorded source entries for a group. * Used by the MLD code. Caller must hold the IN6_MULTI lock. * FIXME: Should reap. */ void in6m_clear_recorded(struct in6_multi *inm) { struct ip6_msource *ims; IN6_MULTI_LIST_LOCK_ASSERT(); RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { if (ims->im6s_stp) { ims->im6s_stp = 0; --inm->in6m_st[1].iss_rec; } } KASSERT(inm->in6m_st[1].iss_rec == 0, ("%s: iss_rec %d not 0", __func__, inm->in6m_st[1].iss_rec)); } /* * Record a source as pending for a Source-Group MLDv2 query. * This lives here as it modifies the shared tree. * * inm is the group descriptor. * naddr is the address of the source to record in network-byte order. * * If the net.inet6.mld.sgalloc sysctl is non-zero, we will * lazy-allocate a source node in response to an SG query. * Otherwise, no allocation is performed. This saves some memory * with the trade-off that the source will not be reported to the * router if joined in the window between the query response and * the group actually being joined on the local host. * * VIMAGE: XXX: Currently the mld_sgalloc feature has been removed. * This turns off the allocation of a recorded source entry if * the group has not been joined. * * Return 0 if the source didn't exist or was already marked as recorded. * Return 1 if the source was marked as recorded by this function. * Return <0 if any error occurred (negated errno code). */ int in6m_record_source(struct in6_multi *inm, const struct in6_addr *addr) { struct ip6_msource find; struct ip6_msource *ims, *nims; IN6_MULTI_LIST_LOCK_ASSERT(); find.im6s_addr = *addr; ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find); if (ims && ims->im6s_stp) return (0); if (ims == NULL) { if (inm->in6m_nsrc == in6_mcast_maxgrpsrc) return (-ENOSPC); nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (-ENOMEM); nims->im6s_addr = find.im6s_addr; RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims); ++inm->in6m_nsrc; ims = nims; } /* * Mark the source as recorded and update the recorded * source count. */ ++ims->im6s_stp; ++inm->in6m_st[1].iss_rec; return (1); } /* * Return a pointer to an in6_msource owned by an in6_mfilter, * given its source address. * Lazy-allocate if needed. If this is a new entry its filter state is * undefined at t0. * * imf is the filter set being modified. * addr is the source address. * * SMPng: May be called with locks held; malloc must not block. */ static int im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin, struct in6_msource **plims) { struct ip6_msource find; struct ip6_msource *ims, *nims; struct in6_msource *lims; int error; error = 0; ims = NULL; lims = NULL; find.im6s_addr = psin->sin6_addr; ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); lims = (struct in6_msource *)ims; if (lims == NULL) { if (imf->im6f_nsrc == in6_mcast_maxsocksrc) return (ENOSPC); nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); lims = (struct in6_msource *)nims; lims->im6s_addr = find.im6s_addr; lims->im6sl_st[0] = MCAST_UNDEFINED; RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims); ++imf->im6f_nsrc; } *plims = lims; return (error); } /* * Graft a source entry into an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being in the new filter mode at t1. * * Return the pointer to the new node, otherwise return NULL. */ static struct in6_msource * im6f_graft(struct in6_mfilter *imf, const uint8_t st1, const struct sockaddr_in6 *psin) { struct ip6_msource *nims; struct in6_msource *lims; nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER, M_NOWAIT | M_ZERO); if (nims == NULL) return (NULL); lims = (struct in6_msource *)nims; lims->im6s_addr = psin->sin6_addr; lims->im6sl_st[0] = MCAST_UNDEFINED; lims->im6sl_st[1] = st1; RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims); ++imf->im6f_nsrc; return (lims); } /* * Prune a source entry from an existing socket-layer filter set, * maintaining any required invariants and checking allocations. * * The source is marked as being left at t1, it is not freed. * * Return 0 if no error occurred, otherwise return an errno value. */ static int im6f_prune(struct in6_mfilter *imf, const struct sockaddr_in6 *psin) { struct ip6_msource find; struct ip6_msource *ims; struct in6_msource *lims; find.im6s_addr = psin->sin6_addr; ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); if (ims == NULL) return (ENOENT); lims = (struct in6_msource *)ims; lims->im6sl_st[1] = MCAST_UNDEFINED; return (0); } /* * Revert socket-layer filter set deltas at t1 to t0 state. */ static void im6f_rollback(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; struct in6_msource *lims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == lims->im6sl_st[1]) { /* no change at t1 */ continue; } else if (lims->im6sl_st[0] != MCAST_UNDEFINED) { /* revert change to existing source at t1 */ lims->im6sl_st[1] = lims->im6sl_st[0]; } else { /* revert source added t1 */ CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } } imf->im6f_st[1] = imf->im6f_st[0]; } /* * Mark socket-layer filter set as INCLUDE {} at t1. */ static void im6f_leave(struct in6_mfilter *imf) { struct ip6_msource *ims; struct in6_msource *lims; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; lims->im6sl_st[1] = MCAST_UNDEFINED; } imf->im6f_st[1] = MCAST_INCLUDE; } /* * Mark socket-layer filter set deltas as committed. */ static void im6f_commit(struct in6_mfilter *imf) { struct ip6_msource *ims; struct in6_msource *lims; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; lims->im6sl_st[0] = lims->im6sl_st[1]; } imf->im6f_st[0] = imf->im6f_st[1]; } /* * Reap unreferenced sources from socket-layer filter set. */ static void im6f_reap(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; struct in6_msource *lims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { lims = (struct in6_msource *)ims; if ((lims->im6sl_st[0] == MCAST_UNDEFINED) && (lims->im6sl_st[1] == MCAST_UNDEFINED)) { CTR2(KTR_MLD, "%s: free lims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } } } /* * Purge socket-layer filter set. */ static void im6f_purge(struct in6_mfilter *imf) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); free(ims, M_IN6MFILTER); imf->im6f_nsrc--; } imf->im6f_st[0] = imf->im6f_st[1] = MCAST_UNDEFINED; KASSERT(RB_EMPTY(&imf->im6f_sources), ("%s: im6f_sources not empty", __func__)); } /* * Look up a source filter entry for a multicast group. * * inm is the group descriptor to work with. * addr is the IPv6 address to look up. * noalloc may be non-zero to suppress allocation of sources. * *pims will be set to the address of the retrieved or allocated source. * * SMPng: NOTE: may be called with locks held. * Return 0 if successful, otherwise return a non-zero error code. */ static int in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr, const int noalloc, struct ip6_msource **pims) { struct ip6_msource find; struct ip6_msource *ims, *nims; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif find.im6s_addr = *addr; ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find); if (ims == NULL && !noalloc) { if (inm->in6m_nsrc == in6_mcast_maxgrpsrc) return (ENOSPC); nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE, M_NOWAIT | M_ZERO); if (nims == NULL) return (ENOMEM); nims->im6s_addr = *addr; RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims); ++inm->in6m_nsrc; ims = nims; CTR3(KTR_MLD, "%s: allocated %s as %p", __func__, ip6_sprintf(ip6tbuf, addr), ims); } *pims = ims; return (0); } /* * Merge socket-layer source into MLD-layer source. * If rollback is non-zero, perform the inverse of the merge. */ static void im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims, const int rollback) { int n = rollback ? -1 : 1; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; ip6_sprintf(ip6tbuf, &lims->im6s_addr); #endif if (lims->im6sl_st[0] == MCAST_EXCLUDE) { CTR3(KTR_MLD, "%s: t1 ex -= %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].ex -= n; } else if (lims->im6sl_st[0] == MCAST_INCLUDE) { CTR3(KTR_MLD, "%s: t1 in -= %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].in -= n; } if (lims->im6sl_st[1] == MCAST_EXCLUDE) { CTR3(KTR_MLD, "%s: t1 ex += %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].ex += n; } else if (lims->im6sl_st[1] == MCAST_INCLUDE) { CTR3(KTR_MLD, "%s: t1 in += %d on %s", __func__, n, ip6tbuf); ims->im6s_st[1].in += n; } } /* * Atomically update the global in6_multi state, when a membership's * filter list is being updated in any way. * * imf is the per-inpcb-membership group filter pointer. * A fake imf may be passed for in-kernel consumers. * * XXX This is a candidate for a set-symmetric-difference style loop * which would eliminate the repeated lookup from root of ims nodes, * as they share the same key space. * * If any error occurred this function will back out of refcounts * and return a non-zero value. */ static int in6m_merge(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { struct ip6_msource *ims, *nims; struct in6_msource *lims; int schanged, error; int nsrc0, nsrc1; schanged = 0; error = 0; nsrc1 = nsrc0 = 0; IN6_MULTI_LIST_LOCK_ASSERT(); /* * Update the source filters first, as this may fail. * Maintain count of in-mode filters at t0, t1. These are * used to work out if we transition into ASM mode or not. * Maintain a count of source filters whose state was * actually modified by this operation. */ RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == imf->im6f_st[0]) nsrc0++; if (lims->im6sl_st[1] == imf->im6f_st[1]) nsrc1++; if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue; error = in6m_get_source(inm, &lims->im6s_addr, 0, &nims); ++schanged; if (error) break; im6s_merge(nims, lims, 0); } if (error) { struct ip6_msource *bims; RB_FOREACH_REVERSE_FROM(ims, ip6_msource_tree, nims) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue; (void)in6m_get_source(inm, &lims->im6s_addr, 1, &bims); if (bims == NULL) continue; im6s_merge(bims, lims, 1); } goto out_reap; } CTR3(KTR_MLD, "%s: imf filters in-mode: %d at t0, %d at t1", __func__, nsrc0, nsrc1); /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ if (imf->im6f_st[0] == imf->im6f_st[1] && imf->im6f_st[1] == MCAST_INCLUDE) { if (nsrc1 == 0) { CTR1(KTR_MLD, "%s: --in on inm at t1", __func__); --inm->in6m_st[1].iss_in; } } /* Handle filter mode transition on socket. */ if (imf->im6f_st[0] != imf->im6f_st[1]) { CTR3(KTR_MLD, "%s: imf transition %d to %d", __func__, imf->im6f_st[0], imf->im6f_st[1]); if (imf->im6f_st[0] == MCAST_EXCLUDE) { CTR1(KTR_MLD, "%s: --ex on inm at t1", __func__); --inm->in6m_st[1].iss_ex; } else if (imf->im6f_st[0] == MCAST_INCLUDE) { CTR1(KTR_MLD, "%s: --in on inm at t1", __func__); --inm->in6m_st[1].iss_in; } if (imf->im6f_st[1] == MCAST_EXCLUDE) { CTR1(KTR_MLD, "%s: ex++ on inm at t1", __func__); inm->in6m_st[1].iss_ex++; } else if (imf->im6f_st[1] == MCAST_INCLUDE && nsrc1 > 0) { CTR1(KTR_MLD, "%s: in++ on inm at t1", __func__); inm->in6m_st[1].iss_in++; } } /* * Track inm filter state in terms of listener counts. * If there are any exclusive listeners, stack-wide * membership is exclusive. * Otherwise, if only inclusive listeners, stack-wide is inclusive. * If no listeners remain, state is undefined at t1, * and the MLD lifecycle for this group should finish. */ if (inm->in6m_st[1].iss_ex > 0) { CTR1(KTR_MLD, "%s: transition to EX", __func__); inm->in6m_st[1].iss_fmode = MCAST_EXCLUDE; } else if (inm->in6m_st[1].iss_in > 0) { CTR1(KTR_MLD, "%s: transition to IN", __func__); inm->in6m_st[1].iss_fmode = MCAST_INCLUDE; } else { CTR1(KTR_MLD, "%s: transition to UNDEF", __func__); inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; } /* Decrement ASM listener count on transition out of ASM mode. */ if (imf->im6f_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { if ((imf->im6f_st[1] != MCAST_EXCLUDE) || (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) { CTR1(KTR_MLD, "%s: --asm on inm at t1", __func__); --inm->in6m_st[1].iss_asm; } } /* Increment ASM listener count on transition to ASM mode. */ if (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { CTR1(KTR_MLD, "%s: asm++ on inm at t1", __func__); inm->in6m_st[1].iss_asm++; } CTR3(KTR_MLD, "%s: merged imf %p to inm %p", __func__, imf, inm); in6m_print(inm); out_reap: if (schanged > 0) { CTR1(KTR_MLD, "%s: sources changed; reaping", __func__); in6m_reap(inm); } return (error); } /* * Mark an in6_multi's filter set deltas as committed. * Called by MLD after a state change has been enqueued. */ void in6m_commit(struct in6_multi *inm) { struct ip6_msource *ims; CTR2(KTR_MLD, "%s: commit inm %p", __func__, inm); CTR1(KTR_MLD, "%s: pre commit:", __func__); in6m_print(inm); RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { ims->im6s_st[0] = ims->im6s_st[1]; } inm->in6m_st[0] = inm->in6m_st[1]; } /* * Reap unreferenced nodes from an in6_multi's filter set. */ static void in6m_reap(struct in6_multi *inm) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) { if (ims->im6s_st[0].ex > 0 || ims->im6s_st[0].in > 0 || ims->im6s_st[1].ex > 0 || ims->im6s_st[1].in > 0 || ims->im6s_stp != 0) continue; CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims); free(ims, M_IP6MSOURCE); inm->in6m_nsrc--; } } /* * Purge all source nodes from an in6_multi's filter set. */ static void in6m_purge(struct in6_multi *inm) { struct ip6_msource *ims, *tims; RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) { CTR2(KTR_MLD, "%s: free ims %p", __func__, ims); RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims); free(ims, M_IP6MSOURCE); inm->in6m_nsrc--; } /* Free state-change requests that might be queued. */ mbufq_drain(&inm->in6m_scq); } /* * Join a multicast address w/o sources. * KAME compatibility entry point. * * SMPng: Assume no mc locks held by caller. */ int in6_joingroup(struct ifnet *ifp, const struct in6_addr *mcaddr, /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm, const int delay) { int error; IN6_MULTI_LOCK(); error = in6_joingroup_locked(ifp, mcaddr, NULL, pinm, delay); IN6_MULTI_UNLOCK(); return (error); } /* * Join a multicast group; real entry point. * * Only preserves atomicity at inm level. * NOTE: imf argument cannot be const due to sys/tree.h limitations. * * If the MLD downcall fails, the group is not joined, and an error * code is returned. */ static int in6_joingroup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr, /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm, const int delay) { struct in6_multi_head inmh; struct in6_mfilter timf; struct in6_multi *inm; struct ifmultiaddr *ifma; int error; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif /* * Sanity: Check scope zone ID was set for ifp, if and * only if group is scoped to an interface. */ KASSERT(IN6_IS_ADDR_MULTICAST(mcaddr), ("%s: not a multicast address", __func__)); if (IN6_IS_ADDR_MC_LINKLOCAL(mcaddr) || IN6_IS_ADDR_MC_INTFACELOCAL(mcaddr)) { KASSERT(mcaddr->s6_addr16[1] != 0, ("%s: scope zone ID not set", __func__)); } IN6_MULTI_LOCK_ASSERT(); IN6_MULTI_LIST_UNLOCK_ASSERT(); CTR4(KTR_MLD, "%s: join %s on %p(%s))", __func__, ip6_sprintf(ip6tbuf, mcaddr), ifp, if_name(ifp)); error = 0; inm = NULL; /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { im6f_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); imf = &timf; } error = in6_getmulti(ifp, mcaddr, &inm); if (error) { CTR1(KTR_MLD, "%s: in6_getmulti() failure", __func__); return (error); } IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) { CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); goto out_in6m_release; } CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, delay); if (error) { CTR1(KTR_MLD, "%s: failed to update source", __func__); goto out_in6m_release; } out_in6m_release: SLIST_INIT(&inmh); if (error) { struct epoch_tracker et; CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm); IF_ADDR_WLOCK(ifp); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_protospec == inm) { ifma->ifma_protospec = NULL; break; } } in6m_disconnect_locked(&inmh, inm); in6m_rele_locked(&inmh, inm); NET_EPOCH_EXIT(et); IF_ADDR_WUNLOCK(ifp); } else { *pinm = inm; } IN6_MULTI_LIST_UNLOCK(); in6m_release_list_deferred(&inmh); return (error); } /* * Leave a multicast group; unlocked entry point. */ int in6_leavegroup(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { int error; IN6_MULTI_LOCK(); error = in6_leavegroup_locked(inm, imf); IN6_MULTI_UNLOCK(); return (error); } /* * Leave a multicast group; real entry point. * All source filters will be expunged. * * Only preserves atomicity at inm level. * * Holding the write lock for the INP which contains imf * is highly advisable. We can't assert for it as imf does not * contain a back-pointer to the owning inp. * * Note: This is not the same as in6m_release(*) as this function also * makes a state change downcall into MLD. */ int in6_leavegroup_locked(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) { struct in6_multi_head inmh; struct in6_mfilter timf; struct ifnet *ifp; int error; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif error = 0; IN6_MULTI_LOCK_ASSERT(); CTR5(KTR_MLD, "%s: leave inm %p, %s/%s, imf %p", __func__, inm, ip6_sprintf(ip6tbuf, &inm->in6m_addr), (in6m_is_ifp_detached(inm) ? "null" : if_name(inm->in6m_ifp)), imf); /* * If no imf was specified (i.e. kernel consumer), * fake one up and assume it is an ASM join. */ if (imf == NULL) { im6f_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); imf = &timf; } /* * Begin state merge transaction at MLD layer. * * As this particular invocation should not cause any memory * to be allocated, and there is no opportunity to roll back * the transaction, it MUST NOT fail. */ ifp = inm->in6m_ifp; IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = 0; if (ifp) error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm); if (ifp) IF_ADDR_WLOCK(ifp); SLIST_INIT(&inmh); if (inm->in6m_refcount == 1) in6m_disconnect_locked(&inmh, inm); in6m_rele_locked(&inmh, inm); if (ifp) IF_ADDR_WUNLOCK(ifp); IN6_MULTI_LIST_UNLOCK(); in6m_release_list_deferred(&inmh); return (error); } /* * Block or unblock an ASM multicast source on an inpcb. * This implements the delta-based API described in RFC 3678. * * The delta-based API applies only to exclusive-mode memberships. * An MLD downcall will be performed. * * SMPng: NOTE: Must take Giant as a join may create a new ifma. * * Return 0 if successful, otherwise return an appropriate error code. */ static int in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) { struct group_source_req gsr; struct epoch_tracker et; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_msource *ims; struct in6_multi *inm; uint16_t fmode; int error, doblock; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif ifp = NULL; error = 0; doblock = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; ssa = (sockunion_t *)&gsr.gsr_source; switch (sopt->sopt_name) { case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); /* * XXXGL: this function should use ifnet_byindex_ref, or * expand the epoch section all the way to where we put * the reference. */ NET_EPOCH_ENTER(et); ifp = ifnet_byindex(gsr.gsr_interface); NET_EPOCH_EXIT(et); if (ifp == NULL) return (EADDRNOTAVAIL); if (sopt->sopt_name == MCAST_BLOCK_SOURCE) doblock = 1; break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); /* * Check if we are actually a member of this group. */ imo = in6p_findmoptions(inp); imf = im6o_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } inm = imf->im6f_in6m; /* * Attempting to use the delta-based API on an * non exclusive-mode membership is an error. */ fmode = imf->im6f_st[0]; if (fmode != MCAST_EXCLUDE) { error = EINVAL; goto out_in6p_locked; } /* * Deal with error cases up-front: * Asked to block, but already blocked; or * Asked to unblock, but nothing to unblock. * If adding a new block entry, allocate it. */ ims = im6o_match_source(imf, &ssa->sa); if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { CTR3(KTR_MLD, "%s: source %s %spresent", __func__, ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr), doblock ? "" : "not "); error = EADDRNOTAVAIL; goto out_in6p_locked; } INP_WLOCK_ASSERT(inp); /* * Begin state merge transaction at socket layer. */ if (doblock) { CTR2(KTR_MLD, "%s: %s source", __func__, "block"); ims = im6f_graft(imf, fmode, &ssa->sin6); if (ims == NULL) error = ENOMEM; } else { CTR2(KTR_MLD, "%s: %s source", __func__, "allow"); error = im6f_prune(imf, &ssa->sin6); } if (error) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); goto out_im6f_rollback; } /* * Begin state merge transaction at MLD layer. */ IN6_MULTI_LIST_LOCK(); CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } IN6_MULTI_LIST_UNLOCK(); out_im6f_rollback: if (error) im6f_rollback(imf); else im6f_commit(imf); im6f_reap(imf); out_in6p_locked: INP_WUNLOCK(inp); return (error); } /* * Given an inpcb, return its multicast options structure pointer. Accepts * an unlocked inpcb pointer, but will return it locked. May sleep. * * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. * SMPng: NOTE: Returns with the INP write lock held. */ static struct ip6_moptions * in6p_findmoptions(struct inpcb *inp) { struct ip6_moptions *imo; INP_WLOCK(inp); if (inp->in6p_moptions != NULL) return (inp->in6p_moptions); INP_WUNLOCK(inp); imo = malloc(sizeof(*imo), M_IP6MOPTS, M_WAITOK); imo->im6o_multicast_ifp = NULL; imo->im6o_multicast_hlim = V_ip6_defmcasthlim; imo->im6o_multicast_loop = in6_mcast_loop; STAILQ_INIT(&imo->im6o_head); INP_WLOCK(inp); if (inp->in6p_moptions != NULL) { free(imo, M_IP6MOPTS); return (inp->in6p_moptions); } inp->in6p_moptions = imo; return (imo); } /* * Discard the IPv6 multicast options (and source filters). * * SMPng: NOTE: assumes INP write lock is held. * * XXX can all be safely deferred to epoch_call * */ static void inp_gcmoptions(struct ip6_moptions *imo) { struct in6_mfilter *imf; struct in6_multi *inm; struct ifnet *ifp; while ((imf = ip6_mfilter_first(&imo->im6o_head)) != NULL) { ip6_mfilter_remove(&imo->im6o_head, imf); im6f_leave(imf); if ((inm = imf->im6f_in6m) != NULL) { if ((ifp = inm->in6m_ifp) != NULL) { CURVNET_SET(ifp->if_vnet); (void)in6_leavegroup(inm, imf); CURVNET_RESTORE(); } else { (void)in6_leavegroup(inm, imf); } } ip6_mfilter_free(imf); } free(imo, M_IP6MOPTS); } void ip6_freemoptions(struct ip6_moptions *imo) { if (imo == NULL) return; inp_gcmoptions(imo); } /* * Atomically get source filters on a socket for an IPv6 multicast group. * Called with INP lock held; returns with lock released. */ static int in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct epoch_tracker et; struct __msfilterreq msfr; sockunion_t *gsa; struct ifnet *ifp; struct ip6_moptions *imo; struct in6_mfilter *imf; struct ip6_msource *ims; struct in6_msource *lims; struct sockaddr_in6 *psin; struct sockaddr_storage *ptss; struct sockaddr_storage *tss; int error; size_t nsrcs, ncsrcs; INP_WLOCK_ASSERT(inp); imo = inp->in6p_moptions; KASSERT(imo != NULL, ("%s: null ip6_moptions", __func__)); INP_WUNLOCK(inp); error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_group.ss_family != AF_INET6 || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); /* * XXXGL: this function should use ifnet_byindex_ref, or expand the * epoch section all the way to where the interface is referenced. */ NET_EPOCH_ENTER(et); ifp = ifnet_byindex(msfr.msfr_ifindex); NET_EPOCH_EXIT(et); if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); INP_WLOCK(inp); /* * Lookup group on the socket. */ imf = im6o_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { INP_WUNLOCK(inp); return (EADDRNOTAVAIL); } /* * Ignore memberships which are in limbo. */ if (imf->im6f_st[1] == MCAST_UNDEFINED) { INP_WUNLOCK(inp); return (EAGAIN); } msfr.msfr_fmode = imf->im6f_st[1]; /* * If the user specified a buffer, copy out the source filter * entries to userland gracefully. * We only copy out the number of entries which userland * has asked for, but we always tell userland how big the * buffer really needs to be. */ if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) msfr.msfr_nsrcs = in6_mcast_maxsocksrc; tss = NULL; if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_NOWAIT | M_ZERO); if (tss == NULL) { INP_WUNLOCK(inp); return (ENOBUFS); } } /* * Count number of sources in-mode at t0. * If buffer space exists and remains, copy out source entries. */ nsrcs = msfr.msfr_nsrcs; ncsrcs = 0; ptss = tss; RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { lims = (struct in6_msource *)ims; if (lims->im6sl_st[0] == MCAST_UNDEFINED || lims->im6sl_st[0] != imf->im6f_st[0]) continue; ++ncsrcs; if (tss != NULL && nsrcs > 0) { psin = (struct sockaddr_in6 *)ptss; psin->sin6_family = AF_INET6; psin->sin6_len = sizeof(struct sockaddr_in6); psin->sin6_addr = lims->im6s_addr; psin->sin6_port = 0; --nsrcs; ++ptss; } } INP_WUNLOCK(inp); if (tss != NULL) { error = copyout(tss, msfr.msfr_srcs, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); free(tss, M_TEMP); if (error) return (error); } msfr.msfr_nsrcs = ncsrcs; error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); return (error); } /* * Return the IP multicast options in response to user getsockopt(). */ int ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip6_moptions *im6o; int error; u_int optval; INP_WLOCK(inp); im6o = inp->in6p_moptions; - /* - * If socket is neither of type SOCK_RAW or SOCK_DGRAM, - * or is a divert socket, reject it. - */ - if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || - (inp->inp_socket->so_proto->pr_type != SOCK_RAW && - inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { + /* If socket is neither of type SOCK_RAW or SOCK_DGRAM, reject it. */ + if (inp->inp_socket->so_proto->pr_type != SOCK_RAW && + inp->inp_socket->so_proto->pr_type != SOCK_DGRAM) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } error = 0; switch (sopt->sopt_name) { case IPV6_MULTICAST_IF: if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) { optval = 0; } else { optval = im6o->im6o_multicast_ifp->if_index; } INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MULTICAST_HOPS: if (im6o == NULL) optval = V_ip6_defmcasthlim; else optval = im6o->im6o_multicast_hlim; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MULTICAST_LOOP: if (im6o == NULL) optval = in6_mcast_loop; /* XXX VIMAGE */ else optval = im6o->im6o_multicast_loop; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(u_int)); break; case IPV6_MSFILTER: if (im6o == NULL) { error = EADDRNOTAVAIL; INP_WUNLOCK(inp); } else { error = in6p_get_source_filters(inp, sopt); } break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Look up the ifnet to use for a multicast group membership, * given the address of an IPv6 group. * * This routine exists to support legacy IPv6 multicast applications. * * Use the socket's current FIB number for any required FIB lookup. Look up the * group address in the unicast FIB, and use its ifp; usually, this points to * the default next-hop. If the FIB lookup fails, return NULL. * * FUTURE: Support multiple forwarding tables for IPv6. * * Returns NULL if no ifp could be found. */ static struct ifnet * in6p_lookup_mcast_ifp(const struct inpcb *inp, const struct sockaddr_in6 *gsin6) { struct nhop_object *nh; struct in6_addr dst; uint32_t scopeid; uint32_t fibnum; KASSERT(gsin6->sin6_family == AF_INET6, ("%s: not AF_INET6 group", __func__)); in6_splitscope(&gsin6->sin6_addr, &dst, &scopeid); fibnum = inp->inp_inc.inc_fibnum; nh = fib6_lookup(fibnum, &dst, scopeid, 0, 0); return (nh ? nh->nh_ifp : NULL); } /* * Join an IPv6 multicast group, possibly with a source. * * FIXME: The KAME use of the unspecified address (::) * to join *all* multicast groups is currently unsupported. * * XXXGL: this function multiple times uses ifnet_byindex() without * proper protection - staying in epoch, or putting reference on ifnet. */ static int in6p_join_group(struct inpcb *inp, struct sockopt *sopt) { struct in6_multi_head inmh; struct group_source_req gsr; struct epoch_tracker et; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_multi *inm; struct in6_msource *lims; int error, is_new; SLIST_INIT(&inmh); ifp = NULL; lims = NULL; error = 0; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; /* * Chew everything into struct group_source_req. * Overwrite the port field if present, as the sockaddr * being copied in may be matched with a binary comparison. * Ignore passed-in scope ID. */ switch (sopt->sopt_name) { case IPV6_JOIN_GROUP: { struct ipv6_mreq mreq; error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq), sizeof(struct ipv6_mreq)); if (error) return (error); gsa->sin6.sin6_family = AF_INET6; gsa->sin6.sin6_len = sizeof(struct sockaddr_in6); gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr; if (mreq.ipv6mr_interface == 0) { ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6); } else { NET_EPOCH_ENTER(et); ifp = ifnet_byindex(mreq.ipv6mr_interface); NET_EPOCH_EXIT(et); if (ifp == NULL) return (EADDRNOTAVAIL); } CTR3(KTR_MLD, "%s: ipv6mr_interface = %d, ifp = %p", __func__, mreq.ipv6mr_interface, ifp); } break; case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: if (sopt->sopt_name == MCAST_JOIN_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr)) return (EINVAL); /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&ssa->sin6.sin6_addr); ssa->sin6.sin6_port = 0; ssa->sin6.sin6_scope_id = 0; } NET_EPOCH_ENTER(et); ifp = ifnet_byindex(gsr.gsr_interface); NET_EPOCH_EXIT(et); if (ifp == NULL) return (EADDRNOTAVAIL); break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) return (EADDRNOTAVAIL); gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; /* * Always set the scope zone ID on memberships created from userland. * Use the passed-in ifp to do this. * XXX The in6_setscope() return value is meaningless. * XXX SCOPE6_LOCK() is taken by in6_setscope(). */ (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); IN6_MULTI_LOCK(); /* * Find the membership in the membership list. */ imo = in6p_findmoptions(inp); imf = im6o_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { is_new = 1; inm = NULL; if (ip6_mfilter_count(&imo->im6o_head) >= IPV6_MAX_MEMBERSHIPS) { error = ENOMEM; goto out_in6p_locked; } } else { is_new = 0; inm = imf->im6f_in6m; if (ssa->ss.ss_family != AF_UNSPEC) { /* * MCAST_JOIN_SOURCE_GROUP on an exclusive membership * is an error. On an existing inclusive membership, * it just adds the source to the filter list. */ if (imf->im6f_st[1] != MCAST_INCLUDE) { error = EINVAL; goto out_in6p_locked; } /* * Throw out duplicates. * * XXX FIXME: This makes a naive assumption that * even if entries exist for *ssa in this imf, * they will be rejected as dupes, even if they * are not valid in the current mode (in-mode). * * in6_msource is transactioned just as for anything * else in SSM -- but note naive use of in6m_graft() * below for allocating new filter entries. * * This is only an issue if someone mixes the * full-state SSM API with the delta-based API, * which is discouraged in the relevant RFCs. */ lims = im6o_match_source(imf, &ssa->sa); if (lims != NULL /*&& lims->im6sl_st[1] == MCAST_INCLUDE*/) { error = EADDRNOTAVAIL; goto out_in6p_locked; } } else { /* * MCAST_JOIN_GROUP alone, on any existing membership, * is rejected, to stop the same inpcb tying up * multiple refs to the in_multi. * On an existing inclusive membership, this is also * an error; if you want to change filter mode, * you must use the userland API setsourcefilter(). * XXX We don't reject this for imf in UNDEFINED * state at t1, because allocation of a filter * is atomic with allocation of a membership. */ error = EADDRINUSE; goto out_in6p_locked; } } /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * Graft new source into filter list for this inpcb's * membership of the group. The in6_multi may not have * been allocated yet if this is a new membership, however, * the in_mfilter slot will be allocated and must be initialized. * * Note: Grafting of exclusive mode filters doesn't happen * in this path. * XXX: Should check for non-NULL lims (node exists but may * not be in-mode) for interop with full-state API. */ if (ssa->ss.ss_family != AF_UNSPEC) { /* Membership starts in IN mode */ if (is_new) { CTR1(KTR_MLD, "%s: new join w/source", __func__); imf = ip6_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_INCLUDE); if (imf == NULL) { error = ENOMEM; goto out_in6p_locked; } } else { CTR2(KTR_MLD, "%s: %s source", __func__, "allow"); } lims = im6f_graft(imf, MCAST_INCLUDE, &ssa->sin6); if (lims == NULL) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); error = ENOMEM; goto out_in6p_locked; } } else { /* No address specified; Membership starts in EX mode */ if (is_new) { CTR1(KTR_MLD, "%s: new join w/o source", __func__); imf = ip6_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_EXCLUDE); if (imf == NULL) { error = ENOMEM; goto out_in6p_locked; } } } /* * Begin state merge transaction at MLD layer. */ if (is_new) { in_pcbref(inp); INP_WUNLOCK(inp); error = in6_joingroup_locked(ifp, &gsa->sin6.sin6_addr, imf, &imf->im6f_in6m, 0); INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { error = ENXIO; goto out_in6p_unlocked; } if (error) { goto out_in6p_locked; } /* * NOTE: Refcount from in6_joingroup_locked() * is protecting membership. */ ip6_mfilter_insert(&imo->im6o_head, imf); } else { CTR1(KTR_MLD, "%s: merge inm state", __func__); IN6_MULTI_LIST_LOCK(); error = in6m_merge(inm, imf); if (error) { CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); IN6_MULTI_LIST_UNLOCK(); im6f_rollback(imf); im6f_reap(imf); goto out_in6p_locked; } CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); IN6_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_MLD, "%s: failed mld downcall", __func__); im6f_rollback(imf); im6f_reap(imf); goto out_in6p_locked; } } im6f_commit(imf); imf = NULL; out_in6p_locked: INP_WUNLOCK(inp); out_in6p_unlocked: IN6_MULTI_UNLOCK(); if (is_new && imf) { if (imf->im6f_in6m != NULL) { struct in6_multi_head inmh; SLIST_INIT(&inmh); SLIST_INSERT_HEAD(&inmh, imf->im6f_in6m, in6m_defer); in6m_release_list_deferred(&inmh); } ip6_mfilter_free(imf); } return (error); } /* * Leave an IPv6 multicast group on an inpcb, possibly with a source. */ static int in6p_leave_group(struct inpcb *inp, struct sockopt *sopt) { struct ipv6_mreq mreq; struct group_source_req gsr; struct epoch_tracker et; sockunion_t *gsa, *ssa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_msource *ims; struct in6_multi *inm; uint32_t ifindex; int error; bool is_final; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif ifp = NULL; ifindex = 0; error = 0; is_final = true; memset(&gsr, 0, sizeof(struct group_source_req)); gsa = (sockunion_t *)&gsr.gsr_group; gsa->ss.ss_family = AF_UNSPEC; ssa = (sockunion_t *)&gsr.gsr_source; ssa->ss.ss_family = AF_UNSPEC; /* * Chew everything passed in up into a struct group_source_req * as that is easier to process. * Note: Any embedded scope ID in the multicast group passed * in by userland is ignored, the interface index is the recommended * mechanism to specify an interface; see below. */ switch (sopt->sopt_name) { case IPV6_LEAVE_GROUP: error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq), sizeof(struct ipv6_mreq)); if (error) return (error); gsa->sin6.sin6_family = AF_INET6; gsa->sin6.sin6_len = sizeof(struct sockaddr_in6); gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr; gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; ifindex = mreq.ipv6mr_interface; break; case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: if (sopt->sopt_name == MCAST_LEAVE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_req), sizeof(struct group_req)); } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { error = sooptcopyin(sopt, &gsr, sizeof(struct group_source_req), sizeof(struct group_source_req)); } if (error) return (error); if (gsa->sin6.sin6_family != AF_INET6 || gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { if (ssa->sin6.sin6_family != AF_INET6 || ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr)) return (EINVAL); /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&ssa->sin6.sin6_addr); } gsa->sin6.sin6_port = 0; gsa->sin6.sin6_scope_id = 0; ifindex = gsr.gsr_interface; break; default: CTR2(KTR_MLD, "%s: unknown sopt_name %d", __func__, sopt->sopt_name); return (EOPNOTSUPP); break; } if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); /* * Validate interface index if provided. If no interface index * was provided separately, attempt to look the membership up * from the default scope as a last resort to disambiguate * the membership we are being asked to leave. * XXX SCOPE6 lock potentially taken here. */ if (ifindex != 0) { NET_EPOCH_ENTER(et); ifp = ifnet_byindex(ifindex); NET_EPOCH_EXIT(et); /* XXXGL: unsafe ifp */ if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); } else { error = sa6_embedscope(&gsa->sin6, V_ip6_use_defzone); if (error) return (EADDRNOTAVAIL); /* * Some badly behaved applications don't pass an ifindex * or a scope ID, which is an API violation. In this case, * perform a lookup as per a v6 join. * * XXX For now, stomp on zone ID for the corner case. * This is not the 'KAME way', but we need to see the ifp * directly until such time as this implementation is * refactored, assuming the scope IDs are the way to go. */ ifindex = ntohs(gsa->sin6.sin6_addr.s6_addr16[1]); if (ifindex == 0) { CTR2(KTR_MLD, "%s: warning: no ifindex, looking up " "ifp for group %s.", __func__, ip6_sprintf(ip6tbuf, &gsa->sin6.sin6_addr)); ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6); } else { NET_EPOCH_ENTER(et); ifp = ifnet_byindex(ifindex); NET_EPOCH_EXIT(et); /* XXXGL: unsafe ifp */ } if (ifp == NULL) return (EADDRNOTAVAIL); } CTR2(KTR_MLD, "%s: ifp = %p", __func__, ifp); KASSERT(ifp != NULL, ("%s: ifp did not resolve", __func__)); IN6_MULTI_LOCK(); /* * Find the membership in the membership list. */ imo = in6p_findmoptions(inp); imf = im6o_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } inm = imf->im6f_in6m; if (ssa->ss.ss_family != AF_UNSPEC) is_final = false; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); /* * If we were instructed only to leave a given source, do so. * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. */ if (is_final) { ip6_mfilter_remove(&imo->im6o_head, imf); im6f_leave(imf); /* * Give up the multicast address record to which * the membership points. */ (void)in6_leavegroup_locked(inm, imf); } else { if (imf->im6f_st[0] == MCAST_EXCLUDE) { error = EADDRNOTAVAIL; goto out_in6p_locked; } ims = im6o_match_source(imf, &ssa->sa); if (ims == NULL) { CTR3(KTR_MLD, "%s: source %p %spresent", __func__, ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr), "not "); error = EADDRNOTAVAIL; goto out_in6p_locked; } CTR2(KTR_MLD, "%s: %s source", __func__, "block"); error = im6f_prune(imf, &ssa->sin6); if (error) { CTR1(KTR_MLD, "%s: merge imf state failed", __func__); goto out_in6p_locked; } } /* * Begin state merge transaction at MLD layer. */ if (!is_final) { CTR1(KTR_MLD, "%s: merge inm state", __func__); IN6_MULTI_LIST_LOCK(); error = in6m_merge(inm, imf); if (error) { CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); IN6_MULTI_LIST_UNLOCK(); im6f_rollback(imf); im6f_reap(imf); goto out_in6p_locked; } CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); IN6_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_MLD, "%s: failed mld downcall", __func__); im6f_rollback(imf); im6f_reap(imf); goto out_in6p_locked; } } im6f_commit(imf); im6f_reap(imf); out_in6p_locked: INP_WUNLOCK(inp); if (is_final && imf) ip6_mfilter_free(imf); IN6_MULTI_UNLOCK(); return (error); } /* * Select the interface for transmitting IPv6 multicast datagrams. * * Either an instance of struct in6_addr or an instance of struct ipv6_mreqn * may be passed to this socket option. An address of in6addr_any or an * interface index of 0 is used to remove a previous selection. * When no interface is selected, one is chosen for every send. */ static int in6p_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) { struct epoch_tracker et; struct ifnet *ifp; struct ip6_moptions *imo; u_int ifindex; int error; if (sopt->sopt_valsize != sizeof(u_int)) return (EINVAL); error = sooptcopyin(sopt, &ifindex, sizeof(u_int), sizeof(u_int)); if (error) return (error); NET_EPOCH_ENTER(et); if (ifindex == 0) ifp = NULL; else { ifp = ifnet_byindex(ifindex); if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } } NET_EPOCH_EXIT(et); /* XXXGL: unsafe ifp */ imo = in6p_findmoptions(inp); imo->im6o_multicast_ifp = ifp; INP_WUNLOCK(inp); return (0); } /* * Atomically set source filters on a socket for an IPv6 multicast group. * * XXXGL: unsafely exits epoch with ifnet pointer */ static int in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt) { struct __msfilterreq msfr; struct epoch_tracker et; sockunion_t *gsa; struct ifnet *ifp; struct in6_mfilter *imf; struct ip6_moptions *imo; struct in6_multi *inm; int error; error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), sizeof(struct __msfilterreq)); if (error) return (error); if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) return (ENOBUFS); if (msfr.msfr_fmode != MCAST_EXCLUDE && msfr.msfr_fmode != MCAST_INCLUDE) return (EINVAL); if (msfr.msfr_group.ss_family != AF_INET6 || msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6)) return (EINVAL); gsa = (sockunion_t *)&msfr.msfr_group; if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) return (EINVAL); gsa->sin6.sin6_port = 0; /* ignore port */ NET_EPOCH_ENTER(et); ifp = ifnet_byindex(msfr.msfr_ifindex); NET_EPOCH_EXIT(et); if (ifp == NULL) return (EADDRNOTAVAIL); (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); /* * Take the INP write lock. * Check if this socket is a member of this group. */ imo = in6p_findmoptions(inp); imf = im6o_match_group(imo, ifp, &gsa->sa); if (imf == NULL) { error = EADDRNOTAVAIL; goto out_in6p_locked; } inm = imf->im6f_in6m; /* * Begin state merge transaction at socket layer. */ INP_WLOCK_ASSERT(inp); imf->im6f_st[1] = msfr.msfr_fmode; /* * Apply any new source filters, if present. * Make a copy of the user-space source vector so * that we may copy them with a single copyin. This * allows us to deal with page faults up-front. */ if (msfr.msfr_nsrcs > 0) { struct in6_msource *lims; struct sockaddr_in6 *psin; struct sockaddr_storage *kss, *pkss; int i; INP_WUNLOCK(inp); CTR2(KTR_MLD, "%s: loading %lu source list entries", __func__, (unsigned long)msfr.msfr_nsrcs); kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, M_TEMP, M_WAITOK); error = copyin(msfr.msfr_srcs, kss, sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); if (error) { free(kss, M_TEMP); return (error); } INP_WLOCK(inp); /* * Mark all source filters as UNDEFINED at t1. * Restore new group filter mode, as im6f_leave() * will set it to INCLUDE. */ im6f_leave(imf); imf->im6f_st[1] = msfr.msfr_fmode; /* * Update socket layer filters at t1, lazy-allocating * new entries. This saves a bunch of memory at the * cost of one RB_FIND() per source entry; duplicate * entries in the msfr_nsrcs vector are ignored. * If we encounter an error, rollback transaction. * * XXX This too could be replaced with a set-symmetric * difference like loop to avoid walking from root * every time, as the key space is common. */ for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { psin = (struct sockaddr_in6 *)pkss; if (psin->sin6_family != AF_INET6) { error = EAFNOSUPPORT; break; } if (psin->sin6_len != sizeof(struct sockaddr_in6)) { error = EINVAL; break; } if (IN6_IS_ADDR_MULTICAST(&psin->sin6_addr)) { error = EINVAL; break; } /* * TODO: Validate embedded scope ID in source * list entry against passed-in ifp, if and only * if source list filter entry is iface or node local. */ in6_clearscope(&psin->sin6_addr); error = im6f_get_source(imf, psin, &lims); if (error) break; lims->im6sl_st[1] = imf->im6f_st[1]; } free(kss, M_TEMP); } if (error) goto out_im6f_rollback; INP_WLOCK_ASSERT(inp); IN6_MULTI_LIST_LOCK(); /* * Begin state merge transaction at MLD layer. */ CTR1(KTR_MLD, "%s: merge inm state", __func__); error = in6m_merge(inm, imf); if (error) CTR1(KTR_MLD, "%s: failed to merge inm state", __func__); else { CTR1(KTR_MLD, "%s: doing mld downcall", __func__); error = mld_change_state(inm, 0); if (error) CTR1(KTR_MLD, "%s: failed mld downcall", __func__); } IN6_MULTI_LIST_UNLOCK(); out_im6f_rollback: if (error) im6f_rollback(imf); else im6f_commit(imf); im6f_reap(imf); out_in6p_locked: INP_WUNLOCK(inp); return (error); } /* * Set the IP multicast options in response to user setsockopt(). * * Many of the socket options handled in this function duplicate the * functionality of socket options in the regular unicast API. However, * it is not possible to merge the duplicate code, because the idempotence * of the IPv6 multicast part of the BSD Sockets API must be preserved; * the effects of these options must be treated as separate and distinct. * * SMPng: XXX: Unlocked read of inp_socket believed OK. */ int ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt) { struct ip6_moptions *im6o; int error; error = 0; - /* - * If socket is neither of type SOCK_RAW or SOCK_DGRAM, - * or is a divert socket, reject it. - */ - if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || - (inp->inp_socket->so_proto->pr_type != SOCK_RAW && - inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) + /* If socket is neither of type SOCK_RAW or SOCK_DGRAM, reject it. */ + if (inp->inp_socket->so_proto->pr_type != SOCK_RAW && + inp->inp_socket->so_proto->pr_type != SOCK_DGRAM) return (EOPNOTSUPP); switch (sopt->sopt_name) { case IPV6_MULTICAST_IF: error = in6p_set_multicast_if(inp, sopt); break; case IPV6_MULTICAST_HOPS: { int hlim; if (sopt->sopt_valsize != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &hlim, sizeof(hlim), sizeof(int)); if (error) break; if (hlim < -1 || hlim > 255) { error = EINVAL; break; } else if (hlim == -1) { hlim = V_ip6_defmcasthlim; } im6o = in6p_findmoptions(inp); im6o->im6o_multicast_hlim = hlim; INP_WUNLOCK(inp); break; } case IPV6_MULTICAST_LOOP: { u_int loop; /* * Set the loopback flag for outgoing multicast packets. * Must be zero or one. */ if (sopt->sopt_valsize != sizeof(u_int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &loop, sizeof(u_int), sizeof(u_int)); if (error) break; if (loop > 1) { error = EINVAL; break; } im6o = in6p_findmoptions(inp); im6o->im6o_multicast_loop = loop; INP_WUNLOCK(inp); break; } case IPV6_JOIN_GROUP: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: error = in6p_join_group(inp, sopt); break; case IPV6_LEAVE_GROUP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = in6p_leave_group(inp, sopt); break; case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = in6p_block_unblock_source(inp, sopt); break; case IPV6_MSFILTER: error = in6p_set_source_filters(inp, sopt); break; default: error = EOPNOTSUPP; break; } INP_UNLOCK_ASSERT(inp); return (error); } /* * Expose MLD's multicast filter mode and source list(s) to userland, * keyed by (ifindex, group). * The filter mode is written out as a uint32_t, followed by * 0..n of struct in6_addr. * For use by ifmcstat(8). * SMPng: NOTE: unlocked read of ifindex space. */ static int sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS) { struct in6_addr mcaddr; struct in6_addr src; struct epoch_tracker et; struct ifnet *ifp; struct ifmultiaddr *ifma; struct in6_multi *inm; struct ip6_msource *ims; int *name; int retval; u_int namelen; uint32_t fmode, ifindex; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); /* int: ifindex + 4 * 32 bits of IPv6 address */ if (namelen != 5) return (EINVAL); memcpy(&mcaddr, &name[1], sizeof(struct in6_addr)); if (!IN6_IS_ADDR_MULTICAST(&mcaddr)) { CTR2(KTR_MLD, "%s: group %s is not multicast", __func__, ip6_sprintf(ip6tbuf, &mcaddr)); return (EINVAL); } ifindex = name[0]; NET_EPOCH_ENTER(et); ifp = ifnet_byindex(ifindex); if (ifp == NULL) { NET_EPOCH_EXIT(et); CTR2(KTR_MLD, "%s: no ifp for ifindex %u", __func__, ifindex); return (ENOENT); } /* * Internal MLD lookups require that scope/zone ID is set. */ (void)in6_setscope(&mcaddr, ifp, NULL); retval = sysctl_wire_old_buffer(req, sizeof(uint32_t) + (in6_mcast_maxgrpsrc * sizeof(struct in6_addr))); if (retval) { NET_EPOCH_EXIT(et); return (retval); } IN6_MULTI_LOCK(); IN6_MULTI_LIST_LOCK(); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { inm = in6m_ifmultiaddr_get_inm(ifma); if (inm == NULL) continue; if (!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &mcaddr)) continue; fmode = inm->in6m_st[1].iss_fmode; retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); if (retval != 0) break; RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { CTR2(KTR_MLD, "%s: visit node %p", __func__, ims); /* * Only copy-out sources which are in-mode. */ if (fmode != im6s_get_mode(inm, ims, 1)) { CTR1(KTR_MLD, "%s: skip non-in-mode", __func__); continue; } src = ims->im6s_addr; retval = SYSCTL_OUT(req, &src, sizeof(struct in6_addr)); if (retval != 0) break; } } IN6_MULTI_LIST_UNLOCK(); IN6_MULTI_UNLOCK(); NET_EPOCH_EXIT(et); return (retval); } #ifdef KTR static const char *in6m_modestrs[] = { "un", "in", "ex" }; static const char * in6m_mode_str(const int mode) { if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) return (in6m_modestrs[mode]); return ("??"); } static const char *in6m_statestrs[] = { "not-member", "silent", "idle", "lazy", "sleeping", "awakening", "query-pending", "sg-query-pending", "leaving" }; static const char * in6m_state_str(const int state) { if (state >= MLD_NOT_MEMBER && state <= MLD_LEAVING_MEMBER) return (in6m_statestrs[state]); return ("??"); } /* * Dump an in6_multi structure to the console. */ void in6m_print(const struct in6_multi *inm) { int t; char ip6tbuf[INET6_ADDRSTRLEN]; if ((ktr_mask & KTR_MLD) == 0) return; printf("%s: --- begin in6m %p ---\n", __func__, inm); printf("addr %s ifp %p(%s) ifma %p\n", ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp), inm->in6m_ifma); printf("timer %u state %s refcount %u scq.len %u\n", inm->in6m_timer, in6m_state_str(inm->in6m_state), inm->in6m_refcount, mbufq_len(&inm->in6m_scq)); printf("mli %p nsrc %lu sctimer %u scrv %u\n", inm->in6m_mli, inm->in6m_nsrc, inm->in6m_sctimer, inm->in6m_scrv); for (t = 0; t < 2; t++) { printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, in6m_mode_str(inm->in6m_st[t].iss_fmode), inm->in6m_st[t].iss_asm, inm->in6m_st[t].iss_ex, inm->in6m_st[t].iss_in, inm->in6m_st[t].iss_rec); } printf("%s: --- end in6m %p ---\n", __func__, inm); } #else /* !KTR */ void in6m_print(const struct in6_multi *inm) { } #endif /* KTR */ diff --git a/sys/sys/socket.h b/sys/sys/socket.h index 3ec0d3b1d06d..f81aba8f972d 100644 --- a/sys/sys/socket.h +++ b/sys/sys/socket.h @@ -1,753 +1,755 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1985, 1986, 1988, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)socket.h 8.4 (Berkeley) 2/21/94 * $FreeBSD$ */ #ifndef _SYS_SOCKET_H_ #define _SYS_SOCKET_H_ #include #include #include #include /* * Definitions related to sockets: types, address families, options. */ /* * Data types. */ #if __BSD_VISIBLE #ifndef _GID_T_DECLARED typedef __gid_t gid_t; #define _GID_T_DECLARED #endif #ifndef _OFF_T_DECLARED typedef __off_t off_t; #define _OFF_T_DECLARED #endif #ifndef _PID_T_DECLARED typedef __pid_t pid_t; #define _PID_T_DECLARED #endif #endif #ifndef _SA_FAMILY_T_DECLARED typedef __sa_family_t sa_family_t; #define _SA_FAMILY_T_DECLARED #endif #ifndef _SOCKLEN_T_DECLARED typedef __socklen_t socklen_t; #define _SOCKLEN_T_DECLARED #endif #ifndef _SSIZE_T_DECLARED typedef __ssize_t ssize_t; #define _SSIZE_T_DECLARED #endif #if __BSD_VISIBLE #ifndef _UID_T_DECLARED typedef __uid_t uid_t; #define _UID_T_DECLARED #endif #endif #ifndef _UINT32_T_DECLARED typedef __uint32_t uint32_t; #define _UINT32_T_DECLARED #endif #ifndef _UINTPTR_T_DECLARED typedef __uintptr_t uintptr_t; #define _UINTPTR_T_DECLARED #endif /* * Types */ #define SOCK_STREAM 1 /* stream socket */ #define SOCK_DGRAM 2 /* datagram socket */ #define SOCK_RAW 3 /* raw-protocol interface */ #if __BSD_VISIBLE #define SOCK_RDM 4 /* reliably-delivered message */ #endif #define SOCK_SEQPACKET 5 /* sequenced packet stream */ #if __BSD_VISIBLE /* * Creation flags, OR'ed into socket() and socketpair() type argument. */ #define SOCK_CLOEXEC 0x10000000 #define SOCK_NONBLOCK 0x20000000 #ifdef _KERNEL /* * Flags for accept1(), kern_accept4() and solisten_dequeue, in addition * to SOCK_CLOEXEC and SOCK_NONBLOCK. */ #define ACCEPT4_INHERIT 0x1 #define ACCEPT4_COMPAT 0x2 #endif /* _KERNEL */ #endif /* __BSD_VISIBLE */ /* * Option flags per-socket. */ #define SO_DEBUG 0x00000001 /* turn on debugging info recording */ #define SO_ACCEPTCONN 0x00000002 /* socket has had listen() */ #define SO_REUSEADDR 0x00000004 /* allow local address reuse */ #define SO_KEEPALIVE 0x00000008 /* keep connections alive */ #define SO_DONTROUTE 0x00000010 /* just use interface addresses */ #define SO_BROADCAST 0x00000020 /* permit sending of broadcast msgs */ #if __BSD_VISIBLE #define SO_USELOOPBACK 0x00000040 /* bypass hardware when possible */ #endif #define SO_LINGER 0x00000080 /* linger on close if data present */ #define SO_OOBINLINE 0x00000100 /* leave received OOB data in line */ #if __BSD_VISIBLE #define SO_REUSEPORT 0x00000200 /* allow local address & port reuse */ #define SO_TIMESTAMP 0x00000400 /* timestamp received dgram traffic */ #define SO_NOSIGPIPE 0x00000800 /* no SIGPIPE from EPIPE */ #define SO_ACCEPTFILTER 0x00001000 /* there is an accept filter */ #define SO_BINTIME 0x00002000 /* timestamp received dgram traffic */ #endif #define SO_NO_OFFLOAD 0x00004000 /* socket cannot be offloaded */ #define SO_NO_DDP 0x00008000 /* disable direct data placement */ #define SO_REUSEPORT_LB 0x00010000 /* reuse with load balancing */ #define SO_RERROR 0x00020000 /* keep track of receive errors */ /* * Additional options, not kept in so_options. */ #define SO_SNDBUF 0x1001 /* send buffer size */ #define SO_RCVBUF 0x1002 /* receive buffer size */ #define SO_SNDLOWAT 0x1003 /* send low-water mark */ #define SO_RCVLOWAT 0x1004 /* receive low-water mark */ #define SO_SNDTIMEO 0x1005 /* send timeout */ #define SO_RCVTIMEO 0x1006 /* receive timeout */ #define SO_ERROR 0x1007 /* get error status and clear */ #define SO_TYPE 0x1008 /* get socket type */ #if __BSD_VISIBLE #define SO_LABEL 0x1009 /* socket's MAC label */ #define SO_PEERLABEL 0x1010 /* socket's peer's MAC label */ #define SO_LISTENQLIMIT 0x1011 /* socket's backlog limit */ #define SO_LISTENQLEN 0x1012 /* socket's complete queue length */ #define SO_LISTENINCQLEN 0x1013 /* socket's incomplete queue length */ #define SO_SETFIB 0x1014 /* use this FIB to route */ #define SO_USER_COOKIE 0x1015 /* user cookie (dummynet etc.) */ #define SO_PROTOCOL 0x1016 /* get socket protocol (Linux name) */ #define SO_PROTOTYPE SO_PROTOCOL /* alias for SO_PROTOCOL (SunOS name) */ #define SO_TS_CLOCK 0x1017 /* clock type used for SO_TIMESTAMP */ #define SO_MAX_PACING_RATE 0x1018 /* socket's max TX pacing rate (Linux name) */ #define SO_DOMAIN 0x1019 /* get socket domain */ #endif #if __BSD_VISIBLE #define SO_TS_REALTIME_MICRO 0 /* microsecond resolution, realtime */ #define SO_TS_BINTIME 1 /* sub-nanosecond resolution, realtime */ #define SO_TS_REALTIME 2 /* nanosecond resolution, realtime */ #define SO_TS_MONOTONIC 3 /* nanosecond resolution, monotonic */ #define SO_TS_DEFAULT SO_TS_REALTIME_MICRO #define SO_TS_CLOCK_MAX SO_TS_MONOTONIC #endif /* * Space reserved for new socket options added by third-party vendors. * This range applies to all socket option levels. New socket options * in FreeBSD should always use an option value less than SO_VENDOR. */ #if __BSD_VISIBLE #define SO_VENDOR 0x80000000 #endif /* * Structure used for manipulating linger option. */ struct linger { int l_onoff; /* option on/off */ int l_linger; /* linger time */ }; #if __BSD_VISIBLE struct accept_filter_arg { char af_name[16]; char af_arg[256-16]; }; #endif /* * Level number for (get/set)sockopt() to apply to socket itself. */ #define SOL_SOCKET 0xffff /* options for socket level */ /* * Address families. */ #define AF_UNSPEC 0 /* unspecified */ #if __BSD_VISIBLE #define AF_LOCAL AF_UNIX /* local to host (pipes, portals) */ #endif #define AF_UNIX 1 /* standardized name for AF_LOCAL */ #define AF_INET 2 /* internetwork: UDP, TCP, etc. */ #if __BSD_VISIBLE #define AF_IMPLINK 3 /* arpanet imp addresses */ #define AF_PUP 4 /* pup protocols: e.g. BSP */ #define AF_CHAOS 5 /* mit CHAOS protocols */ #define AF_NETBIOS 6 /* SMB protocols */ #define AF_ISO 7 /* ISO protocols */ #define AF_OSI AF_ISO #define AF_ECMA 8 /* European computer manufacturers */ #define AF_DATAKIT 9 /* datakit protocols */ #define AF_CCITT 10 /* CCITT protocols, X.25 etc */ #define AF_SNA 11 /* IBM SNA */ #define AF_DECnet 12 /* DECnet */ #define AF_DLI 13 /* DEC Direct data link interface */ #define AF_LAT 14 /* LAT */ #define AF_HYLINK 15 /* NSC Hyperchannel */ #define AF_APPLETALK 16 /* Apple Talk */ #define AF_ROUTE 17 /* Internal Routing Protocol */ #define AF_LINK 18 /* Link layer interface */ #define pseudo_AF_XTP 19 /* eXpress Transfer Protocol (no AF) */ #define AF_COIP 20 /* connection-oriented IP, aka ST II */ #define AF_CNT 21 /* Computer Network Technology */ #define pseudo_AF_RTIP 22 /* Help Identify RTIP packets */ #define AF_IPX 23 /* Novell Internet Protocol */ #define AF_SIP 24 /* Simple Internet Protocol */ #define pseudo_AF_PIP 25 /* Help Identify PIP packets */ #define AF_ISDN 26 /* Integrated Services Digital Network*/ #define AF_E164 AF_ISDN /* CCITT E.164 recommendation */ #define pseudo_AF_KEY 27 /* Internal key-management function */ #endif #define AF_INET6 28 /* IPv6 */ #if __BSD_VISIBLE #define AF_NATM 29 /* native ATM access */ #define AF_ATM 30 /* ATM */ #define pseudo_AF_HDRCMPLT 31 /* Used by BPF to not rewrite headers * in interface output routine */ #define AF_NETGRAPH 32 /* Netgraph sockets */ #define AF_SLOW 33 /* 802.3ad slow protocol */ #define AF_SCLUSTER 34 /* Sitara cluster protocol */ #define AF_ARP 35 #define AF_BLUETOOTH 36 /* Bluetooth sockets */ #define AF_IEEE80211 37 /* IEEE 802.11 protocol */ #define AF_NETLINK 38 /* Netlink protocol */ #define AF_INET_SDP 40 /* OFED Socket Direct Protocol ipv4 */ #define AF_INET6_SDP 42 /* OFED Socket Direct Protocol ipv6 */ #define AF_HYPERV 43 /* HyperV sockets */ -#define AF_MAX 43 +#define AF_DIVERT 44 /* divert(4) */ +#define AF_MAX 44 /* * When allocating a new AF_ constant, please only allocate * even numbered constants for FreeBSD until 134 as odd numbered AF_ * constants 39-133 are now reserved for vendors. */ #define AF_VENDOR00 39 #define AF_VENDOR01 41 #define AF_VENDOR03 45 #define AF_VENDOR04 47 #define AF_VENDOR05 49 #define AF_VENDOR06 51 #define AF_VENDOR07 53 #define AF_VENDOR08 55 #define AF_VENDOR09 57 #define AF_VENDOR10 59 #define AF_VENDOR11 61 #define AF_VENDOR12 63 #define AF_VENDOR13 65 #define AF_VENDOR14 67 #define AF_VENDOR15 69 #define AF_VENDOR16 71 #define AF_VENDOR17 73 #define AF_VENDOR18 75 #define AF_VENDOR19 77 #define AF_VENDOR20 79 #define AF_VENDOR21 81 #define AF_VENDOR22 83 #define AF_VENDOR23 85 #define AF_VENDOR24 87 #define AF_VENDOR25 89 #define AF_VENDOR26 91 #define AF_VENDOR27 93 #define AF_VENDOR28 95 #define AF_VENDOR29 97 #define AF_VENDOR30 99 #define AF_VENDOR31 101 #define AF_VENDOR32 103 #define AF_VENDOR33 105 #define AF_VENDOR34 107 #define AF_VENDOR35 109 #define AF_VENDOR36 111 #define AF_VENDOR37 113 #define AF_VENDOR38 115 #define AF_VENDOR39 117 #define AF_VENDOR40 119 #define AF_VENDOR41 121 #define AF_VENDOR42 123 #define AF_VENDOR43 125 #define AF_VENDOR44 127 #define AF_VENDOR45 129 #define AF_VENDOR46 131 #define AF_VENDOR47 133 #endif /* * Structure used by kernel to store most * addresses. */ struct sockaddr { unsigned char sa_len; /* total length */ sa_family_t sa_family; /* address family */ char sa_data[14]; /* actually longer; address value */ }; #if __BSD_VISIBLE #define SOCK_MAXADDRLEN 255 /* longest possible addresses */ /* * Structure used by kernel to pass protocol * information in raw sockets. */ struct sockproto { unsigned short sp_family; /* address family */ unsigned short sp_protocol; /* protocol */ }; #endif #include #if __BSD_VISIBLE /* * Protocol families, same as address families for now. */ #define PF_UNSPEC AF_UNSPEC #define PF_LOCAL AF_LOCAL #define PF_UNIX PF_LOCAL /* backward compatibility */ #define PF_INET AF_INET #define PF_IMPLINK AF_IMPLINK #define PF_PUP AF_PUP #define PF_CHAOS AF_CHAOS #define PF_NETBIOS AF_NETBIOS #define PF_ISO AF_ISO #define PF_OSI AF_ISO #define PF_ECMA AF_ECMA #define PF_DATAKIT AF_DATAKIT #define PF_CCITT AF_CCITT #define PF_SNA AF_SNA #define PF_DECnet AF_DECnet #define PF_DLI AF_DLI #define PF_LAT AF_LAT #define PF_HYLINK AF_HYLINK #define PF_APPLETALK AF_APPLETALK #define PF_ROUTE AF_ROUTE #define PF_LINK AF_LINK #define PF_XTP pseudo_AF_XTP /* really just proto family, no AF */ #define PF_COIP AF_COIP #define PF_CNT AF_CNT #define PF_SIP AF_SIP #define PF_IPX AF_IPX #define PF_RTIP pseudo_AF_RTIP /* same format as AF_INET */ #define PF_PIP pseudo_AF_PIP #define PF_ISDN AF_ISDN #define PF_KEY pseudo_AF_KEY #define PF_INET6 AF_INET6 #define PF_NATM AF_NATM #define PF_ATM AF_ATM #define PF_NETGRAPH AF_NETGRAPH #define PF_SLOW AF_SLOW #define PF_SCLUSTER AF_SCLUSTER #define PF_ARP AF_ARP #define PF_BLUETOOTH AF_BLUETOOTH #define PF_IEEE80211 AF_IEEE80211 #define PF_NETLINK AF_NETLINK #define PF_INET_SDP AF_INET_SDP #define PF_INET6_SDP AF_INET6_SDP +#define PF_DIVERT AF_DIVERT #define PF_MAX AF_MAX /* * Definitions for network related sysctl, CTL_NET. * * Second level is protocol family. * Third level is protocol number. * * Further levels are defined by the individual families. */ /* * PF_ROUTE - Routing table * * Three additional levels are defined: * Fourth: address family, 0 is wildcard * Fifth: type of info, defined below * Sixth: flag(s) to mask with for NET_RT_FLAGS */ #define NET_RT_DUMP 1 /* dump; may limit to a.f. */ #define NET_RT_FLAGS 2 /* by flags, e.g. RESOLVING */ #define NET_RT_IFLIST 3 /* survey interface list */ #define NET_RT_IFMALIST 4 /* return multicast address list */ #define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en * versions of msghdr structs. */ #define NET_RT_NHOP 6 /* dump routing nexthops */ #define NET_RT_NHGRP 7 /* dump routing nexthop groups */ #endif /* __BSD_VISIBLE */ /* * Maximum queue length specifiable by listen. */ #define SOMAXCONN 128 /* * Message header for recvmsg and sendmsg calls. * Used value-result for recvmsg, value only for sendmsg. */ struct msghdr { void *msg_name; /* optional address */ socklen_t msg_namelen; /* size of address */ struct iovec *msg_iov; /* scatter/gather array */ int msg_iovlen; /* # elements in msg_iov */ void *msg_control; /* ancillary data, see below */ socklen_t msg_controllen; /* ancillary data buffer len */ int msg_flags; /* flags on received message */ }; #define MSG_OOB 0x00000001 /* process out-of-band data */ #define MSG_PEEK 0x00000002 /* peek at incoming message */ #define MSG_DONTROUTE 0x00000004 /* send without using routing tables */ #define MSG_EOR 0x00000008 /* data completes record */ #define MSG_TRUNC 0x00000010 /* data discarded before delivery */ #define MSG_CTRUNC 0x00000020 /* control data lost before delivery */ #define MSG_WAITALL 0x00000040 /* wait for full request or error */ #if __BSD_VISIBLE #define MSG_DONTWAIT 0x00000080 /* this message should be nonblocking */ #define MSG_EOF 0x00000100 /* data completes connection */ /* 0x00000200 unused */ /* 0x00000400 unused */ /* 0x00000800 unused */ /* 0x00001000 unused */ #define MSG_NOTIFICATION 0x00002000 /* SCTP notification */ #define MSG_NBIO 0x00004000 /* FIONBIO mode, used by fifofs */ #define MSG_COMPAT 0x00008000 /* used in sendit() */ #endif #ifdef _KERNEL #define MSG_SOCALLBCK 0x00010000 /* for use by socket callbacks - soreceive (TCP) */ #endif #if __POSIX_VISIBLE >= 200809 #define MSG_NOSIGNAL 0x00020000 /* do not generate SIGPIPE on EOF */ #endif #if __BSD_VISIBLE #define MSG_CMSG_CLOEXEC 0x00040000 /* make received fds close-on-exec */ #define MSG_WAITFORONE 0x00080000 /* for recvmmsg() */ #endif #ifdef _KERNEL #define MSG_MORETOCOME 0x00100000 /* additional data pending */ #define MSG_TLSAPPDATA 0x00200000 /* do not soreceive() alert rec. (TLS) */ #endif /* * Header for ancillary data objects in msg_control buffer. * Used for additional information with/about a datagram * not expressible by flags. The format is a sequence * of message elements headed by cmsghdr structures. */ struct cmsghdr { socklen_t cmsg_len; /* data byte count, including hdr */ int cmsg_level; /* originating protocol */ int cmsg_type; /* protocol-specific type */ /* followed by u_char cmsg_data[]; */ }; #if __BSD_VISIBLE /* * While we may have more groups than this, the cmsgcred struct must * be able to fit in an mbuf and we have historically supported a * maximum of 16 groups. */ #define CMGROUP_MAX 16 /* * Credentials structure, used to verify the identity of a peer * process that has sent us a message. This is allocated by the * peer process but filled in by the kernel. This prevents the * peer from lying about its identity. (Note that cmcred_groups[0] * is the effective GID.) */ struct cmsgcred { pid_t cmcred_pid; /* PID of sending process */ uid_t cmcred_uid; /* real UID of sending process */ uid_t cmcred_euid; /* effective UID of sending process */ gid_t cmcred_gid; /* real GID of sending process */ short cmcred_ngroups; /* number or groups */ gid_t cmcred_groups[CMGROUP_MAX]; /* groups */ }; /* * Socket credentials (LOCAL_CREDS). */ struct sockcred { uid_t sc_uid; /* real user id */ uid_t sc_euid; /* effective user id */ gid_t sc_gid; /* real group id */ gid_t sc_egid; /* effective group id */ int sc_ngroups; /* number of supplemental groups */ gid_t sc_groups[1]; /* variable length */ }; /* * Compute size of a sockcred structure with groups. */ #define SOCKCREDSIZE(ngrps) \ (sizeof(struct sockcred) + (sizeof(gid_t) * ((ngrps) - 1))) /* * Socket credentials (LOCAL_CREDS_PERSISTENT). */ struct sockcred2 { int sc_version; /* version of this structure */ pid_t sc_pid; /* PID of sending process */ uid_t sc_uid; /* real user id */ uid_t sc_euid; /* effective user id */ gid_t sc_gid; /* real group id */ gid_t sc_egid; /* effective group id */ int sc_ngroups; /* number of supplemental groups */ gid_t sc_groups[1]; /* variable length */ }; #define SOCKCRED2SIZE(ngrps) \ (sizeof(struct sockcred2) + (sizeof(gid_t) * ((ngrps) - 1))) #endif /* __BSD_VISIBLE */ /* given pointer to struct cmsghdr, return pointer to data */ #define CMSG_DATA(cmsg) ((unsigned char *)(cmsg) + \ _ALIGN(sizeof(struct cmsghdr))) /* given pointer to struct cmsghdr, return pointer to next cmsghdr */ #define CMSG_NXTHDR(mhdr, cmsg) \ ((char *)(cmsg) == (char *)0 ? CMSG_FIRSTHDR(mhdr) : \ ((char *)(cmsg) + _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len) + \ _ALIGN(sizeof(struct cmsghdr)) > \ (char *)(mhdr)->msg_control + (mhdr)->msg_controllen) ? \ (struct cmsghdr *)0 : \ (struct cmsghdr *)(void *)((char *)(cmsg) + \ _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len))) /* * RFC 2292 requires to check msg_controllen, in case that the kernel returns * an empty list for some reasons. */ #define CMSG_FIRSTHDR(mhdr) \ ((mhdr)->msg_controllen >= sizeof(struct cmsghdr) ? \ (struct cmsghdr *)(mhdr)->msg_control : \ (struct cmsghdr *)0) #if __BSD_VISIBLE /* RFC 2292 additions */ #define CMSG_SPACE(l) (_ALIGN(sizeof(struct cmsghdr)) + _ALIGN(l)) #define CMSG_LEN(l) (_ALIGN(sizeof(struct cmsghdr)) + (l)) #endif #ifdef _KERNEL #define CMSG_ALIGN(n) _ALIGN(n) #endif /* "Socket"-level control message types: */ #define SCM_RIGHTS 0x01 /* access rights (array of int) */ #if __BSD_VISIBLE #define SCM_TIMESTAMP 0x02 /* timestamp (struct timeval) */ #define SCM_CREDS 0x03 /* process creds (struct cmsgcred) */ #define SCM_BINTIME 0x04 /* timestamp (struct bintime) */ #define SCM_REALTIME 0x05 /* timestamp (struct timespec) */ #define SCM_MONOTONIC 0x06 /* timestamp (struct timespec) */ #define SCM_TIME_INFO 0x07 /* timestamp info */ #define SCM_CREDS2 0x08 /* process creds (struct sockcred2) */ struct sock_timestamp_info { __uint32_t st_info_flags; __uint32_t st_info_pad0; __uint64_t st_info_rsv[7]; }; #define ST_INFO_HW 0x0001 /* SCM_TIMESTAMP was hw */ #define ST_INFO_HW_HPREC 0x0002 /* SCM_TIMESTAMP was hw-assisted on entrance */ #endif #if __BSD_VISIBLE /* * 4.3 compat sockaddr, move to compat file later */ struct osockaddr { unsigned short sa_family; /* address family */ char sa_data[14]; /* up to 14 bytes of direct address */ }; /* * 4.3-compat message header (move to compat file later). */ struct omsghdr { char *msg_name; /* optional address */ int msg_namelen; /* size of address */ struct iovec *msg_iov; /* scatter/gather array */ int msg_iovlen; /* # elements in msg_iov */ char *msg_accrights; /* access rights sent/received */ int msg_accrightslen; }; #endif /* * howto arguments for shutdown(2), specified by Posix.1g. */ #define SHUT_RD 0 /* shut down the reading side */ #define SHUT_WR 1 /* shut down the writing side */ #define SHUT_RDWR 2 /* shut down both sides */ #if __BSD_VISIBLE /* for SCTP */ /* we cheat and use the SHUT_XX defines for these */ #define PRU_FLUSH_RD SHUT_RD #define PRU_FLUSH_WR SHUT_WR #define PRU_FLUSH_RDWR SHUT_RDWR #endif #if __BSD_VISIBLE /* * sendfile(2) header/trailer struct */ struct sf_hdtr { struct iovec *headers; /* pointer to an array of header struct iovec's */ int hdr_cnt; /* number of header iovec's */ struct iovec *trailers; /* pointer to an array of trailer struct iovec's */ int trl_cnt; /* number of trailer iovec's */ }; /* * Sendfile-specific flag(s) */ #define SF_NODISKIO 0x00000001 #define SF_MNOWAIT 0x00000002 /* obsolete */ #define SF_SYNC 0x00000004 #define SF_USER_READAHEAD 0x00000008 #define SF_NOCACHE 0x00000010 #define SF_FLAGS(rh, flags) (((rh) << 16) | (flags)) #ifdef _KERNEL #define SF_READAHEAD(flags) ((flags) >> 16) #endif /* _KERNEL */ /* * Sendmmsg/recvmmsg specific structure(s) */ struct mmsghdr { struct msghdr msg_hdr; /* message header */ ssize_t msg_len; /* message length */ }; #endif /* __BSD_VISIBLE */ #ifndef _KERNEL #include __BEGIN_DECLS int accept(int, struct sockaddr * __restrict, socklen_t * __restrict); int bind(int, const struct sockaddr *, socklen_t); int connect(int, const struct sockaddr *, socklen_t); #if __BSD_VISIBLE int accept4(int, struct sockaddr * __restrict, socklen_t * __restrict, int); int bindat(int, int, const struct sockaddr *, socklen_t); int connectat(int, int, const struct sockaddr *, socklen_t); #endif int getpeername(int, struct sockaddr * __restrict, socklen_t * __restrict); int getsockname(int, struct sockaddr * __restrict, socklen_t * __restrict); int getsockopt(int, int, int, void * __restrict, socklen_t * __restrict); int listen(int, int); ssize_t recv(int, void *, size_t, int); ssize_t recvfrom(int, void *, size_t, int, struct sockaddr * __restrict, socklen_t * __restrict); ssize_t recvmsg(int, struct msghdr *, int); #if __BSD_VISIBLE struct timespec; ssize_t recvmmsg(int, struct mmsghdr * __restrict, size_t, int, const struct timespec * __restrict); #endif ssize_t send(int, const void *, size_t, int); ssize_t sendto(int, const void *, size_t, int, const struct sockaddr *, socklen_t); ssize_t sendmsg(int, const struct msghdr *, int); #if __BSD_VISIBLE int sendfile(int, int, off_t, size_t, struct sf_hdtr *, off_t *, int); ssize_t sendmmsg(int, struct mmsghdr * __restrict, size_t, int); int setfib(int); #endif int setsockopt(int, int, int, const void *, socklen_t); int shutdown(int, int); int sockatmark(int); int socket(int, int, int); int socketpair(int, int, int, int *); __END_DECLS #endif /* !_KERNEL */ #ifdef _KERNEL struct socket; struct tcpcb *so_sototcpcb(struct socket *so); struct inpcb *so_sotoinpcb(struct socket *so); struct sockbuf *so_sockbuf_snd(struct socket *); struct sockbuf *so_sockbuf_rcv(struct socket *); int so_state_get(const struct socket *); void so_state_set(struct socket *, int); int so_options_get(const struct socket *); void so_options_set(struct socket *, int); int so_error_get(const struct socket *); void so_error_set(struct socket *, int); int so_linger_get(const struct socket *); void so_linger_set(struct socket *, int); struct protosw *so_protosw_get(const struct socket *); void so_protosw_set(struct socket *, struct protosw *); void so_sorwakeup_locked(struct socket *so); void so_sowwakeup_locked(struct socket *so); void so_sorwakeup(struct socket *so); void so_sowwakeup(struct socket *so); void so_lock(struct socket *so); void so_unlock(struct socket *so); #endif /* _KERNEL */ #endif /* !_SYS_SOCKET_H_ */ diff --git a/usr.bin/netstat/inet.c b/usr.bin/netstat/inet.c index b7dbcb3531b0..e848874d1695 100644 --- a/usr.bin/netstat/inet.c +++ b/usr.bin/netstat/inet.c @@ -1,1516 +1,1515 @@ /*- * Copyright (c) 1983, 1988, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static char sccsid[] = "@(#)inet.c 8.5 (Berkeley) 5/24/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #define _WANT_SOCKET #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif /* INET6 */ #include #include #include #include #include #include #include #include #include #define TCPSTATES #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "netstat.h" #include "nl_defs.h" #define max(a, b) (((a) > (b)) ? (a) : (b)) #ifdef INET static void inetprint(const char *, struct in_addr *, int, const char *, int, const int); #endif #ifdef INET6 static int udp_done, tcp_done, sdp_done; #endif /* INET6 */ static int pcblist_sysctl(int proto, const char *name, char **bufp) { const char *mibvar; char *buf; size_t len; switch (proto) { case IPPROTO_TCP: mibvar = "net.inet.tcp.pcblist"; break; case IPPROTO_UDP: mibvar = "net.inet.udp.pcblist"; break; - case IPPROTO_DIVERT: - mibvar = "net.inet.divert.pcblist"; - break; default: mibvar = "net.inet.raw.pcblist"; break; } if (strncmp(name, "sdp", 3) == 0) mibvar = "net.inet.sdp.pcblist"; + else if (strncmp(name, "divert", 6) == 0) + mibvar = "net.inet.divert.pcblist"; len = 0; if (sysctlbyname(mibvar, 0, &len, 0, 0) < 0) { if (errno != ENOENT) xo_warn("sysctl: %s", mibvar); return (0); } if ((buf = malloc(len)) == NULL) { xo_warnx("malloc %lu bytes", (u_long)len); return (0); } if (sysctlbyname(mibvar, buf, &len, 0, 0) < 0) { xo_warn("sysctl: %s", mibvar); free(buf); return (0); } *bufp = buf; return (1); } /* * Copied directly from uipc_socket2.c. We leave out some fields that are in * nested structures that aren't used to avoid extra work. */ static void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) { xsb->sb_cc = sb->sb_ccc; xsb->sb_hiwat = sb->sb_hiwat; xsb->sb_mbcnt = sb->sb_mbcnt; xsb->sb_mbmax = sb->sb_mbmax; xsb->sb_lowat = sb->sb_lowat; xsb->sb_flags = sb->sb_flags; xsb->sb_timeo = sb->sb_timeo; } int sotoxsocket(struct socket *so, struct xsocket *xso) { struct protosw proto; struct domain domain; bzero(xso, sizeof *xso); xso->xso_len = sizeof *xso; xso->xso_so = (uintptr_t)so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = (uintptr_t)so->so_pcb; if (kread((uintptr_t)so->so_proto, &proto, sizeof(proto)) != 0) return (-1); xso->xso_protocol = proto.pr_protocol; if (kread((uintptr_t)proto.pr_domain, &domain, sizeof(domain)) != 0) return (-1); xso->xso_family = domain.dom_family; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; if ((so->so_options & SO_ACCEPTCONN) != 0) { xso->so_qlen = so->sol_qlen; xso->so_incqlen = so->sol_incqlen; xso->so_qlimit = so->sol_qlimit; } else { sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); xso->so_oobmark = so->so_oobmark; } return (0); } /* * Print a summary of connections related to an Internet * protocol. For TCP, also give state of connection. * Listening processes (aflag) are suppressed unless the * -a (all) flag is specified. */ void protopr(u_long off, const char *name, int af1, int proto) { static int first = 1; int istcp; char *buf; const char *vchar; struct xtcpcb *tp; struct xinpcb *inp; struct xinpgen *xig, *oxig; struct xsocket *so; int fnamelen, cnamelen; istcp = 0; switch (proto) { case IPPROTO_TCP: #ifdef INET6 if (strncmp(name, "sdp", 3) != 0) { if (tcp_done != 0) return; else tcp_done = 1; } else { if (sdp_done != 0) return; else sdp_done = 1; } #endif istcp = 1; break; case IPPROTO_UDP: #ifdef INET6 if (udp_done != 0) return; else udp_done = 1; #endif break; } if (!pcblist_sysctl(proto, name, &buf)) return; if (cflag || Cflag) { fnamelen = strlen("Stack"); cnamelen = strlen("CC"); oxig = xig = (struct xinpgen *)buf; for (xig = (struct xinpgen*)((char *)xig + xig->xig_len); xig->xig_len > sizeof(struct xinpgen); xig = (struct xinpgen *)((char *)xig + xig->xig_len)) { if (istcp) { tp = (struct xtcpcb *)xig; inp = &tp->xt_inp; } else { continue; } if (so->xso_protocol != proto) continue; if (inp->inp_gencnt > oxig->xig_gen) continue; fnamelen = max(fnamelen, (int)strlen(tp->xt_stack)); cnamelen = max(cnamelen, (int)strlen(tp->xt_cc)); } } oxig = xig = (struct xinpgen *)buf; for (xig = (struct xinpgen *)((char *)xig + xig->xig_len); xig->xig_len > sizeof(struct xinpgen); xig = (struct xinpgen *)((char *)xig + xig->xig_len)) { if (istcp) { tp = (struct xtcpcb *)xig; inp = &tp->xt_inp; } else { inp = (struct xinpcb *)xig; } so = &inp->xi_socket; /* Ignore sockets for protocols other than the desired one. */ - if (so->xso_protocol != proto) + if (proto != 0 && so->xso_protocol != proto) continue; /* Ignore PCBs which were freed during copyout. */ if (inp->inp_gencnt > oxig->xig_gen) continue; if ((af1 == AF_INET && (inp->inp_vflag & INP_IPV4) == 0) #ifdef INET6 || (af1 == AF_INET6 && (inp->inp_vflag & INP_IPV6) == 0) #endif /* INET6 */ || (af1 == AF_UNSPEC && ((inp->inp_vflag & INP_IPV4) == 0 #ifdef INET6 && (inp->inp_vflag & INP_IPV6) == 0 #endif /* INET6 */ )) ) continue; if (!aflag && ( (istcp && tp->t_state == TCPS_LISTEN) || (af1 == AF_INET && inp->inp_laddr.s_addr == INADDR_ANY) #ifdef INET6 || (af1 == AF_INET6 && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) #endif /* INET6 */ || (af1 == AF_UNSPEC && (((inp->inp_vflag & INP_IPV4) != 0 && inp->inp_laddr.s_addr == INADDR_ANY) #ifdef INET6 || ((inp->inp_vflag & INP_IPV6) != 0 && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) #endif )) )) continue; if (first) { if (!Lflag) { xo_emit("Active Internet connections"); if (aflag) xo_emit(" (including servers)"); } else xo_emit( "Current listen queue sizes (qlen/incqlen/maxqlen)"); xo_emit("\n"); if (Aflag) xo_emit("{T:/%-*s} ", 2 * (int)sizeof(void *), "Tcpcb"); if (Lflag) xo_emit((Aflag && !Wflag) ? "{T:/%-5.5s} {T:/%-32.32s} {T:/%-18.18s}" : ((!Wflag || af1 == AF_INET) ? "{T:/%-5.5s} {T:/%-32.32s} {T:/%-22.22s}" : "{T:/%-5.5s} {T:/%-32.32s} {T:/%-45.45s}"), "Proto", "Listen", "Local Address"); else if (Tflag) xo_emit((Aflag && !Wflag) ? "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-18.18s} {T:/%s}" : ((!Wflag || af1 == AF_INET) ? "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-22.22s} {T:/%s}" : "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-45.45s} {T:/%s}"), "Proto", "Rexmit", "OOORcv", "0-win", "Local Address", "Foreign Address"); else { xo_emit((Aflag && !Wflag) ? "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-18.18s} {T:/%-18.18s}" : ((!Wflag || af1 == AF_INET) ? "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-22.22s} {T:/%-22.22s}" : "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-45.45s} {T:/%-45.45s}"), "Proto", "Recv-Q", "Send-Q", "Local Address", "Foreign Address"); if (!xflag && !Rflag) xo_emit(" {T:/%-11.11s}", "(state)"); } if (xflag) { xo_emit("{T:/%-6.6s} {T:/%-6.6s} " "{T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} " "{T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s}", "R-HIWA", "S-HIWA", "R-LOWA", "S-LOWA", "R-BCNT", "S-BCNT", "R-BMAX", "S-BMAX"); xo_emit(" {T:/%7.7s} {T:/%7.7s} {T:/%7.7s} " "{T:/%7.7s} {T:/%7.7s} {T:/%7.7s}", "rexmt", "persist", "keep", "2msl", "delack", "rcvtime"); } else if (Rflag) { xo_emit(" {T:/%8.8s} {T:/%5.5s}", "flowid", "ftype"); } if (cflag) { xo_emit(" {T:/%-*.*s}", fnamelen, fnamelen, "Stack"); } if (Cflag) xo_emit(" {T:/%-*.*s} {T:/%10.10s}" " {T:/%10.10s} {T:/%5.5s}" " {T:/%3.3s}", cnamelen, cnamelen, "CC", "cwin", "ssthresh", "MSS", "ECN"); if (Pflag) xo_emit(" {T:/%s}", "Log ID"); xo_emit("\n"); first = 0; } if (Lflag && so->so_qlimit == 0) continue; xo_open_instance("socket"); if (Aflag) { if (istcp) xo_emit("{q:address/%*lx} ", 2 * (int)sizeof(void *), (u_long)inp->inp_ppcb); else xo_emit("{q:address/%*lx} ", 2 * (int)sizeof(void *), (u_long)so->so_pcb); } #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) vchar = ((inp->inp_vflag & INP_IPV4) != 0) ? "46" : "6"; else #endif vchar = ((inp->inp_vflag & INP_IPV4) != 0) ? "4" : ""; if (istcp && (tp->t_flags & TF_TOE) != 0) xo_emit("{:protocol/%-3.3s%-2.2s/%s%s} ", "toe", vchar); else xo_emit("{:protocol/%-3.3s%-2.2s/%s%s} ", name, vchar); if (Lflag) { char buf1[33]; snprintf(buf1, sizeof buf1, "%u/%u/%u", so->so_qlen, so->so_incqlen, so->so_qlimit); xo_emit("{:listen-queue-sizes/%-32.32s} ", buf1); } else if (Tflag) { if (istcp) xo_emit("{:sent-retransmit-packets/%6u} " "{:received-out-of-order-packets/%6u} " "{:sent-zero-window/%6u} ", tp->t_sndrexmitpack, tp->t_rcvoopack, tp->t_sndzerowin); else xo_emit("{P:/%21s}", ""); } else { xo_emit("{:receive-bytes-waiting/%6u} " "{:send-bytes-waiting/%6u} ", so->so_rcv.sb_cc, so->so_snd.sb_cc); } if (numeric_port) { #ifdef INET if (inp->inp_vflag & INP_IPV4) { inetprint("local", &inp->inp_laddr, (int)inp->inp_lport, name, 1, af1); if (!Lflag) inetprint("remote", &inp->inp_faddr, (int)inp->inp_fport, name, 1, af1); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { inet6print("local", &inp->in6p_laddr, (int)inp->inp_lport, name, 1); if (!Lflag) inet6print("remote", &inp->in6p_faddr, (int)inp->inp_fport, name, 1); } /* else nothing printed now */ #endif /* INET6 */ } else if (inp->inp_flags & INP_ANONPORT) { #ifdef INET if (inp->inp_vflag & INP_IPV4) { inetprint("local", &inp->inp_laddr, (int)inp->inp_lport, name, 1, af1); if (!Lflag) inetprint("remote", &inp->inp_faddr, (int)inp->inp_fport, name, 0, af1); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { inet6print("local", &inp->in6p_laddr, (int)inp->inp_lport, name, 1); if (!Lflag) inet6print("remote", &inp->in6p_faddr, (int)inp->inp_fport, name, 0); } /* else nothing printed now */ #endif /* INET6 */ } else { #ifdef INET if (inp->inp_vflag & INP_IPV4) { inetprint("local", &inp->inp_laddr, (int)inp->inp_lport, name, 0, af1); if (!Lflag) inetprint("remote", &inp->inp_faddr, (int)inp->inp_fport, name, inp->inp_lport != inp->inp_fport, af1); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { inet6print("local", &inp->in6p_laddr, (int)inp->inp_lport, name, 0); if (!Lflag) inet6print("remote", &inp->in6p_faddr, (int)inp->inp_fport, name, inp->inp_lport != inp->inp_fport); } /* else nothing printed now */ #endif /* INET6 */ } if (xflag) { xo_emit("{:receive-high-water/%6u} " "{:send-high-water/%6u} " "{:receive-low-water/%6u} {:send-low-water/%6u} " "{:receive-mbuf-bytes/%6u} {:send-mbuf-bytes/%6u} " "{:receive-mbuf-bytes-max/%6u} " "{:send-mbuf-bytes-max/%6u}", so->so_rcv.sb_hiwat, so->so_snd.sb_hiwat, so->so_rcv.sb_lowat, so->so_snd.sb_lowat, so->so_rcv.sb_mbcnt, so->so_snd.sb_mbcnt, so->so_rcv.sb_mbmax, so->so_snd.sb_mbmax); if (istcp) xo_emit(" {:retransmit-timer/%4d.%02d} " "{:persist-timer/%4d.%02d} " "{:keepalive-timer/%4d.%02d} " "{:msl2-timer/%4d.%02d} " "{:delay-ack-timer/%4d.%02d} " "{:inactivity-timer/%4d.%02d}", tp->tt_rexmt / 1000, (tp->tt_rexmt % 1000) / 10, tp->tt_persist / 1000, (tp->tt_persist % 1000) / 10, tp->tt_keep / 1000, (tp->tt_keep % 1000) / 10, tp->tt_2msl / 1000, (tp->tt_2msl % 1000) / 10, tp->tt_delack / 1000, (tp->tt_delack % 1000) / 10, tp->t_rcvtime / 1000, (tp->t_rcvtime % 1000) / 10); } if (istcp && !Lflag && !xflag && !Tflag && !Rflag) { if (tp->t_state < 0 || tp->t_state >= TCP_NSTATES) xo_emit("{:tcp-state/%-11d}", tp->t_state); else { xo_emit("{:tcp-state/%-11s}", tcpstates[tp->t_state]); #if defined(TF_NEEDSYN) && defined(TF_NEEDFIN) /* Show T/TCP `hidden state' */ if (tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) xo_emit("{:need-syn-or-fin/*}"); #endif /* defined(TF_NEEDSYN) && defined(TF_NEEDFIN) */ } } if (Rflag) { /* XXX: is this right Alfred */ xo_emit(" {:flow-id/%08x} {:flow-type/%5d}", inp->inp_flowid, inp->inp_flowtype); } if (istcp) { if (cflag) xo_emit(" {:stack/%-*.*s}", fnamelen, fnamelen, tp->xt_stack); if (Cflag) xo_emit(" {:cc/%-*.*s}" " {:snd-cwnd/%10lu}" " {:snd-ssthresh/%10lu}" " {:t-maxseg/%5u} {:ecn/%3s}", cnamelen, cnamelen, tp->xt_cc, tp->t_snd_cwnd, tp->t_snd_ssthresh, tp->t_maxseg, (tp->t_state >= TCPS_ESTABLISHED ? (tp->xt_ecn > 0 ? (tp->xt_ecn == 1 ? "ecn" : "ace") : "off") : "n/a")); if (Pflag) xo_emit(" {:log-id/%s}", tp->xt_logid[0] == '\0' ? "-" : tp->xt_logid); } xo_emit("\n"); xo_close_instance("socket"); } if (xig != oxig && xig->xig_gen != oxig->xig_gen) { if (oxig->xig_count > xig->xig_count) { xo_emit("Some {d:lost/%s} sockets may have been " "deleted.\n", name); } else if (oxig->xig_count < xig->xig_count) { xo_emit("Some {d:created/%s} sockets may have been " "created.\n", name); } else { xo_emit("Some {d:changed/%s} sockets may have been " "created or deleted.\n", name); } } free(buf); } /* * Dump TCP statistics structure. */ void tcp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct tcpstat tcpstat; uint64_t tcps_states[TCP_NSTATES]; #ifdef INET6 if (tcp_done != 0) return; else tcp_done = 1; #endif if (fetch_stats("net.inet.tcp.stats", off, &tcpstat, sizeof(tcpstat), kread_counters) != 0) return; if (fetch_stats_ro("net.inet.tcp.states", nl[N_TCPS_STATES].n_value, &tcps_states, sizeof(tcps_states), kread_counters) != 0) return; xo_open_container("tcp"); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (tcpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t )tcpstat.f, plural(tcpstat.f)) #define p1a(f, m) if (tcpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t )tcpstat.f) #define p2(f1, f2, m) if (tcpstat.f1 || tcpstat.f2 || sflag <= 1) \ xo_emit(m, (uintmax_t )tcpstat.f1, plural(tcpstat.f1), \ (uintmax_t )tcpstat.f2, plural(tcpstat.f2)) #define p2a(f1, f2, m) if (tcpstat.f1 || tcpstat.f2 || sflag <= 1) \ xo_emit(m, (uintmax_t )tcpstat.f1, plural(tcpstat.f1), \ (uintmax_t )tcpstat.f2) #define p3(f, m) if (tcpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t )tcpstat.f, pluralies(tcpstat.f)) p(tcps_sndtotal, "\t{:sent-packets/%ju} {N:/packet%s sent}\n"); p2(tcps_sndpack,tcps_sndbyte, "\t\t{:sent-data-packets/%ju} " "{N:/data packet%s} ({:sent-data-bytes/%ju} {N:/byte%s})\n"); p2(tcps_sndrexmitpack, tcps_sndrexmitbyte, "\t\t" "{:sent-retransmitted-packets/%ju} {N:/data packet%s} " "({:sent-retransmitted-bytes/%ju} {N:/byte%s}) " "{N:retransmitted}\n"); p(tcps_sndrexmitbad, "\t\t" "{:sent-unnecessary-retransmitted-packets/%ju} " "{N:/data packet%s unnecessarily retransmitted}\n"); p(tcps_mturesent, "\t\t{:sent-resends-by-mtu-discovery/%ju} " "{N:/resend%s initiated by MTU discovery}\n"); p2a(tcps_sndacks, tcps_delack, "\t\t{:sent-ack-only-packets/%ju} " "{N:/ack-only packet%s/} ({:sent-packets-delayed/%ju} " "{N:delayed})\n"); p(tcps_sndurg, "\t\t{:sent-urg-only-packets/%ju} " "{N:/URG only packet%s}\n"); p(tcps_sndprobe, "\t\t{:sent-window-probe-packets/%ju} " "{N:/window probe packet%s}\n"); p(tcps_sndwinup, "\t\t{:sent-window-update-packets/%ju} " "{N:/window update packet%s}\n"); p(tcps_sndctrl, "\t\t{:sent-control-packets/%ju} " "{N:/control packet%s}\n"); p(tcps_rcvtotal, "\t{:received-packets/%ju} " "{N:/packet%s received}\n"); p2(tcps_rcvackpack, tcps_rcvackbyte, "\t\t" "{:received-ack-packets/%ju} {N:/ack%s} " "{N:(for} {:received-ack-bytes/%ju} {N:/byte%s})\n"); p(tcps_rcvdupack, "\t\t{:received-duplicate-acks/%ju} " "{N:/duplicate ack%s}\n"); p(tcps_tunneled_pkts, "\t\t{:received-udp-tunneled-pkts/%ju} " "{N:/UDP tunneled pkt%s}\n"); p(tcps_tunneled_errs, "\t\t{:received-bad-udp-tunneled-pkts/%ju} " "{N:/UDP tunneled pkt cnt with error%s}\n"); p(tcps_rcvacktoomuch, "\t\t{:received-acks-for-unsent-data/%ju} " "{N:/ack%s for unsent data}\n"); p2(tcps_rcvpack, tcps_rcvbyte, "\t\t" "{:received-in-sequence-packets/%ju} {N:/packet%s} " "({:received-in-sequence-bytes/%ju} {N:/byte%s}) " "{N:received in-sequence}\n"); p2(tcps_rcvduppack, tcps_rcvdupbyte, "\t\t" "{:received-completely-duplicate-packets/%ju} " "{N:/completely duplicate packet%s} " "({:received-completely-duplicate-bytes/%ju} {N:/byte%s})\n"); p(tcps_pawsdrop, "\t\t{:received-old-duplicate-packets/%ju} " "{N:/old duplicate packet%s}\n"); p2(tcps_rcvpartduppack, tcps_rcvpartdupbyte, "\t\t" "{:received-some-duplicate-packets/%ju} " "{N:/packet%s with some dup. data} " "({:received-some-duplicate-bytes/%ju} {N:/byte%s duped/})\n"); p2(tcps_rcvoopack, tcps_rcvoobyte, "\t\t{:received-out-of-order/%ju} " "{N:/out-of-order packet%s} " "({:received-out-of-order-bytes/%ju} {N:/byte%s})\n"); p2(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, "\t\t" "{:received-after-window-packets/%ju} {N:/packet%s} " "({:received-after-window-bytes/%ju} {N:/byte%s}) " "{N:of data after window}\n"); p(tcps_rcvwinprobe, "\t\t{:received-window-probes/%ju} " "{N:/window probe%s}\n"); p(tcps_rcvwinupd, "\t\t{:receive-window-update-packets/%ju} " "{N:/window update packet%s}\n"); p(tcps_dsack_count, "\t\t{:received-with-dsack-packets/%ju} " "{N:/packet%s received with dsack}\n"); p(tcps_dsack_bytes, "\t\t{:received-with-dsack-bytes/%ju} " "{N:/dsack byte%s received (no TLP involved)}\n"); p(tcps_dsack_tlp_bytes, "\t\t{:received-with-dsack-bytes-tlp/%ju} " "{N:/dsack byte%s received (TLP responsible)}\n"); p(tcps_rcvafterclose, "\t\t{:received-after-close-packets/%ju} " "{N:/packet%s received after close}\n"); p(tcps_rcvbadsum, "\t\t{:discard-bad-checksum/%ju} " "{N:/discarded for bad checksum%s}\n"); p(tcps_rcvbadoff, "\t\t{:discard-bad-header-offset/%ju} " "{N:/discarded for bad header offset field%s}\n"); p1a(tcps_rcvshort, "\t\t{:discard-too-short/%ju} " "{N:discarded because packet too short}\n"); p1a(tcps_rcvreassfull, "\t\t{:discard-reassembly-queue-full/%ju} " "{N:discarded due to full reassembly queue}\n"); p(tcps_connattempt, "\t{:connection-requests/%ju} " "{N:/connection request%s}\n"); p(tcps_accepts, "\t{:connections-accepts/%ju} " "{N:/connection accept%s}\n"); p(tcps_badsyn, "\t{:bad-connection-attempts/%ju} " "{N:/bad connection attempt%s}\n"); p(tcps_listendrop, "\t{:listen-queue-overflows/%ju} " "{N:/listen queue overflow%s}\n"); p(tcps_badrst, "\t{:ignored-in-window-resets/%ju} " "{N:/ignored RSTs in the window%s}\n"); p(tcps_connects, "\t{:connections-established/%ju} " "{N:/connection%s established (including accepts)}\n"); p(tcps_usedrtt, "\t\t{:connections-hostcache-rtt/%ju} " "{N:/time%s used RTT from hostcache}\n"); p(tcps_usedrttvar, "\t\t{:connections-hostcache-rttvar/%ju} " "{N:/time%s used RTT variance from hostcache}\n"); p(tcps_usedssthresh, "\t\t{:connections-hostcache-ssthresh/%ju} " "{N:/time%s used slow-start threshold from hostcache}\n"); p2(tcps_closed, tcps_drops, "\t{:connections-closed/%ju} " "{N:/connection%s closed (including} " "{:connection-drops/%ju} {N:/drop%s})\n"); p(tcps_cachedrtt, "\t\t{:connections-updated-rtt-on-close/%ju} " "{N:/connection%s updated cached RTT on close}\n"); p(tcps_cachedrttvar, "\t\t" "{:connections-updated-variance-on-close/%ju} " "{N:/connection%s updated cached RTT variance on close}\n"); p(tcps_cachedssthresh, "\t\t" "{:connections-updated-ssthresh-on-close/%ju} " "{N:/connection%s updated cached ssthresh on close}\n"); p(tcps_conndrops, "\t{:embryonic-connections-dropped/%ju} " "{N:/embryonic connection%s dropped}\n"); p2(tcps_rttupdated, tcps_segstimed, "\t{:segments-updated-rtt/%ju} " "{N:/segment%s updated rtt (of} " "{:segment-update-attempts/%ju} {N:/attempt%s})\n"); p(tcps_rexmttimeo, "\t{:retransmit-timeouts/%ju} " "{N:/retransmit timeout%s}\n"); p(tcps_timeoutdrop, "\t\t" "{:connections-dropped-by-retransmit-timeout/%ju} " "{N:/connection%s dropped by rexmit timeout}\n"); p(tcps_persisttimeo, "\t{:persist-timeout/%ju} " "{N:/persist timeout%s}\n"); p(tcps_persistdrop, "\t\t" "{:connections-dropped-by-persist-timeout/%ju} " "{N:/connection%s dropped by persist timeout}\n"); p(tcps_finwait2_drops, "\t" "{:connections-dropped-by-finwait2-timeout/%ju} " "{N:/Connection%s (fin_wait_2) dropped because of timeout}\n"); p(tcps_keeptimeo, "\t{:keepalive-timeout/%ju} " "{N:/keepalive timeout%s}\n"); p(tcps_keepprobe, "\t\t{:keepalive-probes/%ju} " "{N:/keepalive probe%s sent}\n"); p(tcps_keepdrops, "\t\t{:connections-dropped-by-keepalives/%ju} " "{N:/connection%s dropped by keepalive}\n"); p(tcps_predack, "\t{:ack-header-predictions/%ju} " "{N:/correct ACK header prediction%s}\n"); p(tcps_preddat, "\t{:data-packet-header-predictions/%ju} " "{N:/correct data packet header prediction%s}\n"); xo_open_container("syncache"); p3(tcps_sc_added, "\t{:entries-added/%ju} " "{N:/syncache entr%s added}\n"); p1a(tcps_sc_retransmitted, "\t\t{:retransmitted/%ju} " "{N:/retransmitted}\n"); p1a(tcps_sc_dupsyn, "\t\t{:duplicates/%ju} {N:/dupsyn}\n"); p1a(tcps_sc_dropped, "\t\t{:dropped/%ju} {N:/dropped}\n"); p1a(tcps_sc_completed, "\t\t{:completed/%ju} {N:/completed}\n"); p1a(tcps_sc_bucketoverflow, "\t\t{:bucket-overflow/%ju} " "{N:/bucket overflow}\n"); p1a(tcps_sc_cacheoverflow, "\t\t{:cache-overflow/%ju} " "{N:/cache overflow}\n"); p1a(tcps_sc_reset, "\t\t{:reset/%ju} {N:/reset}\n"); p1a(tcps_sc_stale, "\t\t{:stale/%ju} {N:/stale}\n"); p1a(tcps_sc_aborted, "\t\t{:aborted/%ju} {N:/aborted}\n"); p1a(tcps_sc_badack, "\t\t{:bad-ack/%ju} {N:/badack}\n"); p1a(tcps_sc_unreach, "\t\t{:unreachable/%ju} {N:/unreach}\n"); p(tcps_sc_zonefail, "\t\t{:zone-failures/%ju} {N:/zone failure%s}\n"); p(tcps_sc_sendcookie, "\t{:sent-cookies/%ju} {N:/cookie%s sent}\n"); p(tcps_sc_recvcookie, "\t{:receivd-cookies/%ju} " "{N:/cookie%s received}\n"); xo_close_container("syncache"); xo_open_container("hostcache"); p3(tcps_hc_added, "\t{:entries-added/%ju} " "{N:/hostcache entr%s added}\n"); p1a(tcps_hc_bucketoverflow, "\t\t{:buffer-overflows/%ju} " "{N:/bucket overflow}\n"); xo_close_container("hostcache"); xo_open_container("sack"); p(tcps_sack_recovery_episode, "\t{:recovery-episodes/%ju} " "{N:/SACK recovery episode%s}\n"); p(tcps_sack_rexmits, "\t{:segment-retransmits/%ju} " "{N:/segment rexmit%s in SACK recovery episodes}\n"); p(tcps_sack_rexmit_bytes, "\t{:byte-retransmits/%ju} " "{N:/byte rexmit%s in SACK recovery episodes}\n"); p(tcps_sack_rcv_blocks, "\t{:received-blocks/%ju} " "{N:/SACK option%s (SACK blocks) received}\n"); p(tcps_sack_send_blocks, "\t{:sent-option-blocks/%ju} " "{N:/SACK option%s (SACK blocks) sent}\n"); p(tcps_sack_lostrexmt, "\t{:lost-retransmissions/%ju} " "{N:/SACK retransmission%s lost}\n"); p1a(tcps_sack_sboverflow, "\t{:scoreboard-overflows/%ju} " "{N:/SACK scoreboard overflow}\n"); xo_close_container("sack"); xo_open_container("ecn"); p(tcps_ecn_ce, "\t{:ce-packets/%ju} " "{N:/packet%s with ECN CE bit set}\n"); p(tcps_ecn_ect0, "\t{:ect0-packets/%ju} " "{N:/packet%s with ECN ECT(0) bit set}\n"); p(tcps_ecn_ect1, "\t{:ect1-packets/%ju} " "{N:/packet%s with ECN ECT(1) bit set}\n"); p(tcps_ecn_shs, "\t{:handshakes/%ju} " "{N:/successful ECN handshake%s}\n"); p(tcps_ecn_rcwnd, "\t{:congestion-reductions/%ju} " "{N:/time%s ECN reduced the congestion window}\n"); p(tcps_ace_nect, "\t{:ace-nonect-syn/%ju} " "{N:/ACE SYN packet%s with Non-ECT}\n"); p(tcps_ace_ect0, "\t{:ace-ect0-syn/%ju} " "{N:/ACE SYN packet%s with ECT0}\n"); p(tcps_ace_ect1, "\t{:ace-ect1-syn/%ju} " "{N:/ACE SYN packet%s with ECT1}\n"); p(tcps_ace_ce, "\t{:ace-ce-syn/%ju} " "{N:/ACE SYN packet%s with CE}\n"); xo_close_container("ecn"); xo_open_container("tcp-signature"); p(tcps_sig_rcvgoodsig, "\t{:received-good-signature/%ju} " "{N:/packet%s with matching signature received}\n"); p(tcps_sig_rcvbadsig, "\t{:received-bad-signature/%ju} " "{N:/packet%s with bad signature received}\n"); p(tcps_sig_err_buildsig, "\t{:failed-make-signature/%ju} " "{N:/time%s failed to make signature due to no SA}\n"); p(tcps_sig_err_sigopt, "\t{:no-signature-expected/%ju} " "{N:/time%s unexpected signature received}\n"); p(tcps_sig_err_nosigopt, "\t{:no-signature-provided/%ju} " "{N:/time%s no signature provided by segment}\n"); xo_close_container("tcp-signature"); xo_open_container("pmtud"); p(tcps_pmtud_blackhole_activated, "\t{:pmtud-activated/%ju} " "{N:/Path MTU discovery black hole detection activation%s}\n"); p(tcps_pmtud_blackhole_activated_min_mss, "\t{:pmtud-activated-min-mss/%ju} " "{N:/Path MTU discovery black hole detection min MSS activation%s}\n"); p(tcps_pmtud_blackhole_failed, "\t{:pmtud-failed/%ju} " "{N:/Path MTU discovery black hole detection failure%s}\n"); xo_close_container("pmtud"); xo_open_container("tw"); p(tcps_tw_responds, "\t{:tw_responds/%ju} " "{N:/time%s connection in TIME-WAIT responded with ACK}\n"); p(tcps_tw_recycles, "\t{:tw_recycles/%ju} " "{N:/time%s connection in TIME-WAIT was actively recycled}\n"); p(tcps_tw_resets, "\t{:tw_resets/%ju} " "{N:/time%s connection in TIME-WAIT responded with RST}\n"); xo_close_container("tw"); #undef p #undef p1a #undef p2 #undef p2a #undef p3 xo_open_container("TCP connection count by state"); xo_emit("{T:/TCP connection count by state}:\n"); for (int i = 0; i < TCP_NSTATES; i++) { /* * XXXGL: is there a way in libxo to use %s * in the "content string" of a format * string? I failed to do that, that's why * a temporary buffer is used to construct * format string for xo_emit(). */ char fmtbuf[80]; if (sflag > 1 && tcps_states[i] == 0) continue; snprintf(fmtbuf, sizeof(fmtbuf), "\t{:%s/%%ju} " "{Np:/connection ,connections} in %s state\n", tcpstates[i], tcpstates[i]); xo_emit(fmtbuf, (uintmax_t )tcps_states[i]); } xo_close_container("TCP connection count by state"); xo_close_container("tcp"); } /* * Dump UDP statistics structure. */ void udp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct udpstat udpstat; uint64_t delivered; #ifdef INET6 if (udp_done != 0) return; else udp_done = 1; #endif if (fetch_stats("net.inet.udp.stats", off, &udpstat, sizeof(udpstat), kread_counters) != 0) return; xo_open_container("udp"); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (udpstat.f || sflag <= 1) \ xo_emit("\t" m, (uintmax_t)udpstat.f, plural(udpstat.f)) #define p1a(f, m) if (udpstat.f || sflag <= 1) \ xo_emit("\t" m, (uintmax_t)udpstat.f) p(udps_ipackets, "{:received-datagrams/%ju} " "{N:/datagram%s received}\n"); p1a(udps_hdrops, "{:dropped-incomplete-headers/%ju} " "{N:/with incomplete header}\n"); p1a(udps_badlen, "{:dropped-bad-data-length/%ju} " "{N:/with bad data length field}\n"); p1a(udps_badsum, "{:dropped-bad-checksum/%ju} " "{N:/with bad checksum}\n"); p1a(udps_nosum, "{:dropped-no-checksum/%ju} " "{N:/with no checksum}\n"); p1a(udps_noport, "{:dropped-no-socket/%ju} " "{N:/dropped due to no socket}\n"); p(udps_noportbcast, "{:dropped-broadcast-multicast/%ju} " "{N:/broadcast\\/multicast datagram%s undelivered}\n"); p1a(udps_fullsock, "{:dropped-full-socket-buffer/%ju} " "{N:/dropped due to full socket buffers}\n"); p1a(udpps_pcbhashmiss, "{:not-for-hashed-pcb/%ju} " "{N:/not for hashed pcb}\n"); delivered = udpstat.udps_ipackets - udpstat.udps_hdrops - udpstat.udps_badlen - udpstat.udps_badsum - udpstat.udps_noport - udpstat.udps_noportbcast - udpstat.udps_fullsock; if (delivered || sflag <= 1) xo_emit("\t{:delivered-packets/%ju} {N:/delivered}\n", (uint64_t)delivered); p(udps_opackets, "{:output-packets/%ju} {N:/datagram%s output}\n"); /* the next statistic is cumulative in udps_noportbcast */ p(udps_filtermcast, "{:multicast-source-filter-matches/%ju} " "{N:/time%s multicast source filter matched}\n"); #undef p #undef p1a xo_close_container("udp"); } /* * Dump CARP statistics structure. */ void carp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct carpstats carpstat; if (fetch_stats("net.inet.carp.stats", off, &carpstat, sizeof(carpstat), kread_counters) != 0) return; xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (carpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t)carpstat.f, plural(carpstat.f)) #define p2(f, m) if (carpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t)carpstat.f) p(carps_ipackets, "\t{:received-inet-packets/%ju} " "{N:/packet%s received (IPv4)}\n"); p(carps_ipackets6, "\t{:received-inet6-packets/%ju} " "{N:/packet%s received (IPv6)}\n"); p(carps_badttl, "\t\t{:dropped-wrong-ttl/%ju} " "{N:/packet%s discarded for wrong TTL}\n"); p(carps_hdrops, "\t\t{:dropped-short-header/%ju} " "{N:/packet%s shorter than header}\n"); p(carps_badsum, "\t\t{:dropped-bad-checksum/%ju} " "{N:/discarded for bad checksum%s}\n"); p(carps_badver, "\t\t{:dropped-bad-version/%ju} " "{N:/discarded packet%s with a bad version}\n"); p2(carps_badlen, "\t\t{:dropped-short-packet/%ju} " "{N:/discarded because packet too short}\n"); p2(carps_badauth, "\t\t{:dropped-bad-authentication/%ju} " "{N:/discarded for bad authentication}\n"); p2(carps_badvhid, "\t\t{:dropped-bad-vhid/%ju} " "{N:/discarded for bad vhid}\n"); p2(carps_badaddrs, "\t\t{:dropped-bad-address-list/%ju} " "{N:/discarded because of a bad address list}\n"); p(carps_opackets, "\t{:sent-inet-packets/%ju} " "{N:/packet%s sent (IPv4)}\n"); p(carps_opackets6, "\t{:sent-inet6-packets/%ju} " "{N:/packet%s sent (IPv6)}\n"); p2(carps_onomem, "\t\t{:send-failed-memory-error/%ju} " "{N:/send failed due to mbuf memory error}\n"); #if notyet p(carps_ostates, "\t\t{:send-state-updates/%s} " "{N:/state update%s sent}\n"); #endif #undef p #undef p2 xo_close_container(name); } /* * Dump IP statistics structure. */ void ip_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct ipstat ipstat; if (fetch_stats("net.inet.ip.stats", off, &ipstat, sizeof(ipstat), kread_counters) != 0) return; xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (ipstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t )ipstat.f, plural(ipstat.f)) #define p1a(f, m) if (ipstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t )ipstat.f) p(ips_total, "\t{:received-packets/%ju} " "{N:/total packet%s received}\n"); p(ips_badsum, "\t{:dropped-bad-checksum/%ju} " "{N:/bad header checksum%s}\n"); p1a(ips_toosmall, "\t{:dropped-below-minimum-size/%ju} " "{N:/with size smaller than minimum}\n"); p1a(ips_tooshort, "\t{:dropped-short-packets/%ju} " "{N:/with data size < data length}\n"); p1a(ips_toolong, "\t{:dropped-too-long/%ju} " "{N:/with ip length > max ip packet size}\n"); p1a(ips_badhlen, "\t{:dropped-short-header-length/%ju} " "{N:/with header length < data size}\n"); p1a(ips_badlen, "\t{:dropped-short-data/%ju} " "{N:/with data length < header length}\n"); p1a(ips_badoptions, "\t{:dropped-bad-options/%ju} " "{N:/with bad options}\n"); p1a(ips_badvers, "\t{:dropped-bad-version/%ju} " "{N:/with incorrect version number}\n"); p(ips_fragments, "\t{:received-fragments/%ju} " "{N:/fragment%s received}\n"); p(ips_fragdropped, "\t{:dropped-fragments/%ju} " "{N:/fragment%s dropped (dup or out of space)}\n"); p(ips_fragtimeout, "\t{:dropped-fragments-after-timeout/%ju} " "{N:/fragment%s dropped after timeout}\n"); p(ips_reassembled, "\t{:reassembled-packets/%ju} " "{N:/packet%s reassembled ok}\n"); p(ips_delivered, "\t{:received-local-packets/%ju} " "{N:/packet%s for this host}\n"); p(ips_noproto, "\t{:dropped-unknown-protocol/%ju} " "{N:/packet%s for unknown\\/unsupported protocol}\n"); p(ips_forward, "\t{:forwarded-packets/%ju} " "{N:/packet%s forwarded}"); p(ips_fastforward, " ({:fast-forwarded-packets/%ju} " "{N:/packet%s fast forwarded})"); if (ipstat.ips_forward || sflag <= 1) xo_emit("\n"); p(ips_cantforward, "\t{:packets-cannot-forward/%ju} " "{N:/packet%s not forwardable}\n"); p(ips_notmember, "\t{:received-unknown-multicast-group/%ju} " "{N:/packet%s received for unknown multicast group}\n"); p(ips_redirectsent, "\t{:redirects-sent/%ju} " "{N:/redirect%s sent}\n"); p(ips_localout, "\t{:sent-packets/%ju} " "{N:/packet%s sent from this host}\n"); p(ips_rawout, "\t{:send-packets-fabricated-header/%ju} " "{N:/packet%s sent with fabricated ip header}\n"); p(ips_odropped, "\t{:discard-no-mbufs/%ju} " "{N:/output packet%s dropped due to no bufs, etc.}\n"); p(ips_noroute, "\t{:discard-no-route/%ju} " "{N:/output packet%s discarded due to no route}\n"); p(ips_fragmented, "\t{:sent-fragments/%ju} " "{N:/output datagram%s fragmented}\n"); p(ips_ofragments, "\t{:fragments-created/%ju} " "{N:/fragment%s created}\n"); p(ips_cantfrag, "\t{:discard-cannot-fragment/%ju} " "{N:/datagram%s that can't be fragmented}\n"); p(ips_nogif, "\t{:discard-tunnel-no-gif/%ju} " "{N:/tunneling packet%s that can't find gif}\n"); p(ips_badaddr, "\t{:discard-bad-address/%ju} " "{N:/datagram%s with bad address in header}\n"); #undef p #undef p1a xo_close_container(name); } /* * Dump ARP statistics structure. */ void arp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct arpstat arpstat; if (fetch_stats("net.link.ether.arp.stats", off, &arpstat, sizeof(arpstat), kread_counters) != 0) return; xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (arpstat.f || sflag <= 1) \ xo_emit("\t" m, (uintmax_t)arpstat.f, plural(arpstat.f)) #define p2(f, m) if (arpstat.f || sflag <= 1) \ xo_emit("\t" m, (uintmax_t)arpstat.f, pluralies(arpstat.f)) p(txrequests, "{:sent-requests/%ju} {N:/ARP request%s sent}\n"); p(txerrors, "{:sent-failures/%ju} {N:/ARP request%s failed to sent}\n"); p2(txreplies, "{:sent-replies/%ju} {N:/ARP repl%s sent}\n"); p(rxrequests, "{:received-requests/%ju} " "{N:/ARP request%s received}\n"); p2(rxreplies, "{:received-replies/%ju} " "{N:/ARP repl%s received}\n"); p(received, "{:received-packets/%ju} " "{N:/ARP packet%s received}\n"); p(dropped, "{:dropped-no-entry/%ju} " "{N:/total packet%s dropped due to no ARP entry}\n"); p(timeouts, "{:entries-timeout/%ju} " "{N:/ARP entry%s timed out}\n"); p(dupips, "{:dropped-duplicate-address/%ju} " "{N:/Duplicate IP%s seen}\n"); #undef p #undef p2 xo_close_container(name); } static const char *icmpnames[ICMP_MAXTYPE + 1] = { "echo reply", /* RFC 792 */ "#1", "#2", "destination unreachable", /* RFC 792 */ "source quench", /* RFC 792 */ "routing redirect", /* RFC 792 */ "#6", "#7", "echo", /* RFC 792 */ "router advertisement", /* RFC 1256 */ "router solicitation", /* RFC 1256 */ "time exceeded", /* RFC 792 */ "parameter problem", /* RFC 792 */ "time stamp", /* RFC 792 */ "time stamp reply", /* RFC 792 */ "information request", /* RFC 792 */ "information request reply", /* RFC 792 */ "address mask request", /* RFC 950 */ "address mask reply", /* RFC 950 */ "#19", "#20", "#21", "#22", "#23", "#24", "#25", "#26", "#27", "#28", "#29", "icmp traceroute", /* RFC 1393 */ "datagram conversion error", /* RFC 1475 */ "mobile host redirect", "IPv6 where-are-you", "IPv6 i-am-here", "mobile registration req", "mobile registration reply", "domain name request", /* RFC 1788 */ "domain name reply", /* RFC 1788 */ "icmp SKIP", "icmp photuris", /* RFC 2521 */ }; /* * Dump ICMP statistics. */ void icmp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct icmpstat icmpstat; size_t len; int i, first; if (fetch_stats("net.inet.icmp.stats", off, &icmpstat, sizeof(icmpstat), kread_counters) != 0) return; xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (icmpstat.f || sflag <= 1) \ xo_emit(m, icmpstat.f, plural(icmpstat.f)) #define p1a(f, m) if (icmpstat.f || sflag <= 1) \ xo_emit(m, icmpstat.f) #define p2(f, m) if (icmpstat.f || sflag <= 1) \ xo_emit(m, icmpstat.f, plurales(icmpstat.f)) p(icps_error, "\t{:icmp-calls/%lu} " "{N:/call%s to icmp_error}\n"); p(icps_oldicmp, "\t{:errors-not-from-message/%lu} " "{N:/error%s not generated in response to an icmp message}\n"); for (first = 1, i = 0; i < ICMP_MAXTYPE + 1; i++) { if (icmpstat.icps_outhist[i] != 0) { if (first) { xo_open_list("output-histogram"); xo_emit("\tOutput histogram:\n"); first = 0; } xo_open_instance("output-histogram"); if (icmpnames[i] != NULL) xo_emit("\t\t{k:name/%s}: {:count/%lu}\n", icmpnames[i], icmpstat.icps_outhist[i]); else xo_emit("\t\tunknown ICMP #{k:name/%d}: " "{:count/%lu}\n", i, icmpstat.icps_outhist[i]); xo_close_instance("output-histogram"); } } if (!first) xo_close_list("output-histogram"); p(icps_badcode, "\t{:dropped-bad-code/%lu} " "{N:/message%s with bad code fields}\n"); p(icps_tooshort, "\t{:dropped-too-short/%lu} " "{N:/message%s less than the minimum length}\n"); p(icps_checksum, "\t{:dropped-bad-checksum/%lu} " "{N:/message%s with bad checksum}\n"); p(icps_badlen, "\t{:dropped-bad-length/%lu} " "{N:/message%s with bad length}\n"); p1a(icps_bmcastecho, "\t{:dropped-multicast-echo/%lu} " "{N:/multicast echo requests ignored}\n"); p1a(icps_bmcasttstamp, "\t{:dropped-multicast-timestamp/%lu} " "{N:/multicast timestamp requests ignored}\n"); for (first = 1, i = 0; i < ICMP_MAXTYPE + 1; i++) { if (icmpstat.icps_inhist[i] != 0) { if (first) { xo_open_list("input-histogram"); xo_emit("\tInput histogram:\n"); first = 0; } xo_open_instance("input-histogram"); if (icmpnames[i] != NULL) xo_emit("\t\t{k:name/%s}: {:count/%lu}\n", icmpnames[i], icmpstat.icps_inhist[i]); else xo_emit( "\t\tunknown ICMP #{k:name/%d}: {:count/%lu}\n", i, icmpstat.icps_inhist[i]); xo_close_instance("input-histogram"); } } if (!first) xo_close_list("input-histogram"); p(icps_reflect, "\t{:sent-packets/%lu} " "{N:/message response%s generated}\n"); p2(icps_badaddr, "\t{:discard-invalid-return-address/%lu} " "{N:/invalid return address%s}\n"); p(icps_noroute, "\t{:discard-no-route/%lu} " "{N:/no return route%s}\n"); #undef p #undef p1a #undef p2 if (live) { len = sizeof i; if (sysctlbyname("net.inet.icmp.maskrepl", &i, &len, NULL, 0) < 0) return; xo_emit("\tICMP address mask responses are " "{q:icmp-address-responses/%sabled}\n", i ? "en" : "dis"); } xo_close_container(name); } /* * Dump IGMP statistics structure. */ void igmp_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct igmpstat igmpstat; int error, zflag0; if (fetch_stats("net.inet.igmp.stats", 0, &igmpstat, sizeof(igmpstat), kread) != 0) return; /* * Reread net.inet.igmp.stats when zflag == 1. * This is because this MIB contains version number and * length of the structure which are not set when clearing * the counters. */ zflag0 = zflag; if (zflag) { zflag = 0; error = fetch_stats("net.inet.igmp.stats", 0, &igmpstat, sizeof(igmpstat), kread); zflag = zflag0; if (error) return; } if (igmpstat.igps_version != IGPS_VERSION_3) { xo_warnx("%s: version mismatch (%d != %d)", __func__, igmpstat.igps_version, IGPS_VERSION_3); return; } if (igmpstat.igps_len != IGPS_VERSION3_LEN) { xo_warnx("%s: size mismatch (%d != %d)", __func__, igmpstat.igps_len, IGPS_VERSION3_LEN); return; } xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p64(f, m) if (igmpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t) igmpstat.f, plural(igmpstat.f)) #define py64(f, m) if (igmpstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t) igmpstat.f, pluralies(igmpstat.f)) p64(igps_rcv_total, "\t{:received-messages/%ju} " "{N:/message%s received}\n"); p64(igps_rcv_tooshort, "\t{:dropped-too-short/%ju} " "{N:/message%s received with too few bytes}\n"); p64(igps_rcv_badttl, "\t{:dropped-wrong-ttl/%ju} " "{N:/message%s received with wrong TTL}\n"); p64(igps_rcv_badsum, "\t{:dropped-bad-checksum/%ju} " "{N:/message%s received with bad checksum}\n"); py64(igps_rcv_v1v2_queries, "\t{:received-membership-queries/%ju} " "{N:/V1\\/V2 membership quer%s received}\n"); py64(igps_rcv_v3_queries, "\t{:received-v3-membership-queries/%ju} " "{N:/V3 membership quer%s received}\n"); py64(igps_rcv_badqueries, "\t{:dropped-membership-queries/%ju} " "{N:/membership quer%s received with invalid field(s)}\n"); py64(igps_rcv_gen_queries, "\t{:received-general-queries/%ju} " "{N:/general quer%s received}\n"); py64(igps_rcv_group_queries, "\t{:received-group-queries/%ju} " "{N:/group quer%s received}\n"); py64(igps_rcv_gsr_queries, "\t{:received-group-source-queries/%ju} " "{N:/group-source quer%s received}\n"); py64(igps_drop_gsr_queries, "\t{:dropped-group-source-queries/%ju} " "{N:/group-source quer%s dropped}\n"); p64(igps_rcv_reports, "\t{:received-membership-requests/%ju} " "{N:/membership report%s received}\n"); p64(igps_rcv_badreports, "\t{:dropped-membership-reports/%ju} " "{N:/membership report%s received with invalid field(s)}\n"); p64(igps_rcv_ourreports, "\t" "{:received-membership-reports-matching/%ju} " "{N:/membership report%s received for groups to which we belong}" "\n"); p64(igps_rcv_nora, "\t{:received-v3-reports-no-router-alert/%ju} " "{N:/V3 report%s received without Router Alert}\n"); p64(igps_snd_reports, "\t{:sent-membership-reports/%ju} " "{N:/membership report%s sent}\n"); #undef p64 #undef py64 xo_close_container(name); } /* * Dump PIM statistics structure. */ void pim_stats(u_long off __unused, const char *name, int af1 __unused, int proto __unused) { struct pimstat pimstat; if (fetch_stats("net.inet.pim.stats", off, &pimstat, sizeof(pimstat), kread_counters) != 0) return; xo_open_container(name); xo_emit("{T:/%s}:\n", name); #define p(f, m) if (pimstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t)pimstat.f, plural(pimstat.f)) #define py(f, m) if (pimstat.f || sflag <= 1) \ xo_emit(m, (uintmax_t)pimstat.f, pimstat.f != 1 ? "ies" : "y") p(pims_rcv_total_msgs, "\t{:received-messages/%ju} " "{N:/message%s received}\n"); p(pims_rcv_total_bytes, "\t{:received-bytes/%ju} " "{N:/byte%s received}\n"); p(pims_rcv_tooshort, "\t{:dropped-too-short/%ju} " "{N:/message%s received with too few bytes}\n"); p(pims_rcv_badsum, "\t{:dropped-bad-checksum/%ju} " "{N:/message%s received with bad checksum}\n"); p(pims_rcv_badversion, "\t{:dropped-bad-version/%ju} " "{N:/message%s received with bad version}\n"); p(pims_rcv_registers_msgs, "\t{:received-data-register-messages/%ju} " "{N:/data register message%s received}\n"); p(pims_rcv_registers_bytes, "\t{:received-data-register-bytes/%ju} " "{N:/data register byte%s received}\n"); p(pims_rcv_registers_wrongiif, "\t" "{:received-data-register-wrong-interface/%ju} " "{N:/data register message%s received on wrong iif}\n"); p(pims_rcv_badregisters, "\t{:received-bad-registers/%ju} " "{N:/bad register%s received}\n"); p(pims_snd_registers_msgs, "\t{:sent-data-register-messages/%ju} " "{N:/data register message%s sent}\n"); p(pims_snd_registers_bytes, "\t{:sent-data-register-bytes/%ju} " "{N:/data register byte%s sent}\n"); #undef p #undef py xo_close_container(name); } #ifdef INET /* * Pretty print an Internet address (net address + port). */ static void inetprint(const char *container, struct in_addr *in, int port, const char *proto, int num_port, const int af1) { struct servent *sp = 0; char line[80], *cp; int width; size_t alen, plen; if (container) xo_open_container(container); if (Wflag) snprintf(line, sizeof(line), "%s.", inetname(in)); else snprintf(line, sizeof(line), "%.*s.", (Aflag && !num_port) ? 12 : 16, inetname(in)); alen = strlen(line); cp = line + alen; if (!num_port && port) sp = getservbyport((int)port, proto); if (sp || port == 0) snprintf(cp, sizeof(line) - alen, "%.15s ", sp ? sp->s_name : "*"); else snprintf(cp, sizeof(line) - alen, "%d ", ntohs((u_short)port)); width = (Aflag && !Wflag) ? 18 : ((!Wflag || af1 == AF_INET) ? 22 : 45); if (Wflag) xo_emit("{d:target/%-*s} ", width, line); else xo_emit("{d:target/%-*.*s} ", width, width, line); plen = strlen(cp) - 1; alen--; xo_emit("{e:address/%*.*s}{e:port/%*.*s}", alen, alen, line, plen, plen, cp); if (container) xo_close_container(container); } /* * Construct an Internet address representation. * If numeric_addr has been supplied, give * numeric value, otherwise try for symbolic name. */ char * inetname(struct in_addr *inp) { char *cp; static char line[MAXHOSTNAMELEN]; struct hostent *hp; cp = 0; if (!numeric_addr && inp->s_addr != INADDR_ANY) { hp = gethostbyaddr((char *)inp, sizeof (*inp), AF_INET); if (hp) { cp = hp->h_name; trimdomain(cp, strlen(cp)); } } if (inp->s_addr == INADDR_ANY) strcpy(line, "*"); else if (cp) { strlcpy(line, cp, sizeof(line)); } else { inp->s_addr = ntohl(inp->s_addr); #define C(x) ((u_int)((x) & 0xff)) snprintf(line, sizeof(line), "%u.%u.%u.%u", C(inp->s_addr >> 24), C(inp->s_addr >> 16), C(inp->s_addr >> 8), C(inp->s_addr)); } return (line); } #endif diff --git a/usr.bin/netstat/main.c b/usr.bin/netstat/main.c index 1a011b9d5488..d1b069f38f0c 100644 --- a/usr.bin/netstat/main.c +++ b/usr.bin/netstat/main.c @@ -1,913 +1,913 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1983, 1988, 1993 * Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static char const copyright[] = "@(#) Copyright (c) 1983, 1988, 1993\n\ Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #if 0 #ifndef lint static char sccsid[] = "@(#)main.c 8.4 (Berkeley) 3/1/94"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #ifdef NETGRAPH #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "netstat.h" #include "nl_defs.h" #include static struct protox { int pr_index; /* index into nlist of cb head */ int pr_sindex; /* index into nlist of stat block */ u_char pr_wanted; /* 1 if wanted, 0 otherwise */ void (*pr_cblocks)(u_long, const char *, int, int); /* control blocks printing routine */ void (*pr_stats)(u_long, const char *, int, int); /* statistics printing routine */ void (*pr_istats)(char *); /* per/if statistics printing routine */ const char *pr_name; /* well-known name */ int pr_usesysctl; /* non-zero if we use sysctl, not kvm */ int pr_protocol; } protox[] = { { N_TCBINFO, N_TCPSTAT, 1, protopr, tcp_stats, NULL, "tcp", 1, IPPROTO_TCP }, { N_UDBINFO, N_UDPSTAT, 1, protopr, udp_stats, NULL, "udp", 1, IPPROTO_UDP }, #ifdef SCTP { -1, N_SCTPSTAT, 1, sctp_protopr, sctp_stats, NULL, "sctp", 1, IPPROTO_SCTP }, #endif #ifdef SDP { -1, -1, 1, protopr, NULL, NULL, "sdp", 1, IPPROTO_TCP }, #endif { N_DIVCBINFO, -1, 1, protopr, - NULL, NULL, "divert", 1, IPPROTO_DIVERT }, + NULL, NULL, "divert", 1, 0 }, { N_RIPCBINFO, N_IPSTAT, 1, protopr, ip_stats, NULL, "ip", 1, IPPROTO_RAW }, { N_RIPCBINFO, N_ICMPSTAT, 1, protopr, icmp_stats, NULL, "icmp", 1, IPPROTO_ICMP }, { N_RIPCBINFO, N_IGMPSTAT, 1, protopr, igmp_stats, NULL, "igmp", 1, IPPROTO_IGMP }, #ifdef IPSEC { -1, N_IPSEC4STAT, 1, NULL, /* keep as compat */ ipsec_stats, NULL, "ipsec", 1, 0}, { -1, N_AHSTAT, 1, NULL, ah_stats, NULL, "ah", 1, 0}, { -1, N_ESPSTAT, 1, NULL, esp_stats, NULL, "esp", 1, 0}, { -1, N_IPCOMPSTAT, 1, NULL, ipcomp_stats, NULL, "ipcomp", 1, 0}, #endif { N_RIPCBINFO, N_PIMSTAT, 1, protopr, pim_stats, NULL, "pim", 1, IPPROTO_PIM }, { -1, N_CARPSTATS, 1, NULL, carp_stats, NULL, "carp", 1, 0 }, #ifdef PF { -1, N_PFSYNCSTATS, 1, NULL, pfsync_stats, NULL, "pfsync", 1, 0 }, #endif { -1, N_ARPSTAT, 1, NULL, arp_stats, NULL, "arp", 1, 0 }, { -1, -1, 0, NULL, NULL, NULL, NULL, 0, 0 } }; #ifdef INET6 static struct protox ip6protox[] = { { N_TCBINFO, N_TCPSTAT, 1, protopr, tcp_stats, NULL, "tcp", 1, IPPROTO_TCP }, { N_UDBINFO, N_UDPSTAT, 1, protopr, udp_stats, NULL, "udp", 1, IPPROTO_UDP }, { N_RIPCBINFO, N_IP6STAT, 1, protopr, ip6_stats, ip6_ifstats, "ip6", 1, IPPROTO_RAW }, { N_RIPCBINFO, N_ICMP6STAT, 1, protopr, icmp6_stats, icmp6_ifstats, "icmp6", 1, IPPROTO_ICMPV6 }, #ifdef SDP { -1, -1, 1, protopr, NULL, NULL, "sdp", 1, IPPROTO_TCP }, #endif #ifdef IPSEC { -1, N_IPSEC6STAT, 1, NULL, ipsec_stats, NULL, "ipsec6", 1, 0 }, #endif #ifdef notyet { -1, N_PIM6STAT, 1, NULL, pim6_stats, NULL, "pim6", 1, 0 }, #endif { -1, N_RIP6STAT, 1, NULL, rip6_stats, NULL, "rip6", 1, 0 }, { -1, -1, 0, NULL, NULL, NULL, NULL, 0, 0 } }; #endif /*INET6*/ #ifdef IPSEC static struct protox pfkeyprotox[] = { { -1, N_PFKEYSTAT, 1, NULL, pfkey_stats, NULL, "pfkey", 0, 0 }, { -1, -1, 0, NULL, NULL, NULL, NULL, 0, 0 } }; #endif #ifdef NETGRAPH static struct protox netgraphprotox[] = { { N_NGSOCKLIST, -1, 1, netgraphprotopr, NULL, NULL, "ctrl", 0, 0 }, { N_NGSOCKLIST, -1, 1, netgraphprotopr, NULL, NULL, "data", 0, 0 }, { -1, -1, 0, NULL, NULL, NULL, NULL, 0, 0 } }; #endif static struct protox *protoprotox[] = { protox, #ifdef INET6 ip6protox, #endif #ifdef IPSEC pfkeyprotox, #endif NULL }; static void printproto(struct protox *, const char *, bool *); static void usage(void); static struct protox *name2protox(const char *); static struct protox *knownname(const char *); static int kresolve_list(struct nlist *_nl); static kvm_t *kvmd; static char *nlistf = NULL, *memf = NULL; int Aflag; /* show addresses of protocol control block */ int aflag; /* show all sockets (including servers) */ static int Bflag; /* show information about bpf consumers */ int bflag; /* show i/f total bytes in/out */ int cflag; /* show TCP congestion control stack */ int Cflag; /* show congestion control algo and vars */ int dflag; /* show i/f dropped packets */ int gflag; /* show group (multicast) routing or stats */ int hflag; /* show counters in human readable format */ int iflag; /* show interfaces */ int Lflag; /* show size of listen queues */ int mflag; /* show memory stats */ int noutputs = 0; /* how much outputs before we exit */ int numeric_addr; /* show addresses numerically */ int numeric_port; /* show ports numerically */ int Oflag; /* show nhgrp objects*/ int oflag; /* show nexthop objects*/ int Pflag; /* show TCP log ID */ static int pflag; /* show given protocol */ static int Qflag; /* show netisr information */ int rflag; /* show routing tables (or routing stats) */ int Rflag; /* show flow / RSS statistics */ int sflag; /* show protocol statistics */ int Wflag; /* wide display */ int Tflag; /* TCP Information */ int xflag; /* extra information, includes all socket buffer info */ int zflag; /* zero stats */ int interval; /* repeat interval for i/f stats */ char *interface; /* desired i/f for stats, or NULL for all i/fs */ int unit; /* unit number for above */ static int af; /* address family */ int live; /* true if we are examining a live system */ int main(int argc, char *argv[]) { struct protox *tp = NULL; /* for printing cblocks & stats */ int ch; int fib = -1; char *endptr; bool first = true; af = AF_UNSPEC; argc = xo_parse_args(argc, argv); if (argc < 0) exit(EXIT_FAILURE); while ((ch = getopt(argc, argv, "46AaBbCcdF:f:ghI:iLlM:mN:nOoPp:Qq:RrSTsuWw:xz")) != -1) switch(ch) { case '4': #ifdef INET af = AF_INET; #else errx(1, "IPv4 support is not compiled in"); #endif break; case '6': #ifdef INET6 af = AF_INET6; #else errx(1, "IPv6 support is not compiled in"); #endif break; case 'A': Aflag = 1; break; case 'a': aflag = 1; break; case 'B': Bflag = 1; break; case 'b': bflag = 1; break; case 'c': cflag = 1; break; case 'C': Cflag = 1; break; case 'd': dflag = 1; break; case 'F': fib = strtol(optarg, &endptr, 0); if (*endptr != '\0' || (fib == 0 && (errno == EINVAL || errno == ERANGE))) xo_errx(1, "%s: invalid fib", optarg); break; case 'f': if (strcmp(optarg, "inet") == 0) af = AF_INET; #ifdef INET6 else if (strcmp(optarg, "inet6") == 0) af = AF_INET6; #endif #ifdef IPSEC else if (strcmp(optarg, "pfkey") == 0) af = PF_KEY; #endif else if (strcmp(optarg, "unix") == 0 || strcmp(optarg, "local") == 0) af = AF_UNIX; #ifdef NETGRAPH else if (strcmp(optarg, "ng") == 0 || strcmp(optarg, "netgraph") == 0) af = AF_NETGRAPH; #endif else if (strcmp(optarg, "link") == 0) af = AF_LINK; else { xo_errx(1, "%s: unknown address family", optarg); } break; case 'g': gflag = 1; break; case 'h': hflag = 1; break; case 'I': { char *cp; iflag = 1; for (cp = interface = optarg; isalpha(*cp); cp++) continue; unit = atoi(cp); break; } case 'i': iflag = 1; break; case 'L': Lflag = 1; break; case 'M': memf = optarg; break; case 'm': mflag = 1; break; case 'N': nlistf = optarg; break; case 'n': numeric_addr = numeric_port = 1; break; case 'o': oflag = 1; break; case 'O': Oflag = 1; break; case 'P': Pflag = 1; break; case 'p': if ((tp = name2protox(optarg)) == NULL) { xo_errx(1, "%s: unknown or uninstrumented " "protocol", optarg); } pflag = 1; break; case 'Q': Qflag = 1; break; case 'q': noutputs = atoi(optarg); if (noutputs != 0) noutputs++; break; case 'r': rflag = 1; break; case 'R': Rflag = 1; break; case 's': ++sflag; break; case 'S': numeric_addr = 1; break; case 'u': af = AF_UNIX; break; case 'W': case 'l': Wflag = 1; break; case 'w': interval = atoi(optarg); iflag = 1; break; case 'T': Tflag = 1; break; case 'x': xflag = 1; break; case 'z': zflag = 1; break; case '?': default: usage(); } argv += optind; argc -= optind; #define BACKWARD_COMPATIBILITY #ifdef BACKWARD_COMPATIBILITY if (*argv) { if (isdigit(**argv)) { interval = atoi(*argv); if (interval <= 0) usage(); ++argv; iflag = 1; } if (*argv) { nlistf = *argv; if (*++argv) memf = *argv; } } #endif /* * Discard setgid privileges if not the running kernel so that bad * guys can't print interesting stuff from kernel memory. */ live = (nlistf == NULL && memf == NULL); if (!live) { if (setgid(getgid()) != 0) xo_err(-1, "setgid"); /* Load all necessary kvm symbols */ kresolve_list(nl); } if (xflag && Tflag) xo_errx(1, "-x and -T are incompatible, pick one."); if (Bflag) { if (!live) usage(); bpf_stats(interface); xo_finish(); exit(0); } if (mflag) { if (!live) { if (kread(0, NULL, 0) == 0) mbpr(kvmd, nl[N_SFSTAT].n_value); } else mbpr(NULL, 0); xo_finish(); exit(0); } if (Qflag) { if (!live) { if (kread(0, NULL, 0) == 0) netisr_stats(); } else netisr_stats(); xo_finish(); exit(0); } #if 0 /* * Keep file descriptors open to avoid overhead * of open/close on each call to get* routines. */ sethostent(1); setnetent(1); #else /* * This does not make sense any more with DNS being default over * the files. Doing a setXXXXent(1) causes a tcp connection to be * used for the queries, which is slower. */ #endif if (iflag && !sflag) { xo_open_container("statistics"); intpr(NULL, af); xo_close_container("statistics"); xo_finish(); exit(0); } if (rflag) { xo_open_container("statistics"); if (sflag) { if (live) { kresolve_list(nl); } rt_stats(); } else routepr(fib, af); xo_close_container("statistics"); xo_finish(); exit(0); } if (oflag) { xo_open_container("statistics"); nhops_print(fib, af); xo_close_container("statistics"); xo_finish(); exit(0); } if (Oflag) { xo_open_container("statistics"); nhgrp_print(fib, af); xo_close_container("statistics"); xo_finish(); exit(0); } if (gflag) { xo_open_container("statistics"); if (sflag) { if (af == AF_INET || af == AF_UNSPEC) mrt_stats(); #ifdef INET6 if (af == AF_INET6 || af == AF_UNSPEC) mrt6_stats(); #endif } else { if (af == AF_INET || af == AF_UNSPEC) mroutepr(); #ifdef INET6 if (af == AF_INET6 || af == AF_UNSPEC) mroute6pr(); #endif } xo_close_container("statistics"); xo_finish(); exit(0); } if (tp) { xo_open_container("statistics"); printproto(tp, tp->pr_name, &first); if (!first) xo_close_list("socket"); xo_close_container("statistics"); xo_finish(); exit(0); } xo_open_container("statistics"); if (af == AF_INET || af == AF_UNSPEC) for (tp = protox; tp->pr_name; tp++) printproto(tp, tp->pr_name, &first); #ifdef INET6 if (af == AF_INET6 || af == AF_UNSPEC) for (tp = ip6protox; tp->pr_name; tp++) printproto(tp, tp->pr_name, &first); #endif /*INET6*/ #ifdef IPSEC if (af == PF_KEY || af == AF_UNSPEC) for (tp = pfkeyprotox; tp->pr_name; tp++) printproto(tp, tp->pr_name, &first); #endif /*IPSEC*/ #ifdef NETGRAPH if (af == AF_NETGRAPH || af == AF_UNSPEC) for (tp = netgraphprotox; tp->pr_name; tp++) printproto(tp, tp->pr_name, &first); #endif /* NETGRAPH */ if ((af == AF_UNIX || af == AF_UNSPEC) && !sflag) unixpr(nl[N_UNP_COUNT].n_value, nl[N_UNP_GENCNT].n_value, nl[N_UNP_DHEAD].n_value, nl[N_UNP_SHEAD].n_value, nl[N_UNP_SPHEAD].n_value, &first); if (!first) xo_close_list("socket"); xo_close_container("statistics"); xo_finish(); exit(0); } static int fetch_stats_internal(const char *sysctlname, u_long off, void *stats, size_t len, kreadfn_t kreadfn, int zero) { int error; if (live) { memset(stats, 0, len); if (zero) error = sysctlbyname(sysctlname, NULL, NULL, stats, len); else error = sysctlbyname(sysctlname, stats, &len, NULL, 0); if (error == -1 && errno != ENOENT) xo_warn("sysctl %s", sysctlname); } else { if (off == 0) return (1); error = kreadfn(off, stats, len); } return (error); } int fetch_stats(const char *sysctlname, u_long off, void *stats, size_t len, kreadfn_t kreadfn) { return (fetch_stats_internal(sysctlname, off, stats, len, kreadfn, zflag)); } int fetch_stats_ro(const char *sysctlname, u_long off, void *stats, size_t len, kreadfn_t kreadfn) { return (fetch_stats_internal(sysctlname, off, stats, len, kreadfn, 0)); } /* * Print out protocol statistics or control blocks (per sflag). * If the interface was not specifically requested, and the symbol * is not in the namelist, ignore this one. */ static void printproto(struct protox *tp, const char *name, bool *first) { void (*pr)(u_long, const char *, int, int); u_long off; bool doingdblocks = false; if (sflag) { if (iflag) { if (tp->pr_istats) intpr(tp->pr_istats, af); else if (pflag) xo_message("%s: no per-interface stats routine", tp->pr_name); return; } else { pr = tp->pr_stats; if (!pr) { if (pflag) xo_message("%s: no stats routine", tp->pr_name); return; } if (tp->pr_usesysctl && live) off = 0; else if (tp->pr_sindex < 0) { if (pflag) xo_message("%s: stats routine doesn't " "work on cores", tp->pr_name); return; } else off = nl[tp->pr_sindex].n_value; } } else { doingdblocks = true; pr = tp->pr_cblocks; if (!pr) { if (pflag) xo_message("%s: no PCB routine", tp->pr_name); return; } if (tp->pr_usesysctl && live) off = 0; else if (tp->pr_index < 0) { if (pflag) xo_message("%s: PCB routine doesn't work on " "cores", tp->pr_name); return; } else off = nl[tp->pr_index].n_value; } if (pr != NULL && (off || (live && tp->pr_usesysctl) || af != AF_UNSPEC)) { if (doingdblocks && *first) { xo_open_list("socket"); *first = false; } (*pr)(off, name, af, tp->pr_protocol); } } static int kvmd_init(void) { char errbuf[_POSIX2_LINE_MAX]; if (kvmd != NULL) return (0); kvmd = kvm_openfiles(nlistf, memf, NULL, O_RDONLY, errbuf); if (setgid(getgid()) != 0) xo_err(-1, "setgid"); if (kvmd == NULL) { xo_warnx("kvm not available: %s", errbuf); return (-1); } return (0); } /* * Resolve symbol list, return 0 on success. */ static int kresolve_list(struct nlist *_nl) { if ((kvmd == NULL) && (kvmd_init() != 0)) return (-1); if (_nl[0].n_type != 0) return (0); if (kvm_nlist(kvmd, _nl) < 0) { if (nlistf) xo_errx(1, "%s: kvm_nlist: %s", nlistf, kvm_geterr(kvmd)); else xo_errx(1, "kvm_nlist: %s", kvm_geterr(kvmd)); } return (0); } /* * Wrapper of kvm_dpcpu_setcpu(). */ void kset_dpcpu(u_int cpuid) { if ((kvmd == NULL) && (kvmd_init() != 0)) xo_errx(-1, "%s: kvm is not available", __func__); if (kvm_dpcpu_setcpu(kvmd, cpuid) < 0) xo_errx(-1, "%s: kvm_dpcpu_setcpu(%u): %s", __func__, cpuid, kvm_geterr(kvmd)); return; } /* * Read kernel memory, return 0 on success. */ int kread(u_long addr, void *buf, size_t size) { if (kvmd_init() < 0) return (-1); if (!buf) return (0); if (kvm_read(kvmd, addr, buf, size) != (ssize_t)size) { xo_warnx("%s", kvm_geterr(kvmd)); return (-1); } return (0); } /* * Read single counter(9). */ uint64_t kread_counter(u_long addr) { if (kvmd_init() < 0) return (-1); return (kvm_counter_u64_fetch(kvmd, addr)); } /* * Read an array of N counters in kernel memory into array of N uint64_t's. */ int kread_counters(u_long addr, void *buf, size_t size) { uint64_t *c; u_long *counters; size_t i, n; if (kvmd_init() < 0) return (-1); if (size % sizeof(uint64_t) != 0) { xo_warnx("kread_counters: invalid counter set size"); return (-1); } n = size / sizeof(uint64_t); if ((counters = malloc(n * sizeof(u_long))) == NULL) xo_err(-1, "malloc"); if (kread(addr, counters, n * sizeof(u_long)) < 0) { free(counters); return (-1); } c = buf; for (i = 0; i < n; i++) c[i] = kvm_counter_u64_fetch(kvmd, counters[i]); free(counters); return (0); } const char * plural(uintmax_t n) { return (n != 1 ? "s" : ""); } const char * plurales(uintmax_t n) { return (n != 1 ? "es" : ""); } const char * pluralies(uintmax_t n) { return (n != 1 ? "ies" : "y"); } /* * Find the protox for the given "well-known" name. */ static struct protox * knownname(const char *name) { struct protox **tpp, *tp; for (tpp = protoprotox; *tpp; tpp++) for (tp = *tpp; tp->pr_name; tp++) if (strcmp(tp->pr_name, name) == 0) return (tp); return (NULL); } /* * Find the protox corresponding to name. */ static struct protox * name2protox(const char *name) { struct protox *tp; char **alias; /* alias from p->aliases */ struct protoent *p; /* * Try to find the name in the list of "well-known" names. If that * fails, check if name is an alias for an Internet protocol. */ if ((tp = knownname(name)) != NULL) return (tp); setprotoent(1); /* make protocol lookup cheaper */ while ((p = getprotoent()) != NULL) { /* assert: name not same as p->name */ for (alias = p->p_aliases; *alias; alias++) if (strcmp(name, *alias) == 0) { endprotoent(); return (knownname(p->p_name)); } } endprotoent(); return (NULL); } static void usage(void) { (void)xo_error("%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n", "usage: netstat [-46AaCcLnRSTWx] [-f protocol_family | -p protocol]\n" " [-M core] [-N system]", " netstat -i | -I interface [-46abdhnW] [-f address_family]\n" " [-M core] [-N system]", " netstat -w wait [-I interface] [-46d] [-M core] [-N system]\n" " [-q howmany]", " netstat -s [-46sz] [-f protocol_family | -p protocol]\n" " [-M core] [-N system]", " netstat -i | -I interface -s [-46s]\n" " [-f protocol_family | -p protocol] [-M core] [-N system]", " netstat -m [-M core] [-N system]", " netstat -B [-z] [-I interface]", " netstat -r [-46AnW] [-F fibnum] [-f address_family]\n" " [-M core] [-N system]", " netstat -rs [-s] [-M core] [-N system]", " netstat -g [-46W] [-f address_family] [-M core] [-N system]", " netstat -gs [-46s] [-f address_family] [-M core] [-N system]", " netstat -Q"); xo_finish(); exit(1); }