diff --git a/lib/libc/sys/socket.2 b/lib/libc/sys/socket.2
index 8ced1f0ba930..1eceabbf6fd4 100644
--- a/lib/libc/sys/socket.2
+++ b/lib/libc/sys/socket.2
@@ -1,349 +1,351 @@
 .\" Copyright (c) 1983, 1991, 1993
 .\"	The Regents of the University of California.  All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\"     From: @(#)socket.2	8.1 (Berkeley) 6/4/93
 .\" $FreeBSD$
 .\"
-.Dd August 26, 2022
+.Dd August 30, 2022
 .Dt SOCKET 2
 .Os
 .Sh NAME
 .Nm socket
 .Nd create an endpoint for communication
 .Sh LIBRARY
 .Lb libc
 .Sh SYNOPSIS
 .In sys/socket.h
 .Ft int
 .Fn socket "int domain" "int type" "int protocol"
 .Sh DESCRIPTION
 The
 .Fn socket
 system call
 creates an endpoint for communication and returns a descriptor.
 .Pp
 The
 .Fa domain
 argument specifies a communications domain within which
 communication will take place; this selects the protocol family
 which should be used.
 These families are defined in the include file
 .In sys/socket.h .
 The currently understood formats are:
 .Pp
 .Bd -literal -offset indent -compact
 PF_LOCAL	Host-internal protocols (alias for PF_UNIX),
 PF_UNIX		Host-internal protocols,
 PF_INET		Internet version 4 protocols,
 PF_INET6	Internet version 6 protocols,
+PF_DIVERT	Firewall packet diversion/re-injection,
 PF_ROUTE	Internal routing protocol,
 PF_KEY		Internal key-management function,
 PF_NETGRAPH	Netgraph sockets,
 PF_BLUETOOTH	Bluetooth protocols,
 PF_INET_SDP	OFED socket direct protocol (IPv4),
 AF_HYPERV	HyperV sockets
 .Ed
 .Pp
 Each protocol family is connected to an address family, which has the
 same name except that the prefix is
 .Dq Dv AF_
 in place of
 .Dq Dv PF_ .
 Other protocol families may be also defined, beginning with
 .Dq Dv PF_ ,
 with corresponding address families.
 .Pp
 The socket has the indicated
 .Fa type ,
 which specifies the semantics of communication.
 Currently
 defined types are:
 .Pp
 .Bd -literal -offset indent -compact
 SOCK_STREAM	Stream socket,
 SOCK_DGRAM	Datagram socket,
 SOCK_RAW	Raw-protocol interface,
 SOCK_SEQPACKET	Sequenced packet stream
 .Ed
 .Pp
 A
 .Dv SOCK_STREAM
 type provides sequenced, reliable,
 two-way connection based byte streams.
 An out-of-band data transmission mechanism may be supported.
 A
 .Dv SOCK_DGRAM
 socket supports
 datagrams (connectionless, unreliable messages of
 a fixed (typically small) maximum length).
 A
 .Dv SOCK_SEQPACKET
 socket may provide a sequenced, reliable,
 two-way connection-based data transmission path for datagrams
 of fixed maximum length; a consumer may be required to read
 an entire packet with each read system call.
 This facility may have protocol-specific properties.
 .Dv SOCK_RAW
 sockets provide access to internal network protocols and interfaces.
 The
 .Dv SOCK_RAW
 type is available only to the super-user and is described in
 .Xr ip 4
 and
 .Xr ip6 4 .
 .Pp
 Additionally, the following flags are allowed in the
 .Fa type
 argument:
 .Pp
 .Bd -literal -offset indent -compact
 SOCK_CLOEXEC	Set close-on-exec on the new descriptor,
 SOCK_NONBLOCK	Set non-blocking mode on the new socket
 .Ed
 .Pp
 The
 .Fa protocol
 argument
 specifies a particular protocol to be used with the socket.
 Normally only a single protocol exists to support a particular
 socket type within a given protocol family.
 However, it is possible
 that many protocols may exist, in which case a particular protocol
 must be specified in this manner.
 The protocol number to use is
 particular to the
 .Dq "communication domain"
 in which communication
 is to take place; see
 .Xr protocols 5 .
 .Pp
 The
 .Fa protocol
 argument may be set to zero (0) to request the default
 implementation of a socket type for the protocol, if any.
 .Pp
 Sockets of type
 .Dv SOCK_STREAM
 are full-duplex byte streams, similar
 to pipes.
 A stream socket must be in a
 .Em connected
 state before any data may be sent or received
 on it.
 A connection to another socket is created with a
 .Xr connect 2
 system call.
 Once connected, data may be transferred using
 .Xr read 2
 and
 .Xr write 2
 calls or some variant of the
 .Xr send 2
 and
 .Xr recv 2
 functions.
 (Some protocol families, such as the Internet family,
 support the notion of an
 .Dq implied connect ,
 which permits data to be sent piggybacked onto a connect operation by
 using the
 .Xr sendto 2
 system call.)
 When a session has been completed a
 .Xr close 2
 may be performed.
 Out-of-band data may also be transmitted as described in
 .Xr send 2
 and received as described in
 .Xr recv 2 .
 .Pp
 The communications protocols used to implement a
 .Dv SOCK_STREAM
 ensure that data
 is not lost or duplicated.
 If a piece of data for which the
 peer protocol has buffer space cannot be successfully transmitted
 within a reasonable length of time, then
 the connection is considered broken and calls
 will indicate an error with
 -1 returns and with
 .Er ETIMEDOUT
 as the specific code
 in the global variable
 .Va errno .
 The protocols optionally keep sockets
 .Dq warm
 by forcing transmissions
 roughly every minute in the absence of other activity.
 An error is then indicated if no response can be
 elicited on an otherwise
 idle connection for an extended period (e.g.\& 5 minutes).
 By default, a
 .Dv SIGPIPE
 signal is raised if a process sends
 on a broken stream, but this behavior may be inhibited via
 .Xr setsockopt 2 .
 .Pp
 .Dv SOCK_SEQPACKET
 sockets employ the same system calls
 as
 .Dv SOCK_STREAM
 sockets.
 The only difference
 is that
 .Xr read 2
 calls will return only the amount of data requested,
 and any remaining in the arriving packet will be discarded.
 .Pp
 .Dv SOCK_DGRAM
 and
 .Dv SOCK_RAW
 sockets allow sending of datagrams to correspondents
 named in
 .Xr send 2
 calls.
 Datagrams are generally received with
 .Xr recvfrom 2 ,
 which returns the next datagram with its return address.
 .Pp
 An
 .Xr fcntl 2
 system call can be used to specify a process group to receive
 a
 .Dv SIGURG
 signal when the out-of-band data arrives.
 It may also enable non-blocking I/O
 and asynchronous notification of I/O events
 via
 .Dv SIGIO .
 .Pp
 The operation of sockets is controlled by socket level
 .Em options .
 These options are defined in the file
 .In sys/socket.h .
 The
 .Xr setsockopt 2
 and
 .Xr getsockopt 2
 system calls are used to set and get options, respectively.
 .Sh RETURN VALUES
 A -1 is returned if an error occurs, otherwise the return
 value is a descriptor referencing the socket.
 .Sh ERRORS
 The
 .Fn socket
 system call fails if:
 .Bl -tag -width Er
 .It Bq Er EACCES
 Permission to create a socket of the specified type and/or protocol
 is denied.
 .It Bq Er EAFNOSUPPORT
 The address family (domain) is not supported or the
 specified domain is not supported by this protocol family.
 .It Bq Er EMFILE
 The per-process descriptor table is full.
 .It Bq Er ENFILE
 The system file table is full.
 .It Bq Er ENOBUFS
 Insufficient buffer space is available.
 The socket cannot be created until sufficient resources are freed.
 .It Bq Er EPERM
 User has insufficient privileges to carry out the requested operation.
 .It Bq Er EPROTONOSUPPORT
 The protocol type or the specified protocol is not supported
 within this domain.
 .It Bq Er EPROTOTYPE
 The socket type is not supported by the protocol.
 .El
 .Sh SEE ALSO
 .Xr accept 2 ,
 .Xr bind 2 ,
 .Xr connect 2 ,
+.Xr divert 4 ,
 .Xr getpeername 2 ,
 .Xr getsockname 2 ,
 .Xr getsockopt 2 ,
 .Xr ioctl 2 ,
 .Xr ip 4 ,
 .Xr ip6 4 ,
 .Xr listen 2 ,
 .Xr read 2 ,
 .Xr recv 2 ,
 .Xr select 2 ,
 .Xr send 2 ,
 .Xr shutdown 2 ,
 .Xr socketpair 2 ,
 .Xr write 2 ,
 .Xr CMSG_DATA 3 ,
 .Xr getprotoent 3 ,
 .Xr netgraph 4 ,
 .Xr protocols 5
 .Rs
 .%T "An Introductory 4.3 BSD Interprocess Communication Tutorial"
 .%B PS1
 .%N 7
 .Re
 .Rs
 .%T "BSD Interprocess Communication Tutorial"
 .%B PS1
 .%N 8
 .Re
 .Sh STANDARDS
 The
 .Fn socket
 function conforms to
 .St -p1003.1-2008 .
 The
 .Tn POSIX
 standard specifies only the
 .Dv AF_INET ,
 .Dv AF_INET6 ,
 and
 .Dv AF_UNIX
 constants for address families, and requires the use of
 .Dv AF_*
 constants for the
 .Fa domain
 argument of
 .Fn socket .
 The
 .Dv SOCK_CLOEXEC
 flag is expected to conform to the next revision of the
 .Tn POSIX
 standard.
 The
 .Dv SOCK_RDM
 .Fa type ,
 the
 .Dv PF_*
 constants, and other address families are
 .Fx
 extensions.
 .Sh HISTORY
 The
 .Fn socket
 system call appeared in
 .Bx 4.2 .
diff --git a/share/examples/netgraph/ngctl b/share/examples/netgraph/ngctl
index e7b7cd86b04f..8dc6b23815b7 100644
--- a/share/examples/netgraph/ngctl
+++ b/share/examples/netgraph/ngctl
@@ -1,173 +1,173 @@
 # $FreeBSD$
 
 #
 # This is an example that shows how to send ASCII formatted control
 # messages to a node using ngctl(8).
 #
 # What we will do here create a divert(4) tap.  This simply dumps
 # out all packets diverted by some ipfw(8) divert rule to the console.
 #
 # Lines that begin with ``$'' (shell prompt) or ``+'' (ngctl prompt)
 # indicate user input
 #
 
 # First, start up ngctl in interactive mode:
 
     $ ngctl
     Available commands:
       connect    Connects hook <peerhook> of the node at <relpath> to <hook>
       debug      Get/set debugging verbosity level
       help       Show command summary or get more help on a specific command
       list       Show information about all nodes
       mkpeer     Create and connect a new node to the node at "path"
       msg        Send a netgraph control message to the node at "path"
       name       Assign name <name> to the node at <path>
       read       Read and execute commands from a file
       rmhook     Disconnect hook "hook" of the node at "path"
       show       Show information about the node at <path>
       shutdown   Shutdown the node at <path>
       status     Get human readable status information from the node at <path>
       types      Show information about all installed node types
       quit       Exit program
     +
 
-# Now let's create a ng_ksocket(4) node, in the family PF_INET,
-# of type SOCK_RAW, and protocol IPPROTO_DIVERT:
+# Now let's create a ng_ksocket(4) node, in the family PF_DIVERT,
+# of type SOCK_RAW:
 
-    + mkpeer ksocket foo inet/raw/divert
+    + mkpeer ksocket foo divert/raw/0
 
 # Note that ``foo'' is the hook name on the socket node, which can be
 # anything.  The ``inet/raw/divert'' is the hook name on the ksocket
 # node, which tells it what kind of socket to create.
 
 # Lets give our ksocket node a global name.  How about ``fred'':
 
     + name foo fred
 
 # Note that we used ngctl's ``name'' command to do this.  However,
 # the following manually constructed netgraph message would have
 # accomplished the exact same thing:
 
     + msg foo name { name="fred" }
 
 # Here we are using the ASCII <-> binary control message conversion
 # routines.  ngctl does this for us automatically when we use the
 # ``msg'' command.
 
 # Now lets bind the socket associated with the ksocket node to a port
 # supplied by the system.  We do this by sending the ksocket node a
 # ``bind'' control message.  Again, ngctl does the conversion of the
 # control message from ASCII to binary behind the scenes.
 
     + msg fred: bind inet/192.168.1.1
 
 # The ksocket accepts arbitrary sockaddr structures, but also has
 # special support for the PF_LOCAL and PF_INET protocol families.
 # That is why we can specify the struct sockaddr argument to the
 # ``bind'' command as ``inet/192.168.1.1'' (since we didn't specify
 # a port number, it's assumed to be zero).  We could have also
 # relied on the generic sockaddr syntax and instead said this:
 
     + msg fred: bind { family=2 len=16 data=[ 2=192 168 1 1 ] }
 
 # This is what you would have to do for protocol families other
 # that PF_INET and PF_LOCAL, at least until special handling for
 # new ones is added.
 
 # The reason for the ``2=192'' is to skip the two byte IP port number,
 # which causes it to be set to zero, the default value for integral
 # types when parsing.  Now since we didn't ask for a specific port
 # number, we need to do a ``getname'' to see what port number we got:
 
     + msg fred: getname
     Rec'd response "getname" (5) from "fred:":
     Args:   inet/192.168.1.1:1029
 
 # As soon as we sent the message, we got back a response.  Here
 # ngctl is telling us that it received a control message with the
 # NGF_RESP (response) flag set, the response was to a prior ``getname''
 # control message, that the originator was the node addressable
 # as ``fred:''.  The message arguments field is then displayed to
 # us in its ASCII form.  In this case, what we get back is a struct
 # sockaddr, and there we see that our port number is 1029.
 
 # So now let's add the ipfw divert rule for whatever packets we
 # want to see.  How about anything from 192.168.1.129.
 
     + ^Z
     Suspended
     $ ipfw add 100 divert 1029 ip from 192.168.1.129 to any
     00100 divert 1029 ip from 192.168.1.129 to any
     $ fg
 
 # Now watch what happens when we try to ping from that machine:
 
     + 
     Rec'd data packet on hook "foo":
     0000:  45 00 00 3c 57 00 00 00 20 01 bf ee c0 a8 01 81  E..<W... .......
     0010:  c0 a8 01 01 08 00 49 5c 03 00 01 00 61 62 63 64  ......I\....abcd
     0020:  65 66 67 68 69 6a 6b 6c 6d 6e 6f 70 71 72 73 74  efghijklmnopqrst
     0030:  75 76 77 61 62 63 64 65 66 67 68 69              uvwabcdefghi
     + 
     Rec'd data packet on hook "foo":
     0000:  45 00 00 3c 58 00 00 00 20 01 be ee c0 a8 01 81  E..<X... .......
     0010:  c0 a8 01 01 08 00 48 5c 03 00 02 00 61 62 63 64  ......H\....abcd
     0020:  65 66 67 68 69 6a 6b 6c 6d 6e 6f 70 71 72 73 74  efghijklmnopqrst
     0030:  75 76 77 61 62 63 64 65 66 67 68 69              uvwabcdefghi
     + 
     Rec'd data packet on hook "foo":
     0000:  45 00 00 3c 59 00 00 00 20 01 bd ee c0 a8 01 81  E..<Y... .......
     0010:  c0 a8 01 01 08 00 47 5c 03 00 03 00 61 62 63 64  ......G\....abcd
     0020:  65 66 67 68 69 6a 6b 6c 6d 6e 6f 70 71 72 73 74  efghijklmnopqrst
     0030:  75 76 77 61 62 63 64 65 66 67 68 69              uvwabcdefghi
     +
 
 # So we're seeing the output from the ksocket socket appear on the ``foo''
 # hook of ngctl's socket node.  Since the packets are getting diverted,
 # the 192.168.1.129 machine doesn't see any response from us.
 
 # Of course, any type of socket can be used, even TCP:
 
     + mkpeer ksocket bar inet/stream/tcp
     + msg bar connect inet/192.168.1.33:13
     ngctl: send msg: Operation now in progress
     + 
     Rec'd data packet on hook "foo":
     0000:  4d 6f 6e 20 4e 6f 76 20 32 39 20 31 37 3a 34 38  Mon Nov 29 17:48
     0010:  3a 33 37 20 31 39 39 39 0d 0a                    :37 1999..
     +
 
 # Or, UNIX domain:
 
     + mkpeer ksocket bar local/stream/0
     + msg bar bind local/"/tmp/bar.socket"
     + 
 
 # Here's an example of a more complicated ASCII control message argument.
 # If you look in /sys/netgraph/ng_message.h, you will see that a node
 # responds to a NGM_LISTHOOKS with a struct hooklist, which contains
 # an array of struct linkinfo:
 #
 #     /* Structure used for NGM_LISTHOOKS */
 #     struct linkinfo {
 #             char            ourhook[NG_HOOKSIZ];        /* hook name */
 #             char            peerhook[NG_HOOKSIZ];       /* peer hook */
 #             struct nodeinfo nodeinfo;
 #     };
 #
 #     struct hooklist {
 #             struct nodeinfo nodeinfo;               /* node information */
 #             struct linkinfo link[0];                /* info about each hook */
 #     };
 #
 # By sending a node the ``listhooks'' command using ngctl, we can see
 # this structure in ASCII form (lines wrapped for readability):
 
     + msg bar bind local/"/tmp/bar.socket"
     + msg bar listhooks
     Rec'd response "listhooks" (7) from "bar":
     Args:   { nodeinfo={ type="ksocket" id=9 hooks=1 }
 	    linkinfo=[ { ourhook="local/stream/0" peerhook="bar"
 	    nodeinfo={ name="ngctl1327" type="socket" id=8 hooks=1 } } ] }
 
 
diff --git a/share/man/man4/divert.4 b/share/man/man4/divert.4
index d8296995ca97..cfe1a31486c9 100644
--- a/share/man/man4/divert.4
+++ b/share/man/man4/divert.4
@@ -1,192 +1,200 @@
 .\" $FreeBSD$
 .\"
-.Dd December 17, 2004
+.Dd August 30, 2022
 .Dt DIVERT 4
 .Os
 .Sh NAME
 .Nm divert
 .Nd kernel packet diversion mechanism
 .Sh SYNOPSIS
 .In sys/types.h
 .In sys/socket.h
 .In netinet/in.h
 .Ft int
-.Fn socket PF_INET SOCK_RAW IPPROTO_DIVERT
+.Fn socket PF_DIVERT SOCK_RAW 0
 .Pp
 To enable support for divert sockets, place the following lines in the
 kernel configuration file:
 .Bd -ragged -offset indent
 .Cd "options IPFIREWALL"
 .Cd "options IPDIVERT"
 .Ed
 .Pp
 Alternatively, to load
 the driver
 as a module at boot time, add the following lines into the
 .Xr loader.conf 5
 file:
 .Bd -literal -offset indent
 ipfw_load="YES"
 ipdivert_load="YES"
 .Ed
 .Sh DESCRIPTION
-Divert sockets are similar to raw IP sockets, except that they
-can be bound to a specific
+Divert sockets allow to intercept and re-inject packets flowing through
+the
+.Xr ipfw 4
+firewall.
+A divert socket can be bound to a specific
 .Nm
 port via the
 .Xr bind 2
 system call.
-The IP address in the bind is ignored; only the port
-number is significant.
+The sockaddr argument shall be sockaddr_in with sin_port set to the
+desired value.
+Note that the
+.Nm
+port has nothing to do with TCP/UDP ports.
+It is just a cookie number, that allows to differentiate between different
+divert points in the
+.Xr ipfw 4
+ruleset.
 A divert socket bound to a divert port will receive all packets diverted
-to that port by some (here unspecified) kernel mechanism(s).
-Packets may also be written to a divert port, in which case they
-re-enter kernel IP packet processing.
+to that port by
+.Xr ipfw 4 .
+Packets may also be written to a divert port, in which case they re-enter
+firewall processing at the next rule.
 .Pp
-Divert sockets are normally used in conjunction with
-.Fx Ns 's
-packet filtering implementation and the
-.Xr ipfw 8
-program.
 By reading from and writing to a divert socket, matching packets
 can be passed through an arbitrary ``filter'' as they travel through
 the host machine, special routing tricks can be done, etc.
 .Sh READING PACKETS
 Packets are diverted either as they are ``incoming'' or ``outgoing.''
 Incoming packets are diverted after reception on an IP interface,
 whereas outgoing packets are diverted before next hop forwarding.
 .Pp
 Diverted packets may be read unaltered via
 .Xr read 2 ,
 .Xr recv 2 ,
 or
 .Xr recvfrom 2 .
 In the latter case, the address returned will have its port set to
 some tag supplied by the packet diverter, (usually the ipfw rule number)
 and the IP address set to the (first) address of
 the interface on which the packet was received (if the packet
 was incoming) or
 .Dv INADDR_ANY
 (if the packet was outgoing).
 The interface name (if defined
 for the packet) will be placed in the 8 bytes following the address,
 if it fits.
 .Sh WRITING PACKETS
 Writing to a divert socket is similar to writing to a raw IP socket;
 the packet is injected ``as is'' into the normal kernel IP packet
 processing using
 .Xr sendto 2
 and minimal error checking is done.
 Packets are distinguished as either incoming or outgoing.
 If
 .Xr sendto 2
 is used with a destination IP address of
 .Dv INADDR_ANY ,
 then the packet is treated as if it were outgoing, i.e., destined
 for a non-local address.
 Otherwise, the packet is assumed to be
 incoming and full packet routing is done.
 .Pp
 In the latter case, the
 IP address specified must match the address of some local interface,
 or an interface name
 must be found after the IP address.
 If an interface name is found,
 that interface will be used and the value of the IP address will be
 ignored (other than the fact that it is not
 .Dv INADDR_ANY ) .
 This is to indicate on which interface the packet
 .Dq arrived .
 .Pp
 Normally, packets read as incoming should be written as incoming;
 similarly for outgoing packets.
 When reading and then writing back
 packets, passing the same socket address supplied by
 .Xr recvfrom 2
 unmodified to
 .Xr sendto 2
 simplifies things (see below).
 .Pp
 The port part of the socket address passed to the
 .Xr sendto 2
 contains a tag that should be meaningful to the diversion module.
 In the
 case of
 .Xr ipfw 8
 the tag is interpreted as the rule number
 .Em after which
 rule processing should restart.
 .Sh LOOP AVOIDANCE
 Packets written into a divert socket
 (using
 .Xr sendto 2 )
 re-enter the packet filter at the rule number
 following the tag given in the port part of the socket address, which
 is usually already set at the rule number that caused the diversion
 (not the next rule if there are several at the same number).
 If the 'tag'
 is altered to indicate an alternative re-entry point, care should be taken
 to avoid loops, where the same packet is diverted more than once at the
 same rule.
 .Sh DETAILS
 If a packet is diverted but no socket is bound to the
 port, or if
 .Dv IPDIVERT
 is not enabled or loaded in the kernel, the packet is dropped.
 .Pp
 Incoming packet fragments which get diverted are fully reassembled
 before delivery; the diversion of any one fragment causes the entire
 packet to get diverted.
 If different fragments divert to different ports,
 then which port ultimately gets chosen is unpredictable.
 .Pp
 Note that packets arriving on the divert socket by the
 .Xr ipfw 8
 .Cm tee
 action are delivered as-is and packet fragments do not get reassembled
 in this case.
 .Pp
 Packets are received and sent unchanged, except that
 packets read as outgoing have invalid IP header checksums, and
 packets written as outgoing have their IP header checksums overwritten
 with the correct value.
 Packets written as incoming and having incorrect checksums will be dropped.
 Otherwise, all header fields are unchanged (and therefore in network order).
 .Pp
 Binding to port numbers less than 1024 requires super-user access, as does
-creating a socket of type SOCK_RAW.
+creating a
+.Nm
+socket.
 .Sh ERRORS
 Writing to a divert socket can return these errors, along with
 the usual errors possible when writing raw packets:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 The packet had an invalid header, or the IP options in the packet
 and the socket options set were incompatible.
 .It Bq Er EADDRNOTAVAIL
 The destination address contained an IP address not equal to
 .Dv INADDR_ANY
 that was not associated with any interface.
 .El
 .Sh SEE ALSO
 .Xr bind 2 ,
 .Xr recvfrom 2 ,
 .Xr sendto 2 ,
 .Xr socket 2 ,
 .Xr ipfw 4 ,
 .Xr ipfw 8
 .Sh AUTHORS
 .An Archie Cobbs Aq Mt archie@FreeBSD.org ,
 Whistle Communications Corp.
 .Sh BUGS
 This is an attempt to provide a clean way for user mode processes
 to implement various IP tricks like address translation, but it
 could be cleaner, and it is too dependent on
 .Xr ipfw 8 .
 .Pp
 It is questionable whether incoming fragments should be reassembled
 before being diverted.
 For example, if only some fragments of a
 packet destined for another machine do not get routed through the
 local machine, the packet is lost.
 This should probably be
 a settable socket option in any case.
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index bf22c0245f24..1bc172eacd89 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -1,4319 +1,4330 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2004 The FreeBSD Foundation
  * Copyright (c) 2004-2008 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
  */
 
 /*
  * Comments on the socket life cycle:
  *
  * soalloc() sets of socket layer state for a socket, called only by
  * socreate() and sonewconn().  Socket layer private.
  *
  * sodealloc() tears down socket layer state for a socket, called only by
  * sofree() and sonewconn().  Socket layer private.
  *
  * pru_attach() associates protocol layer state with an allocated socket;
  * called only once, may fail, aborting socket allocation.  This is called
  * from socreate() and sonewconn().  Socket layer private.
  *
  * pru_detach() disassociates protocol layer state from an attached socket,
  * and will be called exactly once for sockets in which pru_attach() has
  * been successfully called.  If pru_attach() returned an error,
  * pru_detach() will not be called.  Socket layer private.
  *
  * pru_abort() and pru_close() notify the protocol layer that the last
  * consumer of a socket is starting to tear down the socket, and that the
  * protocol should terminate the connection.  Historically, pru_abort() also
  * detached protocol state from the socket state, but this is no longer the
  * case.
  *
  * socreate() creates a socket and attaches protocol state.  This is a public
  * interface that may be used by socket layer consumers to create new
  * sockets.
  *
  * sonewconn() creates a socket and attaches protocol state.  This is a
  * public interface  that may be used by protocols to create new sockets when
  * a new connection is received and will be available for accept() on a
  * listen socket.
  *
  * soclose() destroys a socket after possibly waiting for it to disconnect.
  * This is a public interface that socket consumers should use to close and
  * release a socket when done with it.
  *
  * soabort() destroys a socket without waiting for it to disconnect (used
  * only for incoming connections that are already partially or fully
  * connected).  This is used internally by the socket layer when clearing
  * listen socket queues (due to overflow or close on the listen socket), but
  * is also a public interface protocols may use to abort connections in
  * their incomplete listen queues should they no longer be required.  Sockets
  * placed in completed connection listen queues should not be aborted for
  * reasons described in the comment above the soclose() implementation.  This
  * is not a general purpose close routine, and except in the specific
  * circumstances described here, should not be used.
  *
  * sofree() will free a socket and its protocol state if all references on
  * the socket have been released, and is the public interface to attempt to
  * free a socket when a reference is removed.  This is a socket layer private
  * interface.
  *
  * NOTE: In addition to socreate() and soclose(), which provide a single
  * socket reference to the consumer to be managed as required, there are two
  * calls to explicitly manage socket references, soref(), and sorele().
  * Currently, these are generally required only when transitioning a socket
  * from a listen queue to a file descriptor, in order to prevent garbage
  * collection of the socket at an untimely moment.  For a number of reasons,
  * these interfaces are not preferred, and should be avoided.
  *
  * NOTE: With regard to VNETs the general rule is that callers do not set
  * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
  * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
  * and sorflush(), which are usually called from a pre-set VNET context.
  * sopoll() currently does not need a VNET context to be set.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/fcntl.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/domain.h>
 #include <sys/file.h>			/* for struct knote */
 #include <sys/hhook.h>
 #include <sys/kernel.h>
 #include <sys/khelp.h>
 #include <sys/ktls.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/resourcevar.h>
 #include <net/route.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #include <sys/jail.h>
 #include <sys/syslog.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp.h>
 
 #include <net/vnet.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/uma.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <sys/sysent.h>
 #include <compat/freebsd32/freebsd32.h>
 #endif
 
 static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
 		    int flags);
 static void	so_rdknl_lock(void *);
 static void	so_rdknl_unlock(void *);
 static void	so_rdknl_assert_lock(void *, int);
 static void	so_wrknl_lock(void *);
 static void	so_wrknl_unlock(void *);
 static void	so_wrknl_assert_lock(void *, int);
 
 static void	filt_sordetach(struct knote *kn);
 static int	filt_soread(struct knote *kn, long hint);
 static void	filt_sowdetach(struct knote *kn);
 static int	filt_sowrite(struct knote *kn, long hint);
 static int	filt_soempty(struct knote *kn, long hint);
 static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
 fo_kqfilter_t	soo_kqfilter;
 
 static struct filterops soread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_sordetach,
 	.f_event = filt_soread,
 };
 static struct filterops sowrite_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_sowdetach,
 	.f_event = filt_sowrite,
 };
 static struct filterops soempty_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_sowdetach,
 	.f_event = filt_soempty,
 };
 
 so_gen_t	so_gencnt;	/* generation count for sockets */
 
 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 
 #define	VNET_SO_ASSERT(so)						\
 	VNET_ASSERT(curvnet != NULL,					\
 	    ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
 
 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]);
 #define	V_socket_hhh		VNET(socket_hhh)
 
 /*
  * Limit on the number of connections in the listen queue waiting
  * for accept(2).
  * NB: The original sysctl somaxconn is still available but hidden
  * to prevent confusion about the actual purpose of this number.
  */
 static u_int somaxconn = SOMAXCONN;
 
 static int
 sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int val;
 
 	val = somaxconn;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 
 	/*
 	 * The purpose of the UINT_MAX / 3 limit, is so that the formula
 	 *   3 * so_qlimit / 2
 	 * below, will not overflow.
          */
 
 	if (val < 1 || val > UINT_MAX / 3)
 		return (EINVAL);
 
 	somaxconn = val;
 	return (0);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue,
     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int),
     sysctl_somaxconn, "I",
     "Maximum listen socket pending connection accept queue size");
 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0,
     sizeof(int), sysctl_somaxconn, "I",
     "Maximum listen socket pending connection accept queue size (compat)");
 
 static int numopensockets;
 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
     &numopensockets, 0, "Number of open sockets");
 
 /*
  * accept_mtx locks down per-socket fields relating to accept queues.  See
  * socketvar.h for an annotation of the protected fields of struct socket.
  */
 struct mtx accept_mtx;
 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
 
 /*
  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
  * so_gencnt field.
  */
 static struct mtx so_global_mtx;
 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
 
 /*
  * General IPC sysctl name space, used by sockets and a variety of other IPC
  * types.
  */
 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IPC");
 
 /*
  * Initialize the socket subsystem and set up the socket
  * memory allocator.
  */
 static uma_zone_t socket_zone;
 int	maxsockets;
 
 static void
 socket_zone_change(void *tag)
 {
 
 	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
 }
 
 static void
 socket_hhook_register(int subtype)
 {
 
 	if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype,
 	    &V_socket_hhh[subtype],
 	    HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register hook\n", __func__);
 }
 
 static void
 socket_hhook_deregister(int subtype)
 {
 
 	if (hhook_head_deregister(V_socket_hhh[subtype]) != 0)
 		printf("%s: WARNING: unable to deregister hook\n", __func__);
 }
 
 static void
 socket_init(void *tag)
 {
 
 	socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
 	uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
 	EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
 	    EVENTHANDLER_PRI_FIRST);
 }
 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
 
 static void
 socket_vnet_init(const void *unused __unused)
 {
 	int i;
 
 	/* We expect a contiguous range */
 	for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
 		socket_hhook_register(i);
 }
 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
     socket_vnet_init, NULL);
 
 static void
 socket_vnet_uninit(const void *unused __unused)
 {
 	int i;
 
 	for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
 		socket_hhook_deregister(i);
 }
 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
     socket_vnet_uninit, NULL);
 
 /*
  * Initialise maxsockets.  This SYSINIT must be run after
  * tunable_mbinit().
  */
 static void
 init_maxsockets(void *ignored)
 {
 
 	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
 	maxsockets = imax(maxsockets, maxfiles);
 }
 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
 
 /*
  * Sysctl to get and set the maximum global sockets limit.  Notify protocols
  * of the change so that they can update their dependent limits as required.
  */
 static int
 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
 {
 	int error, newmaxsockets;
 
 	newmaxsockets = maxsockets;
 	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
 	if (error == 0 && req->newptr && newmaxsockets != maxsockets) {
 		if (newmaxsockets > maxsockets &&
 		    newmaxsockets <= maxfiles) {
 			maxsockets = newmaxsockets;
 			EVENTHANDLER_INVOKE(maxsockets_change);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &maxsockets, 0,
     sysctl_maxsockets, "IU",
     "Maximum number of sockets available");
 
 /*
  * Socket operation routines.  These routines are called by the routines in
  * sys_socket.c or from a system process, and implement the semantics of
  * socket operations by switching out to the protocol specific routines.
  */
 
 /*
  * Get a socket structure from our zone, and initialize it.  Note that it
  * would probably be better to allocate socket and PCB at the same time, but
  * I'm not convinced that all the protocols can be easily modified to do
  * this.
  *
  * soalloc() returns a socket with a ref count of 0.
  */
 static struct socket *
 soalloc(struct vnet *vnet)
 {
 	struct socket *so;
 
 	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
 	if (so == NULL)
 		return (NULL);
 #ifdef MAC
 	if (mac_socket_init(so, M_NOWAIT) != 0) {
 		uma_zfree(socket_zone, so);
 		return (NULL);
 	}
 #endif
 	if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) {
 		uma_zfree(socket_zone, so);
 		return (NULL);
 	}
 
 	/*
 	 * The socket locking protocol allows to lock 2 sockets at a time,
 	 * however, the first one must be a listening socket.  WITNESS lacks
 	 * a feature to change class of an existing lock, so we use DUPOK.
 	 */
 	mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK);
 	mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF);
 	mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF);
 	so->so_rcv.sb_sel = &so->so_rdsel;
 	so->so_snd.sb_sel = &so->so_wrsel;
 	sx_init(&so->so_snd_sx, "so_snd_sx");
 	sx_init(&so->so_rcv_sx, "so_rcv_sx");
 	TAILQ_INIT(&so->so_snd.sb_aiojobq);
 	TAILQ_INIT(&so->so_rcv.sb_aiojobq);
 	TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so);
 	TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so);
 #ifdef VIMAGE
 	VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
 	    __func__, __LINE__, so));
 	so->so_vnet = vnet;
 #endif
 	/* We shouldn't need the so_global_mtx */
 	if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) {
 		/* Do we need more comprehensive error returns? */
 		uma_zfree(socket_zone, so);
 		return (NULL);
 	}
 	mtx_lock(&so_global_mtx);
 	so->so_gencnt = ++so_gencnt;
 	++numopensockets;
 #ifdef VIMAGE
 	vnet->vnet_sockcnt++;
 #endif
 	mtx_unlock(&so_global_mtx);
 
 	return (so);
 }
 
 /*
  * Free the storage associated with a socket at the socket layer, tear down
  * locks, labels, etc.  All protocol state is assumed already to have been
  * torn down (and possibly never set up) by the caller.
  */
 void
 sodealloc(struct socket *so)
 {
 
 	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
 	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
 
 	mtx_lock(&so_global_mtx);
 	so->so_gencnt = ++so_gencnt;
 	--numopensockets;	/* Could be below, but faster here. */
 #ifdef VIMAGE
 	VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
 	    __func__, __LINE__, so));
 	so->so_vnet->vnet_sockcnt--;
 #endif
 	mtx_unlock(&so_global_mtx);
 #ifdef MAC
 	mac_socket_destroy(so);
 #endif
 	hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE);
 
 	khelp_destroy_osd(&so->osd);
 	if (SOLISTENING(so)) {
 		if (so->sol_accept_filter != NULL)
 			accept_filt_setopt(so, NULL);
 	} else {
 		if (so->so_rcv.sb_hiwat)
 			(void)chgsbsize(so->so_cred->cr_uidinfo,
 			    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
 		if (so->so_snd.sb_hiwat)
 			(void)chgsbsize(so->so_cred->cr_uidinfo,
 			    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
 		sx_destroy(&so->so_snd_sx);
 		sx_destroy(&so->so_rcv_sx);
 		mtx_destroy(&so->so_snd_mtx);
 		mtx_destroy(&so->so_rcv_mtx);
 	}
 	crfree(so->so_cred);
 	mtx_destroy(&so->so_lock);
 	uma_zfree(socket_zone, so);
 }
 
 /*
  * socreate returns a socket with a ref count of 1 and a file descriptor
  * reference.  The socket should be closed with soclose().
  */
 int
 socreate(int dom, struct socket **aso, int type, int proto,
     struct ucred *cred, struct thread *td)
 {
 	struct protosw *prp;
 	struct socket *so;
 	int error;
 
+	/*
+	 * XXX: divert(4) historically abused PF_INET.  Keep this compatibility
+	 * shim until all applications have been updated.
+	 */
+	if (__predict_false(dom == PF_INET && type == SOCK_RAW &&
+	    proto == IPPROTO_DIVERT)) {
+		dom = PF_DIVERT;
+		printf("%s uses obsolete way to create divert(4) socket\n",
+		    td->td_proc->p_comm);
+	}
+
 	if (proto)
 		prp = pffindproto(dom, proto, type);
 	else
 		prp = pffindtype(dom, type);
 
 	if (prp == NULL) {
 		/* No support for domain. */
 		if (pffinddomain(dom) == NULL)
 			return (EAFNOSUPPORT);
 		/* No support for socket type. */
 		if (proto == 0 && type != 0)
 			return (EPROTOTYPE);
 		return (EPROTONOSUPPORT);
 	}
 
 	MPASS(prp->pr_attach);
 
 	if (IN_CAPABILITY_MODE(td) && (prp->pr_flags & PR_CAPATTACH) == 0)
 		return (ECAPMODE);
 
 	if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
 		return (EPROTONOSUPPORT);
 
 	if (prp->pr_type != type)
 		return (EPROTOTYPE);
 	so = soalloc(CRED_TO_VNET(cred));
 	if (so == NULL)
 		return (ENOBUFS);
 
 	so->so_type = type;
 	so->so_cred = crhold(cred);
 	if ((prp->pr_domain->dom_family == PF_INET) ||
 	    (prp->pr_domain->dom_family == PF_INET6) ||
 	    (prp->pr_domain->dom_family == PF_ROUTE))
 		so->so_fibnum = td->td_proc->p_fibnum;
 	else
 		so->so_fibnum = 0;
 	so->so_proto = prp;
 #ifdef MAC
 	mac_socket_create(cred, so);
 #endif
 	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 	    so_rdknl_assert_lock);
 	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 	    so_wrknl_assert_lock);
 	if ((prp->pr_flags & PR_SOCKBUF) == 0) {
 		so->so_snd.sb_mtx = &so->so_snd_mtx;
 		so->so_rcv.sb_mtx = &so->so_rcv_mtx;
 	}
 	/*
 	 * Auto-sizing of socket buffers is managed by the protocols and
 	 * the appropriate flags must be set in the pru_attach function.
 	 */
 	CURVNET_SET(so->so_vnet);
 	error = prp->pr_attach(so, proto, td);
 	CURVNET_RESTORE();
 	if (error) {
 		sodealloc(so);
 		return (error);
 	}
 	soref(so);
 	*aso = so;
 	return (0);
 }
 
 #ifdef REGRESSION
 static int regression_sonewconn_earlytest = 1;
 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
 #endif
 
 static struct timeval overinterval = { 60, 0 };
 SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW,
     &overinterval,
     "Delay in seconds between warnings for listen socket overflows");
 
 /*
  * When an attempt at a new connection is noted on a socket which supports
  * accept(2), the protocol has two options:
  * 1) Call legacy sonewconn() function, which would call protocol attach
  *    method, same as used for socket(2).
  * 2) Call solisten_clone(), do attach that is specific to a cloned connection,
  *    and then call solisten_enqueue().
  *
  * Note: the ref count on the socket is 0 on return.
  */
 struct socket *
 solisten_clone(struct socket *head)
 {
 	struct sbuf descrsb;
 	struct socket *so;
 	int len, overcount;
 	u_int qlen;
 	const char localprefix[] = "local:";
 	char descrbuf[SUNPATHLEN + sizeof(localprefix)];
 #if defined(INET6)
 	char addrbuf[INET6_ADDRSTRLEN];
 #elif defined(INET)
 	char addrbuf[INET_ADDRSTRLEN];
 #endif
 	bool dolog, over;
 
 	SOLISTEN_LOCK(head);
 	over = (head->sol_qlen > 3 * head->sol_qlimit / 2);
 #ifdef REGRESSION
 	if (regression_sonewconn_earlytest && over) {
 #else
 	if (over) {
 #endif
 		head->sol_overcount++;
 		dolog = !!ratecheck(&head->sol_lastover, &overinterval);
 
 		/*
 		 * If we're going to log, copy the overflow count and queue
 		 * length from the listen socket before dropping the lock.
 		 * Also, reset the overflow count.
 		 */
 		if (dolog) {
 			overcount = head->sol_overcount;
 			head->sol_overcount = 0;
 			qlen = head->sol_qlen;
 		}
 		SOLISTEN_UNLOCK(head);
 
 		if (dolog) {
 			/*
 			 * Try to print something descriptive about the
 			 * socket for the error message.
 			 */
 			sbuf_new(&descrsb, descrbuf, sizeof(descrbuf),
 			    SBUF_FIXEDLEN);
 			switch (head->so_proto->pr_domain->dom_family) {
 #if defined(INET) || defined(INET6)
 #ifdef INET
 			case AF_INET:
 #endif
 #ifdef INET6
 			case AF_INET6:
 				if (head->so_proto->pr_domain->dom_family ==
 				    AF_INET6 ||
 				    (sotoinpcb(head)->inp_inc.inc_flags &
 				    INC_ISIPV6)) {
 					ip6_sprintf(addrbuf,
 					    &sotoinpcb(head)->inp_inc.inc6_laddr);
 					sbuf_printf(&descrsb, "[%s]", addrbuf);
 				} else
 #endif
 				{
 #ifdef INET
 					inet_ntoa_r(
 					    sotoinpcb(head)->inp_inc.inc_laddr,
 					    addrbuf);
 					sbuf_cat(&descrsb, addrbuf);
 #endif
 				}
 				sbuf_printf(&descrsb, ":%hu (proto %u)",
 				    ntohs(sotoinpcb(head)->inp_inc.inc_lport),
 				    head->so_proto->pr_protocol);
 				break;
 #endif /* INET || INET6 */
 			case AF_UNIX:
 				sbuf_cat(&descrsb, localprefix);
 				if (sotounpcb(head)->unp_addr != NULL)
 					len =
 					    sotounpcb(head)->unp_addr->sun_len -
 					    offsetof(struct sockaddr_un,
 					    sun_path);
 				else
 					len = 0;
 				if (len > 0)
 					sbuf_bcat(&descrsb,
 					    sotounpcb(head)->unp_addr->sun_path,
 					    len);
 				else
 					sbuf_cat(&descrsb, "(unknown)");
 				break;
 			}
 
 			/*
 			 * If we can't print something more specific, at least
 			 * print the domain name.
 			 */
 			if (sbuf_finish(&descrsb) != 0 ||
 			    sbuf_len(&descrsb) <= 0) {
 				sbuf_clear(&descrsb);
 				sbuf_cat(&descrsb,
 				    head->so_proto->pr_domain->dom_name ?:
 				    "unknown");
 				sbuf_finish(&descrsb);
 			}
 			KASSERT(sbuf_len(&descrsb) > 0,
 			    ("%s: sbuf creation failed", __func__));
 			/*
 			 * Preserve the historic listen queue overflow log
 			 * message, that starts with "sonewconn:".  It has
 			 * been known to sysadmins for years and also test
 			 * sys/kern/sonewconn_overflow checks for it.
 			 */
 			if (head->so_cred == 0) {
 				log(LOG_DEBUG, "sonewconn: pcb %p (%s): "
 				    "Listen queue overflow: %i already in "
 				    "queue awaiting acceptance (%d "
 				    "occurrences)\n", head->so_pcb,
 				    sbuf_data(&descrsb),
 			    	qlen, overcount);
 			} else {
 				log(LOG_DEBUG, "sonewconn: pcb %p (%s): "
 				    "Listen queue overflow: "
 				    "%i already in queue awaiting acceptance "
 				    "(%d occurrences), euid %d, rgid %d, jail %s\n",
 				    head->so_pcb, sbuf_data(&descrsb), qlen,
 				    overcount, head->so_cred->cr_uid,
 				    head->so_cred->cr_rgid,
 				    head->so_cred->cr_prison ?
 					head->so_cred->cr_prison->pr_name :
 					"not_jailed");
 			}
 			sbuf_delete(&descrsb);
 
 			overcount = 0;
 		}
 
 		return (NULL);
 	}
 	SOLISTEN_UNLOCK(head);
 	VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL",
 	    __func__, head));
 	so = soalloc(head->so_vnet);
 	if (so == NULL) {
 		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
 		    "limit reached or out of memory\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
 	so->so_listen = head;
 	so->so_type = head->so_type;
 	so->so_options = head->so_options & ~SO_ACCEPTCONN;
 	so->so_linger = head->so_linger;
 	so->so_state = head->so_state;
 	so->so_fibnum = head->so_fibnum;
 	so->so_proto = head->so_proto;
 	so->so_cred = crhold(head->so_cred);
 #ifdef MAC
 	mac_socket_newconn(head, so);
 #endif
 	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 	    so_rdknl_assert_lock);
 	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 	    so_wrknl_assert_lock);
 	VNET_SO_ASSERT(head);
 	if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) {
 		sodealloc(so);
 		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
 	so->so_rcv.sb_lowat = head->sol_sbrcv_lowat;
 	so->so_snd.sb_lowat = head->sol_sbsnd_lowat;
 	so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
 	so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
 	so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE;
 	so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE;
 	if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) {
 		so->so_snd.sb_mtx = &so->so_snd_mtx;
 		so->so_rcv.sb_mtx = &so->so_rcv_mtx;
 	}
 
 	return (so);
 }
 
 /* Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. */
 struct socket *
 sonewconn(struct socket *head, int connstatus)
 {
 	struct socket *so;
 
 	if ((so = solisten_clone(head)) == NULL)
 		return (NULL);
 
 	if (so->so_proto->pr_attach(so, 0, NULL) != 0) {
 		sodealloc(so);
 		log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
 
 	solisten_enqueue(so, connstatus);
 
 	return (so);
 }
 
 /*
  * Enqueue socket cloned by solisten_clone() to the listen queue of the
  * listener it has been cloned from.
  */
 void
 solisten_enqueue(struct socket *so, int connstatus)
 {
 	struct socket *head = so->so_listen;
 
 	MPASS(refcount_load(&so->so_count) == 0);
 	refcount_init(&so->so_count, 1);
 
 	SOLISTEN_LOCK(head);
 	if (head->sol_accept_filter != NULL)
 		connstatus = 0;
 	so->so_state |= connstatus;
 	soref(head); /* A socket on (in)complete queue refs head. */
 	if (connstatus) {
 		TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
 		so->so_qstate = SQ_COMP;
 		head->sol_qlen++;
 		solisten_wakeup(head);	/* unlocks */
 	} else {
 		/*
 		 * Keep removing sockets from the head until there's room for
 		 * us to insert on the tail.  In pre-locking revisions, this
 		 * was a simple if(), but as we could be racing with other
 		 * threads and soabort() requires dropping locks, we must
 		 * loop waiting for the condition to be true.
 		 */
 		while (head->sol_incqlen > head->sol_qlimit) {
 			struct socket *sp;
 
 			sp = TAILQ_FIRST(&head->sol_incomp);
 			TAILQ_REMOVE(&head->sol_incomp, sp, so_list);
 			head->sol_incqlen--;
 			SOCK_LOCK(sp);
 			sp->so_qstate = SQ_NONE;
 			sp->so_listen = NULL;
 			SOCK_UNLOCK(sp);
 			sorele_locked(head);	/* does SOLISTEN_UNLOCK, head stays */
 			soabort(sp);
 			SOLISTEN_LOCK(head);
 		}
 		TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list);
 		so->so_qstate = SQ_INCOMP;
 		head->sol_incqlen++;
 		SOLISTEN_UNLOCK(head);
 	}
 }
 
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 /*
  * Socket part of sctp_peeloff().  Detach a new socket from an
  * association.  The new socket is returned with a reference.
  *
  * XXXGL: reduce copy-paste with solisten_clone().
  */
 struct socket *
 sopeeloff(struct socket *head)
 {
 	struct socket *so;
 
 	VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
 	    __func__, __LINE__, head));
 	so = soalloc(head->so_vnet);
 	if (so == NULL) {
 		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
 		    "limit reached or out of memory\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
 	so->so_type = head->so_type;
 	so->so_options = head->so_options;
 	so->so_linger = head->so_linger;
 	so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED;
 	so->so_fibnum = head->so_fibnum;
 	so->so_proto = head->so_proto;
 	so->so_cred = crhold(head->so_cred);
 #ifdef MAC
 	mac_socket_newconn(head, so);
 #endif
 	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 	    so_rdknl_assert_lock);
 	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 	    so_wrknl_assert_lock);
 	VNET_SO_ASSERT(head);
 	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
 		sodealloc(so);
 		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
 	if ((*so->so_proto->pr_attach)(so, 0, NULL)) {
 		sodealloc(so);
 		log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
 	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
 	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
 	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
 	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
 	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
 	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
 
 	soref(so);
 
 	return (so);
 }
 #endif	/* SCTP */
 
 int
 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
 	error = so->so_proto->pr_bind(so, nam, td);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 int
 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
 	error = so->so_proto->pr_bindat(fd, so, nam, td);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * solisten() transitions a socket from a non-listening state to a listening
  * state, but can also be used to update the listen queue depth on an
  * existing listen socket.  The protocol will call back into the sockets
  * layer using solisten_proto_check() and solisten_proto() to check and set
  * socket-layer listen state.  Call backs are used so that the protocol can
  * acquire both protocol and socket layer locks in whatever order is required
  * by the protocol.
  *
  * Protocol implementors are advised to hold the socket lock across the
  * socket-layer test and set to avoid races at the socket layer.
  */
 int
 solisten(struct socket *so, int backlog, struct thread *td)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
 	error = so->so_proto->pr_listen(so, backlog, td);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * Prepare for a call to solisten_proto().  Acquire all socket buffer locks in
  * order to interlock with socket I/O.
  */
 int
 solisten_proto_check(struct socket *so)
 {
 	SOCK_LOCK_ASSERT(so);
 
 	if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
 	    SS_ISDISCONNECTING)) != 0)
 		return (EINVAL);
 
 	/*
 	 * Sleeping is not permitted here, so simply fail if userspace is
 	 * attempting to transmit or receive on the socket.  This kind of
 	 * transient failure is not ideal, but it should occur only if userspace
 	 * is misusing the socket interfaces.
 	 */
 	if (!sx_try_xlock(&so->so_snd_sx))
 		return (EAGAIN);
 	if (!sx_try_xlock(&so->so_rcv_sx)) {
 		sx_xunlock(&so->so_snd_sx);
 		return (EAGAIN);
 	}
 	mtx_lock(&so->so_snd_mtx);
 	mtx_lock(&so->so_rcv_mtx);
 
 	/* Interlock with soo_aio_queue(). */
 	if (!SOLISTENING(so) &&
 	   ((so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 ||
 	   (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0)) {
 		solisten_proto_abort(so);
 		return (EINVAL);
 	}
 	return (0);
 }
 
 /*
  * Undo the setup done by solisten_proto_check().
  */
 void
 solisten_proto_abort(struct socket *so)
 {
 	mtx_unlock(&so->so_snd_mtx);
 	mtx_unlock(&so->so_rcv_mtx);
 	sx_xunlock(&so->so_snd_sx);
 	sx_xunlock(&so->so_rcv_sx);
 }
 
 void
 solisten_proto(struct socket *so, int backlog)
 {
 	int sbrcv_lowat, sbsnd_lowat;
 	u_int sbrcv_hiwat, sbsnd_hiwat;
 	short sbrcv_flags, sbsnd_flags;
 	sbintime_t sbrcv_timeo, sbsnd_timeo;
 
 	SOCK_LOCK_ASSERT(so);
 	KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
 	    SS_ISDISCONNECTING)) == 0,
 	    ("%s: bad socket state %p", __func__, so));
 
 	if (SOLISTENING(so))
 		goto listening;
 
 	/*
 	 * Change this socket to listening state.
 	 */
 	sbrcv_lowat = so->so_rcv.sb_lowat;
 	sbsnd_lowat = so->so_snd.sb_lowat;
 	sbrcv_hiwat = so->so_rcv.sb_hiwat;
 	sbsnd_hiwat = so->so_snd.sb_hiwat;
 	sbrcv_flags = so->so_rcv.sb_flags;
 	sbsnd_flags = so->so_snd.sb_flags;
 	sbrcv_timeo = so->so_rcv.sb_timeo;
 	sbsnd_timeo = so->so_snd.sb_timeo;
 
 	sbdestroy(so, SO_SND);
 	sbdestroy(so, SO_RCV);
 
 #ifdef INVARIANTS
 	bzero(&so->so_rcv,
 	    sizeof(struct socket) - offsetof(struct socket, so_rcv));
 #endif
 
 	so->sol_sbrcv_lowat = sbrcv_lowat;
 	so->sol_sbsnd_lowat = sbsnd_lowat;
 	so->sol_sbrcv_hiwat = sbrcv_hiwat;
 	so->sol_sbsnd_hiwat = sbsnd_hiwat;
 	so->sol_sbrcv_flags = sbrcv_flags;
 	so->sol_sbsnd_flags = sbsnd_flags;
 	so->sol_sbrcv_timeo = sbrcv_timeo;
 	so->sol_sbsnd_timeo = sbsnd_timeo;
 
 	so->sol_qlen = so->sol_incqlen = 0;
 	TAILQ_INIT(&so->sol_incomp);
 	TAILQ_INIT(&so->sol_comp);
 
 	so->sol_accept_filter = NULL;
 	so->sol_accept_filter_arg = NULL;
 	so->sol_accept_filter_str = NULL;
 
 	so->sol_upcall = NULL;
 	so->sol_upcallarg = NULL;
 
 	so->so_options |= SO_ACCEPTCONN;
 
 listening:
 	if (backlog < 0 || backlog > somaxconn)
 		backlog = somaxconn;
 	so->sol_qlimit = backlog;
 
 	mtx_unlock(&so->so_snd_mtx);
 	mtx_unlock(&so->so_rcv_mtx);
 	sx_xunlock(&so->so_snd_sx);
 	sx_xunlock(&so->so_rcv_sx);
 }
 
 /*
  * Wakeup listeners/subsystems once we have a complete connection.
  * Enters with lock, returns unlocked.
  */
 void
 solisten_wakeup(struct socket *sol)
 {
 
 	if (sol->sol_upcall != NULL)
 		(void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT);
 	else {
 		selwakeuppri(&sol->so_rdsel, PSOCK);
 		KNOTE_LOCKED(&sol->so_rdsel.si_note, 0);
 	}
 	SOLISTEN_UNLOCK(sol);
 	wakeup_one(&sol->sol_comp);
 	if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL)
 		pgsigio(&sol->so_sigio, SIGIO, 0);
 }
 
 /*
  * Return single connection off a listening socket queue.  Main consumer of
  * the function is kern_accept4().  Some modules, that do their own accept
  * management also use the function.  The socket reference held by the
  * listen queue is handed to the caller.
  *
  * Listening socket must be locked on entry and is returned unlocked on
  * return.
  * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT.
  */
 int
 solisten_dequeue(struct socket *head, struct socket **ret, int flags)
 {
 	struct socket *so;
 	int error;
 
 	SOLISTEN_LOCK_ASSERT(head);
 
 	while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) &&
 	    head->so_error == 0) {
 		error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH,
 		    "accept", 0);
 		if (error != 0) {
 			SOLISTEN_UNLOCK(head);
 			return (error);
 		}
 	}
 	if (head->so_error) {
 		error = head->so_error;
 		head->so_error = 0;
 	} else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp))
 		error = EWOULDBLOCK;
 	else
 		error = 0;
 	if (error) {
 		SOLISTEN_UNLOCK(head);
 		return (error);
 	}
 	so = TAILQ_FIRST(&head->sol_comp);
 	SOCK_LOCK(so);
 	KASSERT(so->so_qstate == SQ_COMP,
 	    ("%s: so %p not SQ_COMP", __func__, so));
 	head->sol_qlen--;
 	so->so_qstate = SQ_NONE;
 	so->so_listen = NULL;
 	TAILQ_REMOVE(&head->sol_comp, so, so_list);
 	if (flags & ACCEPT4_INHERIT)
 		so->so_state |= (head->so_state & SS_NBIO);
 	else
 		so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
 	SOCK_UNLOCK(so);
 	sorele_locked(head);
 
 	*ret = so;
 	return (0);
 }
 
 /*
  * Free socket upon release of the very last reference.
  */
 static void
 sofree(struct socket *so)
 {
 	struct protosw *pr = so->so_proto;
 
 	SOCK_LOCK_ASSERT(so);
 	KASSERT(refcount_load(&so->so_count) == 0,
 	    ("%s: so %p has references", __func__, so));
 	KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE,
 	    ("%s: so %p is on listen queue", __func__, so));
 
 	SOCK_UNLOCK(so);
 
 	if (so->so_dtor != NULL)
 		so->so_dtor(so);
 
 	VNET_SO_ASSERT(so);
 	if ((pr->pr_flags & PR_RIGHTS) && !SOLISTENING(so)) {
 		MPASS(pr->pr_domain->dom_dispose != NULL);
 		(*pr->pr_domain->dom_dispose)(so);
 	}
 	if (pr->pr_detach != NULL)
 		pr->pr_detach(so);
 
 	/*
 	 * From this point on, we assume that no other references to this
 	 * socket exist anywhere else in the stack.  Therefore, no locks need
 	 * to be acquired or held.
 	 */
 	if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) {
 		sbdestroy(so, SO_SND);
 		sbdestroy(so, SO_RCV);
 	}
 	seldrain(&so->so_rdsel);
 	seldrain(&so->so_wrsel);
 	knlist_destroy(&so->so_rdsel.si_note);
 	knlist_destroy(&so->so_wrsel.si_note);
 	sodealloc(so);
 }
 
 /*
  * Release a reference on a socket while holding the socket lock.
  * Unlocks the socket lock before returning.
  */
 void
 sorele_locked(struct socket *so)
 {
 	SOCK_LOCK_ASSERT(so);
 	if (refcount_release(&so->so_count))
 		sofree(so);
 	else
 		SOCK_UNLOCK(so);
 }
 
 /*
  * Close a socket on last file table reference removal.  Initiate disconnect
  * if connected.  Free socket when disconnect complete.
  *
  * This function will sorele() the socket.  Note that soclose() may be called
  * prior to the ref count reaching zero.  The actual socket structure will
  * not be freed until the ref count reaches zero.
  */
 int
 soclose(struct socket *so)
 {
 	struct accept_queue lqueue;
 	int error = 0;
 	bool listening, last __diagused;
 
 	CURVNET_SET(so->so_vnet);
 	funsetown(&so->so_sigio);
 	if (so->so_state & SS_ISCONNECTED) {
 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
 			error = sodisconnect(so);
 			if (error) {
 				if (error == ENOTCONN)
 					error = 0;
 				goto drop;
 			}
 		}
 
 		if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) {
 			if ((so->so_state & SS_ISDISCONNECTING) &&
 			    (so->so_state & SS_NBIO))
 				goto drop;
 			while (so->so_state & SS_ISCONNECTED) {
 				error = tsleep(&so->so_timeo,
 				    PSOCK | PCATCH, "soclos",
 				    so->so_linger * hz);
 				if (error)
 					break;
 			}
 		}
 	}
 
 drop:
 	if (so->so_proto->pr_close != NULL)
 		so->so_proto->pr_close(so);
 
 	SOCK_LOCK(so);
 	if ((listening = SOLISTENING(so))) {
 		struct socket *sp;
 
 		TAILQ_INIT(&lqueue);
 		TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list);
 		TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list);
 
 		so->sol_qlen = so->sol_incqlen = 0;
 
 		TAILQ_FOREACH(sp, &lqueue, so_list) {
 			SOCK_LOCK(sp);
 			sp->so_qstate = SQ_NONE;
 			sp->so_listen = NULL;
 			SOCK_UNLOCK(sp);
 			last = refcount_release(&so->so_count);
 			KASSERT(!last, ("%s: released last reference for %p",
 			    __func__, so));
 		}
 	}
 	sorele_locked(so);
 	if (listening) {
 		struct socket *sp, *tsp;
 
 		TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp)
 			soabort(sp);
 	}
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * soabort() is used to abruptly tear down a connection, such as when a
  * resource limit is reached (listen queue depth exceeded), or if a listen
  * socket is closed while there are sockets waiting to be accepted.
  *
  * This interface is tricky, because it is called on an unreferenced socket,
  * and must be called only by a thread that has actually removed the socket
  * from the listen queue it was on.  Likely this thread holds the last
  * reference on the socket and soabort() will proceed with sofree().  But
  * it might be not the last, as the sockets on the listen queues are seen
  * from the protocol side.
  *
  * This interface will call into the protocol code, so must not be called
  * with any socket locks held.  Protocols do call it while holding their own
  * recursible protocol mutexes, but this is something that should be subject
  * to review in the future.
  *
  * Usually socket should have a single reference left, but this is not a
  * requirement.  In the past, when we have had named references for file
  * descriptor and protocol, we asserted that none of them are being held.
  */
 void
 soabort(struct socket *so)
 {
 
 	VNET_SO_ASSERT(so);
 
 	if (so->so_proto->pr_abort != NULL)
 		so->so_proto->pr_abort(so);
 	SOCK_LOCK(so);
 	sorele_locked(so);
 }
 
 int
 soaccept(struct socket *so, struct sockaddr **nam)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
 	error = so->so_proto->pr_accept(so, nam);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 int
 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
 	return (soconnectat(AT_FDCWD, so, nam, td));
 }
 
 int
 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
 	/*
 	 * If protocol is connection-based, can only connect once.
 	 * Otherwise, if connected, try to disconnect first.  This allows
 	 * user to disconnect by connecting to, e.g., a null address.
 	 */
 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
 	    (error = sodisconnect(so)))) {
 		error = EISCONN;
 	} else {
 		/*
 		 * Prevent accumulated error from previous connection from
 		 * biting us.
 		 */
 		so->so_error = 0;
 		if (fd == AT_FDCWD) {
 			error = so->so_proto->pr_connect(so, nam, td);
 		} else {
 			error = so->so_proto->pr_connectat(fd, so, nam, td);
 		}
 	}
 	CURVNET_RESTORE();
 
 	return (error);
 }
 
 int
 soconnect2(struct socket *so1, struct socket *so2)
 {
 	int error;
 
 	CURVNET_SET(so1->so_vnet);
 	error = so1->so_proto->pr_connect2(so1, so2);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 int
 sodisconnect(struct socket *so)
 {
 	int error;
 
 	if ((so->so_state & SS_ISCONNECTED) == 0)
 		return (ENOTCONN);
 	if (so->so_state & SS_ISDISCONNECTING)
 		return (EALREADY);
 	VNET_SO_ASSERT(so);
 	error = so->so_proto->pr_disconnect(so);
 	return (error);
 }
 
 int
 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 {
 	long space;
 	ssize_t resid;
 	int clen = 0, error, dontroute;
 
 	KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
 	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
 	    ("sosend_dgram: !PR_ATOMIC"));
 
 	if (uio != NULL)
 		resid = uio->uio_resid;
 	else
 		resid = top->m_pkthdr.len;
 	/*
 	 * In theory resid should be unsigned.  However, space must be
 	 * signed, as it might be less than 0 if we over-committed, and we
 	 * must use a signed comparison of space and resid.  On the other
 	 * hand, a negative resid causes us to loop sending 0-length
 	 * segments to the protocol.
 	 */
 	if (resid < 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	dontroute =
 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
 	if (td != NULL)
 		td->td_ru.ru_msgsnd++;
 	if (control != NULL)
 		clen = control->m_len;
 
 	SOCKBUF_LOCK(&so->so_snd);
 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		error = EPIPE;
 		goto out;
 	}
 	if (so->so_error) {
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(&so->so_snd);
 		goto out;
 	}
 	if ((so->so_state & SS_ISCONNECTED) == 0) {
 		/*
 		 * `sendto' and `sendmsg' is allowed on a connection-based
 		 * socket if it supports implied connect.  Return ENOTCONN if
 		 * not connected and no address is supplied.
 		 */
 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
 		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
 			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
 			    !(resid == 0 && clen != 0)) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = ENOTCONN;
 				goto out;
 			}
 		} else if (addr == NULL) {
 			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
 				error = ENOTCONN;
 			else
 				error = EDESTADDRREQ;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto out;
 		}
 	}
 
 	/*
 	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
 	 * problem and need fixing.
 	 */
 	space = sbspace(&so->so_snd);
 	if (flags & MSG_OOB)
 		space += 1024;
 	space -= clen;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	if (resid > space) {
 		error = EMSGSIZE;
 		goto out;
 	}
 	if (uio == NULL) {
 		resid = 0;
 		if (flags & MSG_EOR)
 			top->m_flags |= M_EOR;
 	} else {
 		/*
 		 * Copy the data from userland into a mbuf chain.
 		 * If no data is to be copied in, a single empty mbuf
 		 * is returned.
 		 */
 		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
 		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
 		if (top == NULL) {
 			error = EFAULT;	/* only possible error */
 			goto out;
 		}
 		space -= resid - uio->uio_resid;
 		resid = uio->uio_resid;
 	}
 	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
 	/*
 	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
 	 * than with.
 	 */
 	if (dontroute) {
 		SOCK_LOCK(so);
 		so->so_options |= SO_DONTROUTE;
 		SOCK_UNLOCK(so);
 	}
 	/*
 	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
 	 * of date.  We could have received a reset packet in an interrupt or
 	 * maybe we slept while doing page faults in uiomove() etc.  We could
 	 * probably recheck again inside the locking protection here, but
 	 * there are probably other places that this also happens.  We must
 	 * rethink this.
 	 */
 	VNET_SO_ASSERT(so);
 	error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB :
 	/*
 	 * If the user set MSG_EOF, the protocol understands this flag and
 	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
 	 */
 	    ((flags & MSG_EOF) &&
 	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
 	     (resid <= 0)) ?
 		PRUS_EOF :
 		/* If there is more to send set PRUS_MORETOCOME */
 		(flags & MSG_MORETOCOME) ||
 		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
 		top, addr, control, td);
 	if (dontroute) {
 		SOCK_LOCK(so);
 		so->so_options &= ~SO_DONTROUTE;
 		SOCK_UNLOCK(so);
 	}
 	clen = 0;
 	control = NULL;
 	top = NULL;
 out:
 	if (top != NULL)
 		m_freem(top);
 	if (control != NULL)
 		m_freem(control);
 	return (error);
 }
 
 /*
  * Send on a socket.  If send must go all at once and message is larger than
  * send buffering, then hard error.  Lock against other senders.  If must go
  * all at once and not enough room now, then inform user that this would
  * block and do nothing.  Otherwise, if nonblocking, send as much as
  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
  * in mbuf chain must be small enough to send all at once.
  *
  * Returns nonzero on error, timeout or signal; callers must check for short
  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
  * on return.
  */
 int
 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 {
 	long space;
 	ssize_t resid;
 	int clen = 0, error, dontroute;
 	int atomic = sosendallatonce(so) || top;
 	int pr_send_flag;
 #ifdef KERN_TLS
 	struct ktls_session *tls;
 	int tls_enq_cnt, tls_send_flag;
 	uint8_t tls_rtype;
 
 	tls = NULL;
 	tls_rtype = TLS_RLTYPE_APP;
 #endif
 	if (uio != NULL)
 		resid = uio->uio_resid;
 	else if ((top->m_flags & M_PKTHDR) != 0)
 		resid = top->m_pkthdr.len;
 	else
 		resid = m_length(top, NULL);
 	/*
 	 * In theory resid should be unsigned.  However, space must be
 	 * signed, as it might be less than 0 if we over-committed, and we
 	 * must use a signed comparison of space and resid.  On the other
 	 * hand, a negative resid causes us to loop sending 0-length
 	 * segments to the protocol.
 	 *
 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
 	 * type sockets since that's an error.
 	 */
 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
 		error = EINVAL;
 		goto out;
 	}
 
 	dontroute =
 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
 	    (so->so_proto->pr_flags & PR_ATOMIC);
 	if (td != NULL)
 		td->td_ru.ru_msgsnd++;
 	if (control != NULL)
 		clen = control->m_len;
 
 	error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
 	if (error)
 		goto out;
 
 #ifdef KERN_TLS
 	tls_send_flag = 0;
 	tls = ktls_hold(so->so_snd.sb_tls_info);
 	if (tls != NULL) {
 		if (tls->mode == TCP_TLS_MODE_SW)
 			tls_send_flag = PRUS_NOTREADY;
 
 		if (control != NULL) {
 			struct cmsghdr *cm = mtod(control, struct cmsghdr *);
 
 			if (clen >= sizeof(*cm) &&
 			    cm->cmsg_type == TLS_SET_RECORD_TYPE) {
 				tls_rtype = *((uint8_t *)CMSG_DATA(cm));
 				clen = 0;
 				m_freem(control);
 				control = NULL;
 				atomic = 1;
 			}
 		}
 
 		if (resid == 0 && !ktls_permit_empty_frames(tls)) {
 			error = EINVAL;
 			goto release;
 		}
 	}
 #endif
 
 restart:
 	do {
 		SOCKBUF_LOCK(&so->so_snd);
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = EPIPE;
 			goto release;
 		}
 		if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto release;
 		}
 		if ((so->so_state & SS_ISCONNECTED) == 0) {
 			/*
 			 * `sendto' and `sendmsg' is allowed on a connection-
 			 * based socket if it supports implied connect.
 			 * Return ENOTCONN if not connected and no address is
 			 * supplied.
 			 */
 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
 				    !(resid == 0 && clen != 0)) {
 					SOCKBUF_UNLOCK(&so->so_snd);
 					error = ENOTCONN;
 					goto release;
 				}
 			} else if (addr == NULL) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
 					error = ENOTCONN;
 				else
 					error = EDESTADDRREQ;
 				goto release;
 			}
 		}
 		space = sbspace(&so->so_snd);
 		if (flags & MSG_OOB)
 			space += 1024;
 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
 		    clen > so->so_snd.sb_hiwat) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = EMSGSIZE;
 			goto release;
 		}
 		if (space < resid + clen &&
 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
 			if ((so->so_state & SS_NBIO) ||
 			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EWOULDBLOCK;
 				goto release;
 			}
 			error = sbwait(so, SO_SND);
 			SOCKBUF_UNLOCK(&so->so_snd);
 			if (error)
 				goto release;
 			goto restart;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
 		space -= clen;
 		do {
 			if (uio == NULL) {
 				resid = 0;
 				if (flags & MSG_EOR)
 					top->m_flags |= M_EOR;
 #ifdef KERN_TLS
 				if (tls != NULL) {
 					ktls_frame(top, tls, &tls_enq_cnt,
 					    tls_rtype);
 					tls_rtype = TLS_RLTYPE_APP;
 				}
 #endif
 			} else {
 				/*
 				 * Copy the data from userland into a mbuf
 				 * chain.  If resid is 0, which can happen
 				 * only if we have control to send, then
 				 * a single empty mbuf is returned.  This
 				 * is a workaround to prevent protocol send
 				 * methods to panic.
 				 */
 #ifdef KERN_TLS
 				if (tls != NULL) {
 					top = m_uiotombuf(uio, M_WAITOK, space,
 					    tls->params.max_frame_len,
 					    M_EXTPG |
 					    ((flags & MSG_EOR) ? M_EOR : 0));
 					if (top != NULL) {
 						ktls_frame(top, tls,
 						    &tls_enq_cnt, tls_rtype);
 					}
 					tls_rtype = TLS_RLTYPE_APP;
 				} else
 #endif
 					top = m_uiotombuf(uio, M_WAITOK, space,
 					    (atomic ? max_hdr : 0),
 					    (atomic ? M_PKTHDR : 0) |
 					    ((flags & MSG_EOR) ? M_EOR : 0));
 				if (top == NULL) {
 					error = EFAULT; /* only possible error */
 					goto release;
 				}
 				space -= resid - uio->uio_resid;
 				resid = uio->uio_resid;
 			}
 			if (dontroute) {
 				SOCK_LOCK(so);
 				so->so_options |= SO_DONTROUTE;
 				SOCK_UNLOCK(so);
 			}
 			/*
 			 * XXX all the SBS_CANTSENDMORE checks previously
 			 * done could be out of date.  We could have received
 			 * a reset packet in an interrupt or maybe we slept
 			 * while doing page faults in uiomove() etc.  We
 			 * could probably recheck again inside the locking
 			 * protection here, but there are probably other
 			 * places that this also happens.  We must rethink
 			 * this.
 			 */
 			VNET_SO_ASSERT(so);
 
 			pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB :
 			/*
 			 * If the user set MSG_EOF, the protocol understands
 			 * this flag and nothing left to send then use
 			 * PRU_SEND_EOF instead of PRU_SEND.
 			 */
 			    ((flags & MSG_EOF) &&
 			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
 			     (resid <= 0)) ?
 				PRUS_EOF :
 			/* If there is more to send set PRUS_MORETOCOME. */
 			    (flags & MSG_MORETOCOME) ||
 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
 
 #ifdef KERN_TLS
 			pr_send_flag |= tls_send_flag;
 #endif
 
 			error = so->so_proto->pr_send(so, pr_send_flag, top,
 			    addr, control, td);
 
 			if (dontroute) {
 				SOCK_LOCK(so);
 				so->so_options &= ~SO_DONTROUTE;
 				SOCK_UNLOCK(so);
 			}
 
 #ifdef KERN_TLS
 			if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
 				if (error != 0) {
 					m_freem(top);
 					top = NULL;
 				} else {
 					soref(so);
 					ktls_enqueue(top, so, tls_enq_cnt);
 				}
 			}
 #endif
 			clen = 0;
 			control = NULL;
 			top = NULL;
 			if (error)
 				goto release;
 		} while (resid && space > 0);
 	} while (resid);
 
 release:
 	SOCK_IO_SEND_UNLOCK(so);
 out:
 #ifdef KERN_TLS
 	if (tls != NULL)
 		ktls_free(tls);
 #endif
 	if (top != NULL)
 		m_freem(top);
 	if (control != NULL)
 		m_freem(control);
 	return (error);
 }
 
 int
 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
 	error = so->so_proto->pr_sosend(so, addr, uio,
 	    top, control, flags, td);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * The part of soreceive() that implements reading non-inline out-of-band
  * data from a socket.  For more complete comments, see soreceive(), from
  * which this code originated.
  *
  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
  * unable to return an mbuf chain to the caller.
  */
 static int
 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
 {
 	struct protosw *pr = so->so_proto;
 	struct mbuf *m;
 	int error;
 
 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
 	VNET_SO_ASSERT(so);
 
 	m = m_get(M_WAITOK, MT_DATA);
 	error = pr->pr_rcvoob(so, m, flags & MSG_PEEK);
 	if (error)
 		goto bad;
 	do {
 		error = uiomove(mtod(m, void *),
 		    (int) min(uio->uio_resid, m->m_len), uio);
 		m = m_free(m);
 	} while (uio->uio_resid && error == 0 && m);
 bad:
 	if (m != NULL)
 		m_freem(m);
 	return (error);
 }
 
 /*
  * Following replacement or removal of the first mbuf on the first mbuf chain
  * of a socket buffer, push necessary state changes back into the socket
  * buffer so that other consumers see the values consistently.  'nextrecord'
  * is the callers locally stored value of the original value of
  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
  * NOTE: 'nextrecord' may be NULL.
  */
 static __inline void
 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
 {
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	/*
 	 * First, update for the new value of nextrecord.  If necessary, make
 	 * it the first record.
 	 */
 	if (sb->sb_mb != NULL)
 		sb->sb_mb->m_nextpkt = nextrecord;
 	else
 		sb->sb_mb = nextrecord;
 
 	/*
 	 * Now update any dependent socket buffer fields to reflect the new
 	 * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
 	 * addition of a second clause that takes care of the case where
 	 * sb_mb has been updated, but remains the last record.
 	 */
 	if (sb->sb_mb == NULL) {
 		sb->sb_mbtail = NULL;
 		sb->sb_lastrecord = NULL;
 	} else if (sb->sb_mb->m_nextpkt == NULL)
 		sb->sb_lastrecord = sb->sb_mb;
 }
 
 /*
  * Implement receive operations on a socket.  We depend on the way that
  * records are added to the sockbuf by sbappend.  In particular, each record
  * (mbufs linked through m_next) must begin with an address if the protocol
  * so specifies, followed by an optional mbuf or mbufs containing ancillary
  * data, and then zero or more mbufs of data.  In order to allow parallelism
  * between network receive and copying to user space, as well as avoid
  * sleeping with a mutex held, we release the socket buffer mutex during the
  * user space copy.  Although the sockbuf is locked, new data may still be
  * appended, and thus we must maintain consistency of the sockbuf during that
  * time.
  *
  * The caller may receive the data as a single mbuf chain by supplying an
  * mbuf **mp0 for use in returning the chain.  The uio is then used only for
  * the count in uio_resid.
  */
 int
 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	struct mbuf *m, **mp;
 	int flags, error, offset;
 	ssize_t len;
 	struct protosw *pr = so->so_proto;
 	struct mbuf *nextrecord;
 	int moff, type = 0;
 	ssize_t orig_resid = uio->uio_resid;
 	bool report_real_len = false;
 
 	mp = mp0;
 	if (psa != NULL)
 		*psa = NULL;
 	if (controlp != NULL)
 		*controlp = NULL;
 	if (flagsp != NULL) {
 		report_real_len = *flagsp & MSG_TRUNC;
 		*flagsp &= ~MSG_TRUNC;
 		flags = *flagsp &~ MSG_EOR;
 	} else
 		flags = 0;
 	if (flags & MSG_OOB)
 		return (soreceive_rcvoob(so, uio, flags));
 	if (mp != NULL)
 		*mp = NULL;
 	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
 	    && uio->uio_resid) {
 		VNET_SO_ASSERT(so);
 		pr->pr_rcvd(so, 0);
 	}
 
 	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
 	if (error)
 		return (error);
 
 restart:
 	SOCKBUF_LOCK(&so->so_rcv);
 	m = so->so_rcv.sb_mb;
 	/*
 	 * If we have less data than requested, block awaiting more (subject
 	 * to any timeout) if:
 	 *   1. the current count is less than the low water mark, or
 	 *   2. MSG_DONTWAIT is not set
 	 */
 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
 	    sbavail(&so->so_rcv) < uio->uio_resid) &&
 	    sbavail(&so->so_rcv) < so->so_rcv.sb_lowat &&
 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
 		KASSERT(m != NULL || !sbavail(&so->so_rcv),
 		    ("receive: m == %p sbavail == %u",
 		    m, sbavail(&so->so_rcv)));
 		if (so->so_error || so->so_rerror) {
 			if (m != NULL)
 				goto dontblock;
 			if (so->so_error)
 				error = so->so_error;
 			else
 				error = so->so_rerror;
 			if ((flags & MSG_PEEK) == 0) {
 				if (so->so_error)
 					so->so_error = 0;
 				else
 					so->so_rerror = 0;
 			}
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			goto release;
 		}
 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 			if (m != NULL)
 				goto dontblock;
 #ifdef KERN_TLS
 			else if (so->so_rcv.sb_tlsdcc == 0 &&
 			    so->so_rcv.sb_tlscc == 0) {
 #else
 			else {
 #endif
 				SOCKBUF_UNLOCK(&so->so_rcv);
 				goto release;
 			}
 		}
 		for (; m != NULL; m = m->m_next)
 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
 				m = so->so_rcv.sb_mb;
 				goto dontblock;
 			}
 		if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED |
 		    SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 &&
 		    (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			error = ENOTCONN;
 			goto release;
 		}
 		if (uio->uio_resid == 0 && !report_real_len) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			goto release;
 		}
 		if ((so->so_state & SS_NBIO) ||
 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			error = EWOULDBLOCK;
 			goto release;
 		}
 		SBLASTRECORDCHK(&so->so_rcv);
 		SBLASTMBUFCHK(&so->so_rcv);
 		error = sbwait(so, SO_RCV);
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		if (error)
 			goto release;
 		goto restart;
 	}
 dontblock:
 	/*
 	 * From this point onward, we maintain 'nextrecord' as a cache of the
 	 * pointer to the next record in the socket buffer.  We must keep the
 	 * various socket buffer pointers and local stack versions of the
 	 * pointers in sync, pushing out modifications before dropping the
 	 * socket buffer mutex, and re-reading them when picking it up.
 	 *
 	 * Otherwise, we will race with the network stack appending new data
 	 * or records onto the socket buffer by using inconsistent/stale
 	 * versions of the field, possibly resulting in socket buffer
 	 * corruption.
 	 *
 	 * By holding the high-level sblock(), we prevent simultaneous
 	 * readers from pulling off the front of the socket buffer.
 	 */
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	if (uio->uio_td)
 		uio->uio_td->td_ru.ru_msgrcv++;
 	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
 	SBLASTRECORDCHK(&so->so_rcv);
 	SBLASTMBUFCHK(&so->so_rcv);
 	nextrecord = m->m_nextpkt;
 	if (pr->pr_flags & PR_ADDR) {
 		KASSERT(m->m_type == MT_SONAME,
 		    ("m->m_type == %d", m->m_type));
 		orig_resid = 0;
 		if (psa != NULL)
 			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
 			    M_NOWAIT);
 		if (flags & MSG_PEEK) {
 			m = m->m_next;
 		} else {
 			sbfree(&so->so_rcv, m);
 			so->so_rcv.sb_mb = m_free(m);
 			m = so->so_rcv.sb_mb;
 			sockbuf_pushsync(&so->so_rcv, nextrecord);
 		}
 	}
 
 	/*
 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
 	 * perform externalization (or freeing if controlp == NULL).
 	 */
 	if (m != NULL && m->m_type == MT_CONTROL) {
 		struct mbuf *cm = NULL, *cmn;
 		struct mbuf **cme = &cm;
 #ifdef KERN_TLS
 		struct cmsghdr *cmsg;
 		struct tls_get_record tgr;
 
 		/*
 		 * For MSG_TLSAPPDATA, check for an alert record.
 		 * If found, return ENXIO without removing
 		 * it from the receive queue.  This allows a subsequent
 		 * call without MSG_TLSAPPDATA to receive it.
 		 * Note that, for TLS, there should only be a single
 		 * control mbuf with the TLS_GET_RECORD message in it.
 		 */
 		if (flags & MSG_TLSAPPDATA) {
 			cmsg = mtod(m, struct cmsghdr *);
 			if (cmsg->cmsg_type == TLS_GET_RECORD &&
 			    cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) {
 				memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr));
 				if (__predict_false(tgr.tls_type ==
 				    TLS_RLTYPE_ALERT)) {
 					SOCKBUF_UNLOCK(&so->so_rcv);
 					error = ENXIO;
 					goto release;
 				}
 			}
 		}
 #endif
 
 		do {
 			if (flags & MSG_PEEK) {
 				if (controlp != NULL) {
 					*controlp = m_copym(m, 0, m->m_len,
 					    M_NOWAIT);
 					controlp = &(*controlp)->m_next;
 				}
 				m = m->m_next;
 			} else {
 				sbfree(&so->so_rcv, m);
 				so->so_rcv.sb_mb = m->m_next;
 				m->m_next = NULL;
 				*cme = m;
 				cme = &(*cme)->m_next;
 				m = so->so_rcv.sb_mb;
 			}
 		} while (m != NULL && m->m_type == MT_CONTROL);
 		if ((flags & MSG_PEEK) == 0)
 			sockbuf_pushsync(&so->so_rcv, nextrecord);
 		while (cm != NULL) {
 			cmn = cm->m_next;
 			cm->m_next = NULL;
 			if (pr->pr_domain->dom_externalize != NULL) {
 				SOCKBUF_UNLOCK(&so->so_rcv);
 				VNET_SO_ASSERT(so);
 				error = (*pr->pr_domain->dom_externalize)
 				    (cm, controlp, flags);
 				SOCKBUF_LOCK(&so->so_rcv);
 			} else if (controlp != NULL)
 				*controlp = cm;
 			else
 				m_freem(cm);
 			if (controlp != NULL) {
 				while (*controlp != NULL)
 					controlp = &(*controlp)->m_next;
 			}
 			cm = cmn;
 		}
 		if (m != NULL)
 			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
 		else
 			nextrecord = so->so_rcv.sb_mb;
 		orig_resid = 0;
 	}
 	if (m != NULL) {
 		if ((flags & MSG_PEEK) == 0) {
 			KASSERT(m->m_nextpkt == nextrecord,
 			    ("soreceive: post-control, nextrecord !sync"));
 			if (nextrecord == NULL) {
 				KASSERT(so->so_rcv.sb_mb == m,
 				    ("soreceive: post-control, sb_mb!=m"));
 				KASSERT(so->so_rcv.sb_lastrecord == m,
 				    ("soreceive: post-control, lastrecord!=m"));
 			}
 		}
 		type = m->m_type;
 		if (type == MT_OOBDATA)
 			flags |= MSG_OOB;
 	} else {
 		if ((flags & MSG_PEEK) == 0) {
 			KASSERT(so->so_rcv.sb_mb == nextrecord,
 			    ("soreceive: sb_mb != nextrecord"));
 			if (so->so_rcv.sb_mb == NULL) {
 				KASSERT(so->so_rcv.sb_lastrecord == NULL,
 				    ("soreceive: sb_lastercord != NULL"));
 			}
 		}
 	}
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	SBLASTRECORDCHK(&so->so_rcv);
 	SBLASTMBUFCHK(&so->so_rcv);
 
 	/*
 	 * Now continue to read any data mbufs off of the head of the socket
 	 * buffer until the read request is satisfied.  Note that 'type' is
 	 * used to store the type of any mbuf reads that have happened so far
 	 * such that soreceive() can stop reading if the type changes, which
 	 * causes soreceive() to return only one of regular data and inline
 	 * out-of-band data in a single socket receive operation.
 	 */
 	moff = 0;
 	offset = 0;
 	while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0
 	    && error == 0) {
 		/*
 		 * If the type of mbuf has changed since the last mbuf
 		 * examined ('type'), end the receive operation.
 		 */
 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 		if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
 			if (type != m->m_type)
 				break;
 		} else if (type == MT_OOBDATA)
 			break;
 		else
 		    KASSERT(m->m_type == MT_DATA,
 			("m->m_type == %d", m->m_type));
 		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
 		len = uio->uio_resid;
 		if (so->so_oobmark && len > so->so_oobmark - offset)
 			len = so->so_oobmark - offset;
 		if (len > m->m_len - moff)
 			len = m->m_len - moff;
 		/*
 		 * If mp is set, just pass back the mbufs.  Otherwise copy
 		 * them out via the uio, then free.  Sockbuf must be
 		 * consistent here (points to current mbuf, it points to next
 		 * record) when we drop priority; we must note any additions
 		 * to the sockbuf when we block interrupts again.
 		 */
 		if (mp == NULL) {
 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 			SBLASTRECORDCHK(&so->so_rcv);
 			SBLASTMBUFCHK(&so->so_rcv);
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			if ((m->m_flags & M_EXTPG) != 0)
 				error = m_unmapped_uiomove(m, moff, uio,
 				    (int)len);
 			else
 				error = uiomove(mtod(m, char *) + moff,
 				    (int)len, uio);
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (error) {
 				/*
 				 * The MT_SONAME mbuf has already been removed
 				 * from the record, so it is necessary to
 				 * remove the data mbufs, if any, to preserve
 				 * the invariant in the case of PR_ADDR that
 				 * requires MT_SONAME mbufs at the head of
 				 * each record.
 				 */
 				if (pr->pr_flags & PR_ATOMIC &&
 				    ((flags & MSG_PEEK) == 0))
 					(void)sbdroprecord_locked(&so->so_rcv);
 				SOCKBUF_UNLOCK(&so->so_rcv);
 				goto release;
 			}
 		} else
 			uio->uio_resid -= len;
 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 		if (len == m->m_len - moff) {
 			if (m->m_flags & M_EOR)
 				flags |= MSG_EOR;
 			if (flags & MSG_PEEK) {
 				m = m->m_next;
 				moff = 0;
 			} else {
 				nextrecord = m->m_nextpkt;
 				sbfree(&so->so_rcv, m);
 				if (mp != NULL) {
 					m->m_nextpkt = NULL;
 					*mp = m;
 					mp = &m->m_next;
 					so->so_rcv.sb_mb = m = m->m_next;
 					*mp = NULL;
 				} else {
 					so->so_rcv.sb_mb = m_free(m);
 					m = so->so_rcv.sb_mb;
 				}
 				sockbuf_pushsync(&so->so_rcv, nextrecord);
 				SBLASTRECORDCHK(&so->so_rcv);
 				SBLASTMBUFCHK(&so->so_rcv);
 			}
 		} else {
 			if (flags & MSG_PEEK)
 				moff += len;
 			else {
 				if (mp != NULL) {
 					if (flags & MSG_DONTWAIT) {
 						*mp = m_copym(m, 0, len,
 						    M_NOWAIT);
 						if (*mp == NULL) {
 							/*
 							 * m_copym() couldn't
 							 * allocate an mbuf.
 							 * Adjust uio_resid back
 							 * (it was adjusted
 							 * down by len bytes,
 							 * which we didn't end
 							 * up "copying" over).
 							 */
 							uio->uio_resid += len;
 							break;
 						}
 					} else {
 						SOCKBUF_UNLOCK(&so->so_rcv);
 						*mp = m_copym(m, 0, len,
 						    M_WAITOK);
 						SOCKBUF_LOCK(&so->so_rcv);
 					}
 				}
 				sbcut_locked(&so->so_rcv, len);
 			}
 		}
 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 		if (so->so_oobmark) {
 			if ((flags & MSG_PEEK) == 0) {
 				so->so_oobmark -= len;
 				if (so->so_oobmark == 0) {
 					so->so_rcv.sb_state |= SBS_RCVATMARK;
 					break;
 				}
 			} else {
 				offset += len;
 				if (offset == so->so_oobmark)
 					break;
 			}
 		}
 		if (flags & MSG_EOR)
 			break;
 		/*
 		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
 		 * must not quit until "uio->uio_resid == 0" or an error
 		 * termination.  If a signal/timeout occurs, return with a
 		 * short count but without error.  Keep sockbuf locked
 		 * against other readers.
 		 */
 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
 		    !sosendallatonce(so) && nextrecord == NULL) {
 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 			if (so->so_error || so->so_rerror ||
 			    so->so_rcv.sb_state & SBS_CANTRCVMORE)
 				break;
 			/*
 			 * Notify the protocol that some data has been
 			 * drained before blocking.
 			 */
 			if (pr->pr_flags & PR_WANTRCVD) {
 				SOCKBUF_UNLOCK(&so->so_rcv);
 				VNET_SO_ASSERT(so);
 				pr->pr_rcvd(so, flags);
 				SOCKBUF_LOCK(&so->so_rcv);
 			}
 			SBLASTRECORDCHK(&so->so_rcv);
 			SBLASTMBUFCHK(&so->so_rcv);
 			/*
 			 * We could receive some data while was notifying
 			 * the protocol. Skip blocking in this case.
 			 */
 			if (so->so_rcv.sb_mb == NULL) {
 				error = sbwait(so, SO_RCV);
 				if (error) {
 					SOCKBUF_UNLOCK(&so->so_rcv);
 					goto release;
 				}
 			}
 			m = so->so_rcv.sb_mb;
 			if (m != NULL)
 				nextrecord = m->m_nextpkt;
 		}
 	}
 
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
 		if (report_real_len)
 			uio->uio_resid -= m_length(m, NULL) - moff;
 		flags |= MSG_TRUNC;
 		if ((flags & MSG_PEEK) == 0)
 			(void) sbdroprecord_locked(&so->so_rcv);
 	}
 	if ((flags & MSG_PEEK) == 0) {
 		if (m == NULL) {
 			/*
 			 * First part is an inline SB_EMPTY_FIXUP().  Second
 			 * part makes sure sb_lastrecord is up-to-date if
 			 * there is still data in the socket buffer.
 			 */
 			so->so_rcv.sb_mb = nextrecord;
 			if (so->so_rcv.sb_mb == NULL) {
 				so->so_rcv.sb_mbtail = NULL;
 				so->so_rcv.sb_lastrecord = NULL;
 			} else if (nextrecord->m_nextpkt == NULL)
 				so->so_rcv.sb_lastrecord = nextrecord;
 		}
 		SBLASTRECORDCHK(&so->so_rcv);
 		SBLASTMBUFCHK(&so->so_rcv);
 		/*
 		 * If soreceive() is being done from the socket callback,
 		 * then don't need to generate ACK to peer to update window,
 		 * since ACK will be generated on return to TCP.
 		 */
 		if (!(flags & MSG_SOCALLBCK) &&
 		    (pr->pr_flags & PR_WANTRCVD)) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			VNET_SO_ASSERT(so);
 			pr->pr_rcvd(so, flags);
 			SOCKBUF_LOCK(&so->so_rcv);
 		}
 	}
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	if (orig_resid == uio->uio_resid && orig_resid &&
 	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		goto restart;
 	}
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	if (flagsp != NULL)
 		*flagsp |= flags;
 release:
 	SOCK_IO_RECV_UNLOCK(so);
 	return (error);
 }
 
 /*
  * Optimized version of soreceive() for stream (TCP) sockets.
  */
 int
 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	int len = 0, error = 0, flags, oresid;
 	struct sockbuf *sb;
 	struct mbuf *m, *n = NULL;
 
 	/* We only do stream sockets. */
 	if (so->so_type != SOCK_STREAM)
 		return (EINVAL);
 	if (psa != NULL)
 		*psa = NULL;
 	if (flagsp != NULL)
 		flags = *flagsp &~ MSG_EOR;
 	else
 		flags = 0;
 	if (controlp != NULL)
 		*controlp = NULL;
 	if (flags & MSG_OOB)
 		return (soreceive_rcvoob(so, uio, flags));
 	if (mp0 != NULL)
 		*mp0 = NULL;
 
 	sb = &so->so_rcv;
 
 #ifdef KERN_TLS
 	/*
 	 * KTLS store TLS records as records with a control message to
 	 * describe the framing.
 	 *
 	 * We check once here before acquiring locks to optimize the
 	 * common case.
 	 */
 	if (sb->sb_tls_info != NULL)
 		return (soreceive_generic(so, psa, uio, mp0, controlp,
 		    flagsp));
 #endif
 
 	/* Prevent other readers from entering the socket. */
 	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
 	if (error)
 		return (error);
 	SOCKBUF_LOCK(sb);
 
 #ifdef KERN_TLS
 	if (sb->sb_tls_info != NULL) {
 		SOCKBUF_UNLOCK(sb);
 		SOCK_IO_RECV_UNLOCK(so);
 		return (soreceive_generic(so, psa, uio, mp0, controlp,
 		    flagsp));
 	}
 #endif
 
 	/* Easy one, no space to copyout anything. */
 	if (uio->uio_resid == 0) {
 		error = EINVAL;
 		goto out;
 	}
 	oresid = uio->uio_resid;
 
 	/* We will never ever get anything unless we are or were connected. */
 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
 		error = ENOTCONN;
 		goto out;
 	}
 
 restart:
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	/* Abort if socket has reported problems. */
 	if (so->so_error) {
 		if (sbavail(sb) > 0)
 			goto deliver;
 		if (oresid > uio->uio_resid)
 			goto out;
 		error = so->so_error;
 		if (!(flags & MSG_PEEK))
 			so->so_error = 0;
 		goto out;
 	}
 
 	/* Door is closed.  Deliver what is left, if any. */
 	if (sb->sb_state & SBS_CANTRCVMORE) {
 		if (sbavail(sb) > 0)
 			goto deliver;
 		else
 			goto out;
 	}
 
 	/* Socket buffer is empty and we shall not block. */
 	if (sbavail(sb) == 0 &&
 	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
 		error = EAGAIN;
 		goto out;
 	}
 
 	/* Socket buffer got some data that we shall deliver now. */
 	if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) &&
 	    ((so->so_state & SS_NBIO) ||
 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
 	     sbavail(sb) >= sb->sb_lowat ||
 	     sbavail(sb) >= uio->uio_resid ||
 	     sbavail(sb) >= sb->sb_hiwat) ) {
 		goto deliver;
 	}
 
 	/* On MSG_WAITALL we must wait until all data or error arrives. */
 	if ((flags & MSG_WAITALL) &&
 	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat))
 		goto deliver;
 
 	/*
 	 * Wait and block until (more) data comes in.
 	 * NB: Drops the sockbuf lock during wait.
 	 */
 	error = sbwait(so, SO_RCV);
 	if (error)
 		goto out;
 	goto restart;
 
 deliver:
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__));
 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
 
 	/* Statistics. */
 	if (uio->uio_td)
 		uio->uio_td->td_ru.ru_msgrcv++;
 
 	/* Fill uio until full or current end of socket buffer is reached. */
 	len = min(uio->uio_resid, sbavail(sb));
 	if (mp0 != NULL) {
 		/* Dequeue as many mbufs as possible. */
 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
 			if (*mp0 == NULL)
 				*mp0 = sb->sb_mb;
 			else
 				m_cat(*mp0, sb->sb_mb);
 			for (m = sb->sb_mb;
 			     m != NULL && m->m_len <= len;
 			     m = m->m_next) {
 				KASSERT(!(m->m_flags & M_NOTAVAIL),
 				    ("%s: m %p not available", __func__, m));
 				len -= m->m_len;
 				uio->uio_resid -= m->m_len;
 				sbfree(sb, m);
 				n = m;
 			}
 			n->m_next = NULL;
 			sb->sb_mb = m;
 			sb->sb_lastrecord = sb->sb_mb;
 			if (sb->sb_mb == NULL)
 				SB_EMPTY_FIXUP(sb);
 		}
 		/* Copy the remainder. */
 		if (len > 0) {
 			KASSERT(sb->sb_mb != NULL,
 			    ("%s: len > 0 && sb->sb_mb empty", __func__));
 
 			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
 			if (m == NULL)
 				len = 0;	/* Don't flush data from sockbuf. */
 			else
 				uio->uio_resid -= len;
 			if (*mp0 != NULL)
 				m_cat(*mp0, m);
 			else
 				*mp0 = m;
 			if (*mp0 == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 		}
 	} else {
 		/* NB: Must unlock socket buffer as uiomove may sleep. */
 		SOCKBUF_UNLOCK(sb);
 		error = m_mbuftouio(uio, sb->sb_mb, len);
 		SOCKBUF_LOCK(sb);
 		if (error)
 			goto out;
 	}
 	SBLASTRECORDCHK(sb);
 	SBLASTMBUFCHK(sb);
 
 	/*
 	 * Remove the delivered data from the socket buffer unless we
 	 * were only peeking.
 	 */
 	if (!(flags & MSG_PEEK)) {
 		if (len > 0)
 			sbdrop_locked(sb, len);
 
 		/* Notify protocol that we drained some data. */
 		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
 		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
 		     !(flags & MSG_SOCALLBCK))) {
 			SOCKBUF_UNLOCK(sb);
 			VNET_SO_ASSERT(so);
 			so->so_proto->pr_rcvd(so, flags);
 			SOCKBUF_LOCK(sb);
 		}
 	}
 
 	/*
 	 * For MSG_WAITALL we may have to loop again and wait for
 	 * more data to come in.
 	 */
 	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
 		goto restart;
 out:
 	SBLASTRECORDCHK(sb);
 	SBLASTMBUFCHK(sb);
 	SOCKBUF_UNLOCK(sb);
 	SOCK_IO_RECV_UNLOCK(so);
 	return (error);
 }
 
 /*
  * Optimized version of soreceive() for simple datagram cases from userspace.
  * Unlike in the stream case, we're able to drop a datagram if copyout()
  * fails, and because we handle datagrams atomically, we don't need to use a
  * sleep lock to prevent I/O interlacing.
  */
 int
 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	struct mbuf *m, *m2;
 	int flags, error;
 	ssize_t len;
 	struct protosw *pr = so->so_proto;
 	struct mbuf *nextrecord;
 
 	if (psa != NULL)
 		*psa = NULL;
 	if (controlp != NULL)
 		*controlp = NULL;
 	if (flagsp != NULL)
 		flags = *flagsp &~ MSG_EOR;
 	else
 		flags = 0;
 
 	/*
 	 * For any complicated cases, fall back to the full
 	 * soreceive_generic().
 	 */
 	if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC)))
 		return (soreceive_generic(so, psa, uio, mp0, controlp,
 		    flagsp));
 
 	/*
 	 * Enforce restrictions on use.
 	 */
 	KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
 	    ("soreceive_dgram: wantrcvd"));
 	KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
 	KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
 	    ("soreceive_dgram: SBS_RCVATMARK"));
 	KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
 	    ("soreceive_dgram: P_CONNREQUIRED"));
 
 	/*
 	 * Loop blocking while waiting for a datagram.
 	 */
 	SOCKBUF_LOCK(&so->so_rcv);
 	while ((m = so->so_rcv.sb_mb) == NULL) {
 		KASSERT(sbavail(&so->so_rcv) == 0,
 		    ("soreceive_dgram: sb_mb NULL but sbavail %u",
 		    sbavail(&so->so_rcv)));
 		if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			return (error);
 		}
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
 		    uio->uio_resid == 0) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			return (0);
 		}
 		if ((so->so_state & SS_NBIO) ||
 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			return (EWOULDBLOCK);
 		}
 		SBLASTRECORDCHK(&so->so_rcv);
 		SBLASTMBUFCHK(&so->so_rcv);
 		error = sbwait(so, SO_RCV);
 		if (error) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			return (error);
 		}
 	}
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	if (uio->uio_td)
 		uio->uio_td->td_ru.ru_msgrcv++;
 	SBLASTRECORDCHK(&so->so_rcv);
 	SBLASTMBUFCHK(&so->so_rcv);
 	nextrecord = m->m_nextpkt;
 	if (nextrecord == NULL) {
 		KASSERT(so->so_rcv.sb_lastrecord == m,
 		    ("soreceive_dgram: lastrecord != m"));
 	}
 
 	KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
 	    ("soreceive_dgram: m_nextpkt != nextrecord"));
 
 	/*
 	 * Pull 'm' and its chain off the front of the packet queue.
 	 */
 	so->so_rcv.sb_mb = NULL;
 	sockbuf_pushsync(&so->so_rcv, nextrecord);
 
 	/*
 	 * Walk 'm's chain and free that many bytes from the socket buffer.
 	 */
 	for (m2 = m; m2 != NULL; m2 = m2->m_next)
 		sbfree(&so->so_rcv, m2);
 
 	/*
 	 * Do a few last checks before we let go of the lock.
 	 */
 	SBLASTRECORDCHK(&so->so_rcv);
 	SBLASTMBUFCHK(&so->so_rcv);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	if (pr->pr_flags & PR_ADDR) {
 		KASSERT(m->m_type == MT_SONAME,
 		    ("m->m_type == %d", m->m_type));
 		if (psa != NULL)
 			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
 			    M_NOWAIT);
 		m = m_free(m);
 	}
 	if (m == NULL) {
 		/* XXXRW: Can this happen? */
 		return (0);
 	}
 
 	/*
 	 * Packet to copyout() is now in 'm' and it is disconnected from the
 	 * queue.
 	 *
 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
 	 * in the first mbuf chain on the socket buffer.  We call into the
 	 * protocol to perform externalization (or freeing if controlp ==
 	 * NULL). In some cases there can be only MT_CONTROL mbufs without
 	 * MT_DATA mbufs.
 	 */
 	if (m->m_type == MT_CONTROL) {
 		struct mbuf *cm = NULL, *cmn;
 		struct mbuf **cme = &cm;
 
 		do {
 			m2 = m->m_next;
 			m->m_next = NULL;
 			*cme = m;
 			cme = &(*cme)->m_next;
 			m = m2;
 		} while (m != NULL && m->m_type == MT_CONTROL);
 		while (cm != NULL) {
 			cmn = cm->m_next;
 			cm->m_next = NULL;
 			if (pr->pr_domain->dom_externalize != NULL) {
 				error = (*pr->pr_domain->dom_externalize)
 				    (cm, controlp, flags);
 			} else if (controlp != NULL)
 				*controlp = cm;
 			else
 				m_freem(cm);
 			if (controlp != NULL) {
 				while (*controlp != NULL)
 					controlp = &(*controlp)->m_next;
 			}
 			cm = cmn;
 		}
 	}
 	KASSERT(m == NULL || m->m_type == MT_DATA,
 	    ("soreceive_dgram: !data"));
 	while (m != NULL && uio->uio_resid > 0) {
 		len = uio->uio_resid;
 		if (len > m->m_len)
 			len = m->m_len;
 		error = uiomove(mtod(m, char *), (int)len, uio);
 		if (error) {
 			m_freem(m);
 			return (error);
 		}
 		if (len == m->m_len)
 			m = m_free(m);
 		else {
 			m->m_data += len;
 			m->m_len -= len;
 		}
 	}
 	if (m != NULL) {
 		flags |= MSG_TRUNC;
 		m_freem(m);
 	}
 	if (flagsp != NULL)
 		*flagsp |= flags;
 	return (0);
 }
 
 int
 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
 	error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 int
 soshutdown(struct socket *so, int how)
 {
 	struct protosw *pr;
 	int error, soerror_enotconn;
 
 	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
 		return (EINVAL);
 
 	soerror_enotconn = 0;
 	SOCK_LOCK(so);
 	if ((so->so_state &
 	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
 		/*
 		 * POSIX mandates us to return ENOTCONN when shutdown(2) is
 		 * invoked on a datagram sockets, however historically we would
 		 * actually tear socket down. This is known to be leveraged by
 		 * some applications to unblock process waiting in recvXXX(2)
 		 * by other process that it shares that socket with. Try to meet
 		 * both backward-compatibility and POSIX requirements by forcing
 		 * ENOTCONN but still asking protocol to perform pru_shutdown().
 		 */
 		if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) {
 			SOCK_UNLOCK(so);
 			return (ENOTCONN);
 		}
 		soerror_enotconn = 1;
 	}
 
 	if (SOLISTENING(so)) {
 		if (how != SHUT_WR) {
 			so->so_error = ECONNABORTED;
 			solisten_wakeup(so);	/* unlocks so */
 		} else {
 			SOCK_UNLOCK(so);
 		}
 		goto done;
 	}
 	SOCK_UNLOCK(so);
 
 	CURVNET_SET(so->so_vnet);
 	pr = so->so_proto;
 	if (pr->pr_flush != NULL)
 		pr->pr_flush(so, how);
 	if (how != SHUT_WR)
 		sorflush(so);
 	if (how != SHUT_RD) {
 		error = pr->pr_shutdown(so);
 		wakeup(&so->so_timeo);
 		CURVNET_RESTORE();
 		return ((error == 0 && soerror_enotconn) ? ENOTCONN : error);
 	}
 	wakeup(&so->so_timeo);
 	CURVNET_RESTORE();
 
 done:
 	return (soerror_enotconn ? ENOTCONN : 0);
 }
 
 void
 sorflush(struct socket *so)
 {
 	struct protosw *pr;
 	int error;
 
 	VNET_SO_ASSERT(so);
 
 	/*
 	 * Dislodge threads currently blocked in receive and wait to acquire
 	 * a lock against other simultaneous readers before clearing the
 	 * socket buffer.  Don't let our acquire be interrupted by a signal
 	 * despite any existing socket disposition on interruptable waiting.
 	 */
 	socantrcvmore(so);
 
 	error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR);
 	if (error != 0) {
 		KASSERT(SOLISTENING(so),
 		    ("%s: soiolock(%p) failed", __func__, so));
 		return;
 	}
 
 	pr = so->so_proto;
 	if (pr->pr_flags & PR_RIGHTS) {
 		MPASS(pr->pr_domain->dom_dispose != NULL);
 		(*pr->pr_domain->dom_dispose)(so);
 	} else {
 		sbrelease(so, SO_RCV);
 		SOCK_IO_RECV_UNLOCK(so);
 	}
 
 }
 
 /*
  * Wrapper for Socket established helper hook.
  * Parameters: socket, context of the hook point, hook id.
  */
 static int inline
 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id)
 {
 	struct socket_hhook_data hhook_data = {
 		.so = so,
 		.hctx = hctx,
 		.m = NULL,
 		.status = 0
 	};
 
 	CURVNET_SET(so->so_vnet);
 	HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd);
 	CURVNET_RESTORE();
 
 	/* Ugly but needed, since hhooks return void for now */
 	return (hhook_data.status);
 }
 
 /*
  * Perhaps this routine, and sooptcopyout(), below, ought to come in an
  * additional variant to handle the case where the option value needs to be
  * some kind of integer, but not a specific size.  In addition to their use
  * here, these functions are also called by the protocol-level pr_ctloutput()
  * routines.
  */
 int
 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
 {
 	size_t	valsize;
 
 	/*
 	 * If the user gives us more than we wanted, we ignore it, but if we
 	 * don't get the minimum length the caller wants, we return EINVAL.
 	 * On success, sopt->sopt_valsize is set to however much we actually
 	 * retrieved.
 	 */
 	if ((valsize = sopt->sopt_valsize) < minlen)
 		return EINVAL;
 	if (valsize > len)
 		sopt->sopt_valsize = valsize = len;
 
 	if (sopt->sopt_td != NULL)
 		return (copyin(sopt->sopt_val, buf, valsize));
 
 	bcopy(sopt->sopt_val, buf, valsize);
 	return (0);
 }
 
 /*
  * Kernel version of setsockopt(2).
  *
  * XXX: optlen is size_t, not socklen_t
  */
 int
 so_setsockopt(struct socket *so, int level, int optname, void *optval,
     size_t optlen)
 {
 	struct sockopt sopt;
 
 	sopt.sopt_level = level;
 	sopt.sopt_name = optname;
 	sopt.sopt_dir = SOPT_SET;
 	sopt.sopt_val = optval;
 	sopt.sopt_valsize = optlen;
 	sopt.sopt_td = NULL;
 	return (sosetopt(so, &sopt));
 }
 
 int
 sosetopt(struct socket *so, struct sockopt *sopt)
 {
 	int	error, optval;
 	struct	linger l;
 	struct	timeval tv;
 	sbintime_t val, *valp;
 	uint32_t val32;
 #ifdef MAC
 	struct mac extmac;
 #endif
 
 	CURVNET_SET(so->so_vnet);
 	error = 0;
 	if (sopt->sopt_level != SOL_SOCKET) {
 		if (so->so_proto->pr_ctloutput != NULL)
 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
 		else
 			error = ENOPROTOOPT;
 	} else {
 		switch (sopt->sopt_name) {
 		case SO_ACCEPTFILTER:
 			error = accept_filt_setopt(so, sopt);
 			if (error)
 				goto bad;
 			break;
 
 		case SO_LINGER:
 			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
 			if (error)
 				goto bad;
 			if (l.l_linger < 0 ||
 			    l.l_linger > USHRT_MAX ||
 			    l.l_linger > (INT_MAX / hz)) {
 				error = EDOM;
 				goto bad;
 			}
 			SOCK_LOCK(so);
 			so->so_linger = l.l_linger;
 			if (l.l_onoff)
 				so->so_options |= SO_LINGER;
 			else
 				so->so_options &= ~SO_LINGER;
 			SOCK_UNLOCK(so);
 			break;
 
 		case SO_DEBUG:
 		case SO_KEEPALIVE:
 		case SO_DONTROUTE:
 		case SO_USELOOPBACK:
 		case SO_BROADCAST:
 		case SO_REUSEADDR:
 		case SO_REUSEPORT:
 		case SO_REUSEPORT_LB:
 		case SO_OOBINLINE:
 		case SO_TIMESTAMP:
 		case SO_BINTIME:
 		case SO_NOSIGPIPE:
 		case SO_NO_DDP:
 		case SO_NO_OFFLOAD:
 		case SO_RERROR:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				goto bad;
 			SOCK_LOCK(so);
 			if (optval)
 				so->so_options |= sopt->sopt_name;
 			else
 				so->so_options &= ~sopt->sopt_name;
 			SOCK_UNLOCK(so);
 			break;
 
 		case SO_SETFIB:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				goto bad;
 
 			if (optval < 0 || optval >= rt_numfibs) {
 				error = EINVAL;
 				goto bad;
 			}
 			if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
 			   (so->so_proto->pr_domain->dom_family == PF_INET6) ||
 			   (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
 				so->so_fibnum = optval;
 			else
 				so->so_fibnum = 0;
 			break;
 
 		case SO_USER_COOKIE:
 			error = sooptcopyin(sopt, &val32, sizeof val32,
 			    sizeof val32);
 			if (error)
 				goto bad;
 			so->so_user_cookie = val32;
 			break;
 
 		case SO_SNDBUF:
 		case SO_RCVBUF:
 		case SO_SNDLOWAT:
 		case SO_RCVLOWAT:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				goto bad;
 
 			/*
 			 * Values < 1 make no sense for any of these options,
 			 * so disallow them.
 			 */
 			if (optval < 1) {
 				error = EINVAL;
 				goto bad;
 			}
 
 			error = sbsetopt(so, sopt->sopt_name, optval);
 			break;
 
 		case SO_SNDTIMEO:
 		case SO_RCVTIMEO:
 #ifdef COMPAT_FREEBSD32
 			if (SV_CURPROC_FLAG(SV_ILP32)) {
 				struct timeval32 tv32;
 
 				error = sooptcopyin(sopt, &tv32, sizeof tv32,
 				    sizeof tv32);
 				CP(tv32, tv, tv_sec);
 				CP(tv32, tv, tv_usec);
 			} else
 #endif
 				error = sooptcopyin(sopt, &tv, sizeof tv,
 				    sizeof tv);
 			if (error)
 				goto bad;
 			if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
 			    tv.tv_usec >= 1000000) {
 				error = EDOM;
 				goto bad;
 			}
 			if (tv.tv_sec > INT32_MAX)
 				val = SBT_MAX;
 			else
 				val = tvtosbt(tv);
 			SOCK_LOCK(so);
 			valp = sopt->sopt_name == SO_SNDTIMEO ?
 			    (SOLISTENING(so) ? &so->sol_sbsnd_timeo :
 			    &so->so_snd.sb_timeo) :
 			    (SOLISTENING(so) ? &so->sol_sbrcv_timeo :
 			    &so->so_rcv.sb_timeo);
 			*valp = val;
 			SOCK_UNLOCK(so);
 			break;
 
 		case SO_LABEL:
 #ifdef MAC
 			error = sooptcopyin(sopt, &extmac, sizeof extmac,
 			    sizeof extmac);
 			if (error)
 				goto bad;
 			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
 			    so, &extmac);
 #else
 			error = EOPNOTSUPP;
 #endif
 			break;
 
 		case SO_TS_CLOCK:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				goto bad;
 			if (optval < 0 || optval > SO_TS_CLOCK_MAX) {
 				error = EINVAL;
 				goto bad;
 			}
 			so->so_ts_clock = optval;
 			break;
 
 		case SO_MAX_PACING_RATE:
 			error = sooptcopyin(sopt, &val32, sizeof(val32),
 			    sizeof(val32));
 			if (error)
 				goto bad;
 			so->so_max_pacing_rate = val32;
 			break;
 
 		default:
 			if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
 				error = hhook_run_socket(so, sopt,
 				    HHOOK_SOCKET_OPT);
 			else
 				error = ENOPROTOOPT;
 			break;
 		}
 		if (error == 0 && so->so_proto->pr_ctloutput != NULL)
 			(void)(*so->so_proto->pr_ctloutput)(so, sopt);
 	}
 bad:
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * Helper routine for getsockopt.
  */
 int
 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
 {
 	int	error;
 	size_t	valsize;
 
 	error = 0;
 
 	/*
 	 * Documented get behavior is that we always return a value, possibly
 	 * truncated to fit in the user's buffer.  Traditional behavior is
 	 * that we always tell the user precisely how much we copied, rather
 	 * than something useful like the total amount we had available for
 	 * her.  Note that this interface is not idempotent; the entire
 	 * answer must be generated ahead of time.
 	 */
 	valsize = min(len, sopt->sopt_valsize);
 	sopt->sopt_valsize = valsize;
 	if (sopt->sopt_val != NULL) {
 		if (sopt->sopt_td != NULL)
 			error = copyout(buf, sopt->sopt_val, valsize);
 		else
 			bcopy(buf, sopt->sopt_val, valsize);
 	}
 	return (error);
 }
 
 int
 sogetopt(struct socket *so, struct sockopt *sopt)
 {
 	int	error, optval;
 	struct	linger l;
 	struct	timeval tv;
 #ifdef MAC
 	struct mac extmac;
 #endif
 
 	CURVNET_SET(so->so_vnet);
 	error = 0;
 	if (sopt->sopt_level != SOL_SOCKET) {
 		if (so->so_proto->pr_ctloutput != NULL)
 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
 		else
 			error = ENOPROTOOPT;
 		CURVNET_RESTORE();
 		return (error);
 	} else {
 		switch (sopt->sopt_name) {
 		case SO_ACCEPTFILTER:
 			error = accept_filt_getopt(so, sopt);
 			break;
 
 		case SO_LINGER:
 			SOCK_LOCK(so);
 			l.l_onoff = so->so_options & SO_LINGER;
 			l.l_linger = so->so_linger;
 			SOCK_UNLOCK(so);
 			error = sooptcopyout(sopt, &l, sizeof l);
 			break;
 
 		case SO_USELOOPBACK:
 		case SO_DONTROUTE:
 		case SO_DEBUG:
 		case SO_KEEPALIVE:
 		case SO_REUSEADDR:
 		case SO_REUSEPORT:
 		case SO_REUSEPORT_LB:
 		case SO_BROADCAST:
 		case SO_OOBINLINE:
 		case SO_ACCEPTCONN:
 		case SO_TIMESTAMP:
 		case SO_BINTIME:
 		case SO_NOSIGPIPE:
 		case SO_NO_DDP:
 		case SO_NO_OFFLOAD:
 		case SO_RERROR:
 			optval = so->so_options & sopt->sopt_name;
 integer:
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		case SO_DOMAIN:
 			optval = so->so_proto->pr_domain->dom_family;
 			goto integer;
 
 		case SO_TYPE:
 			optval = so->so_type;
 			goto integer;
 
 		case SO_PROTOCOL:
 			optval = so->so_proto->pr_protocol;
 			goto integer;
 
 		case SO_ERROR:
 			SOCK_LOCK(so);
 			if (so->so_error) {
 				optval = so->so_error;
 				so->so_error = 0;
 			} else {
 				optval = so->so_rerror;
 				so->so_rerror = 0;
 			}
 			SOCK_UNLOCK(so);
 			goto integer;
 
 		case SO_SNDBUF:
 			optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat :
 			    so->so_snd.sb_hiwat;
 			goto integer;
 
 		case SO_RCVBUF:
 			optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat :
 			    so->so_rcv.sb_hiwat;
 			goto integer;
 
 		case SO_SNDLOWAT:
 			optval = SOLISTENING(so) ? so->sol_sbsnd_lowat :
 			    so->so_snd.sb_lowat;
 			goto integer;
 
 		case SO_RCVLOWAT:
 			optval = SOLISTENING(so) ? so->sol_sbrcv_lowat :
 			    so->so_rcv.sb_lowat;
 			goto integer;
 
 		case SO_SNDTIMEO:
 		case SO_RCVTIMEO:
 			SOCK_LOCK(so);
 			tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ?
 			    (SOLISTENING(so) ? so->sol_sbsnd_timeo :
 			    so->so_snd.sb_timeo) :
 			    (SOLISTENING(so) ? so->sol_sbrcv_timeo :
 			    so->so_rcv.sb_timeo));
 			SOCK_UNLOCK(so);
 #ifdef COMPAT_FREEBSD32
 			if (SV_CURPROC_FLAG(SV_ILP32)) {
 				struct timeval32 tv32;
 
 				CP(tv, tv32, tv_sec);
 				CP(tv, tv32, tv_usec);
 				error = sooptcopyout(sopt, &tv32, sizeof tv32);
 			} else
 #endif
 				error = sooptcopyout(sopt, &tv, sizeof tv);
 			break;
 
 		case SO_LABEL:
 #ifdef MAC
 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
 			    sizeof(extmac));
 			if (error)
 				goto bad;
 			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
 			    so, &extmac);
 			if (error)
 				goto bad;
 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
 #else
 			error = EOPNOTSUPP;
 #endif
 			break;
 
 		case SO_PEERLABEL:
 #ifdef MAC
 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
 			    sizeof(extmac));
 			if (error)
 				goto bad;
 			error = mac_getsockopt_peerlabel(
 			    sopt->sopt_td->td_ucred, so, &extmac);
 			if (error)
 				goto bad;
 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
 #else
 			error = EOPNOTSUPP;
 #endif
 			break;
 
 		case SO_LISTENQLIMIT:
 			optval = SOLISTENING(so) ? so->sol_qlimit : 0;
 			goto integer;
 
 		case SO_LISTENQLEN:
 			optval = SOLISTENING(so) ? so->sol_qlen : 0;
 			goto integer;
 
 		case SO_LISTENINCQLEN:
 			optval = SOLISTENING(so) ? so->sol_incqlen : 0;
 			goto integer;
 
 		case SO_TS_CLOCK:
 			optval = so->so_ts_clock;
 			goto integer;
 
 		case SO_MAX_PACING_RATE:
 			optval = so->so_max_pacing_rate;
 			goto integer;
 
 		default:
 			if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
 				error = hhook_run_socket(so, sopt,
 				    HHOOK_SOCKET_OPT);
 			else
 				error = ENOPROTOOPT;
 			break;
 		}
 	}
 #ifdef MAC
 bad:
 #endif
 	CURVNET_RESTORE();
 	return (error);
 }
 
 int
 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
 {
 	struct mbuf *m, *m_prev;
 	int sopt_size = sopt->sopt_valsize;
 
 	MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return ENOBUFS;
 	if (sopt_size > MLEN) {
 		MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
 		if ((m->m_flags & M_EXT) == 0) {
 			m_free(m);
 			return ENOBUFS;
 		}
 		m->m_len = min(MCLBYTES, sopt_size);
 	} else {
 		m->m_len = min(MLEN, sopt_size);
 	}
 	sopt_size -= m->m_len;
 	*mp = m;
 	m_prev = m;
 
 	while (sopt_size) {
 		MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			m_freem(*mp);
 			return ENOBUFS;
 		}
 		if (sopt_size > MLEN) {
 			MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
 			    M_NOWAIT);
 			if ((m->m_flags & M_EXT) == 0) {
 				m_freem(m);
 				m_freem(*mp);
 				return ENOBUFS;
 			}
 			m->m_len = min(MCLBYTES, sopt_size);
 		} else {
 			m->m_len = min(MLEN, sopt_size);
 		}
 		sopt_size -= m->m_len;
 		m_prev->m_next = m;
 		m_prev = m;
 	}
 	return (0);
 }
 
 int
 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
 {
 	struct mbuf *m0 = m;
 
 	if (sopt->sopt_val == NULL)
 		return (0);
 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
 		if (sopt->sopt_td != NULL) {
 			int error;
 
 			error = copyin(sopt->sopt_val, mtod(m, char *),
 			    m->m_len);
 			if (error != 0) {
 				m_freem(m0);
 				return(error);
 			}
 		} else
 			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
 		sopt->sopt_valsize -= m->m_len;
 		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
 		m = m->m_next;
 	}
 	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
 		panic("ip6_sooptmcopyin");
 	return (0);
 }
 
 int
 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
 {
 	struct mbuf *m0 = m;
 	size_t valsize = 0;
 
 	if (sopt->sopt_val == NULL)
 		return (0);
 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
 		if (sopt->sopt_td != NULL) {
 			int error;
 
 			error = copyout(mtod(m, char *), sopt->sopt_val,
 			    m->m_len);
 			if (error != 0) {
 				m_freem(m0);
 				return(error);
 			}
 		} else
 			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
 		sopt->sopt_valsize -= m->m_len;
 		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
 		valsize += m->m_len;
 		m = m->m_next;
 	}
 	if (m != NULL) {
 		/* enough soopt buffer should be given from user-land */
 		m_freem(m0);
 		return(EINVAL);
 	}
 	sopt->sopt_valsize = valsize;
 	return (0);
 }
 
 /*
  * sohasoutofband(): protocol notifies socket layer of the arrival of new
  * out-of-band data, which will then notify socket consumers.
  */
 void
 sohasoutofband(struct socket *so)
 {
 
 	if (so->so_sigio != NULL)
 		pgsigio(&so->so_sigio, SIGURG, 0);
 	selwakeuppri(&so->so_rdsel, PSOCK);
 }
 
 int
 sopoll(struct socket *so, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	/*
 	 * We do not need to set or assert curvnet as long as everyone uses
 	 * sopoll_generic().
 	 */
 	return (so->so_proto->pr_sopoll(so, events, active_cred, td));
 }
 
 int
 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	int revents;
 
 	SOCK_LOCK(so);
 	if (SOLISTENING(so)) {
 		if (!(events & (POLLIN | POLLRDNORM)))
 			revents = 0;
 		else if (!TAILQ_EMPTY(&so->sol_comp))
 			revents = events & (POLLIN | POLLRDNORM);
 		else if ((events & POLLINIGNEOF) == 0 && so->so_error)
 			revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP;
 		else {
 			selrecord(td, &so->so_rdsel);
 			revents = 0;
 		}
 	} else {
 		revents = 0;
 		SOCK_SENDBUF_LOCK(so);
 		SOCK_RECVBUF_LOCK(so);
 		if (events & (POLLIN | POLLRDNORM))
 			if (soreadabledata(so))
 				revents |= events & (POLLIN | POLLRDNORM);
 		if (events & (POLLOUT | POLLWRNORM))
 			if (sowriteable(so))
 				revents |= events & (POLLOUT | POLLWRNORM);
 		if (events & (POLLPRI | POLLRDBAND))
 			if (so->so_oobmark ||
 			    (so->so_rcv.sb_state & SBS_RCVATMARK))
 				revents |= events & (POLLPRI | POLLRDBAND);
 		if ((events & POLLINIGNEOF) == 0) {
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				revents |= events & (POLLIN | POLLRDNORM);
 				if (so->so_snd.sb_state & SBS_CANTSENDMORE)
 					revents |= POLLHUP;
 			}
 		}
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 			revents |= events & POLLRDHUP;
 		if (revents == 0) {
 			if (events &
 			    (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) {
 				selrecord(td, &so->so_rdsel);
 				so->so_rcv.sb_flags |= SB_SEL;
 			}
 			if (events & (POLLOUT | POLLWRNORM)) {
 				selrecord(td, &so->so_wrsel);
 				so->so_snd.sb_flags |= SB_SEL;
 			}
 		}
 		SOCK_RECVBUF_UNLOCK(so);
 		SOCK_SENDBUF_UNLOCK(so);
 	}
 	SOCK_UNLOCK(so);
 	return (revents);
 }
 
 int
 soo_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct socket *so = kn->kn_fp->f_data;
 	struct sockbuf *sb;
 	sb_which which;
 	struct knlist *knl;
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &soread_filtops;
 		knl = &so->so_rdsel.si_note;
 		sb = &so->so_rcv;
 		which = SO_RCV;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &sowrite_filtops;
 		knl = &so->so_wrsel.si_note;
 		sb = &so->so_snd;
 		which = SO_SND;
 		break;
 	case EVFILT_EMPTY:
 		kn->kn_fop = &soempty_filtops;
 		knl = &so->so_wrsel.si_note;
 		sb = &so->so_snd;
 		which = SO_SND;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	SOCK_LOCK(so);
 	if (SOLISTENING(so)) {
 		knlist_add(knl, kn, 1);
 	} else {
 		SOCK_BUF_LOCK(so, which);
 		knlist_add(knl, kn, 1);
 		sb->sb_flags |= SB_KNOTE;
 		SOCK_BUF_UNLOCK(so, which);
 	}
 	SOCK_UNLOCK(so);
 	return (0);
 }
 
 static void
 filt_sordetach(struct knote *kn)
 {
 	struct socket *so = kn->kn_fp->f_data;
 
 	so_rdknl_lock(so);
 	knlist_remove(&so->so_rdsel.si_note, kn, 1);
 	if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note))
 		so->so_rcv.sb_flags &= ~SB_KNOTE;
 	so_rdknl_unlock(so);
 }
 
 /*ARGSUSED*/
 static int
 filt_soread(struct knote *kn, long hint)
 {
 	struct socket *so;
 
 	so = kn->kn_fp->f_data;
 
 	if (SOLISTENING(so)) {
 		SOCK_LOCK_ASSERT(so);
 		kn->kn_data = so->sol_qlen;
 		if (so->so_error) {
 			kn->kn_flags |= EV_EOF;
 			kn->kn_fflags = so->so_error;
 			return (1);
 		}
 		return (!TAILQ_EMPTY(&so->sol_comp));
 	}
 
 	SOCK_RECVBUF_LOCK_ASSERT(so);
 
 	kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 		kn->kn_flags |= EV_EOF;
 		kn->kn_fflags = so->so_error;
 		return (1);
 	} else if (so->so_error || so->so_rerror)
 		return (1);
 
 	if (kn->kn_sfflags & NOTE_LOWAT) {
 		if (kn->kn_data >= kn->kn_sdata)
 			return (1);
 	} else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat)
 		return (1);
 
 	/* This hook returning non-zero indicates an event, not error */
 	return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD));
 }
 
 static void
 filt_sowdetach(struct knote *kn)
 {
 	struct socket *so = kn->kn_fp->f_data;
 
 	so_wrknl_lock(so);
 	knlist_remove(&so->so_wrsel.si_note, kn, 1);
 	if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note))
 		so->so_snd.sb_flags &= ~SB_KNOTE;
 	so_wrknl_unlock(so);
 }
 
 /*ARGSUSED*/
 static int
 filt_sowrite(struct knote *kn, long hint)
 {
 	struct socket *so;
 
 	so = kn->kn_fp->f_data;
 
 	if (SOLISTENING(so))
 		return (0);
 
 	SOCK_SENDBUF_LOCK_ASSERT(so);
 	kn->kn_data = sbspace(&so->so_snd);
 
 	hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE);
 
 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		kn->kn_flags |= EV_EOF;
 		kn->kn_fflags = so->so_error;
 		return (1);
 	} else if (so->so_error)	/* temporary udp error */
 		return (1);
 	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
 		return (0);
 	else if (kn->kn_sfflags & NOTE_LOWAT)
 		return (kn->kn_data >= kn->kn_sdata);
 	else
 		return (kn->kn_data >= so->so_snd.sb_lowat);
 }
 
 static int
 filt_soempty(struct knote *kn, long hint)
 {
 	struct socket *so;
 
 	so = kn->kn_fp->f_data;
 
 	if (SOLISTENING(so))
 		return (1);
 
 	SOCK_SENDBUF_LOCK_ASSERT(so);
 	kn->kn_data = sbused(&so->so_snd);
 
 	if (kn->kn_data == 0)
 		return (1);
 	else
 		return (0);
 }
 
 int
 socheckuid(struct socket *so, uid_t uid)
 {
 
 	if (so == NULL)
 		return (EPERM);
 	if (so->so_cred->cr_uid != uid)
 		return (EPERM);
 	return (0);
 }
 
 /*
  * These functions are used by protocols to notify the socket layer (and its
  * consumers) of state changes in the sockets driven by protocol-side events.
  */
 
 /*
  * Procedures to manipulate state flags of socket and do appropriate wakeups.
  *
  * Normal sequence from the active (originating) side is that
  * soisconnecting() is called during processing of connect() call, resulting
  * in an eventual call to soisconnected() if/when the connection is
  * established.  When the connection is torn down soisdisconnecting() is
  * called during processing of disconnect() call, and soisdisconnected() is
  * called when the connection to the peer is totally severed.  The semantics
  * of these routines are such that connectionless protocols can call
  * soisconnected() and soisdisconnected() only, bypassing the in-progress
  * calls when setting up a ``connection'' takes no time.
  *
  * From the passive side, a socket is created with two queues of sockets:
  * so_incomp for connections in progress and so_comp for connections already
  * made and awaiting user acceptance.  As a protocol is preparing incoming
  * connections, it creates a socket structure queued on so_incomp by calling
  * sonewconn().  When the connection is established, soisconnected() is
  * called, and transfers the socket structure to so_comp, making it available
  * to accept().
  *
  * If a socket is closed with sockets on either so_incomp or so_comp, these
  * sockets are dropped.
  *
  * If higher-level protocols are implemented in the kernel, the wakeups done
  * here will sometimes cause software-interrupt process scheduling.
  */
 void
 soisconnecting(struct socket *so)
 {
 
 	SOCK_LOCK(so);
 	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
 	so->so_state |= SS_ISCONNECTING;
 	SOCK_UNLOCK(so);
 }
 
 void
 soisconnected(struct socket *so)
 {
 	bool last __diagused;
 
 	SOCK_LOCK(so);
 	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
 	so->so_state |= SS_ISCONNECTED;
 
 	if (so->so_qstate == SQ_INCOMP) {
 		struct socket *head = so->so_listen;
 		int ret;
 
 		KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so));
 		/*
 		 * Promoting a socket from incomplete queue to complete, we
 		 * need to go through reverse order of locking.  We first do
 		 * trylock, and if that doesn't succeed, we go the hard way
 		 * leaving a reference and rechecking consistency after proper
 		 * locking.
 		 */
 		if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
 			soref(head);
 			SOCK_UNLOCK(so);
 			SOLISTEN_LOCK(head);
 			SOCK_LOCK(so);
 			if (__predict_false(head != so->so_listen)) {
 				/*
 				 * The socket went off the listen queue,
 				 * should be lost race to close(2) of sol.
 				 * The socket is about to soabort().
 				 */
 				SOCK_UNLOCK(so);
 				sorele_locked(head);
 				return;
 			}
 			last = refcount_release(&head->so_count);
 			KASSERT(!last, ("%s: released last reference for %p",
 			    __func__, head));
 		}
 again:
 		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
 			TAILQ_REMOVE(&head->sol_incomp, so, so_list);
 			head->sol_incqlen--;
 			TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
 			head->sol_qlen++;
 			so->so_qstate = SQ_COMP;
 			SOCK_UNLOCK(so);
 			solisten_wakeup(head);	/* unlocks */
 		} else {
 			SOCK_RECVBUF_LOCK(so);
 			soupcall_set(so, SO_RCV,
 			    head->sol_accept_filter->accf_callback,
 			    head->sol_accept_filter_arg);
 			so->so_options &= ~SO_ACCEPTFILTER;
 			ret = head->sol_accept_filter->accf_callback(so,
 			    head->sol_accept_filter_arg, M_NOWAIT);
 			if (ret == SU_ISCONNECTED) {
 				soupcall_clear(so, SO_RCV);
 				SOCK_RECVBUF_UNLOCK(so);
 				goto again;
 			}
 			SOCK_RECVBUF_UNLOCK(so);
 			SOCK_UNLOCK(so);
 			SOLISTEN_UNLOCK(head);
 		}
 		return;
 	}
 	SOCK_UNLOCK(so);
 	wakeup(&so->so_timeo);
 	sorwakeup(so);
 	sowwakeup(so);
 }
 
 void
 soisdisconnecting(struct socket *so)
 {
 
 	SOCK_LOCK(so);
 	so->so_state &= ~SS_ISCONNECTING;
 	so->so_state |= SS_ISDISCONNECTING;
 
 	if (!SOLISTENING(so)) {
 		SOCK_RECVBUF_LOCK(so);
 		socantrcvmore_locked(so);
 		SOCK_SENDBUF_LOCK(so);
 		socantsendmore_locked(so);
 	}
 	SOCK_UNLOCK(so);
 	wakeup(&so->so_timeo);
 }
 
 void
 soisdisconnected(struct socket *so)
 {
 
 	SOCK_LOCK(so);
 
 	/*
 	 * There is at least one reader of so_state that does not
 	 * acquire socket lock, namely soreceive_generic().  Ensure
 	 * that it never sees all flags that track connection status
 	 * cleared, by ordering the update with a barrier semantic of
 	 * our release thread fence.
 	 */
 	so->so_state |= SS_ISDISCONNECTED;
 	atomic_thread_fence_rel();
 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
 
 	if (!SOLISTENING(so)) {
 		SOCK_UNLOCK(so);
 		SOCK_RECVBUF_LOCK(so);
 		socantrcvmore_locked(so);
 		SOCK_SENDBUF_LOCK(so);
 		sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
 		socantsendmore_locked(so);
 	} else
 		SOCK_UNLOCK(so);
 	wakeup(&so->so_timeo);
 }
 
 int
 soiolock(struct socket *so, struct sx *sx, int flags)
 {
 	int error;
 
 	KASSERT((flags & SBL_VALID) == flags,
 	    ("soiolock: invalid flags %#x", flags));
 
 	if ((flags & SBL_WAIT) != 0) {
 		if ((flags & SBL_NOINTR) != 0) {
 			sx_xlock(sx);
 		} else {
 			error = sx_xlock_sig(sx);
 			if (error != 0)
 				return (error);
 		}
 	} else if (!sx_try_xlock(sx)) {
 		return (EWOULDBLOCK);
 	}
 
 	if (__predict_false(SOLISTENING(so))) {
 		sx_xunlock(sx);
 		return (ENOTCONN);
 	}
 	return (0);
 }
 
 void
 soiounlock(struct sx *sx)
 {
 	sx_xunlock(sx);
 }
 
 /*
  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
  */
 struct sockaddr *
 sodupsockaddr(const struct sockaddr *sa, int mflags)
 {
 	struct sockaddr *sa2;
 
 	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
 	if (sa2)
 		bcopy(sa, sa2, sa->sa_len);
 	return sa2;
 }
 
 /*
  * Register per-socket destructor.
  */
 void
 sodtor_set(struct socket *so, so_dtor_t *func)
 {
 
 	SOCK_LOCK_ASSERT(so);
 	so->so_dtor = func;
 }
 
 /*
  * Register per-socket buffer upcalls.
  */
 void
 soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg)
 {
 	struct sockbuf *sb;
 
 	KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
 
 	switch (which) {
 	case SO_RCV:
 		sb = &so->so_rcv;
 		break;
 	case SO_SND:
 		sb = &so->so_snd;
 		break;
 	}
 	SOCK_BUF_LOCK_ASSERT(so, which);
 	sb->sb_upcall = func;
 	sb->sb_upcallarg = arg;
 	sb->sb_flags |= SB_UPCALL;
 }
 
 void
 soupcall_clear(struct socket *so, sb_which which)
 {
 	struct sockbuf *sb;
 
 	KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
 
 	switch (which) {
 	case SO_RCV:
 		sb = &so->so_rcv;
 		break;
 	case SO_SND:
 		sb = &so->so_snd;
 		break;
 	}
 	SOCK_BUF_LOCK_ASSERT(so, which);
 	KASSERT(sb->sb_upcall != NULL,
 	    ("%s: so %p no upcall to clear", __func__, so));
 	sb->sb_upcall = NULL;
 	sb->sb_upcallarg = NULL;
 	sb->sb_flags &= ~SB_UPCALL;
 }
 
 void
 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg)
 {
 
 	SOLISTEN_LOCK_ASSERT(so);
 	so->sol_upcall = func;
 	so->sol_upcallarg = arg;
 }
 
 static void
 so_rdknl_lock(void *arg)
 {
 	struct socket *so = arg;
 
 retry:
 	if (SOLISTENING(so)) {
 		SOLISTEN_LOCK(so);
 	} else {
 		SOCK_RECVBUF_LOCK(so);
 		if (__predict_false(SOLISTENING(so))) {
 			SOCK_RECVBUF_UNLOCK(so);
 			goto retry;
 		}
 	}
 }
 
 static void
 so_rdknl_unlock(void *arg)
 {
 	struct socket *so = arg;
 
 	if (SOLISTENING(so))
 		SOLISTEN_UNLOCK(so);
 	else
 		SOCK_RECVBUF_UNLOCK(so);
 }
 
 static void
 so_rdknl_assert_lock(void *arg, int what)
 {
 	struct socket *so = arg;
 
 	if (what == LA_LOCKED) {
 		if (SOLISTENING(so))
 			SOLISTEN_LOCK_ASSERT(so);
 		else
 			SOCK_RECVBUF_LOCK_ASSERT(so);
 	} else {
 		if (SOLISTENING(so))
 			SOLISTEN_UNLOCK_ASSERT(so);
 		else
 			SOCK_RECVBUF_UNLOCK_ASSERT(so);
 	}
 }
 
 static void
 so_wrknl_lock(void *arg)
 {
 	struct socket *so = arg;
 
 retry:
 	if (SOLISTENING(so)) {
 		SOLISTEN_LOCK(so);
 	} else {
 		SOCK_SENDBUF_LOCK(so);
 		if (__predict_false(SOLISTENING(so))) {
 			SOCK_SENDBUF_UNLOCK(so);
 			goto retry;
 		}
 	}
 }
 
 static void
 so_wrknl_unlock(void *arg)
 {
 	struct socket *so = arg;
 
 	if (SOLISTENING(so))
 		SOLISTEN_UNLOCK(so);
 	else
 		SOCK_SENDBUF_UNLOCK(so);
 }
 
 static void
 so_wrknl_assert_lock(void *arg, int what)
 {
 	struct socket *so = arg;
 
 	if (what == LA_LOCKED) {
 		if (SOLISTENING(so))
 			SOLISTEN_LOCK_ASSERT(so);
 		else
 			SOCK_SENDBUF_LOCK_ASSERT(so);
 	} else {
 		if (SOLISTENING(so))
 			SOLISTEN_UNLOCK_ASSERT(so);
 		else
 			SOCK_SENDBUF_UNLOCK_ASSERT(so);
 	}
 }
 
 /*
  * Create an external-format (``xsocket'') structure using the information in
  * the kernel-format socket structure pointed to by so.  This is done to
  * reduce the spew of irrelevant information over this interface, to isolate
  * user code from changes in the kernel structure, and potentially to provide
  * information-hiding if we decide that some of this information should be
  * hidden from users.
  */
 void
 sotoxsocket(struct socket *so, struct xsocket *xso)
 {
 
 	bzero(xso, sizeof(*xso));
 	xso->xso_len = sizeof *xso;
 	xso->xso_so = (uintptr_t)so;
 	xso->so_type = so->so_type;
 	xso->so_options = so->so_options;
 	xso->so_linger = so->so_linger;
 	xso->so_state = so->so_state;
 	xso->so_pcb = (uintptr_t)so->so_pcb;
 	xso->xso_protocol = so->so_proto->pr_protocol;
 	xso->xso_family = so->so_proto->pr_domain->dom_family;
 	xso->so_timeo = so->so_timeo;
 	xso->so_error = so->so_error;
 	xso->so_uid = so->so_cred->cr_uid;
 	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
 	if (SOLISTENING(so)) {
 		xso->so_qlen = so->sol_qlen;
 		xso->so_incqlen = so->sol_incqlen;
 		xso->so_qlimit = so->sol_qlimit;
 		xso->so_oobmark = 0;
 	} else {
 		xso->so_state |= so->so_qstate;
 		xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0;
 		xso->so_oobmark = so->so_oobmark;
 		sbtoxsockbuf(&so->so_snd, &xso->so_snd);
 		sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
 	}
 }
 
 struct sockbuf *
 so_sockbuf_rcv(struct socket *so)
 {
 
 	return (&so->so_rcv);
 }
 
 struct sockbuf *
 so_sockbuf_snd(struct socket *so)
 {
 
 	return (&so->so_snd);
 }
 
 int
 so_state_get(const struct socket *so)
 {
 
 	return (so->so_state);
 }
 
 void
 so_state_set(struct socket *so, int val)
 {
 
 	so->so_state = val;
 }
 
 int
 so_options_get(const struct socket *so)
 {
 
 	return (so->so_options);
 }
 
 void
 so_options_set(struct socket *so, int val)
 {
 
 	so->so_options = val;
 }
 
 int
 so_error_get(const struct socket *so)
 {
 
 	return (so->so_error);
 }
 
 void
 so_error_set(struct socket *so, int val)
 {
 
 	so->so_error = val;
 }
 
 int
 so_linger_get(const struct socket *so)
 {
 
 	return (so->so_linger);
 }
 
 void
 so_linger_set(struct socket *so, int val)
 {
 
 	KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz),
 	    ("%s: val %d out of range", __func__, val));
 
 	so->so_linger = val;
 }
 
 struct protosw *
 so_protosw_get(const struct socket *so)
 {
 
 	return (so->so_proto);
 }
 
 void
 so_protosw_set(struct socket *so, struct protosw *val)
 {
 
 	so->so_proto = val;
 }
 
 void
 so_sorwakeup(struct socket *so)
 {
 
 	sorwakeup(so);
 }
 
 void
 so_sowwakeup(struct socket *so)
 {
 
 	sowwakeup(so);
 }
 
 void
 so_sorwakeup_locked(struct socket *so)
 {
 
 	sorwakeup_locked(so);
 }
 
 void
 so_sowwakeup_locked(struct socket *so)
 {
 
 	sowwakeup_locked(so);
 }
 
 void
 so_lock(struct socket *so)
 {
 
 	SOCK_LOCK(so);
 }
 
 void
 so_unlock(struct socket *so)
 {
 
 	SOCK_UNLOCK(so);
 }
diff --git a/sys/netgraph/ng_ksocket.c b/sys/netgraph/ng_ksocket.c
index d4f41fe02205..ff5e7b4812bf 100644
--- a/sys/netgraph/ng_ksocket.c
+++ b/sys/netgraph/ng_ksocket.c
@@ -1,1270 +1,1270 @@
 /*
  * ng_ksocket.c
  */
 
 /*-
  * Copyright (c) 1996-1999 Whistle Communications, Inc.
  * All rights reserved.
  * 
  * Subject to the following obligations and disclaimer of warranty, use and
  * redistribution of this software, in source or object code forms, with or
  * without modifications are expressly permitted by Whistle Communications;
  * provided, however, that:
  * 1. Any and all reproductions of the source or object code must include the
  *    copyright notice above and the following disclaimer of warranties; and
  * 2. No rights are granted, in any manner or form, to use Whistle
  *    Communications, Inc. trademarks, including the mark "WHISTLE
  *    COMMUNICATIONS" on advertising, endorsements, or otherwise except as
  *    such appears in the above copyright notice or in the software.
  * 
  * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND
  * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO
  * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE,
  * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
  * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY
  * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS
  * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE.
  * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES
  * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
  * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY
  * OF SUCH DAMAGE.
  *
  * Author: Archie Cobbs <archie@freebsd.org>
  *
  * $FreeBSD$
  * $Whistle: ng_ksocket.c,v 1.1 1999/11/16 20:04:40 archie Exp $
  */
 
 /*
  * Kernel socket node type.  This node type is basically a kernel-mode
  * version of a socket... kindof like the reverse of the socket node type.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/ctype.h>
 #include <sys/protosw.h>
 #include <sys/errno.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/uio.h>
 #include <sys/un.h>
 
 #include <netgraph/ng_message.h>
 #include <netgraph/netgraph.h>
 #include <netgraph/ng_parse.h>
 #include <netgraph/ng_ksocket.h>
 
 #include <netinet/in.h>
 #include <netinet/ip.h>
 
 #ifdef NG_SEPARATE_MALLOC
 static MALLOC_DEFINE(M_NETGRAPH_KSOCKET, "netgraph_ksock",
     "netgraph ksock node");
 #else
 #define M_NETGRAPH_KSOCKET M_NETGRAPH
 #endif
 
 #define OFFSETOF(s, e) ((char *)&((s *)0)->e - (char *)((s *)0))
 #define SADATA_OFFSET	(OFFSETOF(struct sockaddr, sa_data))
 
 /* Node private data */
 struct ng_ksocket_private {
 	node_p		node;
 	hook_p		hook;
 	struct socket	*so;
 	int		fn_sent;	/* FN call on incoming event was sent */
 	LIST_HEAD(, ng_ksocket_private)	embryos;
 	LIST_ENTRY(ng_ksocket_private)	siblings;
 	u_int32_t	flags;
 	u_int32_t	response_token;
 	ng_ID_t		response_addr;
 };
 typedef struct ng_ksocket_private *priv_p;
 
 /* Flags for priv_p */
 #define	KSF_CONNECTING	0x00000001	/* Waiting for connection complete */
 #define	KSF_ACCEPTING	0x00000002	/* Waiting for accept complete */
 #define	KSF_EOFSEEN	0x00000004	/* Have sent 0-length EOF mbuf */
 #define	KSF_CLONED	0x00000008	/* Cloned from an accepting socket */
 #define	KSF_EMBRYONIC	0x00000010	/* Cloned node with no hooks yet */
 
 /* Netgraph node methods */
 static ng_constructor_t	ng_ksocket_constructor;
 static ng_rcvmsg_t	ng_ksocket_rcvmsg;
 static ng_shutdown_t	ng_ksocket_shutdown;
 static ng_newhook_t	ng_ksocket_newhook;
 static ng_rcvdata_t	ng_ksocket_rcvdata;
 static ng_connect_t	ng_ksocket_connect;
 static ng_disconnect_t	ng_ksocket_disconnect;
 
 /* Alias structure */
 struct ng_ksocket_alias {
 	const char	*name;
 	const int	value;
 	const int	family;
 };
 
 /* Protocol family aliases */
 static const struct ng_ksocket_alias ng_ksocket_families[] = {
 	{ "local",	PF_LOCAL	},
 	{ "inet",	PF_INET		},
 	{ "inet6",	PF_INET6	},
 	{ "atm",	PF_ATM		},
+	{ "divert",	PF_DIVERT	},
 	{ NULL,		-1		},
 };
 
 /* Socket type aliases */
 static const struct ng_ksocket_alias ng_ksocket_types[] = {
 	{ "stream",	SOCK_STREAM	},
 	{ "dgram",	SOCK_DGRAM	},
 	{ "raw",	SOCK_RAW	},
 	{ "rdm",	SOCK_RDM	},
 	{ "seqpacket",	SOCK_SEQPACKET	},
 	{ NULL,		-1		},
 };
 
 /* Protocol aliases */
 static const struct ng_ksocket_alias ng_ksocket_protos[] = {
 	{ "ip",		IPPROTO_IP,		PF_INET		},
 	{ "raw",	IPPROTO_RAW,		PF_INET		},
 	{ "icmp",	IPPROTO_ICMP,		PF_INET		},
 	{ "igmp",	IPPROTO_IGMP,		PF_INET		},
 	{ "tcp",	IPPROTO_TCP,		PF_INET		},
 	{ "udp",	IPPROTO_UDP,		PF_INET		},
 	{ "gre",	IPPROTO_GRE,		PF_INET		},
 	{ "esp",	IPPROTO_ESP,		PF_INET		},
 	{ "ah",		IPPROTO_AH,		PF_INET		},
 	{ "swipe",	IPPROTO_SWIPE,		PF_INET		},
 	{ "encap",	IPPROTO_ENCAP,		PF_INET		},
-	{ "divert",	IPPROTO_DIVERT,		PF_INET		},
 	{ "pim",	IPPROTO_PIM,		PF_INET		},
 	{ NULL,		-1					},
 };
 
 /* Helper functions */
 static int	ng_ksocket_accept(priv_p);
 static int	ng_ksocket_incoming(struct socket *so, void *arg, int waitflag);
 static int	ng_ksocket_parse(const struct ng_ksocket_alias *aliases,
 			const char *s, int family);
 static void	ng_ksocket_incoming2(node_p node, hook_p hook,
 			void *arg1, int arg2);
 
 /************************************************************************
 			STRUCT SOCKADDR PARSE TYPE
  ************************************************************************/
 
 /* Get the length of the data portion of a generic struct sockaddr */
 static int
 ng_parse_generic_sockdata_getLength(const struct ng_parse_type *type,
 	const u_char *start, const u_char *buf)
 {
 	const struct sockaddr *sa;
 
 	sa = (const struct sockaddr *)(buf - SADATA_OFFSET);
 	return (sa->sa_len < SADATA_OFFSET) ? 0 : sa->sa_len - SADATA_OFFSET;
 }
 
 /* Type for the variable length data portion of a generic struct sockaddr */
 static const struct ng_parse_type ng_ksocket_generic_sockdata_type = {
 	&ng_parse_bytearray_type,
 	&ng_parse_generic_sockdata_getLength
 };
 
 /* Type for a generic struct sockaddr */
 static const struct ng_parse_struct_field
     ng_parse_generic_sockaddr_type_fields[] = {
 	  { "len",	&ng_parse_uint8_type			},
 	  { "family",	&ng_parse_uint8_type			},
 	  { "data",	&ng_ksocket_generic_sockdata_type	},
 	  { NULL }
 };
 static const struct ng_parse_type ng_ksocket_generic_sockaddr_type = {
 	&ng_parse_struct_type,
 	&ng_parse_generic_sockaddr_type_fields
 };
 
 /* Convert a struct sockaddr from ASCII to binary.  If its a protocol
    family that we specially handle, do that, otherwise defer to the
    generic parse type ng_ksocket_generic_sockaddr_type. */
 static int
 ng_ksocket_sockaddr_parse(const struct ng_parse_type *type,
 	const char *s, int *off, const u_char *const start,
 	u_char *const buf, int *buflen)
 {
 	struct sockaddr *const sa = (struct sockaddr *)buf;
 	enum ng_parse_token tok;
 	char fambuf[32];
 	int family, len;
 	char *t;
 
 	/* If next token is a left curly brace, use generic parse type */
 	if ((tok = ng_parse_get_token(s, off, &len)) == T_LBRACE) {
 		return (*ng_ksocket_generic_sockaddr_type.supertype->parse)
 		    (&ng_ksocket_generic_sockaddr_type,
 		    s, off, start, buf, buflen);
 	}
 
 	/* Get socket address family followed by a slash */
 	while (isspace(s[*off]))
 		(*off)++;
 	if ((t = strchr(s + *off, '/')) == NULL)
 		return (EINVAL);
 	if ((len = t - (s + *off)) > sizeof(fambuf) - 1)
 		return (EINVAL);
 	strncpy(fambuf, s + *off, len);
 	fambuf[len] = '\0';
 	*off += len + 1;
 	if ((family = ng_ksocket_parse(ng_ksocket_families, fambuf, 0)) == -1)
 		return (EINVAL);
 
 	/* Set family */
 	if (*buflen < SADATA_OFFSET)
 		return (ERANGE);
 	sa->sa_family = family;
 
 	/* Set family-specific data and length */
 	switch (sa->sa_family) {
 	case PF_LOCAL:		/* Get pathname */
 	    {
 		const int pathoff = OFFSETOF(struct sockaddr_un, sun_path);
 		struct sockaddr_un *const sun = (struct sockaddr_un *)sa;
 		int toklen, pathlen;
 		char *path;
 
 		if ((path = ng_get_string_token(s, off, &toklen, NULL)) == NULL)
 			return (EINVAL);
 		pathlen = strlen(path);
 		if (pathlen > SOCK_MAXADDRLEN) {
 			free(path, M_NETGRAPH_KSOCKET);
 			return (E2BIG);
 		}
 		if (*buflen < pathoff + pathlen) {
 			free(path, M_NETGRAPH_KSOCKET);
 			return (ERANGE);
 		}
 		*off += toklen;
 		bcopy(path, sun->sun_path, pathlen);
 		sun->sun_len = pathoff + pathlen;
 		free(path, M_NETGRAPH_KSOCKET);
 		break;
 	    }
 
 	case PF_INET:		/* Get an IP address with optional port */
 	    {
 		struct sockaddr_in *const sin = (struct sockaddr_in *)sa;
 		int i;
 
 		/* Parse this: <ipaddress>[:port] */
 		for (i = 0; i < 4; i++) {
 			u_long val;
 			char *eptr;
 
 			val = strtoul(s + *off, &eptr, 10);
 			if (val > 0xff || eptr == s + *off)
 				return (EINVAL);
 			*off += (eptr - (s + *off));
 			((u_char *)&sin->sin_addr)[i] = (u_char)val;
 			if (i < 3) {
 				if (s[*off] != '.')
 					return (EINVAL);
 				(*off)++;
 			} else if (s[*off] == ':') {
 				(*off)++;
 				val = strtoul(s + *off, &eptr, 10);
 				if (val > 0xffff || eptr == s + *off)
 					return (EINVAL);
 				*off += (eptr - (s + *off));
 				sin->sin_port = htons(val);
 			} else
 				sin->sin_port = 0;
 		}
 		bzero(&sin->sin_zero, sizeof(sin->sin_zero));
 		sin->sin_len = sizeof(*sin);
 		break;
 	    }
 
 #if 0
 	case PF_INET6:	/* XXX implement this someday */
 #endif
 
 	default:
 		return (EINVAL);
 	}
 
 	/* Done */
 	*buflen = sa->sa_len;
 	return (0);
 }
 
 /* Convert a struct sockaddr from binary to ASCII */
 static int
 ng_ksocket_sockaddr_unparse(const struct ng_parse_type *type,
 	const u_char *data, int *off, char *cbuf, int cbuflen)
 {
 	const struct sockaddr *sa = (const struct sockaddr *)(data + *off);
 	int slen = 0;
 
 	/* Output socket address, either in special or generic format */
 	switch (sa->sa_family) {
 	case PF_LOCAL:
 	    {
 		const int pathoff = OFFSETOF(struct sockaddr_un, sun_path);
 		const struct sockaddr_un *sun = (const struct sockaddr_un *)sa;
 		const int pathlen = sun->sun_len - pathoff;
 		char pathbuf[SOCK_MAXADDRLEN + 1];
 		char *pathtoken;
 
 		bcopy(sun->sun_path, pathbuf, pathlen);
 		if ((pathtoken = ng_encode_string(pathbuf, pathlen)) == NULL)
 			return (ENOMEM);
 		slen += snprintf(cbuf, cbuflen, "local/%s", pathtoken);
 		free(pathtoken, M_NETGRAPH_KSOCKET);
 		if (slen >= cbuflen)
 			return (ERANGE);
 		*off += sun->sun_len;
 		return (0);
 	    }
 
 	case PF_INET:
 	    {
 		const struct sockaddr_in *sin = (const struct sockaddr_in *)sa;
 
 		slen += snprintf(cbuf, cbuflen, "inet/%d.%d.%d.%d",
 		  ((const u_char *)&sin->sin_addr)[0],
 		  ((const u_char *)&sin->sin_addr)[1],
 		  ((const u_char *)&sin->sin_addr)[2],
 		  ((const u_char *)&sin->sin_addr)[3]);
 		if (sin->sin_port != 0) {
 			slen += snprintf(cbuf + strlen(cbuf),
 			    cbuflen - strlen(cbuf), ":%d",
 			    (u_int)ntohs(sin->sin_port));
 		}
 		if (slen >= cbuflen)
 			return (ERANGE);
 		*off += sizeof(*sin);
 		return(0);
 	    }
 
 #if 0
 	case PF_INET6:	/* XXX implement this someday */
 #endif
 
 	default:
 		return (*ng_ksocket_generic_sockaddr_type.supertype->unparse)
 		    (&ng_ksocket_generic_sockaddr_type,
 		    data, off, cbuf, cbuflen);
 	}
 }
 
 /* Parse type for struct sockaddr */
 static const struct ng_parse_type ng_ksocket_sockaddr_type = {
 	NULL,
 	NULL,
 	NULL,
 	&ng_ksocket_sockaddr_parse,
 	&ng_ksocket_sockaddr_unparse,
 	NULL		/* no such thing as a default struct sockaddr */
 };
 
 /************************************************************************
 		STRUCT NG_KSOCKET_SOCKOPT PARSE TYPE
  ************************************************************************/
 
 /* Get length of the struct ng_ksocket_sockopt value field, which is the
    just the excess of the message argument portion over the length of
    the struct ng_ksocket_sockopt. */
 static int
 ng_parse_sockoptval_getLength(const struct ng_parse_type *type,
 	const u_char *start, const u_char *buf)
 {
 	static const int offset = OFFSETOF(struct ng_ksocket_sockopt, value);
 	const struct ng_ksocket_sockopt *sopt;
 	const struct ng_mesg *msg;
 
 	sopt = (const struct ng_ksocket_sockopt *)(buf - offset);
 	msg = (const struct ng_mesg *)((const u_char *)sopt - sizeof(*msg));
 	return msg->header.arglen - sizeof(*sopt);
 }
 
 /* Parse type for the option value part of a struct ng_ksocket_sockopt
    XXX Eventually, we should handle the different socket options specially.
    XXX This would avoid byte order problems, eg an integer value of 1 is
    XXX going to be "[1]" for little endian or "[3=1]" for big endian. */
 static const struct ng_parse_type ng_ksocket_sockoptval_type = {
 	&ng_parse_bytearray_type,
 	&ng_parse_sockoptval_getLength
 };
 
 /* Parse type for struct ng_ksocket_sockopt */
 static const struct ng_parse_struct_field ng_ksocket_sockopt_type_fields[]
 	= NG_KSOCKET_SOCKOPT_INFO(&ng_ksocket_sockoptval_type);
 static const struct ng_parse_type ng_ksocket_sockopt_type = {
 	&ng_parse_struct_type,
 	&ng_ksocket_sockopt_type_fields
 };
 
 /* Parse type for struct ng_ksocket_accept */
 static const struct ng_parse_struct_field ng_ksocket_accept_type_fields[]
 	= NGM_KSOCKET_ACCEPT_INFO;
 static const struct ng_parse_type ng_ksocket_accept_type = {
 	&ng_parse_struct_type,
 	&ng_ksocket_accept_type_fields
 };
 
 /* List of commands and how to convert arguments to/from ASCII */
 static const struct ng_cmdlist ng_ksocket_cmds[] = {
 	{
 	  NGM_KSOCKET_COOKIE,
 	  NGM_KSOCKET_BIND,
 	  "bind",
 	  &ng_ksocket_sockaddr_type,
 	  NULL
 	},
 	{
 	  NGM_KSOCKET_COOKIE,
 	  NGM_KSOCKET_LISTEN,
 	  "listen",
 	  &ng_parse_int32_type,
 	  NULL
 	},
 	{
 	  NGM_KSOCKET_COOKIE,
 	  NGM_KSOCKET_ACCEPT,
 	  "accept",
 	  NULL,
 	  &ng_ksocket_accept_type
 	},
 	{
 	  NGM_KSOCKET_COOKIE,
 	  NGM_KSOCKET_CONNECT,
 	  "connect",
 	  &ng_ksocket_sockaddr_type,
 	  &ng_parse_int32_type
 	},
 	{
 	  NGM_KSOCKET_COOKIE,
 	  NGM_KSOCKET_GETNAME,
 	  "getname",
 	  NULL,
 	  &ng_ksocket_sockaddr_type
 	},
 	{
 	  NGM_KSOCKET_COOKIE,
 	  NGM_KSOCKET_GETPEERNAME,
 	  "getpeername",
 	  NULL,
 	  &ng_ksocket_sockaddr_type
 	},
 	{
 	  NGM_KSOCKET_COOKIE,
 	  NGM_KSOCKET_SETOPT,
 	  "setopt",
 	  &ng_ksocket_sockopt_type,
 	  NULL
 	},
 	{
 	  NGM_KSOCKET_COOKIE,
 	  NGM_KSOCKET_GETOPT,
 	  "getopt",
 	  &ng_ksocket_sockopt_type,
 	  &ng_ksocket_sockopt_type
 	},
 	{ 0 }
 };
 
 /* Node type descriptor */
 static struct ng_type ng_ksocket_typestruct = {
 	.version =	NG_ABI_VERSION,
 	.name =		NG_KSOCKET_NODE_TYPE,
 	.constructor =	ng_ksocket_constructor,
 	.rcvmsg =	ng_ksocket_rcvmsg,
 	.shutdown =	ng_ksocket_shutdown,
 	.newhook =	ng_ksocket_newhook,
 	.connect =	ng_ksocket_connect,
 	.rcvdata =	ng_ksocket_rcvdata,
 	.disconnect =	ng_ksocket_disconnect,
 	.cmdlist =	ng_ksocket_cmds,
 };
 NETGRAPH_INIT(ksocket, &ng_ksocket_typestruct);
 
 #define ERROUT(x)	do { error = (x); goto done; } while (0)
 
 /************************************************************************
 			NETGRAPH NODE STUFF
  ************************************************************************/
 
 /*
  * Node type constructor
  * The NODE part is assumed to be all set up.
  * There is already a reference to the node for us.
  */
 static int
 ng_ksocket_constructor(node_p node)
 {
 	priv_p priv;
 
 	/* Allocate private structure */
 	priv = malloc(sizeof(*priv), M_NETGRAPH_KSOCKET, M_NOWAIT | M_ZERO);
 	if (priv == NULL)
 		return (ENOMEM);
 
 	LIST_INIT(&priv->embryos);
 	/* cross link them */
 	priv->node = node;
 	NG_NODE_SET_PRIVATE(node, priv);
 
 	/* Done */
 	return (0);
 }
 
 /*
  * Give our OK for a hook to be added. The hook name is of the
  * form "<family>/<type>/<proto>" where the three components may
  * be decimal numbers or else aliases from the above lists.
  *
  * Connecting a hook amounts to opening the socket.  Disconnecting
  * the hook closes the socket and destroys the node as well.
  */
 static int
 ng_ksocket_newhook(node_p node, hook_p hook, const char *name0)
 {
 	struct thread *td = curthread;	/* XXX broken */
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	char *s1, *s2, name[NG_HOOKSIZ];
 	int family, type, protocol, error;
 
 	/* Check if we're already connected */
 	if (priv->hook != NULL)
 		return (EISCONN);
 
 	if (priv->flags & KSF_CLONED) {
 		if (priv->flags & KSF_EMBRYONIC) {
 			/* Remove ourselves from our parent's embryo list */
 			LIST_REMOVE(priv, siblings);
 			priv->flags &= ~KSF_EMBRYONIC;
 		}
 	} else {
 		/* Extract family, type, and protocol from hook name */
 		snprintf(name, sizeof(name), "%s", name0);
 		s1 = name;
 		if ((s2 = strchr(s1, '/')) == NULL)
 			return (EINVAL);
 		*s2++ = '\0';
 		family = ng_ksocket_parse(ng_ksocket_families, s1, 0);
 		if (family == -1)
 			return (EINVAL);
 		s1 = s2;
 		if ((s2 = strchr(s1, '/')) == NULL)
 			return (EINVAL);
 		*s2++ = '\0';
 		type = ng_ksocket_parse(ng_ksocket_types, s1, 0);
 		if (type == -1)
 			return (EINVAL);
 		s1 = s2;
 		protocol = ng_ksocket_parse(ng_ksocket_protos, s1, family);
 		if (protocol == -1)
 			return (EINVAL);
 
 		/* Create the socket */
 		error = socreate(family, &priv->so, type, protocol,
 		   td->td_ucred, td);
 		if (error != 0)
 			return (error);
 
 		/* XXX call soreserve() ? */
 	}
 
 	/* OK */
 	priv->hook = hook;
 
 	/*
 	 * In case of misconfigured routing a packet may reenter
 	 * ksocket node recursively. Decouple stack to avoid possible
 	 * panics about sleeping with locks held.
 	 */
 	NG_HOOK_FORCE_QUEUE(hook);
 
 	return(0);
 }
 
 static int
 ng_ksocket_connect(hook_p hook)
 {
 	node_p node = NG_HOOK_NODE(hook);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct socket *const so = priv->so;
 
 	/* Add our hook for incoming data and other events */
 	SOCKBUF_LOCK(&priv->so->so_rcv);
 	soupcall_set(priv->so, SO_RCV, ng_ksocket_incoming, node);
 	SOCKBUF_UNLOCK(&priv->so->so_rcv);
 	SOCKBUF_LOCK(&priv->so->so_snd);
 	soupcall_set(priv->so, SO_SND, ng_ksocket_incoming, node);
 	SOCKBUF_UNLOCK(&priv->so->so_snd);
 	SOCK_LOCK(priv->so);
 	priv->so->so_state |= SS_NBIO;
 	SOCK_UNLOCK(priv->so);
 	/*
 	 * --Original comment--
 	 * On a cloned socket we may have already received one or more
 	 * upcalls which we couldn't handle without a hook.  Handle
 	 * those now.
 	 * We cannot call the upcall function directly
 	 * from here, because until this function has returned our
 	 * hook isn't connected.
 	 *
 	 * ---meta comment for -current ---
 	 * XXX This is dubius.
 	 * Upcalls between the time that the hook was
 	 * first created and now (on another processesor) will
 	 * be earlier on the queue than the request to finalise the hook.
 	 * By the time the hook is finalised,
 	 * The queued upcalls will have happened and the code
 	 * will have discarded them because of a lack of a hook.
 	 * (socket not open).
 	 *
 	 * This is a bad byproduct of the complicated way in which hooks
 	 * are now created (3 daisy chained async events).
 	 *
 	 * Since we are a netgraph operation
 	 * We know that we hold a lock on this node. This forces the
 	 * request we make below to be queued rather than implemented
 	 * immediately which will cause the upcall function to be called a bit
 	 * later.
 	 * However, as we will run any waiting queued operations immediately
 	 * after doing this one, if we have not finalised the other end
 	 * of the hook, those queued operations will fail.
 	 */
 	if (priv->flags & KSF_CLONED) {
 		ng_send_fn(node, NULL, &ng_ksocket_incoming2, so, M_NOWAIT);
 	}
 
 	return (0);
 }
 
 /*
  * Receive a control message
  */
 static int
 ng_ksocket_rcvmsg(node_p node, item_p item, hook_p lasthook)
 {
 	struct thread *td = curthread;	/* XXX broken */
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct socket *const so = priv->so;
 	struct ng_mesg *resp = NULL;
 	int error = 0;
 	struct ng_mesg *msg;
 
 	NGI_GET_MSG(item, msg);
 	switch (msg->header.typecookie) {
 	case NGM_KSOCKET_COOKIE:
 		switch (msg->header.cmd) {
 		case NGM_KSOCKET_BIND:
 		    {
 			struct sockaddr *const sa
 			    = (struct sockaddr *)msg->data;
 
 			/* Sanity check */
 			if (msg->header.arglen < SADATA_OFFSET
 			    || msg->header.arglen < sa->sa_len)
 				ERROUT(EINVAL);
 			if (so == NULL)
 				ERROUT(ENXIO);
 
 			/* Bind */
 			error = sobind(so, sa, td);
 			break;
 		    }
 		case NGM_KSOCKET_LISTEN:
 		    {
 			/* Sanity check */
 			if (msg->header.arglen != sizeof(int32_t))
 				ERROUT(EINVAL);
 			if (so == NULL)
 				ERROUT(ENXIO);
 
 			/* Listen */
 			so->so_state |= SS_NBIO;
 			error = solisten(so, *((int32_t *)msg->data), td);
 			break;
 		    }
 
 		case NGM_KSOCKET_ACCEPT:
 		    {
 			/* Sanity check */
 			if (msg->header.arglen != 0)
 				ERROUT(EINVAL);
 			if (so == NULL)
 				ERROUT(ENXIO);
 
 			/* Make sure the socket is capable of accepting */
 			if (!(so->so_options & SO_ACCEPTCONN))
 				ERROUT(EINVAL);
 			if (priv->flags & KSF_ACCEPTING)
 				ERROUT(EALREADY);
 
 			/*
 			 * If a connection is already complete, take it.
 			 * Otherwise let the upcall function deal with
 			 * the connection when it comes in.
 			 */
 			error = ng_ksocket_accept(priv);
 			if (error != 0 && error != EWOULDBLOCK)
 				ERROUT(error);
 			priv->response_token = msg->header.token;
 			priv->response_addr = NGI_RETADDR(item);
 			break;
 		    }
 
 		case NGM_KSOCKET_CONNECT:
 		    {
 			struct sockaddr *const sa
 			    = (struct sockaddr *)msg->data;
 
 			/* Sanity check */
 			if (msg->header.arglen < SADATA_OFFSET
 			    || msg->header.arglen < sa->sa_len)
 				ERROUT(EINVAL);
 			if (so == NULL)
 				ERROUT(ENXIO);
 
 			/* Do connect */
 			if ((so->so_state & SS_ISCONNECTING) != 0)
 				ERROUT(EALREADY);
 			if ((error = soconnect(so, sa, td)) != 0) {
 				so->so_state &= ~SS_ISCONNECTING;
 				ERROUT(error);
 			}
 			if ((so->so_state & SS_ISCONNECTING) != 0) {
 				/* We will notify the sender when we connect */
 				priv->response_token = msg->header.token;
 				priv->response_addr = NGI_RETADDR(item);
 				priv->flags |= KSF_CONNECTING;
 				ERROUT(EINPROGRESS);
 			}
 			break;
 		    }
 
 		case NGM_KSOCKET_GETNAME:
 		case NGM_KSOCKET_GETPEERNAME:
 		    {
 			int (*func)(struct socket *so, struct sockaddr **nam);
 			struct sockaddr *sa = NULL;
 			int len;
 
 			/* Sanity check */
 			if (msg->header.arglen != 0)
 				ERROUT(EINVAL);
 			if (so == NULL)
 				ERROUT(ENXIO);
 
 			/* Get function */
 			if (msg->header.cmd == NGM_KSOCKET_GETPEERNAME) {
 				if ((so->so_state
 				    & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
 					ERROUT(ENOTCONN);
 				func = so->so_proto->pr_peeraddr;
 			} else
 				func = so->so_proto->pr_sockaddr;
 
 			/* Get local or peer address */
 			if ((error = (*func)(so, &sa)) != 0)
 				goto bail;
 			len = (sa == NULL) ? 0 : sa->sa_len;
 
 			/* Send it back in a response */
 			NG_MKRESPONSE(resp, msg, len, M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				goto bail;
 			}
 			bcopy(sa, resp->data, len);
 
 		bail:
 			/* Cleanup */
 			if (sa != NULL)
 				free(sa, M_SONAME);
 			break;
 		    }
 
 		case NGM_KSOCKET_GETOPT:
 		    {
 			struct ng_ksocket_sockopt *ksopt =
 			    (struct ng_ksocket_sockopt *)msg->data;
 			struct sockopt sopt;
 
 			/* Sanity check */
 			if (msg->header.arglen != sizeof(*ksopt))
 				ERROUT(EINVAL);
 			if (so == NULL)
 				ERROUT(ENXIO);
 
 			/* Get response with room for option value */
 			NG_MKRESPONSE(resp, msg, sizeof(*ksopt)
 			    + NG_KSOCKET_MAX_OPTLEN, M_NOWAIT);
 			if (resp == NULL)
 				ERROUT(ENOMEM);
 
 			/* Get socket option, and put value in the response */
 			sopt.sopt_dir = SOPT_GET;
 			sopt.sopt_level = ksopt->level;
 			sopt.sopt_name = ksopt->name;
 			sopt.sopt_td = NULL;
 			sopt.sopt_valsize = NG_KSOCKET_MAX_OPTLEN;
 			ksopt = (struct ng_ksocket_sockopt *)resp->data;
 			sopt.sopt_val = ksopt->value;
 			if ((error = sogetopt(so, &sopt)) != 0) {
 				NG_FREE_MSG(resp);
 				break;
 			}
 
 			/* Set actual value length */
 			resp->header.arglen = sizeof(*ksopt)
 			    + sopt.sopt_valsize;
 			break;
 		    }
 
 		case NGM_KSOCKET_SETOPT:
 		    {
 			struct ng_ksocket_sockopt *const ksopt =
 			    (struct ng_ksocket_sockopt *)msg->data;
 			const int valsize = msg->header.arglen - sizeof(*ksopt);
 			struct sockopt sopt;
 
 			/* Sanity check */
 			if (valsize < 0)
 				ERROUT(EINVAL);
 			if (so == NULL)
 				ERROUT(ENXIO);
 
 			/* Set socket option */
 			sopt.sopt_dir = SOPT_SET;
 			sopt.sopt_level = ksopt->level;
 			sopt.sopt_name = ksopt->name;
 			sopt.sopt_val = ksopt->value;
 			sopt.sopt_valsize = valsize;
 			sopt.sopt_td = NULL;
 			error = sosetopt(so, &sopt);
 			break;
 		    }
 
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 done:
 	NG_RESPOND_MSG(error, node, item, resp);
 	NG_FREE_MSG(msg);
 	return (error);
 }
 
 /*
  * Receive incoming data on our hook.  Send it out the socket.
  */
 static int
 ng_ksocket_rcvdata(hook_p hook, item_p item)
 {
 	struct thread *td = curthread;	/* XXX broken */
 	const node_p node = NG_HOOK_NODE(hook);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct socket *const so = priv->so;
 	struct sockaddr *sa = NULL;
 	int error;
 	struct mbuf *m;
 #ifdef ALIGNED_POINTER
 	struct mbuf *n;
 #endif /* ALIGNED_POINTER */
 	struct sa_tag *stag;
 
 	/* Extract data */
 	NGI_GET_M(item, m);
 	NG_FREE_ITEM(item);
 #ifdef ALIGNED_POINTER
 	if (!ALIGNED_POINTER(mtod(m, caddr_t), uint32_t)) {
 		n = m_defrag(m, M_NOWAIT);
 		if (n == NULL) {
 			m_freem(m);
 			return (ENOBUFS);
 		}
 		m = n;
 	}
 #endif /* ALIGNED_POINTER */
 	/*
 	 * Look if socket address is stored in packet tags.
 	 * If sockaddr is ours, or provided by a third party (zero id),
 	 * then we accept it.
 	 */
 	if (((stag = (struct sa_tag *)m_tag_locate(m, NGM_KSOCKET_COOKIE,
 	    NG_KSOCKET_TAG_SOCKADDR, NULL)) != NULL) &&
 	    (stag->id == NG_NODE_ID(node) || stag->id == 0))
 		sa = &stag->sa;
 
 	/* Reset specific mbuf flags to prevent addressing problems. */
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 
 	/* Send packet */
 	error = sosend(so, sa, 0, m, 0, 0, td);
 
 	return (error);
 }
 
 /*
  * Destroy node
  */
 static int
 ng_ksocket_shutdown(node_p node)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	priv_p embryo;
 
 	/* Close our socket (if any) */
 	if (priv->so != NULL) {
 		SOCKBUF_LOCK(&priv->so->so_rcv);
 		soupcall_clear(priv->so, SO_RCV);
 		SOCKBUF_UNLOCK(&priv->so->so_rcv);
 		SOCKBUF_LOCK(&priv->so->so_snd);
 		soupcall_clear(priv->so, SO_SND);
 		SOCKBUF_UNLOCK(&priv->so->so_snd);
 		soclose(priv->so);
 		priv->so = NULL;
 	}
 
 	/* If we are an embryo, take ourselves out of the parent's list */
 	if (priv->flags & KSF_EMBRYONIC) {
 		LIST_REMOVE(priv, siblings);
 		priv->flags &= ~KSF_EMBRYONIC;
 	}
 
 	/* Remove any embryonic children we have */
 	while (!LIST_EMPTY(&priv->embryos)) {
 		embryo = LIST_FIRST(&priv->embryos);
 		ng_rmnode_self(embryo->node);
 	}
 
 	/* Take down netgraph node */
 	bzero(priv, sizeof(*priv));
 	free(priv, M_NETGRAPH_KSOCKET);
 	NG_NODE_SET_PRIVATE(node, NULL);
 	NG_NODE_UNREF(node);		/* let the node escape */
 	return (0);
 }
 
 /*
  * Hook disconnection
  */
 static int
 ng_ksocket_disconnect(hook_p hook)
 {
 	KASSERT(NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0,
 	    ("%s: numhooks=%d?", __func__,
 	    NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook))));
 	if (NG_NODE_IS_VALID(NG_HOOK_NODE(hook)))
 		ng_rmnode_self(NG_HOOK_NODE(hook));
 	return (0);
 }
 
 /************************************************************************
 			HELPER STUFF
  ************************************************************************/
 /*
  * You should not "just call" a netgraph node function from an external
  * asynchronous event. This is because in doing so you are ignoring the
  * locking on the netgraph nodes. Instead call your function via ng_send_fn().
  * This will call the function you chose, but will first do all the
  * locking rigmarole. Your function MAY only be called at some distant future
  * time (several millisecs away) so don't give it any arguments
  * that may be revoked soon (e.g. on your stack).
  *
  * To decouple stack, we use queue version of ng_send_fn().
  */
 
 static int
 ng_ksocket_incoming(struct socket *so, void *arg, int waitflag)
 {
 	const node_p node = arg;
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	int wait = ((waitflag & M_WAITOK) ? NG_WAITOK : 0) | NG_QUEUE;
 
 	/*
 	 * Even if node is not locked, as soon as we are called, we assume
 	 * it exist and it's private area is valid. With some care we can
 	 * access it. Mark node that incoming event for it was sent to
 	 * avoid unneded queue trashing.
 	 */
 	if (atomic_cmpset_int(&priv->fn_sent, 0, 1) &&
 	    ng_send_fn1(node, NULL, &ng_ksocket_incoming2, so, 0, wait)) {
 		atomic_store_rel_int(&priv->fn_sent, 0);
 	}
 	return (SU_OK);
 }
 
 /*
  * When incoming data is appended to the socket, we get notified here.
  * This is also called whenever a significant event occurs for the socket.
  * Our original caller may have queued this even some time ago and
  * we cannot trust that he even still exists. The node however is being
  * held with a reference by the queueing code and guarantied to be valid.
  */
 static void
 ng_ksocket_incoming2(node_p node, hook_p hook, void *arg1, int arg2)
 {
 	struct socket *so = arg1;
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct ng_mesg *response;
 	int error;
 
 	KASSERT(so == priv->so, ("%s: wrong socket", __func__));
 
 	/* Allow next incoming event to be queued. */
 	atomic_store_rel_int(&priv->fn_sent, 0);
 
 	/* Check whether a pending connect operation has completed */
 	if (priv->flags & KSF_CONNECTING) {
 		if ((error = so->so_error) != 0) {
 			so->so_error = 0;
 			so->so_state &= ~SS_ISCONNECTING;
 		}
 		if (!(so->so_state & SS_ISCONNECTING)) {
 			NG_MKMESSAGE(response, NGM_KSOCKET_COOKIE,
 			    NGM_KSOCKET_CONNECT, sizeof(int32_t), M_NOWAIT);
 			if (response != NULL) {
 				response->header.flags |= NGF_RESP;
 				response->header.token = priv->response_token;
 				*(int32_t *)response->data = error;
 				/*
 				 * send an async "response" message
 				 * to the node that set us up
 				 * (if it still exists)
 				 */
 				NG_SEND_MSG_ID(error, node,
 				    response, priv->response_addr, 0);
 			}
 			priv->flags &= ~KSF_CONNECTING;
 		}
 	}
 
 	/* Check whether a pending accept operation has completed */
 	if (priv->flags & KSF_ACCEPTING)
 		(void )ng_ksocket_accept(priv);
 
 	/*
 	 * If we don't have a hook, we must handle data events later.  When
 	 * the hook gets created and is connected, this upcall function
 	 * will be called again.
 	 */
 	if (priv->hook == NULL)
 		return;
 
 	/* Read and forward available mbufs. */
 	while (1) {
 		struct uio uio;
 		struct sockaddr *sa;
 		struct mbuf *m;
 		int flags;
 
 		/* Try to get next packet from socket. */
 		uio.uio_td = NULL;
 		uio.uio_resid = IP_MAXPACKET;
 		flags = MSG_DONTWAIT;
 		sa = NULL;
 		if ((error = soreceive(so, (so->so_state & SS_ISCONNECTED) ?
 		    NULL : &sa, &uio, &m, NULL, &flags)) != 0)
 			break;
 
 		/* See if we got anything. */
 		if (flags & MSG_TRUNC) {
 			m_freem(m);
 			m = NULL;
 		}
 		if (m == NULL) {
 			if (sa != NULL)
 				free(sa, M_SONAME);
 			break;
 		}
 
 		KASSERT(m->m_nextpkt == NULL, ("%s: nextpkt", __func__));
 
 		/*
 		 * Stream sockets do not have packet boundaries, so
 		 * we have to allocate a header mbuf and attach the
 		 * stream of data to it.
 		 */
 		if (so->so_type == SOCK_STREAM) {
 			struct mbuf *mh;
 
 			mh = m_gethdr(M_NOWAIT, MT_DATA);
 			if (mh == NULL) {
 				m_freem(m);
 				if (sa != NULL)
 					free(sa, M_SONAME);
 				break;
 			}
 
 			mh->m_next = m;
 			for (; m; m = m->m_next)
 				mh->m_pkthdr.len += m->m_len;
 			m = mh;
 		}
 
 		/* Put peer's socket address (if any) into a tag */
 		if (sa != NULL) {
 			struct sa_tag	*stag;
 
 			stag = (struct sa_tag *)m_tag_alloc(NGM_KSOCKET_COOKIE,
 			    NG_KSOCKET_TAG_SOCKADDR, sizeof(ng_ID_t) +
 			    sa->sa_len, M_NOWAIT);
 			if (stag == NULL) {
 				free(sa, M_SONAME);
 				goto sendit;
 			}
 			bcopy(sa, &stag->sa, sa->sa_len);
 			free(sa, M_SONAME);
 			stag->id = NG_NODE_ID(node);
 			m_tag_prepend(m, &stag->tag);
 		}
 
 sendit:		/* Forward data with optional peer sockaddr as packet tag */
 		NG_SEND_DATA_ONLY(error, priv->hook, m);
 	}
 
 	/*
 	 * If the peer has closed the connection, forward a 0-length mbuf
 	 * to indicate end-of-file.
 	 */
 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE &&
 	    !(priv->flags & KSF_EOFSEEN)) {
 		struct mbuf *m;
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m != NULL)
 			NG_SEND_DATA_ONLY(error, priv->hook, m);
 		priv->flags |= KSF_EOFSEEN;
 	}
 }
 
 static int
 ng_ksocket_accept(priv_p priv)
 {
 	struct socket *const head = priv->so;
 	struct socket *so;
 	struct sockaddr *sa = NULL;
 	struct ng_mesg *resp;
 	struct ng_ksocket_accept *resp_data;
 	node_p node;
 	priv_p priv2;
 	int len;
 	int error;
 
 	SOLISTEN_LOCK(head);
 	error = solisten_dequeue(head, &so, SOCK_NONBLOCK);
 	if (error == EWOULDBLOCK) {
 		priv->flags |= KSF_ACCEPTING;
 		return (error);
 	}
 	priv->flags &= ~KSF_ACCEPTING;
 	if (error)
 		return (error);
 
 	if ((error = soaccept(so, &sa)) != 0)
 		return (error);
 
 	len = OFFSETOF(struct ng_ksocket_accept, addr);
 	if (sa != NULL)
 		len += sa->sa_len;
 
 	NG_MKMESSAGE(resp, NGM_KSOCKET_COOKIE, NGM_KSOCKET_ACCEPT, len,
 	    M_NOWAIT);
 	if (resp == NULL) {
 		soclose(so);
 		goto out;
 	}
 	resp->header.flags |= NGF_RESP;
 	resp->header.token = priv->response_token;
 
 	/* Clone a ksocket node to wrap the new socket */
 	error = ng_make_node_common(&ng_ksocket_typestruct, &node);
 	if (error) {
 		free(resp, M_NETGRAPH);
 		soclose(so);
 		goto out;
 	}
 
 	if (ng_ksocket_constructor(node) != 0) {
 		NG_NODE_UNREF(node);
 		free(resp, M_NETGRAPH);
 		soclose(so);
 		goto out;
 	}
 
 	priv2 = NG_NODE_PRIVATE(node);
 	priv2->so = so;
 	priv2->flags |= KSF_CLONED | KSF_EMBRYONIC;
 
 	/*
 	 * Insert the cloned node into a list of embryonic children
 	 * on the parent node.  When a hook is created on the cloned
 	 * node it will be removed from this list.  When the parent
 	 * is destroyed it will destroy any embryonic children it has.
 	 */
 	LIST_INSERT_HEAD(&priv->embryos, priv2, siblings);
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	soupcall_set(so, SO_RCV, ng_ksocket_incoming, node);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	SOCKBUF_LOCK(&so->so_snd);
 	soupcall_set(so, SO_SND, ng_ksocket_incoming, node);
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	/* Fill in the response data and send it or return it to the caller */
 	resp_data = (struct ng_ksocket_accept *)resp->data;
 	resp_data->nodeid = NG_NODE_ID(node);
 	if (sa != NULL)
 		bcopy(sa, &resp_data->addr, sa->sa_len);
 	NG_SEND_MSG_ID(error, node, resp, priv->response_addr, 0);
 
 out:
 	if (sa != NULL)
 		free(sa, M_SONAME);
 
 	return (0);
 }
 
 /*
  * Parse out either an integer value or an alias.
  */
 static int
 ng_ksocket_parse(const struct ng_ksocket_alias *aliases,
 	const char *s, int family)
 {
 	int k, val;
 	char *eptr;
 
 	/* Try aliases */
 	for (k = 0; aliases[k].name != NULL; k++) {
 		if (strcmp(s, aliases[k].name) == 0
 		    && aliases[k].family == family)
 			return aliases[k].value;
 	}
 
 	/* Try parsing as a number */
 	val = (int)strtoul(s, &eptr, 10);
 	if (val < 0 || *eptr != '\0')
 		return (-1);
 	return (val);
 }
diff --git a/sys/netinet/in_mcast.c b/sys/netinet/in_mcast.c
index 3f25471f0858..87de83da7a6a 100644
--- a/sys/netinet/in_mcast.c
+++ b/sys/netinet/in_mcast.c
@@ -1,3040 +1,3032 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Bruce Simpson.
  * Copyright (c) 2005 Robert N. M. Watson.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * IPv4 multicast socket, group, and socket option processing module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/sysctl.h>
 #include <sys/ktr.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #include <net/ethernet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/igmp_var.h>
 
 #ifndef KTR_IGMPV3
 #define KTR_IGMPV3 KTR_INET
 #endif
 
 #ifndef __SOCKUNION_DECLARED
 union sockunion {
 	struct sockaddr_storage	ss;
 	struct sockaddr		sa;
 	struct sockaddr_dl	sdl;
 	struct sockaddr_in	sin;
 };
 typedef union sockunion sockunion_t;
 #define __SOCKUNION_DECLARED
 #endif /* __SOCKUNION_DECLARED */
 
 static MALLOC_DEFINE(M_INMFILTER, "in_mfilter",
     "IPv4 multicast PCB-layer source filter");
 static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group");
 static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options");
 static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource",
     "IPv4 multicast IGMP-layer source filter");
 
 /*
  * Locking:
  *
  * - Lock order is: Giant, IN_MULTI_LOCK, INP_WLOCK,
  *   IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
  * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however
  *   it can be taken by code in net/if.c also.
  * - ip_moptions and in_mfilter are covered by the INP_WLOCK.
  *
  * struct in_multi is covered by IN_MULTI_LIST_LOCK. There isn't strictly
  * any need for in_multi itself to be virtualized -- it is bound to an ifp
  * anyway no matter what happens.
  */
 struct mtx in_multi_list_mtx;
 MTX_SYSINIT(in_multi_mtx, &in_multi_list_mtx, "in_multi_list_mtx", MTX_DEF);
 
 struct mtx in_multi_free_mtx;
 MTX_SYSINIT(in_multi_free_mtx, &in_multi_free_mtx, "in_multi_free_mtx", MTX_DEF);
 
 struct sx in_multi_sx;
 SX_SYSINIT(in_multi_sx, &in_multi_sx, "in_multi_sx");
 
 int ifma_restart;
 
 /*
  * Functions with non-static linkage defined in this file should be
  * declared in in_var.h:
  *  imo_multi_filter()
  *  in_joingroup()
  *  in_joingroup_locked()
  *  in_leavegroup()
  *  in_leavegroup_locked()
  * and ip_var.h:
  *  inp_freemoptions()
  *  inp_getmoptions()
  *  inp_setmoptions()
  */
 static void	imf_commit(struct in_mfilter *);
 static int	imf_get_source(struct in_mfilter *imf,
 		    const struct sockaddr_in *psin,
 		    struct in_msource **);
 static struct in_msource *
 		imf_graft(struct in_mfilter *, const uint8_t,
 		    const struct sockaddr_in *);
 static void	imf_leave(struct in_mfilter *);
 static int	imf_prune(struct in_mfilter *, const struct sockaddr_in *);
 static void	imf_purge(struct in_mfilter *);
 static void	imf_rollback(struct in_mfilter *);
 static void	imf_reap(struct in_mfilter *);
 static struct in_mfilter *
 		imo_match_group(const struct ip_moptions *,
 		    const struct ifnet *, const struct sockaddr *);
 static struct in_msource *
 		imo_match_source(struct in_mfilter *, const struct sockaddr *);
 static void	ims_merge(struct ip_msource *ims,
 		    const struct in_msource *lims, const int rollback);
 static int	in_getmulti(struct ifnet *, const struct in_addr *,
 		    struct in_multi **);
 static int	inm_get_source(struct in_multi *inm, const in_addr_t haddr,
 		    const int noalloc, struct ip_msource **pims);
 #ifdef KTR
 static int	inm_is_ifp_detached(const struct in_multi *);
 #endif
 static int	inm_merge(struct in_multi *, /*const*/ struct in_mfilter *);
 static void	inm_purge(struct in_multi *);
 static void	inm_reap(struct in_multi *);
 static void inm_release(struct in_multi *);
 static struct ip_moptions *
 		inp_findmoptions(struct inpcb *);
 static int	inp_get_source_filters(struct inpcb *, struct sockopt *);
 static int	inp_join_group(struct inpcb *, struct sockopt *);
 static int	inp_leave_group(struct inpcb *, struct sockopt *);
 static struct ifnet *
 		inp_lookup_mcast_ifp(const struct inpcb *,
 		    const struct sockaddr_in *, const struct in_addr);
 static int	inp_block_unblock_source(struct inpcb *, struct sockopt *);
 static int	inp_set_multicast_if(struct inpcb *, struct sockopt *);
 static int	inp_set_source_filters(struct inpcb *, struct sockopt *);
 static int	sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS);
 
 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IPv4 multicast");
 
 static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER;
 SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc,
     CTLFLAG_RWTUN, &in_mcast_maxgrpsrc, 0,
     "Max source filters per group");
 
 static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER;
 SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc,
     CTLFLAG_RWTUN, &in_mcast_maxsocksrc, 0,
     "Max source filters per socket");
 
 int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP;
 SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RWTUN,
     &in_mcast_loop, 0, "Loopback multicast datagrams by default");
 
 static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters,
     "Per-interface stack-wide source filters");
 
 #ifdef KTR
 /*
  * Inline function which wraps assertions for a valid ifp.
  * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp
  * is detached.
  */
 static int __inline
 inm_is_ifp_detached(const struct in_multi *inm)
 {
 	struct ifnet *ifp;
 
 	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->inm_ifma->ifma_ifp;
 	if (ifp != NULL) {
 		/*
 		 * Sanity check that netinet's notion of ifp is the
 		 * same as net's.
 		 */
 		KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
 	}
 
 	return (ifp == NULL);
 }
 #endif
 
 /*
  * Interface detach can happen in a taskqueue thread context, so we must use a
  * dedicated thread to avoid deadlocks when draining inm_release tasks.
  */
 TASKQUEUE_DEFINE_THREAD(inm_free);
 static struct in_multi_head inm_free_list = SLIST_HEAD_INITIALIZER();
 static void inm_release_task(void *arg __unused, int pending __unused);
 static struct task inm_free_task = TASK_INITIALIZER(0, inm_release_task, NULL);
 
 void
 inm_release_wait(void *arg __unused)
 {
 
 	/*
 	 * Make sure all pending multicast addresses are freed before
 	 * the VNET or network device is destroyed:
 	 */
 	taskqueue_drain(taskqueue_inm_free, &inm_free_task);
 }
 #ifdef VIMAGE
 /* XXX-BZ FIXME, see D24914. */
 VNET_SYSUNINIT(inm_release_wait, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, inm_release_wait, NULL);
 #endif
 
 void
 inm_release_list_deferred(struct in_multi_head *inmh)
 {
 
 	if (SLIST_EMPTY(inmh))
 		return;
 	mtx_lock(&in_multi_free_mtx);
 	SLIST_CONCAT(&inm_free_list, inmh, in_multi, inm_nrele);
 	mtx_unlock(&in_multi_free_mtx);
 	taskqueue_enqueue(taskqueue_inm_free, &inm_free_task);
 }
 
 void
 inm_disconnect(struct in_multi *inm)
 {
 	struct ifnet *ifp;
 	struct ifmultiaddr *ifma, *ll_ifma;
 
 	ifp = inm->inm_ifp;
 	IF_ADDR_WLOCK_ASSERT(ifp);
 	ifma = inm->inm_ifma;
 
 	if_ref(ifp);
 	if (ifma->ifma_flags & IFMA_F_ENQUEUED) {
 		CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
 		ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 	}
 	MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname);
 	if ((ll_ifma = ifma->ifma_llifma) != NULL) {
 		MPASS(ifma != ll_ifma);
 		ifma->ifma_llifma = NULL;
 		MPASS(ll_ifma->ifma_llifma == NULL);
 		MPASS(ll_ifma->ifma_ifp == ifp);
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) {
 				CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link);
 				ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 			}
 			MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname);
 			if_freemulti(ll_ifma);
 			ifma_restart = true;
 		}
 	}
 }
 
 void
 inm_release_deferred(struct in_multi *inm)
 {
 	struct in_multi_head tmp;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	MPASS(inm->inm_refcount > 0);
 	if (--inm->inm_refcount == 0) {
 		SLIST_INIT(&tmp);
 		inm_disconnect(inm);
 		inm->inm_ifma->ifma_protospec = NULL;
 		SLIST_INSERT_HEAD(&tmp, inm, inm_nrele);
 		inm_release_list_deferred(&tmp);
 	}
 }
 
 static void
 inm_release_task(void *arg __unused, int pending __unused)
 {
 	struct in_multi_head inm_free_tmp;
 	struct in_multi *inm, *tinm;
 
 	SLIST_INIT(&inm_free_tmp);
 	mtx_lock(&in_multi_free_mtx);
 	SLIST_CONCAT(&inm_free_tmp, &inm_free_list, in_multi, inm_nrele);
 	mtx_unlock(&in_multi_free_mtx);
 	IN_MULTI_LOCK();
 	SLIST_FOREACH_SAFE(inm, &inm_free_tmp, inm_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&inm_free_tmp, inm_nrele);
 		MPASS(inm);
 		inm_release(inm);
 	}
 	IN_MULTI_UNLOCK();
 }
 
 /*
  * Initialize an in_mfilter structure to a known state at t0, t1
  * with an empty source filter list.
  */
 static __inline void
 imf_init(struct in_mfilter *imf, const int st0, const int st1)
 {
 	memset(imf, 0, sizeof(struct in_mfilter));
 	RB_INIT(&imf->imf_sources);
 	imf->imf_st[0] = st0;
 	imf->imf_st[1] = st1;
 }
 
 struct in_mfilter *
 ip_mfilter_alloc(const int mflags, const int st0, const int st1)
 {
 	struct in_mfilter *imf;
 
 	imf = malloc(sizeof(*imf), M_INMFILTER, mflags);
 	if (imf != NULL)
 		imf_init(imf, st0, st1);
 
 	return (imf);
 }
 
 void
 ip_mfilter_free(struct in_mfilter *imf)
 {
 
 	imf_purge(imf);
 	free(imf, M_INMFILTER);
 }
 
 /*
  * Function for looking up an in_multi record for an IPv4 multicast address
  * on a given interface. ifp must be valid. If no record found, return NULL.
  * The IN_MULTI_LIST_LOCK and IF_ADDR_LOCK on ifp must be held.
  */
 struct in_multi *
 inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina)
 {
 	struct ifmultiaddr *ifma;
 	struct in_multi *inm;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	IF_ADDR_LOCK_ASSERT(ifp);
 
 	inm = NULL;
 	CK_STAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 			ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		if (inm->inm_addr.s_addr == ina.s_addr)
 			break;
 		inm = NULL;
 	}
 	return (inm);
 }
 
 /*
  * Wrapper for inm_lookup_locked().
  * The IF_ADDR_LOCK will be taken on ifp and released on return.
  */
 struct in_multi *
 inm_lookup(struct ifnet *ifp, const struct in_addr ina)
 {
 	struct epoch_tracker et;
 	struct in_multi *inm;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	NET_EPOCH_ENTER(et);
 
 	inm = inm_lookup_locked(ifp, ina);
 	NET_EPOCH_EXIT(et);
 
 	return (inm);
 }
 
 /*
  * Find an IPv4 multicast group entry for this ip_moptions instance
  * which matches the specified group, and optionally an interface.
  * Return its index into the array, or -1 if not found.
  */
 static struct in_mfilter *
 imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group)
 {
 	const struct sockaddr_in *gsin;
 	struct in_mfilter *imf;
 	struct in_multi	*inm;
 
 	gsin = (const struct sockaddr_in *)group;
 
 	IP_MFILTER_FOREACH(imf, &imo->imo_head) {
 		inm = imf->imf_inm;
 		if (inm == NULL)
 			continue;
 		if ((ifp == NULL || (inm->inm_ifp == ifp)) &&
 		    in_hosteq(inm->inm_addr, gsin->sin_addr)) {
 			break;
 		}
 	}
 	return (imf);
 }
 
 /*
  * Find an IPv4 multicast source entry for this imo which matches
  * the given group index for this socket, and source address.
  *
  * NOTE: This does not check if the entry is in-mode, merely if
  * it exists, which may not be the desired behaviour.
  */
 static struct in_msource *
 imo_match_source(struct in_mfilter *imf, const struct sockaddr *src)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims;
 	const sockunion_t	*psa;
 
 	KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__));
 
 	/* Source trees are keyed in host byte order. */
 	psa = (const sockunion_t *)src;
 	find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr);
 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
 
 	return ((struct in_msource *)ims);
 }
 
 /*
  * Perform filtering for multicast datagrams on a socket by group and source.
  *
  * Returns 0 if a datagram should be allowed through, or various error codes
  * if the socket was not a member of the group, or the source was muted, etc.
  */
 int
 imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group, const struct sockaddr *src)
 {
 	struct in_mfilter *imf;
 	struct in_msource *ims;
 	int mode;
 
 	KASSERT(ifp != NULL, ("%s: null ifp", __func__));
 
 	imf = imo_match_group(imo, ifp, group);
 	if (imf == NULL)
 		return (MCAST_NOTGMEMBER);
 
 	/*
 	 * Check if the source was included in an (S,G) join.
 	 * Allow reception on exclusive memberships by default,
 	 * reject reception on inclusive memberships by default.
 	 * Exclude source only if an in-mode exclude filter exists.
 	 * Include source only if an in-mode include filter exists.
 	 * NOTE: We are comparing group state here at IGMP t1 (now)
 	 * with socket-layer t0 (since last downcall).
 	 */
 	mode = imf->imf_st[1];
 	ims = imo_match_source(imf, src);
 
 	if ((ims == NULL && mode == MCAST_INCLUDE) ||
 	    (ims != NULL && ims->imsl_st[0] != mode))
 		return (MCAST_NOTSMEMBER);
 
 	return (MCAST_PASS);
 }
 
 /*
  * Find and return a reference to an in_multi record for (ifp, group),
  * and bump its reference count.
  * If one does not exist, try to allocate it, and update link-layer multicast
  * filters on ifp to listen for group.
  * Assumes the IN_MULTI lock is held across the call.
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 in_getmulti(struct ifnet *ifp, const struct in_addr *group,
     struct in_multi **pinm)
 {
 	struct sockaddr_in	 gsin;
 	struct ifmultiaddr	*ifma;
 	struct in_ifinfo	*ii;
 	struct in_multi		*inm;
 	int error;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET];
 	IN_MULTI_LIST_LOCK();
 	inm = inm_lookup(ifp, *group);
 	if (inm != NULL) {
 		/*
 		 * If we already joined this group, just bump the
 		 * refcount and return it.
 		 */
 		KASSERT(inm->inm_refcount >= 1,
 		    ("%s: bad refcount %d", __func__, inm->inm_refcount));
 		inm_acquire_locked(inm);
 		*pinm = inm;
 	}
 	IN_MULTI_LIST_UNLOCK();
 	if (inm != NULL)
 		return (0);
 
 	memset(&gsin, 0, sizeof(gsin));
 	gsin.sin_family = AF_INET;
 	gsin.sin_len = sizeof(struct sockaddr_in);
 	gsin.sin_addr = *group;
 
 	/*
 	 * Check if a link-layer group is already associated
 	 * with this network-layer group on the given ifnet.
 	 */
 	error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma);
 	if (error != 0)
 		return (error);
 
 	/* XXX ifma_protospec must be covered by IF_ADDR_LOCK */
 	IN_MULTI_LIST_LOCK();
 	IF_ADDR_WLOCK(ifp);
 
 	/*
 	 * If something other than netinet is occupying the link-layer
 	 * group, print a meaningful error message and back out of
 	 * the allocation.
 	 * Otherwise, bump the refcount on the existing network-layer
 	 * group association and return it.
 	 */
 	if (ifma->ifma_protospec != NULL) {
 		inm = (struct in_multi *)ifma->ifma_protospec;
 #ifdef INVARIANTS
 		KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr",
 		    __func__));
 		KASSERT(ifma->ifma_addr->sa_family == AF_INET,
 		    ("%s: ifma not AF_INET", __func__));
 		KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__));
 		if (inm->inm_ifma != ifma || inm->inm_ifp != ifp ||
 		    !in_hosteq(inm->inm_addr, *group)) {
 			char addrbuf[INET_ADDRSTRLEN];
 
 			panic("%s: ifma %p is inconsistent with %p (%s)",
 			    __func__, ifma, inm, inet_ntoa_r(*group, addrbuf));
 		}
 #endif
 		inm_acquire_locked(inm);
 		*pinm = inm;
 		goto out_locked;
 	}
 
 	IF_ADDR_WLOCK_ASSERT(ifp);
 
 	/*
 	 * A new in_multi record is needed; allocate and initialize it.
 	 * We DO NOT perform an IGMP join as the in_ layer may need to
 	 * push an initial source list down to IGMP to support SSM.
 	 *
 	 * The initial source filter state is INCLUDE, {} as per the RFC.
 	 */
 	inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO);
 	if (inm == NULL) {
 		IF_ADDR_WUNLOCK(ifp);
 		IN_MULTI_LIST_UNLOCK();
 		if_delmulti_ifma(ifma);
 		return (ENOMEM);
 	}
 	inm->inm_addr = *group;
 	inm->inm_ifp = ifp;
 	inm->inm_igi = ii->ii_igmp;
 	inm->inm_ifma = ifma;
 	inm->inm_refcount = 1;
 	inm->inm_state = IGMP_NOT_MEMBER;
 	mbufq_init(&inm->inm_scq, IGMP_MAX_STATE_CHANGES);
 	inm->inm_st[0].iss_fmode = MCAST_UNDEFINED;
 	inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
 	RB_INIT(&inm->inm_srcs);
 
 	ifma->ifma_protospec = inm;
 
 	*pinm = inm;
  out_locked:
 	IF_ADDR_WUNLOCK(ifp);
 	IN_MULTI_LIST_UNLOCK();
 	return (0);
 }
 
 /*
  * Drop a reference to an in_multi record.
  *
  * If the refcount drops to 0, free the in_multi record and
  * delete the underlying link-layer membership.
  */
 static void
 inm_release(struct in_multi *inm)
 {
 	struct ifmultiaddr *ifma;
 	struct ifnet *ifp;
 
 	CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount);
 	MPASS(inm->inm_refcount == 0);
 	CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm);
 
 	ifma = inm->inm_ifma;
 	ifp = inm->inm_ifp;
 
 	/* XXX this access is not covered by IF_ADDR_LOCK */
 	CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma);
 	if (ifp != NULL) {
 		CURVNET_SET(ifp->if_vnet);
 		inm_purge(inm);
 		free(inm, M_IPMADDR);
 		if_delmulti_ifma_flags(ifma, 1);
 		CURVNET_RESTORE();
 		if_rele(ifp);
 	} else {
 		inm_purge(inm);
 		free(inm, M_IPMADDR);
 		if_delmulti_ifma_flags(ifma, 1);
 	}
 }
 
 /*
  * Clear recorded source entries for a group.
  * Used by the IGMP code. Caller must hold the IN_MULTI lock.
  * FIXME: Should reap.
  */
 void
 inm_clear_recorded(struct in_multi *inm)
 {
 	struct ip_msource	*ims;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 
 	RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
 		if (ims->ims_stp) {
 			ims->ims_stp = 0;
 			--inm->inm_st[1].iss_rec;
 		}
 	}
 	KASSERT(inm->inm_st[1].iss_rec == 0,
 	    ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec));
 }
 
 /*
  * Record a source as pending for a Source-Group IGMPv3 query.
  * This lives here as it modifies the shared tree.
  *
  * inm is the group descriptor.
  * naddr is the address of the source to record in network-byte order.
  *
  * If the net.inet.igmp.sgalloc sysctl is non-zero, we will
  * lazy-allocate a source node in response to an SG query.
  * Otherwise, no allocation is performed. This saves some memory
  * with the trade-off that the source will not be reported to the
  * router if joined in the window between the query response and
  * the group actually being joined on the local host.
  *
  * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed.
  * This turns off the allocation of a recorded source entry if
  * the group has not been joined.
  *
  * Return 0 if the source didn't exist or was already marked as recorded.
  * Return 1 if the source was marked as recorded by this function.
  * Return <0 if any error occurred (negated errno code).
  */
 int
 inm_record_source(struct in_multi *inm, const in_addr_t naddr)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims, *nims;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 
 	find.ims_haddr = ntohl(naddr);
 	ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find);
 	if (ims && ims->ims_stp)
 		return (0);
 	if (ims == NULL) {
 		if (inm->inm_nsrc == in_mcast_maxgrpsrc)
 			return (-ENOSPC);
 		nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (-ENOMEM);
 		nims->ims_haddr = find.ims_haddr;
 		RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims);
 		++inm->inm_nsrc;
 		ims = nims;
 	}
 
 	/*
 	 * Mark the source as recorded and update the recorded
 	 * source count.
 	 */
 	++ims->ims_stp;
 	++inm->inm_st[1].iss_rec;
 
 	return (1);
 }
 
 /*
  * Return a pointer to an in_msource owned by an in_mfilter,
  * given its source address.
  * Lazy-allocate if needed. If this is a new entry its filter state is
  * undefined at t0.
  *
  * imf is the filter set being modified.
  * haddr is the source address in *host* byte-order.
  *
  * SMPng: May be called with locks held; malloc must not block.
  */
 static int
 imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin,
     struct in_msource **plims)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims, *nims;
 	struct in_msource	*lims;
 	int			 error;
 
 	error = 0;
 	ims = NULL;
 	lims = NULL;
 
 	/* key is host byte order */
 	find.ims_haddr = ntohl(psin->sin_addr.s_addr);
 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
 	lims = (struct in_msource *)ims;
 	if (lims == NULL) {
 		if (imf->imf_nsrc == in_mcast_maxsocksrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct in_msource), M_INMFILTER,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		lims = (struct in_msource *)nims;
 		lims->ims_haddr = find.ims_haddr;
 		lims->imsl_st[0] = MCAST_UNDEFINED;
 		RB_INSERT(ip_msource_tree, &imf->imf_sources, nims);
 		++imf->imf_nsrc;
 	}
 
 	*plims = lims;
 
 	return (error);
 }
 
 /*
  * Graft a source entry into an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being in the new filter mode at t1.
  *
  * Return the pointer to the new node, otherwise return NULL.
  */
 static struct in_msource *
 imf_graft(struct in_mfilter *imf, const uint8_t st1,
     const struct sockaddr_in *psin)
 {
 	struct ip_msource	*nims;
 	struct in_msource	*lims;
 
 	nims = malloc(sizeof(struct in_msource), M_INMFILTER,
 	    M_NOWAIT | M_ZERO);
 	if (nims == NULL)
 		return (NULL);
 	lims = (struct in_msource *)nims;
 	lims->ims_haddr = ntohl(psin->sin_addr.s_addr);
 	lims->imsl_st[0] = MCAST_UNDEFINED;
 	lims->imsl_st[1] = st1;
 	RB_INSERT(ip_msource_tree, &imf->imf_sources, nims);
 	++imf->imf_nsrc;
 
 	return (lims);
 }
 
 /*
  * Prune a source entry from an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being left at t1, it is not freed.
  *
  * Return 0 if no error occurred, otherwise return an errno value.
  */
 static int
 imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 
 	/* key is host byte order */
 	find.ims_haddr = ntohl(psin->sin_addr.s_addr);
 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
 	if (ims == NULL)
 		return (ENOENT);
 	lims = (struct in_msource *)ims;
 	lims->imsl_st[1] = MCAST_UNDEFINED;
 	return (0);
 }
 
 /*
  * Revert socket-layer filter set deltas at t1 to t0 state.
  */
 static void
 imf_rollback(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *tims;
 	struct in_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
 		lims = (struct in_msource *)ims;
 		if (lims->imsl_st[0] == lims->imsl_st[1]) {
 			/* no change at t1 */
 			continue;
 		} else if (lims->imsl_st[0] != MCAST_UNDEFINED) {
 			/* revert change to existing source at t1 */
 			lims->imsl_st[1] = lims->imsl_st[0];
 		} else {
 			/* revert source added t1 */
 			CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 			RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
 			free(ims, M_INMFILTER);
 			imf->imf_nsrc--;
 		}
 	}
 	imf->imf_st[1] = imf->imf_st[0];
 }
 
 /*
  * Mark socket-layer filter set as INCLUDE {} at t1.
  */
 static void
 imf_leave(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		lims->imsl_st[1] = MCAST_UNDEFINED;
 	}
 	imf->imf_st[1] = MCAST_INCLUDE;
 }
 
 /*
  * Mark socket-layer filter set deltas as committed.
  */
 static void
 imf_commit(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		lims->imsl_st[0] = lims->imsl_st[1];
 	}
 	imf->imf_st[0] = imf->imf_st[1];
 }
 
 /*
  * Reap unreferenced sources from socket-layer filter set.
  */
 static void
 imf_reap(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *tims;
 	struct in_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
 		lims = (struct in_msource *)ims;
 		if ((lims->imsl_st[0] == MCAST_UNDEFINED) &&
 		    (lims->imsl_st[1] == MCAST_UNDEFINED)) {
 			CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims);
 			RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
 			free(ims, M_INMFILTER);
 			imf->imf_nsrc--;
 		}
 	}
 }
 
 /*
  * Purge socket-layer filter set.
  */
 static void
 imf_purge(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
 		free(ims, M_INMFILTER);
 		imf->imf_nsrc--;
 	}
 	imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED;
 	KASSERT(RB_EMPTY(&imf->imf_sources),
 	    ("%s: imf_sources not empty", __func__));
 }
 
 /*
  * Look up a source filter entry for a multicast group.
  *
  * inm is the group descriptor to work with.
  * haddr is the host-byte-order IPv4 address to look up.
  * noalloc may be non-zero to suppress allocation of sources.
  * *pims will be set to the address of the retrieved or allocated source.
  *
  * SMPng: NOTE: may be called with locks held.
  * Return 0 if successful, otherwise return a non-zero error code.
  */
 static int
 inm_get_source(struct in_multi *inm, const in_addr_t haddr,
     const int noalloc, struct ip_msource **pims)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims, *nims;
 
 	find.ims_haddr = haddr;
 	ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find);
 	if (ims == NULL && !noalloc) {
 		if (inm->inm_nsrc == in_mcast_maxgrpsrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		nims->ims_haddr = haddr;
 		RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims);
 		++inm->inm_nsrc;
 		ims = nims;
 #ifdef KTR
 		CTR3(KTR_IGMPV3, "%s: allocated 0x%08x as %p", __func__,
 		    haddr, ims);
 #endif
 	}
 
 	*pims = ims;
 	return (0);
 }
 
 /*
  * Merge socket-layer source into IGMP-layer source.
  * If rollback is non-zero, perform the inverse of the merge.
  */
 static void
 ims_merge(struct ip_msource *ims, const struct in_msource *lims,
     const int rollback)
 {
 	int n = rollback ? -1 : 1;
 
 	if (lims->imsl_st[0] == MCAST_EXCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on 0x%08x",
 		    __func__, n, ims->ims_haddr);
 		ims->ims_st[1].ex -= n;
 	} else if (lims->imsl_st[0] == MCAST_INCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 in -= %d on 0x%08x",
 		    __func__, n, ims->ims_haddr);
 		ims->ims_st[1].in -= n;
 	}
 
 	if (lims->imsl_st[1] == MCAST_EXCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 ex += %d on 0x%08x",
 		    __func__, n, ims->ims_haddr);
 		ims->ims_st[1].ex += n;
 	} else if (lims->imsl_st[1] == MCAST_INCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 in += %d on 0x%08x",
 		    __func__, n, ims->ims_haddr);
 		ims->ims_st[1].in += n;
 	}
 }
 
 /*
  * Atomically update the global in_multi state, when a membership's
  * filter list is being updated in any way.
  *
  * imf is the per-inpcb-membership group filter pointer.
  * A fake imf may be passed for in-kernel consumers.
  *
  * XXX This is a candidate for a set-symmetric-difference style loop
  * which would eliminate the repeated lookup from root of ims nodes,
  * as they share the same key space.
  *
  * If any error occurred this function will back out of refcounts
  * and return a non-zero value.
  */
 static int
 inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *nims;
 	struct in_msource	*lims;
 	int			 schanged, error;
 	int			 nsrc0, nsrc1;
 
 	schanged = 0;
 	error = 0;
 	nsrc1 = nsrc0 = 0;
 	IN_MULTI_LIST_LOCK_ASSERT();
 
 	/*
 	 * Update the source filters first, as this may fail.
 	 * Maintain count of in-mode filters at t0, t1. These are
 	 * used to work out if we transition into ASM mode or not.
 	 * Maintain a count of source filters whose state was
 	 * actually modified by this operation.
 	 */
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++;
 		if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++;
 		if (lims->imsl_st[0] == lims->imsl_st[1]) continue;
 		error = inm_get_source(inm, lims->ims_haddr, 0, &nims);
 		++schanged;
 		if (error)
 			break;
 		ims_merge(nims, lims, 0);
 	}
 	if (error) {
 		struct ip_msource *bims;
 
 		RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) {
 			lims = (struct in_msource *)ims;
 			if (lims->imsl_st[0] == lims->imsl_st[1])
 				continue;
 			(void)inm_get_source(inm, lims->ims_haddr, 1, &bims);
 			if (bims == NULL)
 				continue;
 			ims_merge(bims, lims, 1);
 		}
 		goto out_reap;
 	}
 
 	CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1",
 	    __func__, nsrc0, nsrc1);
 
 	/* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */
 	if (imf->imf_st[0] == imf->imf_st[1] &&
 	    imf->imf_st[1] == MCAST_INCLUDE) {
 		if (nsrc1 == 0) {
 			CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__);
 			--inm->inm_st[1].iss_in;
 		}
 	}
 
 	/* Handle filter mode transition on socket. */
 	if (imf->imf_st[0] != imf->imf_st[1]) {
 		CTR3(KTR_IGMPV3, "%s: imf transition %d to %d",
 		    __func__, imf->imf_st[0], imf->imf_st[1]);
 
 		if (imf->imf_st[0] == MCAST_EXCLUDE) {
 			CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__);
 			--inm->inm_st[1].iss_ex;
 		} else if (imf->imf_st[0] == MCAST_INCLUDE) {
 			CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__);
 			--inm->inm_st[1].iss_in;
 		}
 
 		if (imf->imf_st[1] == MCAST_EXCLUDE) {
 			CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__);
 			inm->inm_st[1].iss_ex++;
 		} else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) {
 			CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__);
 			inm->inm_st[1].iss_in++;
 		}
 	}
 
 	/*
 	 * Track inm filter state in terms of listener counts.
 	 * If there are any exclusive listeners, stack-wide
 	 * membership is exclusive.
 	 * Otherwise, if only inclusive listeners, stack-wide is inclusive.
 	 * If no listeners remain, state is undefined at t1,
 	 * and the IGMP lifecycle for this group should finish.
 	 */
 	if (inm->inm_st[1].iss_ex > 0) {
 		CTR1(KTR_IGMPV3, "%s: transition to EX", __func__);
 		inm->inm_st[1].iss_fmode = MCAST_EXCLUDE;
 	} else if (inm->inm_st[1].iss_in > 0) {
 		CTR1(KTR_IGMPV3, "%s: transition to IN", __func__);
 		inm->inm_st[1].iss_fmode = MCAST_INCLUDE;
 	} else {
 		CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__);
 		inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
 	}
 
 	/* Decrement ASM listener count on transition out of ASM mode. */
 	if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) {
 		if ((imf->imf_st[1] != MCAST_EXCLUDE) ||
 		    (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) {
 			CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__);
 			--inm->inm_st[1].iss_asm;
 		}
 	}
 
 	/* Increment ASM listener count on transition to ASM mode. */
 	if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) {
 		CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__);
 		inm->inm_st[1].iss_asm++;
 	}
 
 	CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm);
 	inm_print(inm);
 
 out_reap:
 	if (schanged > 0) {
 		CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__);
 		inm_reap(inm);
 	}
 	return (error);
 }
 
 /*
  * Mark an in_multi's filter set deltas as committed.
  * Called by IGMP after a state change has been enqueued.
  */
 void
 inm_commit(struct in_multi *inm)
 {
 	struct ip_msource	*ims;
 
 	CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm);
 	CTR1(KTR_IGMPV3, "%s: pre commit:", __func__);
 	inm_print(inm);
 
 	RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
 		ims->ims_st[0] = ims->ims_st[1];
 	}
 	inm->inm_st[0] = inm->inm_st[1];
 }
 
 /*
  * Reap unreferenced nodes from an in_multi's filter set.
  */
 static void
 inm_reap(struct in_multi *inm)
 {
 	struct ip_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) {
 		if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 ||
 		    ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 ||
 		    ims->ims_stp != 0)
 			continue;
 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims);
 		free(ims, M_IPMSOURCE);
 		inm->inm_nsrc--;
 	}
 }
 
 /*
  * Purge all source nodes from an in_multi's filter set.
  */
 static void
 inm_purge(struct in_multi *inm)
 {
 	struct ip_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) {
 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims);
 		free(ims, M_IPMSOURCE);
 		inm->inm_nsrc--;
 	}
 }
 
 /*
  * Join a multicast group; unlocked entry point.
  *
  * SMPng: XXX: in_joingroup() is called from in_control() when Giant
  * is not held. Fortunately, ifp is unlikely to have been detached
  * at this point, so we assume it's OK to recurse.
  */
 int
 in_joingroup(struct ifnet *ifp, const struct in_addr *gina,
     /*const*/ struct in_mfilter *imf, struct in_multi **pinm)
 {
 	int error;
 
 	IN_MULTI_LOCK();
 	error = in_joingroup_locked(ifp, gina, imf, pinm);
 	IN_MULTI_UNLOCK();
 
 	return (error);
 }
 
 /*
  * Join a multicast group; real entry point.
  *
  * Only preserves atomicity at inm level.
  * NOTE: imf argument cannot be const due to sys/tree.h limitations.
  *
  * If the IGMP downcall fails, the group is not joined, and an error
  * code is returned.
  */
 int
 in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina,
     /*const*/ struct in_mfilter *imf, struct in_multi **pinm)
 {
 	struct in_mfilter	 timf;
 	struct in_multi		*inm;
 	int			 error;
 
 	IN_MULTI_LOCK_ASSERT();
 	IN_MULTI_LIST_UNLOCK_ASSERT();
 
 	CTR4(KTR_IGMPV3, "%s: join 0x%08x on %p(%s))", __func__,
 	    ntohl(gina->s_addr), ifp, ifp->if_xname);
 
 	error = 0;
 	inm = NULL;
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE);
 		imf = &timf;
 	}
 
 	error = in_getmulti(ifp, gina, &inm);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__);
 		return (error);
 	}
 	IN_MULTI_LIST_LOCK();
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	error = inm_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
 		goto out_inm_release;
 	}
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	error = igmp_change_state(inm);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to update source", __func__);
 		goto out_inm_release;
 	}
 
  out_inm_release:
 	if (error) {
 		CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm);
 		IF_ADDR_WLOCK(ifp);
 		inm_release_deferred(inm);
 		IF_ADDR_WUNLOCK(ifp);
 	} else {
 		*pinm = inm;
 	}
 	IN_MULTI_LIST_UNLOCK();
 
 	return (error);
 }
 
 /*
  * Leave a multicast group; unlocked entry point.
  */
 int
 in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
 {
 	int error;
 
 	IN_MULTI_LOCK();
 	error = in_leavegroup_locked(inm, imf);
 	IN_MULTI_UNLOCK();
 
 	return (error);
 }
 
 /*
  * Leave a multicast group; real entry point.
  * All source filters will be expunged.
  *
  * Only preserves atomicity at inm level.
  *
  * Holding the write lock for the INP which contains imf
  * is highly advisable. We can't assert for it as imf does not
  * contain a back-pointer to the owning inp.
  *
  * Note: This is not the same as inm_release(*) as this function also
  * makes a state change downcall into IGMP.
  */
 int
 in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
 {
 	struct in_mfilter	 timf;
 	int			 error;
 
 	IN_MULTI_LOCK_ASSERT();
 	IN_MULTI_LIST_UNLOCK_ASSERT();
 
 	error = 0;
 
 	CTR5(KTR_IGMPV3, "%s: leave inm %p, 0x%08x/%s, imf %p", __func__,
 	    inm, ntohl(inm->inm_addr.s_addr),
 	    (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname),
 	    imf);
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED);
 		imf = &timf;
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 *
 	 * As this particular invocation should not cause any memory
 	 * to be allocated, and there is no opportunity to roll back
 	 * the transaction, it MUST NOT fail.
 	 */
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	IN_MULTI_LIST_LOCK();
 	error = inm_merge(inm, imf);
 	KASSERT(error == 0, ("%s: failed to merge inm state", __func__));
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	CURVNET_SET(inm->inm_ifp->if_vnet);
 	error = igmp_change_state(inm);
 	IF_ADDR_WLOCK(inm->inm_ifp);
 	inm_release_deferred(inm);
 	IF_ADDR_WUNLOCK(inm->inm_ifp);
 	IN_MULTI_LIST_UNLOCK();
 	CURVNET_RESTORE();
 	if (error)
 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
 
 	CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm);
 
 	return (error);
 }
 
 /*#ifndef BURN_BRIDGES*/
 
 /*
  * Block or unblock an ASM multicast source on an inpcb.
  * This implements the delta-based API described in RFC 3678.
  *
  * The delta-based API applies only to exclusive-mode memberships.
  * An IGMP downcall will be performed.
  *
  * SMPng: NOTE: Must take Giant as a join may create a new ifma.
  *
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct epoch_tracker		 et;
 	struct group_source_req		 gsr;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in_mfilter		*imf;
 	struct ip_moptions		*imo;
 	struct in_msource		*ims;
 	struct in_multi			*inm;
 	uint16_t			 fmode;
 	int				 error, doblock;
 
 	ifp = NULL;
 	error = 0;
 	doblock = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 
 	switch (sopt->sopt_name) {
 	case IP_BLOCK_SOURCE:
 	case IP_UNBLOCK_SOURCE: {
 		struct ip_mreq_source	 mreqs;
 
 		error = sooptcopyin(sopt, &mreqs,
 		    sizeof(struct ip_mreq_source),
 		    sizeof(struct ip_mreq_source));
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
 
 		ssa->sin.sin_family = AF_INET;
 		ssa->sin.sin_len = sizeof(struct sockaddr_in);
 		ssa->sin.sin_addr = mreqs.imr_sourceaddr;
 
 		if (!in_nullhost(mreqs.imr_interface)) {
 			NET_EPOCH_ENTER(et);
 			INADDR_TO_IFP(mreqs.imr_interface, ifp);
 			/* XXXGL: ifref? */
 			NET_EPOCH_EXIT(et);
 		}
 		if (sopt->sopt_name == IP_BLOCK_SOURCE)
 			doblock = 1;
 
 		CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p",
 		    __func__, ntohl(mreqs.imr_interface.s_addr), ifp);
 		break;
 	    }
 
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = sooptcopyin(sopt, &gsr,
 		    sizeof(struct group_source_req),
 		    sizeof(struct group_source_req));
 		if (error)
 			return (error);
 
 		if (gsa->sin.sin_family != AF_INET ||
 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		if (ssa->sin.sin_family != AF_INET ||
 		    ssa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		NET_EPOCH_ENTER(et);
 		ifp = ifnet_byindex(gsr.gsr_interface);
 		NET_EPOCH_EXIT(et);
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 
 		if (sopt->sopt_name == MCAST_BLOCK_SOURCE)
 			doblock = 1;
 		break;
 
 	default:
 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 		return (EINVAL);
 
 	IN_MULTI_LOCK();
 
 	/*
 	 * Check if we are actually a member of this group.
 	 */
 	imo = inp_findmoptions(inp);
 	imf = imo_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
 	inm = imf->imf_inm;
 
 	/*
 	 * Attempting to use the delta-based API on an
 	 * non exclusive-mode membership is an error.
 	 */
 	fmode = imf->imf_st[0];
 	if (fmode != MCAST_EXCLUDE) {
 		error = EINVAL;
 		goto out_inp_locked;
 	}
 
 	/*
 	 * Deal with error cases up-front:
 	 *  Asked to block, but already blocked; or
 	 *  Asked to unblock, but nothing to unblock.
 	 * If adding a new block entry, allocate it.
 	 */
 	ims = imo_match_source(imf, &ssa->sa);
 	if ((ims != NULL && doblock) || (ims == NULL && !doblock)) {
 		CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__,
 		    ntohl(ssa->sin.sin_addr.s_addr), doblock ? "" : "not ");
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	if (doblock) {
 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block");
 		ims = imf_graft(imf, fmode, &ssa->sin);
 		if (ims == NULL)
 			error = ENOMEM;
 	} else {
 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow");
 		error = imf_prune(imf, &ssa->sin);
 	}
 
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__);
 		goto out_imf_rollback;
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	IN_MULTI_LIST_LOCK();
 	error = inm_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
 		IN_MULTI_LIST_UNLOCK();
 		goto out_imf_rollback;
 	}
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	error = igmp_change_state(inm);
 	IN_MULTI_LIST_UNLOCK();
 	if (error)
 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
 
 out_imf_rollback:
 	if (error)
 		imf_rollback(imf);
 	else
 		imf_commit(imf);
 
 	imf_reap(imf);
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
 	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Given an inpcb, return its multicast options structure pointer.  Accepts
  * an unlocked inpcb pointer, but will return it locked.  May sleep.
  *
  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  * SMPng: NOTE: Returns with the INP write lock held.
  */
 static struct ip_moptions *
 inp_findmoptions(struct inpcb *inp)
 {
 	struct ip_moptions	 *imo;
 
 	INP_WLOCK(inp);
 	if (inp->inp_moptions != NULL)
 		return (inp->inp_moptions);
 
 	INP_WUNLOCK(inp);
 
 	imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK);
 
 	imo->imo_multicast_ifp = NULL;
 	imo->imo_multicast_addr.s_addr = INADDR_ANY;
 	imo->imo_multicast_vif = -1;
 	imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
 	imo->imo_multicast_loop = in_mcast_loop;
 	STAILQ_INIT(&imo->imo_head);
 
 	INP_WLOCK(inp);
 	if (inp->inp_moptions != NULL) {
 		free(imo, M_IPMOPTS);
 		return (inp->inp_moptions);
 	}
 	inp->inp_moptions = imo;
 	return (imo);
 }
 
 void
 inp_freemoptions(struct ip_moptions *imo)
 {
 	struct in_mfilter *imf;
 	struct in_multi *inm;
 	struct ifnet *ifp;
 
 	if (imo == NULL)
 		return;
 
 	while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) {
 		ip_mfilter_remove(&imo->imo_head, imf);
 
 		imf_leave(imf);
 		if ((inm = imf->imf_inm) != NULL) {
 			if ((ifp = inm->inm_ifp) != NULL) {
 				CURVNET_SET(ifp->if_vnet);
 				(void)in_leavegroup(inm, imf);
 				CURVNET_RESTORE();
 			} else {
 				(void)in_leavegroup(inm, imf);
 			}
 		}
 		ip_mfilter_free(imf);
 	}
 	free(imo, M_IPMOPTS);
 }
 
 /*
  * Atomically get source filters on a socket for an IPv4 multicast group.
  * Called with INP lock held; returns with lock released.
  */
 static int
 inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct epoch_tracker	 et;
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct ip_moptions	*imo;
 	struct in_mfilter	*imf;
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 	struct sockaddr_in	*psin;
 	struct sockaddr_storage	*ptss;
 	struct sockaddr_storage	*tss;
 	int			 error;
 	size_t			 nsrcs, ncsrcs;
 
 	INP_WLOCK_ASSERT(inp);
 
 	imo = inp->inp_moptions;
 	KASSERT(imo != NULL, ("%s: null ip_moptions", __func__));
 
 	INP_WUNLOCK(inp);
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	NET_EPOCH_ENTER(et);
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifnet pointer left */
 	if (ifp == NULL)
 		return (EINVAL);
 
 	INP_WLOCK(inp);
 
 	/*
 	 * Lookup group on the socket.
 	 */
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	imf = imo_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		INP_WUNLOCK(inp);
 		return (EADDRNOTAVAIL);
 	}
 
 	/*
 	 * Ignore memberships which are in limbo.
 	 */
 	if (imf->imf_st[1] == MCAST_UNDEFINED) {
 		INP_WUNLOCK(inp);
 		return (EAGAIN);
 	}
 	msfr.msfr_fmode = imf->imf_st[1];
 
 	/*
 	 * If the user specified a buffer, copy out the source filter
 	 * entries to userland gracefully.
 	 * We only copy out the number of entries which userland
 	 * has asked for, but we always tell userland how big the
 	 * buffer really needs to be.
 	 */
 	if (msfr.msfr_nsrcs > in_mcast_maxsocksrc)
 		msfr.msfr_nsrcs = in_mcast_maxsocksrc;
 	tss = NULL;
 	if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) {
 		tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_NOWAIT | M_ZERO);
 		if (tss == NULL) {
 			INP_WUNLOCK(inp);
 			return (ENOBUFS);
 		}
 	}
 
 	/*
 	 * Count number of sources in-mode at t0.
 	 * If buffer space exists and remains, copy out source entries.
 	 */
 	nsrcs = msfr.msfr_nsrcs;
 	ncsrcs = 0;
 	ptss = tss;
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		if (lims->imsl_st[0] == MCAST_UNDEFINED ||
 		    lims->imsl_st[0] != imf->imf_st[0])
 			continue;
 		++ncsrcs;
 		if (tss != NULL && nsrcs > 0) {
 			psin = (struct sockaddr_in *)ptss;
 			psin->sin_family = AF_INET;
 			psin->sin_len = sizeof(struct sockaddr_in);
 			psin->sin_addr.s_addr = htonl(lims->ims_haddr);
 			psin->sin_port = 0;
 			++ptss;
 			--nsrcs;
 		}
 	}
 
 	INP_WUNLOCK(inp);
 
 	if (tss != NULL) {
 		error = copyout(tss, msfr.msfr_srcs,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		free(tss, M_TEMP);
 		if (error)
 			return (error);
 	}
 
 	msfr.msfr_nsrcs = ncsrcs;
 	error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq));
 
 	return (error);
 }
 
 /*
  * Return the IP multicast options in response to user getsockopt().
  */
 int
 inp_getmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ip_mreqn		 mreqn;
 	struct ip_moptions	*imo;
 	struct ifnet		*ifp;
 	struct in_ifaddr	*ia;
 	int			 error, optval;
 	u_char			 coptval;
 
 	INP_WLOCK(inp);
 	imo = inp->inp_moptions;
-	/*
-	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
-	 * or is a divert socket, reject it.
-	 */
-	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
-	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
-	    inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) {
+	/* If socket is neither of type SOCK_RAW or SOCK_DGRAM reject it. */
+	if (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
+	    inp->inp_socket->so_proto->pr_type != SOCK_DGRAM) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
 
 	error = 0;
 	switch (sopt->sopt_name) {
 	case IP_MULTICAST_VIF:
 		if (imo != NULL)
 			optval = imo->imo_multicast_vif;
 		else
 			optval = -1;
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(int));
 		break;
 
 	case IP_MULTICAST_IF:
 		memset(&mreqn, 0, sizeof(struct ip_mreqn));
 		if (imo != NULL) {
 			ifp = imo->imo_multicast_ifp;
 			if (!in_nullhost(imo->imo_multicast_addr)) {
 				mreqn.imr_address = imo->imo_multicast_addr;
 			} else if (ifp != NULL) {
 				struct epoch_tracker et;
 
 				mreqn.imr_ifindex = ifp->if_index;
 				NET_EPOCH_ENTER(et);
 				IFP_TO_IA(ifp, ia);
 				if (ia != NULL)
 					mreqn.imr_address =
 					    IA_SIN(ia)->sin_addr;
 				NET_EPOCH_EXIT(et);
 			}
 		}
 		INP_WUNLOCK(inp);
 		if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) {
 			error = sooptcopyout(sopt, &mreqn,
 			    sizeof(struct ip_mreqn));
 		} else {
 			error = sooptcopyout(sopt, &mreqn.imr_address,
 			    sizeof(struct in_addr));
 		}
 		break;
 
 	case IP_MULTICAST_TTL:
 		if (imo == NULL)
 			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
 		else
 			optval = coptval = imo->imo_multicast_ttl;
 		INP_WUNLOCK(inp);
 		if (sopt->sopt_valsize == sizeof(u_char))
 			error = sooptcopyout(sopt, &coptval, sizeof(u_char));
 		else
 			error = sooptcopyout(sopt, &optval, sizeof(int));
 		break;
 
 	case IP_MULTICAST_LOOP:
 		if (imo == NULL)
 			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
 		else
 			optval = coptval = imo->imo_multicast_loop;
 		INP_WUNLOCK(inp);
 		if (sopt->sopt_valsize == sizeof(u_char))
 			error = sooptcopyout(sopt, &coptval, sizeof(u_char));
 		else
 			error = sooptcopyout(sopt, &optval, sizeof(int));
 		break;
 
 	case IP_MSFILTER:
 		if (imo == NULL) {
 			error = EADDRNOTAVAIL;
 			INP_WUNLOCK(inp);
 		} else {
 			error = inp_get_source_filters(inp, sopt);
 		}
 		break;
 
 	default:
 		INP_WUNLOCK(inp);
 		error = ENOPROTOOPT;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Look up the ifnet to use for a multicast group membership,
  * given the IPv4 address of an interface, and the IPv4 group address.
  *
  * This routine exists to support legacy multicast applications
  * which do not understand that multicast memberships are scoped to
  * specific physical links in the networking stack, or which need
  * to join link-scope groups before IPv4 addresses are configured.
  *
  * Use this socket's current FIB number for any required FIB lookup.
  * If ina is INADDR_ANY, look up the group address in the unicast FIB,
  * and use its ifp; usually, this points to the default next-hop.
  *
  * If the FIB lookup fails, attempt to use the first non-loopback
  * interface with multicast capability in the system as a
  * last resort. The legacy IPv4 ASM API requires that we do
  * this in order to allow groups to be joined when the routing
  * table has not yet been populated during boot.
  *
  * Returns NULL if no ifp could be found, otherwise return referenced ifp.
  *
  * FUTURE: Implement IPv4 source-address selection.
  */
 static struct ifnet *
 inp_lookup_mcast_ifp(const struct inpcb *inp,
     const struct sockaddr_in *gsin, const struct in_addr ina)
 {
 	struct ifnet *ifp;
 	struct nhop_object *nh;
 
 	NET_EPOCH_ASSERT();
 	KASSERT(inp != NULL, ("%s: inp must not be NULL", __func__));
 	KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__));
 	KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)),
 	    ("%s: not multicast", __func__));
 
 	ifp = NULL;
 	if (!in_nullhost(ina)) {
 		INADDR_TO_IFP(ina, ifp);
 		if (ifp != NULL)
 			if_ref(ifp);
 	} else {
 		nh = fib4_lookup(inp->inp_inc.inc_fibnum, gsin->sin_addr, 0, NHR_NONE, 0);
 		if (nh != NULL) {
 			ifp = nh->nh_ifp;
 			if_ref(ifp);
 		} else {
 			struct in_ifaddr *ia;
 			struct ifnet *mifp;
 
 			mifp = NULL;
 			CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 				mifp = ia->ia_ifp;
 				if (!(mifp->if_flags & IFF_LOOPBACK) &&
 				     (mifp->if_flags & IFF_MULTICAST)) {
 					ifp = mifp;
 					if_ref(ifp);
 					break;
 				}
 			}
 		}
 	}
 
 	return (ifp);
 }
 
 /*
  * Join an IPv4 multicast group, possibly with a source.
  */
 static int
 inp_join_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct group_source_req		 gsr;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in_mfilter		*imf;
 	struct ip_moptions		*imo;
 	struct in_multi			*inm;
 	struct in_msource		*lims;
 	struct epoch_tracker		 et;
 	int				 error, is_new;
 
 	ifp = NULL;
 	lims = NULL;
 	error = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	switch (sopt->sopt_name) {
 	case IP_ADD_MEMBERSHIP: {
 		struct ip_mreqn mreqn;
 
 		if (sopt->sopt_valsize == sizeof(struct ip_mreqn))
 			error = sooptcopyin(sopt, &mreqn,
 			    sizeof(struct ip_mreqn), sizeof(struct ip_mreqn));
 		else
 			error = sooptcopyin(sopt, &mreqn,
 			    sizeof(struct ip_mreq), sizeof(struct ip_mreq));
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
 		gsa->sin.sin_addr = mreqn.imr_multiaddr;
 		if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 			return (EINVAL);
 
 		NET_EPOCH_ENTER(et);
 		if (sopt->sopt_valsize == sizeof(struct ip_mreqn) &&
 		    mreqn.imr_ifindex != 0)
 			ifp = ifnet_byindex_ref(mreqn.imr_ifindex);
 		else
 			ifp = inp_lookup_mcast_ifp(inp, &gsa->sin,
 			    mreqn.imr_address);
 		NET_EPOCH_EXIT(et);
 		break;
 	}
 	case IP_ADD_SOURCE_MEMBERSHIP: {
 		struct ip_mreq_source	 mreqs;
 
 		error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source),
 			    sizeof(struct ip_mreq_source));
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = ssa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = ssa->sin.sin_len =
 		    sizeof(struct sockaddr_in);
 
 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
 		if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 			return (EINVAL);
 
 		ssa->sin.sin_addr = mreqs.imr_sourceaddr;
 
 		NET_EPOCH_ENTER(et);
 		ifp = inp_lookup_mcast_ifp(inp, &gsa->sin,
 		    mreqs.imr_interface);
 		NET_EPOCH_EXIT(et);
 		CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p",
 		    __func__, ntohl(mreqs.imr_interface.s_addr), ifp);
 		break;
 	}
 
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_JOIN_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin.sin_family != AF_INET ||
 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		/*
 		 * Overwrite the port field if present, as the sockaddr
 		 * being copied in may be matched with a binary comparison.
 		 */
 		gsa->sin.sin_port = 0;
 		if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			if (ssa->sin.sin_family != AF_INET ||
 			    ssa->sin.sin_len != sizeof(struct sockaddr_in))
 				return (EINVAL);
 			ssa->sin.sin_port = 0;
 		}
 
 		if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 			return (EINVAL);
 
 		NET_EPOCH_ENTER(et);
 		ifp = ifnet_byindex_ref(gsr.gsr_interface);
 		NET_EPOCH_EXIT(et);
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 		break;
 
 	default:
 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
 		if (ifp != NULL)
 			if_rele(ifp);
 		return (EADDRNOTAVAIL);
 	}
 
 	IN_MULTI_LOCK();
 
 	/*
 	 * Find the membership in the membership list.
 	 */
 	imo = inp_findmoptions(inp);
 	imf = imo_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		is_new = 1;
 		inm = NULL;
 
 		if (ip_mfilter_count(&imo->imo_head) >= IP_MAX_MEMBERSHIPS) {
 			error = ENOMEM;
 			goto out_inp_locked;
 		}
 	} else {
 		is_new = 0;
 		inm = imf->imf_inm;
 
 		if (ssa->ss.ss_family != AF_UNSPEC) {
 			/*
 			 * MCAST_JOIN_SOURCE_GROUP on an exclusive membership
 			 * is an error. On an existing inclusive membership,
 			 * it just adds the source to the filter list.
 			 */
 			if (imf->imf_st[1] != MCAST_INCLUDE) {
 				error = EINVAL;
 				goto out_inp_locked;
 			}
 			/*
 			 * Throw out duplicates.
 			 *
 			 * XXX FIXME: This makes a naive assumption that
 			 * even if entries exist for *ssa in this imf,
 			 * they will be rejected as dupes, even if they
 			 * are not valid in the current mode (in-mode).
 			 *
 			 * in_msource is transactioned just as for anything
 			 * else in SSM -- but note naive use of inm_graft()
 			 * below for allocating new filter entries.
 			 *
 			 * This is only an issue if someone mixes the
 			 * full-state SSM API with the delta-based API,
 			 * which is discouraged in the relevant RFCs.
 			 */
 			lims = imo_match_source(imf, &ssa->sa);
 			if (lims != NULL /*&&
 			    lims->imsl_st[1] == MCAST_INCLUDE*/) {
 				error = EADDRNOTAVAIL;
 				goto out_inp_locked;
 			}
 		} else {
 			/*
 			 * MCAST_JOIN_GROUP on an existing exclusive
 			 * membership is an error; return EADDRINUSE
 			 * to preserve 4.4BSD API idempotence, and
 			 * avoid tedious detour to code below.
 			 * NOTE: This is bending RFC 3678 a bit.
 			 *
 			 * On an existing inclusive membership, this is also
 			 * an error; if you want to change filter mode,
 			 * you must use the userland API setsourcefilter().
 			 * XXX We don't reject this for imf in UNDEFINED
 			 * state at t1, because allocation of a filter
 			 * is atomic with allocation of a membership.
 			 */
 			error = EINVAL;
 			if (imf->imf_st[1] == MCAST_EXCLUDE)
 				error = EADDRINUSE;
 			goto out_inp_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Graft new source into filter list for this inpcb's
 	 * membership of the group. The in_multi may not have
 	 * been allocated yet if this is a new membership, however,
 	 * the in_mfilter slot will be allocated and must be initialized.
 	 *
 	 * Note: Grafting of exclusive mode filters doesn't happen
 	 * in this path.
 	 * XXX: Should check for non-NULL lims (node exists but may
 	 * not be in-mode) for interop with full-state API.
 	 */
 	if (ssa->ss.ss_family != AF_UNSPEC) {
 		/* Membership starts in IN mode */
 		if (is_new) {
 			CTR1(KTR_IGMPV3, "%s: new join w/source", __func__);
 			imf = ip_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_INCLUDE);
 			if (imf == NULL) {
 				error = ENOMEM;
 				goto out_inp_locked;
 			}
 		} else {
 			CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow");
 		}
 		lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin);
 		if (lims == NULL) {
 			CTR1(KTR_IGMPV3, "%s: merge imf state failed",
 			    __func__);
 			error = ENOMEM;
 			goto out_inp_locked;
 		}
 	} else {
 		/* No address specified; Membership starts in EX mode */
 		if (is_new) {
 			CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__);
 			imf = ip_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_EXCLUDE);
 			if (imf == NULL) {
 				error = ENOMEM;
 				goto out_inp_locked;
 			}
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
 	if (is_new) {
 		in_pcbref(inp);
 		INP_WUNLOCK(inp);
 
 		error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf,
 		    &imf->imf_inm);
 
 		INP_WLOCK(inp);
 		if (in_pcbrele_wlocked(inp)) {
 			error = ENXIO;
 			goto out_inp_unlocked;
 		}
 		if (error) {
                         CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed",
                             __func__);
 			goto out_inp_locked;
 		}
 		/*
 		 * NOTE: Refcount from in_joingroup_locked()
 		 * is protecting membership.
 		 */
 		ip_mfilter_insert(&imo->imo_head, imf);
 	} else {
 		CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 		IN_MULTI_LIST_LOCK();
 		error = inm_merge(inm, imf);
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed to merge inm state",
 				 __func__);
 			IN_MULTI_LIST_UNLOCK();
 			imf_rollback(imf);
 			imf_reap(imf);
 			goto out_inp_locked;
 		}
 		CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 		error = igmp_change_state(inm);
 		IN_MULTI_LIST_UNLOCK();
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed igmp downcall",
 			    __func__);
 			imf_rollback(imf);
 			imf_reap(imf);
 			goto out_inp_locked;
 		}
 	}
 
 	imf_commit(imf);
 	imf = NULL;
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
 out_inp_unlocked:
 	IN_MULTI_UNLOCK();
 
 	if (is_new && imf) {
 		if (imf->imf_inm != NULL) {
 			IN_MULTI_LIST_LOCK();
 			IF_ADDR_WLOCK(ifp);
 			inm_release_deferred(imf->imf_inm);
 			IF_ADDR_WUNLOCK(ifp);
 			IN_MULTI_LIST_UNLOCK();
 		}
 		ip_mfilter_free(imf);
 	}
 	if_rele(ifp);
 	return (error);
 }
 
 /*
  * Leave an IPv4 multicast group on an inpcb, possibly with a source.
  */
 static int
 inp_leave_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct epoch_tracker		 et;
 	struct group_source_req		 gsr;
 	struct ip_mreq_source		 mreqs;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in_mfilter		*imf;
 	struct ip_moptions		*imo;
 	struct in_msource		*ims;
 	struct in_multi			*inm;
 	int				 error;
 	bool				 is_final;
 
 	ifp = NULL;
 	error = 0;
 	is_final = true;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	switch (sopt->sopt_name) {
 	case IP_DROP_MEMBERSHIP:
 	case IP_DROP_SOURCE_MEMBERSHIP:
 		if (sopt->sopt_name == IP_DROP_MEMBERSHIP) {
 			error = sooptcopyin(sopt, &mreqs,
 			    sizeof(struct ip_mreq),
 			    sizeof(struct ip_mreq));
 			/*
 			 * Swap interface and sourceaddr arguments,
 			 * as ip_mreq and ip_mreq_source are laid
 			 * out differently.
 			 */
 			mreqs.imr_interface = mreqs.imr_sourceaddr;
 			mreqs.imr_sourceaddr.s_addr = INADDR_ANY;
 		} else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) {
 			error = sooptcopyin(sopt, &mreqs,
 			    sizeof(struct ip_mreq_source),
 			    sizeof(struct ip_mreq_source));
 		}
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
 
 		if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) {
 			ssa->sin.sin_family = AF_INET;
 			ssa->sin.sin_len = sizeof(struct sockaddr_in);
 			ssa->sin.sin_addr = mreqs.imr_sourceaddr;
 		}
 
 		/*
 		 * Attempt to look up hinted ifp from interface address.
 		 * Fallthrough with null ifp iff lookup fails, to
 		 * preserve 4.4BSD mcast API idempotence.
 		 * XXX NOTE WELL: The RFC 3678 API is preferred because
 		 * using an IPv4 address as a key is racy.
 		 */
 		if (!in_nullhost(mreqs.imr_interface)) {
 			NET_EPOCH_ENTER(et);
 			INADDR_TO_IFP(mreqs.imr_interface, ifp);
 			/* XXXGL ifref? */
 			NET_EPOCH_EXIT(et);
 		}
 		CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p",
 		    __func__, ntohl(mreqs.imr_interface.s_addr), ifp);
 
 		break;
 
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_LEAVE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin.sin_family != AF_INET ||
 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			if (ssa->sin.sin_family != AF_INET ||
 			    ssa->sin.sin_len != sizeof(struct sockaddr_in))
 				return (EINVAL);
 		}
 
 		NET_EPOCH_ENTER(et);
 		ifp = ifnet_byindex(gsr.gsr_interface);
 		NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifp */
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 		break;
 
 	default:
 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 		return (EINVAL);
 
 	IN_MULTI_LOCK();
 
 	/*
 	 * Find the membership in the membership list.
 	 */
 	imo = inp_findmoptions(inp);
 	imf = imo_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
 	inm = imf->imf_inm;
 
 	if (ssa->ss.ss_family != AF_UNSPEC)
 		is_final = false;
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * If we were instructed only to leave a given source, do so.
 	 * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships.
 	 */
 	if (is_final) {
 		ip_mfilter_remove(&imo->imo_head, imf);
 		imf_leave(imf);
 
 		/*
 		 * Give up the multicast address record to which
 		 * the membership points.
 		 */
 		(void) in_leavegroup_locked(imf->imf_inm, imf);
 	} else {
 		if (imf->imf_st[0] == MCAST_EXCLUDE) {
 			error = EADDRNOTAVAIL;
 			goto out_inp_locked;
 		}
 		ims = imo_match_source(imf, &ssa->sa);
 		if (ims == NULL) {
 			CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent",
 			    __func__, ntohl(ssa->sin.sin_addr.s_addr), "not ");
 			error = EADDRNOTAVAIL;
 			goto out_inp_locked;
 		}
 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block");
 		error = imf_prune(imf, &ssa->sin);
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: merge imf state failed",
 			    __func__);
 			goto out_inp_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
 	if (!is_final) {
 		CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 		IN_MULTI_LIST_LOCK();
 		error = inm_merge(inm, imf);
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed to merge inm state",
 			    __func__);
 			IN_MULTI_LIST_UNLOCK();
 			imf_rollback(imf);
 			imf_reap(imf);
 			goto out_inp_locked;
 		}
 
 		CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 		error = igmp_change_state(inm);
 		IN_MULTI_LIST_UNLOCK();
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed igmp downcall",
 			    __func__);
 			imf_rollback(imf);
 			imf_reap(imf);
 			goto out_inp_locked;
 		}
 	}
 	imf_commit(imf);
 	imf_reap(imf);
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
 
 	if (is_final && imf)
 		ip_mfilter_free(imf);
 
 	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Select the interface for transmitting IPv4 multicast datagrams.
  *
  * Either an instance of struct in_addr or an instance of struct ip_mreqn
  * may be passed to this socket option. An address of INADDR_ANY or an
  * interface index of 0 is used to remove a previous selection.
  * When no interface is selected, one is chosen for every send.
  */
 static int
 inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct in_addr		 addr;
 	struct ip_mreqn		 mreqn;
 	struct ifnet		*ifp;
 	struct ip_moptions	*imo;
 	int			 error;
 
 	if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) {
 		/*
 		 * An interface index was specified using the
 		 * Linux-derived ip_mreqn structure.
 		 */
 		error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn),
 		    sizeof(struct ip_mreqn));
 		if (error)
 			return (error);
 
 		if (mreqn.imr_ifindex < 0)
 			return (EINVAL);
 
 		if (mreqn.imr_ifindex == 0) {
 			ifp = NULL;
 		} else {
 			struct epoch_tracker et;
 
 			NET_EPOCH_ENTER(et);
 			ifp = ifnet_byindex(mreqn.imr_ifindex);
 			NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifp */
 			if (ifp == NULL)
 				return (EADDRNOTAVAIL);
 		}
 	} else {
 		/*
 		 * An interface was specified by IPv4 address.
 		 * This is the traditional BSD usage.
 		 */
 		error = sooptcopyin(sopt, &addr, sizeof(struct in_addr),
 		    sizeof(struct in_addr));
 		if (error)
 			return (error);
 		if (in_nullhost(addr)) {
 			ifp = NULL;
 		} else {
 			struct epoch_tracker et;
 
 			NET_EPOCH_ENTER(et);
 			INADDR_TO_IFP(addr, ifp);
 			/* XXXGL ifref? */
 			NET_EPOCH_EXIT(et);
 			if (ifp == NULL)
 				return (EADDRNOTAVAIL);
 		}
 		CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = 0x%08x", __func__, ifp,
 		    ntohl(addr.s_addr));
 	}
 
 	/* Reject interfaces which do not support multicast. */
 	if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0)
 		return (EOPNOTSUPP);
 
 	imo = inp_findmoptions(inp);
 	imo->imo_multicast_ifp = ifp;
 	imo->imo_multicast_addr.s_addr = INADDR_ANY;
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 /*
  * Atomically set source filters on a socket for an IPv4 multicast group.
  *
  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  */
 static int
 inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct epoch_tracker	 et;
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct in_mfilter	*imf;
 	struct ip_moptions	*imo;
 	struct in_multi		*inm;
 	int			 error;
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_nsrcs > in_mcast_maxsocksrc)
 		return (ENOBUFS);
 
 	if ((msfr.msfr_fmode != MCAST_EXCLUDE &&
 	     msfr.msfr_fmode != MCAST_INCLUDE))
 		return (EINVAL);
 
 	if (msfr.msfr_group.ss_family != AF_INET ||
 	    msfr.msfr_group.ss_len != sizeof(struct sockaddr_in))
 		return (EINVAL);
 
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 		return (EINVAL);
 
 	gsa->sin.sin_port = 0;	/* ignore port */
 
 	NET_EPOCH_ENTER(et);
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifp */
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 
 	IN_MULTI_LOCK();
 
 	/*
 	 * Take the INP write lock.
 	 * Check if this socket is a member of this group.
 	 */
 	imo = inp_findmoptions(inp);
 	imf = imo_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
 	inm = imf->imf_inm;
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	imf->imf_st[1] = msfr.msfr_fmode;
 
 	/*
 	 * Apply any new source filters, if present.
 	 * Make a copy of the user-space source vector so
 	 * that we may copy them with a single copyin. This
 	 * allows us to deal with page faults up-front.
 	 */
 	if (msfr.msfr_nsrcs > 0) {
 		struct in_msource	*lims;
 		struct sockaddr_in	*psin;
 		struct sockaddr_storage	*kss, *pkss;
 		int			 i;
 
 		INP_WUNLOCK(inp);
 
 		CTR2(KTR_IGMPV3, "%s: loading %lu source list entries",
 		    __func__, (unsigned long)msfr.msfr_nsrcs);
 		kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_WAITOK);
 		error = copyin(msfr.msfr_srcs, kss,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		if (error) {
 			free(kss, M_TEMP);
 			return (error);
 		}
 
 		INP_WLOCK(inp);
 
 		/*
 		 * Mark all source filters as UNDEFINED at t1.
 		 * Restore new group filter mode, as imf_leave()
 		 * will set it to INCLUDE.
 		 */
 		imf_leave(imf);
 		imf->imf_st[1] = msfr.msfr_fmode;
 
 		/*
 		 * Update socket layer filters at t1, lazy-allocating
 		 * new entries. This saves a bunch of memory at the
 		 * cost of one RB_FIND() per source entry; duplicate
 		 * entries in the msfr_nsrcs vector are ignored.
 		 * If we encounter an error, rollback transaction.
 		 *
 		 * XXX This too could be replaced with a set-symmetric
 		 * difference like loop to avoid walking from root
 		 * every time, as the key space is common.
 		 */
 		for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) {
 			psin = (struct sockaddr_in *)pkss;
 			if (psin->sin_family != AF_INET) {
 				error = EAFNOSUPPORT;
 				break;
 			}
 			if (psin->sin_len != sizeof(struct sockaddr_in)) {
 				error = EINVAL;
 				break;
 			}
 			error = imf_get_source(imf, psin, &lims);
 			if (error)
 				break;
 			lims->imsl_st[1] = imf->imf_st[1];
 		}
 		free(kss, M_TEMP);
 	}
 
 	if (error)
 		goto out_imf_rollback;
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	IN_MULTI_LIST_LOCK();
 	error = inm_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
 		IN_MULTI_LIST_UNLOCK();
 		goto out_imf_rollback;
 	}
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	error = igmp_change_state(inm);
 	IN_MULTI_LIST_UNLOCK();
 	if (error)
 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
 
 out_imf_rollback:
 	if (error)
 		imf_rollback(imf);
 	else
 		imf_commit(imf);
 
 	imf_reap(imf);
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
 	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Set the IP multicast options in response to user setsockopt().
  *
  * Many of the socket options handled in this function duplicate the
  * functionality of socket options in the regular unicast API. However,
  * it is not possible to merge the duplicate code, because the idempotence
  * of the IPv4 multicast part of the BSD Sockets API must be preserved;
  * the effects of these options must be treated as separate and distinct.
  *
  * SMPng: XXX: Unlocked read of inp_socket believed OK.
  * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING
  * is refactored to no longer use vifs.
  */
 int
 inp_setmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ip_moptions	*imo;
 	int			 error;
 
 	error = 0;
 
-	/*
-	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
-	 * or is a divert socket, reject it.
-	 */
-	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
-	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
-	     inp->inp_socket->so_proto->pr_type != SOCK_DGRAM))
+	/* If socket is neither of type SOCK_RAW or SOCK_DGRAM, reject it. */
+	if (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
+	     inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)
 		return (EOPNOTSUPP);
 
 	switch (sopt->sopt_name) {
 	case IP_MULTICAST_VIF: {
 		int vifi;
 		/*
 		 * Select a multicast VIF for transmission.
 		 * Only useful if multicast forwarding is active.
 		 */
 		if (legal_vif_num == NULL) {
 			error = EOPNOTSUPP;
 			break;
 		}
 		error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int));
 		if (error)
 			break;
 		if (!legal_vif_num(vifi) && (vifi != -1)) {
 			error = EINVAL;
 			break;
 		}
 		imo = inp_findmoptions(inp);
 		imo->imo_multicast_vif = vifi;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IP_MULTICAST_IF:
 		error = inp_set_multicast_if(inp, sopt);
 		break;
 
 	case IP_MULTICAST_TTL: {
 		u_char ttl;
 
 		/*
 		 * Set the IP time-to-live for outgoing multicast packets.
 		 * The original multicast API required a char argument,
 		 * which is inconsistent with the rest of the socket API.
 		 * We allow either a char or an int.
 		 */
 		if (sopt->sopt_valsize == sizeof(u_char)) {
 			error = sooptcopyin(sopt, &ttl, sizeof(u_char),
 			    sizeof(u_char));
 			if (error)
 				break;
 		} else {
 			u_int ittl;
 
 			error = sooptcopyin(sopt, &ittl, sizeof(u_int),
 			    sizeof(u_int));
 			if (error)
 				break;
 			if (ittl > 255) {
 				error = EINVAL;
 				break;
 			}
 			ttl = (u_char)ittl;
 		}
 		imo = inp_findmoptions(inp);
 		imo->imo_multicast_ttl = ttl;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IP_MULTICAST_LOOP: {
 		u_char loop;
 
 		/*
 		 * Set the loopback flag for outgoing multicast packets.
 		 * Must be zero or one.  The original multicast API required a
 		 * char argument, which is inconsistent with the rest
 		 * of the socket API.  We allow either a char or an int.
 		 */
 		if (sopt->sopt_valsize == sizeof(u_char)) {
 			error = sooptcopyin(sopt, &loop, sizeof(u_char),
 			    sizeof(u_char));
 			if (error)
 				break;
 		} else {
 			u_int iloop;
 
 			error = sooptcopyin(sopt, &iloop, sizeof(u_int),
 					    sizeof(u_int));
 			if (error)
 				break;
 			loop = (u_char)iloop;
 		}
 		imo = inp_findmoptions(inp);
 		imo->imo_multicast_loop = !!loop;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IP_ADD_MEMBERSHIP:
 	case IP_ADD_SOURCE_MEMBERSHIP:
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		error = inp_join_group(inp, sopt);
 		break;
 
 	case IP_DROP_MEMBERSHIP:
 	case IP_DROP_SOURCE_MEMBERSHIP:
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		error = inp_leave_group(inp, sopt);
 		break;
 
 	case IP_BLOCK_SOURCE:
 	case IP_UNBLOCK_SOURCE:
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = inp_block_unblock_source(inp, sopt);
 		break;
 
 	case IP_MSFILTER:
 		error = inp_set_source_filters(inp, sopt);
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Expose IGMP's multicast filter mode and source list(s) to userland,
  * keyed by (ifindex, group).
  * The filter mode is written out as a uint32_t, followed by
  * 0..n of struct in_addr.
  * For use by ifmcstat(8).
  * SMPng: NOTE: unlocked read of ifindex space.
  */
 static int
 sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS)
 {
 	struct in_addr			 src, group;
 	struct epoch_tracker		 et;
 	struct ifnet			*ifp;
 	struct ifmultiaddr		*ifma;
 	struct in_multi			*inm;
 	struct ip_msource		*ims;
 	int				*name;
 	int				 retval;
 	u_int				 namelen;
 	uint32_t			 fmode, ifindex;
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (namelen != 2)
 		return (EINVAL);
 
 	group.s_addr = name[1];
 	if (!IN_MULTICAST(ntohl(group.s_addr))) {
 		CTR2(KTR_IGMPV3, "%s: group 0x%08x is not multicast",
 		    __func__, ntohl(group.s_addr));
 		return (EINVAL);
 	}
 
 	ifindex = name[0];
 	NET_EPOCH_ENTER(et);
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		NET_EPOCH_EXIT(et);
 		CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u",
 		    __func__, ifindex);
 		return (ENOENT);
 	}
 
 	retval = sysctl_wire_old_buffer(req,
 	    sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr)));
 	if (retval) {
 		NET_EPOCH_EXIT(et);
 		return (retval);
 	}
 
 	IN_MULTI_LIST_LOCK();
 
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_INET ||
 		    ifma->ifma_protospec == NULL)
 			continue;
 		inm = (struct in_multi *)ifma->ifma_protospec;
 		if (!in_hosteq(inm->inm_addr, group))
 			continue;
 		fmode = inm->inm_st[1].iss_fmode;
 		retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t));
 		if (retval != 0)
 			break;
 		RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
 			CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__,
 			    ims->ims_haddr);
 			/*
 			 * Only copy-out sources which are in-mode.
 			 */
 			if (fmode != ims_get_mode(inm, ims, 1)) {
 				CTR1(KTR_IGMPV3, "%s: skip non-in-mode",
 				    __func__);
 				continue;
 			}
 			src.s_addr = htonl(ims->ims_haddr);
 			retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr));
 			if (retval != 0)
 				break;
 		}
 	}
 
 	IN_MULTI_LIST_UNLOCK();
 	NET_EPOCH_EXIT(et);
 
 	return (retval);
 }
 
 #if defined(KTR) && (KTR_COMPILE & KTR_IGMPV3)
 
 static const char *inm_modestrs[] = {
 	[MCAST_UNDEFINED] = "un",
 	[MCAST_INCLUDE] = "in",
 	[MCAST_EXCLUDE] = "ex",
 };
 _Static_assert(MCAST_UNDEFINED == 0 &&
 	       MCAST_EXCLUDE + 1 == nitems(inm_modestrs),
 	       "inm_modestrs: no longer matches #defines");
 
 static const char *
 inm_mode_str(const int mode)
 {
 
 	if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE)
 		return (inm_modestrs[mode]);
 	return ("??");
 }
 
 static const char *inm_statestrs[] = {
 	[IGMP_NOT_MEMBER] = "not-member",
 	[IGMP_SILENT_MEMBER] = "silent",
 	[IGMP_REPORTING_MEMBER] = "reporting",
 	[IGMP_IDLE_MEMBER] = "idle",
 	[IGMP_LAZY_MEMBER] = "lazy",
 	[IGMP_SLEEPING_MEMBER] = "sleeping",
 	[IGMP_AWAKENING_MEMBER] = "awakening",
 	[IGMP_G_QUERY_PENDING_MEMBER] = "query-pending",
 	[IGMP_SG_QUERY_PENDING_MEMBER] = "sg-query-pending",
 	[IGMP_LEAVING_MEMBER] = "leaving",
 };
 _Static_assert(IGMP_NOT_MEMBER == 0 &&
 	       IGMP_LEAVING_MEMBER + 1 == nitems(inm_statestrs),
 	       "inm_statetrs: no longer matches #defines");
 
 static const char *
 inm_state_str(const int state)
 {
 
 	if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER)
 		return (inm_statestrs[state]);
 	return ("??");
 }
 
 /*
  * Dump an in_multi structure to the console.
  */
 void
 inm_print(const struct in_multi *inm)
 {
 	int t;
 	char addrbuf[INET_ADDRSTRLEN];
 
 	if ((ktr_mask & KTR_IGMPV3) == 0)
 		return;
 
 	printf("%s: --- begin inm %p ---\n", __func__, inm);
 	printf("addr %s ifp %p(%s) ifma %p\n",
 	    inet_ntoa_r(inm->inm_addr, addrbuf),
 	    inm->inm_ifp,
 	    inm->inm_ifp->if_xname,
 	    inm->inm_ifma);
 	printf("timer %u state %s refcount %u scq.len %u\n",
 	    inm->inm_timer,
 	    inm_state_str(inm->inm_state),
 	    inm->inm_refcount,
 	    inm->inm_scq.mq_len);
 	printf("igi %p nsrc %lu sctimer %u scrv %u\n",
 	    inm->inm_igi,
 	    inm->inm_nsrc,
 	    inm->inm_sctimer,
 	    inm->inm_scrv);
 	for (t = 0; t < 2; t++) {
 		printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t,
 		    inm_mode_str(inm->inm_st[t].iss_fmode),
 		    inm->inm_st[t].iss_asm,
 		    inm->inm_st[t].iss_ex,
 		    inm->inm_st[t].iss_in,
 		    inm->inm_st[t].iss_rec);
 	}
 	printf("%s: --- end inm %p ---\n", __func__, inm);
 }
 
 #else /* !KTR || !(KTR_COMPILE & KTR_IGMPV3) */
 
 void
 inm_print(const struct in_multi *inm)
 {
 
 }
 
 #endif /* KTR && (KTR_COMPILE & KTR_IGMPV3) */
 
 RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp);
diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c
index d14ec5190ad0..b09d7e1dda7a 100644
--- a/sys/netinet/ip_divert.c
+++ b/sys/netinet/ip_divert.c
@@ -1,795 +1,800 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_sctp.h"
 #ifndef INET
-#error "IPDIVERT requires INET"
+#error "IPDIVERT requires INET"		/* XXX! */
 #endif
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
+#include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <net/vnet.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 #include <netinet/sctp_crc32.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 /*
  * Divert sockets
  */
 
 /*
  * Allocate enough space to hold a full IP packet
  */
 #define	DIVSNDQ		(65536 + 100)
 #define	DIVRCVQ		(65536 + 100)
 
 /*
  * Divert sockets work in conjunction with ipfw or other packet filters,
  * see the divert(4) manpage for features.
  * Packets are selected by the packet filter and tagged with an
  * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by
  * the packet filter) and information on the matching filter rule for
  * subsequent reinjection. The divert_port is used to put the packet
  * on the corresponding divert socket, while the rule number is passed
  * up (at least partially) as the sin_port in the struct sockaddr.
  *
  * Packets written to the divert socket carry in sin_addr a
  * destination address, and in sin_port the number of the filter rule
  * after which to continue processing.
  * If the destination address is INADDR_ANY, the packet is treated as
  * as outgoing and sent to ip_output(); otherwise it is treated as
  * incoming and sent to ip_input().
  * Further, sin_zero carries some information on the interface,
  * which can be used in the reinject -- see comments in the code.
  *
  * On reinjection, processing in ip_input() and ip_output()
  * will be exactly the same as for the original packet, except that
  * packet filter processing will start at the rule number after the one
  * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0
  * will apply the entire ruleset to the packet).
  */
 
 /* Internal variables. */
 VNET_DEFINE_STATIC(struct inpcbinfo, divcbinfo);
 #define	V_divcbinfo			VNET(divcbinfo)
 
 static u_long	div_sendspace = DIVSNDQ;	/* XXX sysctl ? */
 static u_long	div_recvspace = DIVRCVQ;	/* XXX sysctl ? */
 
 static int div_output_inbound(int fmaily, struct socket *so, struct mbuf *m,
     struct sockaddr_in *sin);
 static int div_output_outbound(int family, struct socket *so, struct mbuf *m);
 
 /*
  * Initialize divert connection block queue.
  */
 INPCBSTORAGE_DEFINE(divcbstor, "divinp", "divcb", "div", "divhash");
 
 static void
 div_init(void *arg __unused)
 {
 
 	/*
 	 * XXX We don't use the hash list for divert IP, but it's easier to
 	 * allocate one-entry hash lists than it is to check all over the
 	 * place for hashbase == NULL.
 	 */
 	in_pcbinfo_init(&V_divcbinfo, &divcbstor, 1, 1);
 }
 VNET_SYSINIT(div_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_init, NULL);
 
 static void
 div_destroy(void *unused __unused)
 {
 
 	in_pcbinfo_destroy(&V_divcbinfo);
 }
 VNET_SYSUNINIT(divert, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_destroy, NULL);
 
 static bool
 div_port_match(const struct inpcb *inp, void *v)
 {
 	uint16_t nport = *(uint16_t *)v;
 
 	return (inp->inp_lport == nport);
 }
 
 /*
  * Divert a packet by passing it up to the divert socket at port 'port'.
  */
 static void
 divert_packet(struct mbuf *m, bool incoming)
 {
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 	struct ip *ip;
 #endif
 	struct inpcb *inp;
 	struct socket *sa;
 	u_int16_t nport;
 	struct sockaddr_in divsrc;
 	struct inpcb_iterator inpi = INP_ITERATOR(&V_divcbinfo,
 	    INPLOOKUP_RLOCKPCB, div_port_match, &nport);
 	struct m_tag *mtag;
 
 	NET_EPOCH_ASSERT();
 
 	mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
 	if (mtag == NULL) {
 		m_freem(m);
 		return;
 	}
 	/* Assure header */
 	if (m->m_len < sizeof(struct ip) &&
 	    (m = m_pullup(m, sizeof(struct ip))) == NULL)
 		return;
 
 	/* Delayed checksums are currently not compatible with divert. */
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP) {
 		ip = mtod(m, struct ip *);
 		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 #ifdef INET6
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 		in6_delayed_cksum(m, m->m_pkthdr.len -
 		    sizeof(struct ip6_hdr), sizeof(struct ip6_hdr));
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
 	}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) {
 		sctp_delayed_cksum(m, sizeof(struct ip6_hdr));
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
 	}
 #endif
 #endif /* INET6 */
 	bzero(&divsrc, sizeof(divsrc));
 	divsrc.sin_len = sizeof(divsrc);
 	divsrc.sin_family = AF_INET;
 	/* record matching rule, in host format */
 	divsrc.sin_port = ((struct ipfw_rule_ref *)(mtag+1))->rulenum;
 	/*
 	 * Record receive interface address, if any.
 	 * But only for incoming packets.
 	 */
 	if (incoming) {
 		struct ifaddr *ifa;
 		struct ifnet *ifp;
 
 		/* Sanity check */
 		M_ASSERTPKTHDR(m);
 
 		/* Find IP address for receive interface */
 		ifp = m->m_pkthdr.rcvif;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			divsrc.sin_addr =
 			    ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr;
 			break;
 		}
 	}
 	/*
 	 * Record the incoming interface name whenever we have one.
 	 */
 	if (m->m_pkthdr.rcvif) {
 		/*
 		 * Hide the actual interface name in there in the
 		 * sin_zero array. XXX This needs to be moved to a
 		 * different sockaddr type for divert, e.g.
 		 * sockaddr_div with multiple fields like
 		 * sockaddr_dl. Presently we have only 7 bytes
 		 * but that will do for now as most interfaces
 		 * are 4 or less + 2 or less bytes for unit.
 		 * There is probably a faster way of doing this,
 		 * possibly taking it from the sockaddr_dl on the iface.
 		 * This solves the problem of a P2P link and a LAN interface
 		 * having the same address, which can result in the wrong
 		 * interface being assigned to the packet when fed back
 		 * into the divert socket. Theoretically if the daemon saves
 		 * and re-uses the sockaddr_in as suggested in the man pages,
 		 * this iface name will come along for the ride.
 		 * (see div_output for the other half of this.)
 		 */
 		strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname,
 		    sizeof(divsrc.sin_zero));
 	}
 
 	/* Put packet on socket queue, if any */
 	sa = NULL;
 	/* nport is inp_next's context. */
 	nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info));
 	while ((inp = inp_next(&inpi)) != NULL) {
 		sa = inp->inp_socket;
 		SOCKBUF_LOCK(&sa->so_rcv);
 		if (sbappendaddr_locked(&sa->so_rcv,
 		    (struct sockaddr *)&divsrc, m, NULL) == 0) {
 			soroverflow_locked(sa);
 			sa = NULL;	/* force mbuf reclaim below */
 		} else
 			sorwakeup_locked(sa);
 		/* XXX why does only one socket match? */
 		INP_RUNLOCK(inp);
 		break;
 	}
 	if (sa == NULL) {
 		m_freem(m);
 		KMOD_IPSTAT_INC(ips_noproto);
 		KMOD_IPSTAT_DEC(ips_delivered);
         }
 }
 
 /*
  * Deliver packet back into the IP processing machinery.
  *
  * If no address specified, or address is 0.0.0.0, send to ip_output();
  * otherwise, send to ip_input() and mark as having been received on
  * the interface with that address.
  */
 static int
 div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
 	struct epoch_tracker et;
 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
 	const struct ip *ip;
 	struct m_tag *mtag;
 	struct ipfw_rule_ref *dt;
 	int error, family;
 
 	if (control)
 		m_freem(control);
 
 	/* Packet must have a header (but that's about it) */
 	if (m->m_len < sizeof (struct ip) &&
 	    (m = m_pullup(m, sizeof (struct ip))) == NULL) {
 		KMOD_IPSTAT_INC(ips_toosmall);
 		m_freem(m);
 		return (EINVAL);
 	}
 
 	if (sin != NULL) {
 		if (sin->sin_family != AF_INET) {
 			m_freem(m);
 			return (EAFNOSUPPORT);
 		}
 		if (sin->sin_len != sizeof(*sin)) {
 			m_freem(m);
 			return (EINVAL);
 		}
 	}
 
 	/*
 	 * An mbuf may hasn't come from userland, but we pretend
 	 * that it has.
 	 */
 	m->m_pkthdr.rcvif = NULL;
 	m->m_nextpkt = NULL;
 	M_SETFIB(m, so->so_fibnum);
 
 	mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
 	if (mtag == NULL) {
 		/* this should be normal */
 		mtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
 		    sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
 		if (mtag == NULL) {
 			m_freem(m);
 			return (ENOBUFS);
 		}
 		m_tag_prepend(m, mtag);
 	}
 	dt = (struct ipfw_rule_ref *)(mtag+1);
 
 	/* Loopback avoidance and state recovery */
 	if (sin) {
 		int i;
 
 		/* set the starting point. We provide a non-zero slot,
 		 * but a non_matching chain_id to skip that info and use
 		 * the rulenum/rule_id.
 		 */
 		dt->slot = 1; /* dummy, chain_id is invalid */
 		dt->chain_id = 0;
 		dt->rulenum = sin->sin_port+1; /* host format ? */
 		dt->rule_id = 0;
 		/* XXX: broken for IPv6 */
 		/*
 		 * Find receive interface with the given name, stuffed
 		 * (if it exists) in the sin_zero[] field.
 		 * The name is user supplied data so don't trust its size
 		 * or that it is zero terminated.
 		 */
 		for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++)
 			;
 		if ( i > 0 && i < sizeof(sin->sin_zero))
 			m->m_pkthdr.rcvif = ifunit(sin->sin_zero);
 	}
 
 	ip = mtod(m, struct ip *);
 	switch (ip->ip_v) {
 	case IPVERSION:
 		family = AF_INET;
 		break;
 #ifdef INET6
 	case IPV6_VERSION >> 4:
 		family = AF_INET6;
 		break;
 #endif
 	default:
 		m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 
 	/* Reinject packet into the system as incoming or outgoing */
 	NET_EPOCH_ENTER(et);
 	if (!sin || sin->sin_addr.s_addr == 0) {
 		dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT;
 		error = div_output_outbound(family, so, m);
 	} else {
 		dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN;
 		error = div_output_inbound(family, so, m, sin);
 	}
 	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
 
 /*
  * Sends mbuf @m to the wire via ip[6]_output().
  *
  * Returns 0 on success or an errno value on failure.  @m is always consumed.
  */
 static int
 div_output_outbound(int family, struct socket *so, struct mbuf *m)
 {
 	struct ip *const ip = mtod(m, struct ip *);
 	struct mbuf *options;
 	struct inpcb *inp;
 	int error;
 
 	inp = sotoinpcb(so);
 	INP_RLOCK(inp);
 	switch (family) {
 	case AF_INET:
 		/*
 		 * Don't allow both user specified and setsockopt
 		 * options, and don't allow packet length sizes that
 		 * will crash.
 		 */
 		if ((((ip->ip_hl << 2) != sizeof(struct ip)) &&
 		    inp->inp_options != NULL) ||
 		    ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (EINVAL);
 		}
 		break;
 #ifdef INET6
 	case AF_INET6:
 	    {
 		struct ip6_hdr *const ip6 = mtod(m, struct ip6_hdr *);
 
 		/* Don't allow packet length sizes that will crash */
 		if (((u_short)ntohs(ip6->ip6_plen) > m->m_pkthdr.len)) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (EINVAL);
 		}
 		break;
 	    }
 #endif
 	}
 
 	/* Send packet to output processing */
 	KMOD_IPSTAT_INC(ips_rawout);		/* XXX */
 
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 	/*
 	 * Get ready to inject the packet into ip_output().
 	 * Just in case socket options were specified on the
 	 * divert socket, we duplicate them.  This is done
 	 * to avoid having to hold the PCB locks over the call
 	 * to ip_output(), as doing this results in a number of
 	 * lock ordering complexities.
 	 *
 	 * Note that we set the multicast options argument for
 	 * ip_output() to NULL since it should be invariant that
 	 * they are not present.
 	 */
 	KASSERT(inp->inp_moptions == NULL,
 	    ("multicast options set on a divert socket"));
 	/*
 	 * XXXCSJP: It is unclear to me whether or not it makes
 	 * sense for divert sockets to have options.  However,
 	 * for now we will duplicate them with the INP locks
 	 * held so we can use them in ip_output() without
 	 * requring a reference to the pcb.
 	 */
 	options = NULL;
 	if (inp->inp_options != NULL) {
 		options = m_dup(inp->inp_options, M_NOWAIT);
 		if (options == NULL) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (ENOBUFS);
 		}
 	}
 	INP_RUNLOCK(inp);
 
 	error = 0;
 	switch (family) {
 	case AF_INET:
 		error = ip_output(m, options, NULL,
 		    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0)
 		    | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL);
 		break;
 #ifdef INET6
 	case AF_INET6:
 		error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
 		break;
 #endif
 	}
 	if (options != NULL)
 		m_freem(options);
 
 	return (error);
 }
 
 /*
  * Schedules mbuf @m for local processing via IPv4/IPv6 netisr queue.
  *
  * Returns 0 on success or an errno value on failure.  @m is always consumed.
  */
 static int
 div_output_inbound(int family, struct socket *so, struct mbuf *m,
     struct sockaddr_in *sin)
 {
 	const struct ip *ip;
 	struct ifaddr *ifa;
 
 	if (m->m_pkthdr.rcvif == NULL) {
 		/*
 		 * No luck with the name, check by IP address.
 		 * Clear the port and the ifname to make sure
 		 * there are no distractions for ifa_ifwithaddr.
 		 */
 
 		/* XXX: broken for IPv6 */
 		bzero(sin->sin_zero, sizeof(sin->sin_zero));
 		sin->sin_port = 0;
 		ifa = ifa_ifwithaddr((struct sockaddr *) sin);
 		if (ifa == NULL) {
 			m_freem(m);
 			return (EADDRNOTAVAIL);
 		}
 		m->m_pkthdr.rcvif = ifa->ifa_ifp;
 	}
 #ifdef MAC
 	mac_socket_create_mbuf(so, m);
 #endif
 	/* Send packet to input processing via netisr */
 	switch (family) {
 	case AF_INET:
 		ip = mtod(m, struct ip *);
 		/*
 		 * Restore M_BCAST flag when destination address is
 		 * broadcast. It is expected by ip_tryforward().
 		 */
 		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)))
 			m->m_flags |= M_MCAST;
 		else if (in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 			m->m_flags |= M_BCAST;
 		netisr_queue_src(NETISR_IP, (uintptr_t)so, m);
 		break;
 #ifdef INET6
 	case AF_INET6:
 		netisr_queue_src(NETISR_IPV6, (uintptr_t)so, m);
 		break;
 #endif
 	default:
 		m_freem(m);
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static int
 div_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	inp  = sotoinpcb(so);
 	KASSERT(inp == NULL, ("div_attach: inp != NULL"));
 	if (td != NULL) {
 		error = priv_check(td, PRIV_NETINET_DIVERT);
 		if (error)
 			return (error);
 	}
 	error = soreserve(so, div_sendspace, div_recvspace);
 	if (error)
 		return error;
 	error = in_pcballoc(so, &V_divcbinfo);
 	if (error)
 		return error;
 	inp = (struct inpcb *)so->so_pcb;
 	inp->inp_ip_p = proto;
 	inp->inp_flags |= INP_HDRINCL;
 	INP_WUNLOCK(inp);
 	return 0;
 }
 
 static void
 div_detach(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("div_detach: inp == NULL"));
 	INP_WLOCK(inp);
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 }
 
 static int
 div_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("div_bind: inp == NULL"));
 	/* in_pcbbind assumes that nam is a sockaddr_in
 	 * and in_pcbbind requires a valid address. Since divert
 	 * sockets don't we need to make sure the address is
 	 * filled in properly.
 	 * XXX -- divert should not be abusing in_pcbind
 	 * and should probably have its own family.
 	 */
 	if (nam->sa_family != AF_INET)
 		return EAFNOSUPPORT;
 	if (nam->sa_len != sizeof(struct sockaddr_in))
 		return EINVAL;
 	((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY;
 	INP_WLOCK(inp);
 	INP_HASH_WLOCK(&V_divcbinfo);
 	error = in_pcbbind(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(&V_divcbinfo);
 	INP_WUNLOCK(inp);
 	return error;
 }
 
 static int
 div_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("div_shutdown: inp == NULL"));
 	INP_WLOCK(inp);
 	socantsendmore(so);
 	INP_WUNLOCK(inp);
 	return 0;
 }
 
 static int
 div_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_divcbinfo,
 	    INPLOOKUP_RLOCKPCB);
 	struct xinpgen xig;
 	struct inpcb *inp;
 	int error;
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	if (req->oldptr == 0) {
 		int n;
 
 		n = V_divcbinfo.ipi_count;
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
 		return 0;
 	}
 
 	if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
 		return (error);
 
 	bzero(&xig, sizeof(xig));
 	xig.xig_len = sizeof xig;
 	xig.xig_count = V_divcbinfo.ipi_count;
 	xig.xig_gen = V_divcbinfo.ipi_gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	while ((inp = inp_next(&inpi)) != NULL) {
 		if (inp->inp_gencnt <= xig.xig_gen) {
 			struct xinpcb xi;
 
 			in_pcbtoxinpcb(inp, &xi);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 			if (error) {
 				INP_RUNLOCK(inp);
 				break;
 			}
 		}
 	}
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		xig.xig_gen = V_divcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_divcbinfo.ipi_count;
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 
 	return (error);
 }
 
 #ifdef SYSCTL_NODE
 static SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IPDIVERT");
 SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist,
    CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, 0, div_pcblist, "S,xinpcb",
     "List of active divert sockets");
 #endif
 
 static struct protosw div_protosw = {
 	.pr_type =		SOCK_RAW,
-	.pr_protocol =		IPPROTO_DIVERT,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_attach =		div_attach,
 	.pr_bind =		div_bind,
 	.pr_control =		in_control,
 	.pr_detach =		div_detach,
 	.pr_peeraddr =		in_getpeeraddr,
 	.pr_send =		div_send,
 	.pr_shutdown =		div_shutdown,
 	.pr_sockaddr =		in_getsockaddr,
 	.pr_sosetlabel =	in_pcbsosetlabel
 };
 
+static struct domain divertdomain = {
+	.dom_family =	PF_DIVERT,
+	.dom_name =	"divert",
+	.dom_nprotosw =	1,
+	.dom_protosw =	{ &div_protosw },
+};
+
 static int
 div_modevent(module_t mod, int type, void *unused)
 {
 	int err = 0;
 
 	switch (type) {
 	case MOD_LOAD:
-		/*
-		 * Protocol will be initialized by pf_proto_register().
-		 */
-		err = protosw_register(&inetdomain, &div_protosw);
-		if (err != 0)
-			return (err);
+		domain_add(&divertdomain);
 		ip_divert_ptr = divert_packet;
 		break;
 	case MOD_QUIESCE:
 		/*
 		 * IPDIVERT may normally not be unloaded because of the
 		 * potential race conditions.  Tell kldunload we can't be
 		 * unloaded unless the unload is forced.
 		 */
 		err = EPERM;
 		break;
 	case MOD_UNLOAD:
 		/*
 		 * Forced unload.
 		 *
 		 * Module ipdivert can only be unloaded if no sockets are
 		 * connected.  Maybe this can be changed later to forcefully
 		 * disconnect any open sockets.
 		 *
 		 * XXXRW: Note that there is a slight race here, as a new
 		 * socket open request could be spinning on the lock and then
 		 * we destroy the lock.
+		 *
+		 * XXXGL: One more reason this code is incorrect is that it
+		 * checks only the current vnet.
 		 */
 		INP_INFO_WLOCK(&V_divcbinfo);
 		if (V_divcbinfo.ipi_count != 0) {
 			err = EBUSY;
 			INP_INFO_WUNLOCK(&V_divcbinfo);
 			break;
 		}
 		ip_divert_ptr = NULL;
-		err = protosw_unregister(&div_protosw);
+		domain_remove(&divertdomain);
 		INP_INFO_WUNLOCK(&V_divcbinfo);
 #ifndef VIMAGE
 		div_destroy(NULL);
 #endif
 		break;
 	default:
 		err = EOPNOTSUPP;
 		break;
 	}
 	return err;
 }
 
 static moduledata_t ipdivertmod = {
         "ipdivert",
         div_modevent,
         0
 };
 
 DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY);
 MODULE_DEPEND(ipdivert, ipfw, 3, 3, 3);
 MODULE_VERSION(ipdivert, 1);
diff --git a/sys/netinet6/in6_mcast.c b/sys/netinet6/in6_mcast.c
index d0f8186e75c7..a02e18656dc2 100644
--- a/sys/netinet6/in6_mcast.c
+++ b/sys/netinet6/in6_mcast.c
@@ -1,2930 +1,2922 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2009 Bruce Simpson.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * IPv6 multicast socket, group, and socket option processing module.
  * Normative references: RFC 2292, RFC 3492, RFC 3542, RFC 3678, RFC 3810.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/priv.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/udp.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp_var.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/scope6_var.h>
 
 #ifndef KTR_MLD
 #define KTR_MLD KTR_INET6
 #endif
 
 #ifndef __SOCKUNION_DECLARED
 union sockunion {
 	struct sockaddr_storage	ss;
 	struct sockaddr		sa;
 	struct sockaddr_dl	sdl;
 	struct sockaddr_in6	sin6;
 };
 typedef union sockunion sockunion_t;
 #define __SOCKUNION_DECLARED
 #endif /* __SOCKUNION_DECLARED */
 
 static MALLOC_DEFINE(M_IN6MFILTER, "in6_mfilter",
     "IPv6 multicast PCB-layer source filter");
 MALLOC_DEFINE(M_IP6MADDR, "in6_multi", "IPv6 multicast group");
 static MALLOC_DEFINE(M_IP6MOPTS, "ip6_moptions", "IPv6 multicast options");
 static MALLOC_DEFINE(M_IP6MSOURCE, "ip6_msource",
     "IPv6 multicast MLD-layer source filter");
 
 RB_GENERATE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp);
 
 /*
  * Locking:
  * - Lock order is: Giant, IN6_MULTI_LOCK, INP_WLOCK,
  *   IN6_MULTI_LIST_LOCK, MLD_LOCK, IF_ADDR_LOCK.
  * - The IF_ADDR_LOCK is implicitly taken by in6m_lookup() earlier, however
  *   it can be taken by code in net/if.c also.
  * - ip6_moptions and in6_mfilter are covered by the INP_WLOCK.
  *
  * struct in6_multi is covered by IN6_MULTI_LOCK. There isn't strictly
  * any need for in6_multi itself to be virtualized -- it is bound to an ifp
  * anyway no matter what happens.
  */
 struct mtx in6_multi_list_mtx;
 MTX_SYSINIT(in6_multi_mtx, &in6_multi_list_mtx, "in6_multi_list_mtx", MTX_DEF);
 
 struct mtx in6_multi_free_mtx;
 MTX_SYSINIT(in6_multi_free_mtx, &in6_multi_free_mtx, "in6_multi_free_mtx", MTX_DEF);
 
 struct sx in6_multi_sx;
 SX_SYSINIT(in6_multi_sx, &in6_multi_sx, "in6_multi_sx");
 
 static void	im6f_commit(struct in6_mfilter *);
 static int	im6f_get_source(struct in6_mfilter *imf,
 		    const struct sockaddr_in6 *psin,
 		    struct in6_msource **);
 static struct in6_msource *
 		im6f_graft(struct in6_mfilter *, const uint8_t,
 		    const struct sockaddr_in6 *);
 static void	im6f_leave(struct in6_mfilter *);
 static int	im6f_prune(struct in6_mfilter *, const struct sockaddr_in6 *);
 static void	im6f_purge(struct in6_mfilter *);
 static void	im6f_rollback(struct in6_mfilter *);
 static void	im6f_reap(struct in6_mfilter *);
 static struct in6_mfilter *
 		im6o_match_group(const struct ip6_moptions *,
 		    const struct ifnet *, const struct sockaddr *);
 static struct in6_msource *
 		im6o_match_source(struct in6_mfilter *, const struct sockaddr *);
 static void	im6s_merge(struct ip6_msource *ims,
 		    const struct in6_msource *lims, const int rollback);
 static int	in6_getmulti(struct ifnet *, const struct in6_addr *,
 		    struct in6_multi **);
 static int	in6_joingroup_locked(struct ifnet *, const struct in6_addr *,
 		    struct in6_mfilter *, struct in6_multi **, int);
 static int	in6m_get_source(struct in6_multi *inm,
 		    const struct in6_addr *addr, const int noalloc,
 		    struct ip6_msource **pims);
 #ifdef KTR
 static int	in6m_is_ifp_detached(const struct in6_multi *);
 #endif
 static int	in6m_merge(struct in6_multi *, /*const*/ struct in6_mfilter *);
 static void	in6m_purge(struct in6_multi *);
 static void	in6m_reap(struct in6_multi *);
 static struct ip6_moptions *
 		in6p_findmoptions(struct inpcb *);
 static int	in6p_get_source_filters(struct inpcb *, struct sockopt *);
 static int	in6p_join_group(struct inpcb *, struct sockopt *);
 static int	in6p_leave_group(struct inpcb *, struct sockopt *);
 static struct ifnet *
 		in6p_lookup_mcast_ifp(const struct inpcb *,
 		    const struct sockaddr_in6 *);
 static int	in6p_block_unblock_source(struct inpcb *, struct sockopt *);
 static int	in6p_set_multicast_if(struct inpcb *, struct sockopt *);
 static int	in6p_set_source_filters(struct inpcb *, struct sockopt *);
 static int	sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_DECL(_net_inet6_ip6);	/* XXX Not in any common header. */
 
 static SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, mcast,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IPv6 multicast");
 
 static u_long in6_mcast_maxgrpsrc = IPV6_MAX_GROUP_SRC_FILTER;
 SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxgrpsrc,
     CTLFLAG_RWTUN, &in6_mcast_maxgrpsrc, 0,
     "Max source filters per group");
 
 static u_long in6_mcast_maxsocksrc = IPV6_MAX_SOCK_SRC_FILTER;
 SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxsocksrc,
     CTLFLAG_RWTUN, &in6_mcast_maxsocksrc, 0,
     "Max source filters per socket");
 
 /* TODO Virtualize this switch. */
 int in6_mcast_loop = IPV6_DEFAULT_MULTICAST_LOOP;
 SYSCTL_INT(_net_inet6_ip6_mcast, OID_AUTO, loop, CTLFLAG_RWTUN,
     &in6_mcast_loop, 0, "Loopback multicast datagrams by default");
 
 static SYSCTL_NODE(_net_inet6_ip6_mcast, OID_AUTO, filters,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip6_mcast_filters,
     "Per-interface stack-wide source filters");
 
 #ifdef KTR
 /*
  * Inline function which wraps assertions for a valid ifp.
  * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp
  * is detached.
  */
 static int __inline
 in6m_is_ifp_detached(const struct in6_multi *inm)
 {
 	struct ifnet *ifp;
 
 	KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->in6m_ifma->ifma_ifp;
 	if (ifp != NULL) {
 		/*
 		 * Sanity check that network-layer notion of ifp is the
 		 * same as that of link-layer.
 		 */
 		KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__));
 	}
 
 	return (ifp == NULL);
 }
 #endif
 
 /*
  * Initialize an in6_mfilter structure to a known state at t0, t1
  * with an empty source filter list.
  */
 static __inline void
 im6f_init(struct in6_mfilter *imf, const int st0, const int st1)
 {
 	memset(imf, 0, sizeof(struct in6_mfilter));
 	RB_INIT(&imf->im6f_sources);
 	imf->im6f_st[0] = st0;
 	imf->im6f_st[1] = st1;
 }
 
 struct in6_mfilter *
 ip6_mfilter_alloc(const int mflags, const int st0, const int st1)
 {
 	struct in6_mfilter *imf;
 
 	imf = malloc(sizeof(*imf), M_IN6MFILTER, mflags);
 
 	if (imf != NULL)
 		im6f_init(imf, st0, st1);
 
 	return (imf);
 }
 
 void
 ip6_mfilter_free(struct in6_mfilter *imf)
 {
 
 	im6f_purge(imf);
 	free(imf, M_IN6MFILTER);
 }
 
 /*
  * Find an IPv6 multicast group entry for this ip6_moptions instance
  * which matches the specified group, and optionally an interface.
  * Return its index into the array, or -1 if not found.
  */
 static struct in6_mfilter *
 im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group)
 {
 	const struct sockaddr_in6 *gsin6;
         struct in6_mfilter *imf;
         struct in6_multi *inm;
 
         gsin6 = (const struct sockaddr_in6 *)group;
 
 	IP6_MFILTER_FOREACH(imf, &imo->im6o_head) {
 		inm = imf->im6f_in6m;
 		if (inm == NULL)
 			continue;
 		if ((ifp == NULL || (inm->in6m_ifp == ifp)) &&
 		    IN6_ARE_ADDR_EQUAL(&inm->in6m_addr,
 		    &gsin6->sin6_addr)) {
 			break;
 		}
 	}
 	return (imf);
 }
 
 /*
  * Find an IPv6 multicast source entry for this imo which matches
  * the given group index for this socket, and source address.
  *
  * XXX TODO: The scope ID, if present in src, is stripped before
  * any comparison. We SHOULD enforce scope/zone checks where the source
  * filter entry has a link scope.
  *
  * NOTE: This does not check if the entry is in-mode, merely if
  * it exists, which may not be the desired behaviour.
  */
 static struct in6_msource *
 im6o_match_source(struct in6_mfilter *imf, const struct sockaddr *src)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims;
 	const sockunion_t	*psa;
 
 	KASSERT(src->sa_family == AF_INET6, ("%s: !AF_INET6", __func__));
 
 	psa = (const sockunion_t *)src;
 	find.im6s_addr = psa->sin6.sin6_addr;
 	in6_clearscope(&find.im6s_addr);		/* XXX */
 	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
 
 	return ((struct in6_msource *)ims);
 }
 
 /*
  * Perform filtering for multicast datagrams on a socket by group and source.
  *
  * Returns 0 if a datagram should be allowed through, or various error codes
  * if the socket was not a member of the group, or the source was muted, etc.
  */
 int
 im6o_mc_filter(const struct ip6_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group, const struct sockaddr *src)
 {
 	struct in6_mfilter *imf;
 	struct in6_msource *ims;
 	int mode;
 
 	KASSERT(ifp != NULL, ("%s: null ifp", __func__));
 
 	imf = im6o_match_group(imo, ifp, group);
 	if (imf == NULL)
 		return (MCAST_NOTGMEMBER);
 
 	/*
 	 * Check if the source was included in an (S,G) join.
 	 * Allow reception on exclusive memberships by default,
 	 * reject reception on inclusive memberships by default.
 	 * Exclude source only if an in-mode exclude filter exists.
 	 * Include source only if an in-mode include filter exists.
 	 * NOTE: We are comparing group state here at MLD t1 (now)
 	 * with socket-layer t0 (since last downcall).
 	 */
 	mode = imf->im6f_st[1];
 	ims = im6o_match_source(imf, src);
 
 	if ((ims == NULL && mode == MCAST_INCLUDE) ||
 	    (ims != NULL && ims->im6sl_st[0] != mode))
 		return (MCAST_NOTSMEMBER);
 
 	return (MCAST_PASS);
 }
 
 /*
  * Find and return a reference to an in6_multi record for (ifp, group),
  * and bump its reference count.
  * If one does not exist, try to allocate it, and update link-layer multicast
  * filters on ifp to listen for group.
  * Assumes the IN6_MULTI lock is held across the call.
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 in6_getmulti(struct ifnet *ifp, const struct in6_addr *group,
     struct in6_multi **pinm)
 {
 	struct epoch_tracker	 et;
 	struct sockaddr_in6	 gsin6;
 	struct ifmultiaddr	*ifma;
 	struct in6_multi	*inm;
 	int			 error;
 
 	error = 0;
 
 	/*
 	 * XXX: Accesses to ifma_protospec must be covered by IF_ADDR_LOCK;
 	 * if_addmulti() takes this mutex itself, so we must drop and
 	 * re-acquire around the call.
 	 */
 	IN6_MULTI_LOCK_ASSERT();
 	IN6_MULTI_LIST_LOCK();
 	IF_ADDR_WLOCK(ifp);
 	NET_EPOCH_ENTER(et);
 	/*
 	 * Does ifp support IPv6 multicasts?
 	 */
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		error = ENODEV;
 	else
 		inm = in6m_lookup_locked(ifp, group);
 	NET_EPOCH_EXIT(et);
 
 	if (error != 0)
 		goto out_locked;
 
 	if (inm != NULL) {
 		/*
 		 * If we already joined this group, just bump the
 		 * refcount and return it.
 		 */
 		KASSERT(inm->in6m_refcount >= 1,
 		    ("%s: bad refcount %d", __func__, inm->in6m_refcount));
 		in6m_acquire_locked(inm);
 		*pinm = inm;
 		goto out_locked;
 	}
 
 	memset(&gsin6, 0, sizeof(gsin6));
 	gsin6.sin6_family = AF_INET6;
 	gsin6.sin6_len = sizeof(struct sockaddr_in6);
 	gsin6.sin6_addr = *group;
 
 	/*
 	 * Check if a link-layer group is already associated
 	 * with this network-layer group on the given ifnet.
 	 */
 	IN6_MULTI_LIST_UNLOCK();
 	IF_ADDR_WUNLOCK(ifp);
 	error = if_addmulti(ifp, (struct sockaddr *)&gsin6, &ifma);
 	if (error != 0)
 		return (error);
 	IN6_MULTI_LIST_LOCK();
 	IF_ADDR_WLOCK(ifp);
 
 	/*
 	 * If something other than netinet6 is occupying the link-layer
 	 * group, print a meaningful error message and back out of
 	 * the allocation.
 	 * Otherwise, bump the refcount on the existing network-layer
 	 * group association and return it.
 	 */
 	if (ifma->ifma_protospec != NULL) {
 		inm = (struct in6_multi *)ifma->ifma_protospec;
 #ifdef INVARIANTS
 		KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr",
 		    __func__));
 		KASSERT(ifma->ifma_addr->sa_family == AF_INET6,
 		    ("%s: ifma not AF_INET6", __func__));
 		KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__));
 		if (inm->in6m_ifma != ifma || inm->in6m_ifp != ifp ||
 		    !IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, group))
 			panic("%s: ifma %p is inconsistent with %p (%p)",
 			    __func__, ifma, inm, group);
 #endif
 		in6m_acquire_locked(inm);
 		*pinm = inm;
 		goto out_locked;
 	}
 
 	IF_ADDR_WLOCK_ASSERT(ifp);
 
 	/*
 	 * A new in6_multi record is needed; allocate and initialize it.
 	 * We DO NOT perform an MLD join as the in6_ layer may need to
 	 * push an initial source list down to MLD to support SSM.
 	 *
 	 * The initial source filter state is INCLUDE, {} as per the RFC.
 	 * Pending state-changes per group are subject to a bounds check.
 	 */
 	inm = malloc(sizeof(*inm), M_IP6MADDR, M_NOWAIT | M_ZERO);
 	if (inm == NULL) {
 		IN6_MULTI_LIST_UNLOCK();
 		IF_ADDR_WUNLOCK(ifp);
 		if_delmulti_ifma(ifma);
 		return (ENOMEM);
 	}
 	inm->in6m_addr = *group;
 	inm->in6m_ifp = ifp;
 	inm->in6m_mli = MLD_IFINFO(ifp);
 	inm->in6m_ifma = ifma;
 	inm->in6m_refcount = 1;
 	inm->in6m_state = MLD_NOT_MEMBER;
 	mbufq_init(&inm->in6m_scq, MLD_MAX_STATE_CHANGES);
 
 	inm->in6m_st[0].iss_fmode = MCAST_UNDEFINED;
 	inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
 	RB_INIT(&inm->in6m_srcs);
 
 	ifma->ifma_protospec = inm;
 	*pinm = inm;
 
  out_locked:
 	IN6_MULTI_LIST_UNLOCK();
 	IF_ADDR_WUNLOCK(ifp);
 	return (error);
 }
 
 /*
  * Drop a reference to an in6_multi record.
  *
  * If the refcount drops to 0, free the in6_multi record and
  * delete the underlying link-layer membership.
  */
 static void
 in6m_release(struct in6_multi *inm)
 {
 	struct ifmultiaddr *ifma;
 	struct ifnet *ifp;
 
 	CTR2(KTR_MLD, "%s: refcount is %d", __func__, inm->in6m_refcount);
 
 	MPASS(inm->in6m_refcount == 0);
 	CTR2(KTR_MLD, "%s: freeing inm %p", __func__, inm);
 
 	ifma = inm->in6m_ifma;
 	ifp = inm->in6m_ifp;
 	MPASS(ifma->ifma_llifma == NULL);
 
 	/* XXX this access is not covered by IF_ADDR_LOCK */
 	CTR2(KTR_MLD, "%s: purging ifma %p", __func__, ifma);
 	KASSERT(ifma->ifma_protospec == NULL,
 	    ("%s: ifma_protospec != NULL", __func__));
 	if (ifp == NULL)
 		ifp = ifma->ifma_ifp;
 
 	if (ifp != NULL) {
 		CURVNET_SET(ifp->if_vnet);
 		in6m_purge(inm);
 		free(inm, M_IP6MADDR);
 		if_delmulti_ifma_flags(ifma, 1);
 		CURVNET_RESTORE();
 		if_rele(ifp);
 	} else {
 		in6m_purge(inm);
 		free(inm, M_IP6MADDR);
 		if_delmulti_ifma_flags(ifma, 1);
 	}
 }
 
 /*
  * Interface detach can happen in a taskqueue thread context, so we must use a
  * dedicated thread to avoid deadlocks when draining in6m_release tasks.
  */
 TASKQUEUE_DEFINE_THREAD(in6m_free);
 static struct in6_multi_head in6m_free_list = SLIST_HEAD_INITIALIZER();
 static void in6m_release_task(void *arg __unused, int pending __unused);
 static struct task in6m_free_task = TASK_INITIALIZER(0, in6m_release_task, NULL);
 
 void
 in6m_release_list_deferred(struct in6_multi_head *inmh)
 {
 	if (SLIST_EMPTY(inmh))
 		return;
 	mtx_lock(&in6_multi_free_mtx);
 	SLIST_CONCAT(&in6m_free_list, inmh, in6_multi, in6m_nrele);
 	mtx_unlock(&in6_multi_free_mtx);
 	taskqueue_enqueue(taskqueue_in6m_free, &in6m_free_task);
 }
 
 void
 in6m_release_wait(void *arg __unused)
 {
 
 	/*
 	 * Make sure all pending multicast addresses are freed before
 	 * the VNET or network device is destroyed:
 	 */
 	taskqueue_drain_all(taskqueue_in6m_free);
 }
 #ifdef VIMAGE
 /* XXX-BZ FIXME, see D24914. */
 VNET_SYSUNINIT(in6m_release_wait, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, in6m_release_wait, NULL);
 #endif
 
 void
 in6m_disconnect_locked(struct in6_multi_head *inmh, struct in6_multi *inm)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct in6_ifaddr *ifa6;
 	struct in6_multi_mship *imm, *imm_tmp;
 	struct ifmultiaddr *ifma, *ll_ifma;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	ifp = inm->in6m_ifp;
 	if (ifp == NULL)
 		return;		/* already called */
 
 	inm->in6m_ifp = NULL;
 	IF_ADDR_WLOCK_ASSERT(ifp);
 	ifma = inm->in6m_ifma;
 	if (ifma == NULL)
 		return;
 
 	if_ref(ifp);
 	if (ifma->ifma_flags & IFMA_F_ENQUEUED) {
 		CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
 		ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 	}
 	MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname);
 	if ((ll_ifma = ifma->ifma_llifma) != NULL) {
 		MPASS(ifma != ll_ifma);
 		ifma->ifma_llifma = NULL;
 		MPASS(ll_ifma->ifma_llifma == NULL);
 		MPASS(ll_ifma->ifma_ifp == ifp);
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) {
 				CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link);
 				ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 			}
 			MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname);
 			if_freemulti(ll_ifma);
 		}
 	}
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		ifa6 = (void *)ifa;
 		LIST_FOREACH_SAFE(imm, &ifa6->ia6_memberships,
 		    i6mm_chain, imm_tmp) {
 			if (inm == imm->i6mm_maddr) {
 				LIST_REMOVE(imm, i6mm_chain);
 				free(imm, M_IP6MADDR);
 				in6m_rele_locked(inmh, inm);
 			}
 		}
 	}
 }
 
 static void
 in6m_release_task(void *arg __unused, int pending __unused)
 {
 	struct in6_multi_head in6m_free_tmp;
 	struct in6_multi *inm, *tinm;
 
 	SLIST_INIT(&in6m_free_tmp);
 	mtx_lock(&in6_multi_free_mtx);
 	SLIST_CONCAT(&in6m_free_tmp, &in6m_free_list, in6_multi, in6m_nrele);
 	mtx_unlock(&in6_multi_free_mtx);
 	IN6_MULTI_LOCK();
 	SLIST_FOREACH_SAFE(inm, &in6m_free_tmp, in6m_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&in6m_free_tmp, in6m_nrele);
 		in6m_release(inm);
 	}
 	IN6_MULTI_UNLOCK();
 }
 
 /*
  * Clear recorded source entries for a group.
  * Used by the MLD code. Caller must hold the IN6_MULTI lock.
  * FIXME: Should reap.
  */
 void
 in6m_clear_recorded(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
 		if (ims->im6s_stp) {
 			ims->im6s_stp = 0;
 			--inm->in6m_st[1].iss_rec;
 		}
 	}
 	KASSERT(inm->in6m_st[1].iss_rec == 0,
 	    ("%s: iss_rec %d not 0", __func__, inm->in6m_st[1].iss_rec));
 }
 
 /*
  * Record a source as pending for a Source-Group MLDv2 query.
  * This lives here as it modifies the shared tree.
  *
  * inm is the group descriptor.
  * naddr is the address of the source to record in network-byte order.
  *
  * If the net.inet6.mld.sgalloc sysctl is non-zero, we will
  * lazy-allocate a source node in response to an SG query.
  * Otherwise, no allocation is performed. This saves some memory
  * with the trade-off that the source will not be reported to the
  * router if joined in the window between the query response and
  * the group actually being joined on the local host.
  *
  * VIMAGE: XXX: Currently the mld_sgalloc feature has been removed.
  * This turns off the allocation of a recorded source entry if
  * the group has not been joined.
  *
  * Return 0 if the source didn't exist or was already marked as recorded.
  * Return 1 if the source was marked as recorded by this function.
  * Return <0 if any error occurred (negated errno code).
  */
 int
 in6m_record_source(struct in6_multi *inm, const struct in6_addr *addr)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims, *nims;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	find.im6s_addr = *addr;
 	ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find);
 	if (ims && ims->im6s_stp)
 		return (0);
 	if (ims == NULL) {
 		if (inm->in6m_nsrc == in6_mcast_maxgrpsrc)
 			return (-ENOSPC);
 		nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (-ENOMEM);
 		nims->im6s_addr = find.im6s_addr;
 		RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims);
 		++inm->in6m_nsrc;
 		ims = nims;
 	}
 
 	/*
 	 * Mark the source as recorded and update the recorded
 	 * source count.
 	 */
 	++ims->im6s_stp;
 	++inm->in6m_st[1].iss_rec;
 
 	return (1);
 }
 
 /*
  * Return a pointer to an in6_msource owned by an in6_mfilter,
  * given its source address.
  * Lazy-allocate if needed. If this is a new entry its filter state is
  * undefined at t0.
  *
  * imf is the filter set being modified.
  * addr is the source address.
  *
  * SMPng: May be called with locks held; malloc must not block.
  */
 static int
 im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin,
     struct in6_msource **plims)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims, *nims;
 	struct in6_msource	*lims;
 	int			 error;
 
 	error = 0;
 	ims = NULL;
 	lims = NULL;
 
 	find.im6s_addr = psin->sin6_addr;
 	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
 	lims = (struct in6_msource *)ims;
 	if (lims == NULL) {
 		if (imf->im6f_nsrc == in6_mcast_maxsocksrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		lims = (struct in6_msource *)nims;
 		lims->im6s_addr = find.im6s_addr;
 		lims->im6sl_st[0] = MCAST_UNDEFINED;
 		RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims);
 		++imf->im6f_nsrc;
 	}
 
 	*plims = lims;
 
 	return (error);
 }
 
 /*
  * Graft a source entry into an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being in the new filter mode at t1.
  *
  * Return the pointer to the new node, otherwise return NULL.
  */
 static struct in6_msource *
 im6f_graft(struct in6_mfilter *imf, const uint8_t st1,
     const struct sockaddr_in6 *psin)
 {
 	struct ip6_msource	*nims;
 	struct in6_msource	*lims;
 
 	nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER,
 	    M_NOWAIT | M_ZERO);
 	if (nims == NULL)
 		return (NULL);
 	lims = (struct in6_msource *)nims;
 	lims->im6s_addr = psin->sin6_addr;
 	lims->im6sl_st[0] = MCAST_UNDEFINED;
 	lims->im6sl_st[1] = st1;
 	RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims);
 	++imf->im6f_nsrc;
 
 	return (lims);
 }
 
 /*
  * Prune a source entry from an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being left at t1, it is not freed.
  *
  * Return 0 if no error occurred, otherwise return an errno value.
  */
 static int
 im6f_prune(struct in6_mfilter *imf, const struct sockaddr_in6 *psin)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 
 	find.im6s_addr = psin->sin6_addr;
 	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
 	if (ims == NULL)
 		return (ENOENT);
 	lims = (struct in6_msource *)ims;
 	lims->im6sl_st[1] = MCAST_UNDEFINED;
 	return (0);
 }
 
 /*
  * Revert socket-layer filter set deltas at t1 to t0 state.
  */
 static void
 im6f_rollback(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *tims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
 		lims = (struct in6_msource *)ims;
 		if (lims->im6sl_st[0] == lims->im6sl_st[1]) {
 			/* no change at t1 */
 			continue;
 		} else if (lims->im6sl_st[0] != MCAST_UNDEFINED) {
 			/* revert change to existing source at t1 */
 			lims->im6sl_st[1] = lims->im6sl_st[0];
 		} else {
 			/* revert source added t1 */
 			CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 			RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
 			free(ims, M_IN6MFILTER);
 			imf->im6f_nsrc--;
 		}
 	}
 	imf->im6f_st[1] = imf->im6f_st[0];
 }
 
 /*
  * Mark socket-layer filter set as INCLUDE {} at t1.
  */
 static void
 im6f_leave(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		lims->im6sl_st[1] = MCAST_UNDEFINED;
 	}
 	imf->im6f_st[1] = MCAST_INCLUDE;
 }
 
 /*
  * Mark socket-layer filter set deltas as committed.
  */
 static void
 im6f_commit(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		lims->im6sl_st[0] = lims->im6sl_st[1];
 	}
 	imf->im6f_st[0] = imf->im6f_st[1];
 }
 
 /*
  * Reap unreferenced sources from socket-layer filter set.
  */
 static void
 im6f_reap(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *tims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
 		lims = (struct in6_msource *)ims;
 		if ((lims->im6sl_st[0] == MCAST_UNDEFINED) &&
 		    (lims->im6sl_st[1] == MCAST_UNDEFINED)) {
 			CTR2(KTR_MLD, "%s: free lims %p", __func__, ims);
 			RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
 			free(ims, M_IN6MFILTER);
 			imf->im6f_nsrc--;
 		}
 	}
 }
 
 /*
  * Purge socket-layer filter set.
  */
 static void
 im6f_purge(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
 		CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
 		free(ims, M_IN6MFILTER);
 		imf->im6f_nsrc--;
 	}
 	imf->im6f_st[0] = imf->im6f_st[1] = MCAST_UNDEFINED;
 	KASSERT(RB_EMPTY(&imf->im6f_sources),
 	    ("%s: im6f_sources not empty", __func__));
 }
 
 /*
  * Look up a source filter entry for a multicast group.
  *
  * inm is the group descriptor to work with.
  * addr is the IPv6 address to look up.
  * noalloc may be non-zero to suppress allocation of sources.
  * *pims will be set to the address of the retrieved or allocated source.
  *
  * SMPng: NOTE: may be called with locks held.
  * Return 0 if successful, otherwise return a non-zero error code.
  */
 static int
 in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr,
     const int noalloc, struct ip6_msource **pims)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims, *nims;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	find.im6s_addr = *addr;
 	ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find);
 	if (ims == NULL && !noalloc) {
 		if (inm->in6m_nsrc == in6_mcast_maxgrpsrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		nims->im6s_addr = *addr;
 		RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims);
 		++inm->in6m_nsrc;
 		ims = nims;
 		CTR3(KTR_MLD, "%s: allocated %s as %p", __func__,
 		    ip6_sprintf(ip6tbuf, addr), ims);
 	}
 
 	*pims = ims;
 	return (0);
 }
 
 /*
  * Merge socket-layer source into MLD-layer source.
  * If rollback is non-zero, perform the inverse of the merge.
  */
 static void
 im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims,
     const int rollback)
 {
 	int n = rollback ? -1 : 1;
 #ifdef KTR
 	char ip6tbuf[INET6_ADDRSTRLEN];
 
 	ip6_sprintf(ip6tbuf, &lims->im6s_addr);
 #endif
 
 	if (lims->im6sl_st[0] == MCAST_EXCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 ex -= %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].ex -= n;
 	} else if (lims->im6sl_st[0] == MCAST_INCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 in -= %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].in -= n;
 	}
 
 	if (lims->im6sl_st[1] == MCAST_EXCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 ex += %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].ex += n;
 	} else if (lims->im6sl_st[1] == MCAST_INCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 in += %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].in += n;
 	}
 }
 
 /*
  * Atomically update the global in6_multi state, when a membership's
  * filter list is being updated in any way.
  *
  * imf is the per-inpcb-membership group filter pointer.
  * A fake imf may be passed for in-kernel consumers.
  *
  * XXX This is a candidate for a set-symmetric-difference style loop
  * which would eliminate the repeated lookup from root of ims nodes,
  * as they share the same key space.
  *
  * If any error occurred this function will back out of refcounts
  * and return a non-zero value.
  */
 static int
 in6m_merge(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *nims;
 	struct in6_msource	*lims;
 	int			 schanged, error;
 	int			 nsrc0, nsrc1;
 
 	schanged = 0;
 	error = 0;
 	nsrc1 = nsrc0 = 0;
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	/*
 	 * Update the source filters first, as this may fail.
 	 * Maintain count of in-mode filters at t0, t1. These are
 	 * used to work out if we transition into ASM mode or not.
 	 * Maintain a count of source filters whose state was
 	 * actually modified by this operation.
 	 */
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		if (lims->im6sl_st[0] == imf->im6f_st[0]) nsrc0++;
 		if (lims->im6sl_st[1] == imf->im6f_st[1]) nsrc1++;
 		if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue;
 		error = in6m_get_source(inm, &lims->im6s_addr, 0, &nims);
 		++schanged;
 		if (error)
 			break;
 		im6s_merge(nims, lims, 0);
 	}
 	if (error) {
 		struct ip6_msource *bims;
 
 		RB_FOREACH_REVERSE_FROM(ims, ip6_msource_tree, nims) {
 			lims = (struct in6_msource *)ims;
 			if (lims->im6sl_st[0] == lims->im6sl_st[1])
 				continue;
 			(void)in6m_get_source(inm, &lims->im6s_addr, 1, &bims);
 			if (bims == NULL)
 				continue;
 			im6s_merge(bims, lims, 1);
 		}
 		goto out_reap;
 	}
 
 	CTR3(KTR_MLD, "%s: imf filters in-mode: %d at t0, %d at t1",
 	    __func__, nsrc0, nsrc1);
 
 	/* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */
 	if (imf->im6f_st[0] == imf->im6f_st[1] &&
 	    imf->im6f_st[1] == MCAST_INCLUDE) {
 		if (nsrc1 == 0) {
 			CTR1(KTR_MLD, "%s: --in on inm at t1", __func__);
 			--inm->in6m_st[1].iss_in;
 		}
 	}
 
 	/* Handle filter mode transition on socket. */
 	if (imf->im6f_st[0] != imf->im6f_st[1]) {
 		CTR3(KTR_MLD, "%s: imf transition %d to %d",
 		    __func__, imf->im6f_st[0], imf->im6f_st[1]);
 
 		if (imf->im6f_st[0] == MCAST_EXCLUDE) {
 			CTR1(KTR_MLD, "%s: --ex on inm at t1", __func__);
 			--inm->in6m_st[1].iss_ex;
 		} else if (imf->im6f_st[0] == MCAST_INCLUDE) {
 			CTR1(KTR_MLD, "%s: --in on inm at t1", __func__);
 			--inm->in6m_st[1].iss_in;
 		}
 
 		if (imf->im6f_st[1] == MCAST_EXCLUDE) {
 			CTR1(KTR_MLD, "%s: ex++ on inm at t1", __func__);
 			inm->in6m_st[1].iss_ex++;
 		} else if (imf->im6f_st[1] == MCAST_INCLUDE && nsrc1 > 0) {
 			CTR1(KTR_MLD, "%s: in++ on inm at t1", __func__);
 			inm->in6m_st[1].iss_in++;
 		}
 	}
 
 	/*
 	 * Track inm filter state in terms of listener counts.
 	 * If there are any exclusive listeners, stack-wide
 	 * membership is exclusive.
 	 * Otherwise, if only inclusive listeners, stack-wide is inclusive.
 	 * If no listeners remain, state is undefined at t1,
 	 * and the MLD lifecycle for this group should finish.
 	 */
 	if (inm->in6m_st[1].iss_ex > 0) {
 		CTR1(KTR_MLD, "%s: transition to EX", __func__);
 		inm->in6m_st[1].iss_fmode = MCAST_EXCLUDE;
 	} else if (inm->in6m_st[1].iss_in > 0) {
 		CTR1(KTR_MLD, "%s: transition to IN", __func__);
 		inm->in6m_st[1].iss_fmode = MCAST_INCLUDE;
 	} else {
 		CTR1(KTR_MLD, "%s: transition to UNDEF", __func__);
 		inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
 	}
 
 	/* Decrement ASM listener count on transition out of ASM mode. */
 	if (imf->im6f_st[0] == MCAST_EXCLUDE && nsrc0 == 0) {
 		if ((imf->im6f_st[1] != MCAST_EXCLUDE) ||
 		    (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) {
 			CTR1(KTR_MLD, "%s: --asm on inm at t1", __func__);
 			--inm->in6m_st[1].iss_asm;
 		}
 	}
 
 	/* Increment ASM listener count on transition to ASM mode. */
 	if (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 == 0) {
 		CTR1(KTR_MLD, "%s: asm++ on inm at t1", __func__);
 		inm->in6m_st[1].iss_asm++;
 	}
 
 	CTR3(KTR_MLD, "%s: merged imf %p to inm %p", __func__, imf, inm);
 	in6m_print(inm);
 
 out_reap:
 	if (schanged > 0) {
 		CTR1(KTR_MLD, "%s: sources changed; reaping", __func__);
 		in6m_reap(inm);
 	}
 	return (error);
 }
 
 /*
  * Mark an in6_multi's filter set deltas as committed.
  * Called by MLD after a state change has been enqueued.
  */
 void
 in6m_commit(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims;
 
 	CTR2(KTR_MLD, "%s: commit inm %p", __func__, inm);
 	CTR1(KTR_MLD, "%s: pre commit:", __func__);
 	in6m_print(inm);
 
 	RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
 		ims->im6s_st[0] = ims->im6s_st[1];
 	}
 	inm->in6m_st[0] = inm->in6m_st[1];
 }
 
 /*
  * Reap unreferenced nodes from an in6_multi's filter set.
  */
 static void
 in6m_reap(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) {
 		if (ims->im6s_st[0].ex > 0 || ims->im6s_st[0].in > 0 ||
 		    ims->im6s_st[1].ex > 0 || ims->im6s_st[1].in > 0 ||
 		    ims->im6s_stp != 0)
 			continue;
 		CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims);
 		free(ims, M_IP6MSOURCE);
 		inm->in6m_nsrc--;
 	}
 }
 
 /*
  * Purge all source nodes from an in6_multi's filter set.
  */
 static void
 in6m_purge(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) {
 		CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims);
 		free(ims, M_IP6MSOURCE);
 		inm->in6m_nsrc--;
 	}
 	/* Free state-change requests that might be queued. */
 	mbufq_drain(&inm->in6m_scq);
 }
 
 /*
  * Join a multicast address w/o sources.
  * KAME compatibility entry point.
  *
  * SMPng: Assume no mc locks held by caller.
  */
 int
 in6_joingroup(struct ifnet *ifp, const struct in6_addr *mcaddr,
     /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm,
     const int delay)
 {
 	int error;
 
 	IN6_MULTI_LOCK();
 	error = in6_joingroup_locked(ifp, mcaddr, NULL, pinm, delay);
 	IN6_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Join a multicast group; real entry point.
  *
  * Only preserves atomicity at inm level.
  * NOTE: imf argument cannot be const due to sys/tree.h limitations.
  *
  * If the MLD downcall fails, the group is not joined, and an error
  * code is returned.
  */
 static int
 in6_joingroup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr,
     /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm,
     const int delay)
 {
 	struct in6_multi_head    inmh;
 	struct in6_mfilter	 timf;
 	struct in6_multi	*inm;
 	struct ifmultiaddr *ifma;
 	int			 error;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	/*
 	 * Sanity: Check scope zone ID was set for ifp, if and
 	 * only if group is scoped to an interface.
 	 */
 	KASSERT(IN6_IS_ADDR_MULTICAST(mcaddr),
 	    ("%s: not a multicast address", __func__));
 	if (IN6_IS_ADDR_MC_LINKLOCAL(mcaddr) ||
 	    IN6_IS_ADDR_MC_INTFACELOCAL(mcaddr)) {
 		KASSERT(mcaddr->s6_addr16[1] != 0,
 		    ("%s: scope zone ID not set", __func__));
 	}
 
 	IN6_MULTI_LOCK_ASSERT();
 	IN6_MULTI_LIST_UNLOCK_ASSERT();
 
 	CTR4(KTR_MLD, "%s: join %s on %p(%s))", __func__,
 	    ip6_sprintf(ip6tbuf, mcaddr), ifp, if_name(ifp));
 
 	error = 0;
 	inm = NULL;
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		im6f_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE);
 		imf = &timf;
 	}
 	error = in6_getmulti(ifp, mcaddr, &inm);
 	if (error) {
 		CTR1(KTR_MLD, "%s: in6_getmulti() failure", __func__);
 		return (error);
 	}
 
 	IN6_MULTI_LIST_LOCK();
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
 		goto out_in6m_release;
 	}
 
 	CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 	error = mld_change_state(inm, delay);
 	if (error) {
 		CTR1(KTR_MLD, "%s: failed to update source", __func__);
 		goto out_in6m_release;
 	}
 
 out_in6m_release:
 	SLIST_INIT(&inmh);
 	if (error) {
 		struct epoch_tracker et;
 
 		CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm);
 		IF_ADDR_WLOCK(ifp);
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_protospec == inm) {
 				ifma->ifma_protospec = NULL;
 				break;
 			}
 		}
 		in6m_disconnect_locked(&inmh, inm);
 		in6m_rele_locked(&inmh, inm);
 		NET_EPOCH_EXIT(et);
 		IF_ADDR_WUNLOCK(ifp);
 	} else {
 		*pinm = inm;
 	}
 	IN6_MULTI_LIST_UNLOCK();
 	in6m_release_list_deferred(&inmh);
 	return (error);
 }
 
 /*
  * Leave a multicast group; unlocked entry point.
  */
 int
 in6_leavegroup(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
 {
 	int error;
 
 	IN6_MULTI_LOCK();
 	error = in6_leavegroup_locked(inm, imf);
 	IN6_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Leave a multicast group; real entry point.
  * All source filters will be expunged.
  *
  * Only preserves atomicity at inm level.
  *
  * Holding the write lock for the INP which contains imf
  * is highly advisable. We can't assert for it as imf does not
  * contain a back-pointer to the owning inp.
  *
  * Note: This is not the same as in6m_release(*) as this function also
  * makes a state change downcall into MLD.
  */
 int
 in6_leavegroup_locked(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
 {
 	struct in6_multi_head	 inmh;
 	struct in6_mfilter	 timf;
 	struct ifnet *ifp;
 	int			 error;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	error = 0;
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	CTR5(KTR_MLD, "%s: leave inm %p, %s/%s, imf %p", __func__,
 	    inm, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    (in6m_is_ifp_detached(inm) ? "null" : if_name(inm->in6m_ifp)),
 	    imf);
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		im6f_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED);
 		imf = &timf;
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 *
 	 * As this particular invocation should not cause any memory
 	 * to be allocated, and there is no opportunity to roll back
 	 * the transaction, it MUST NOT fail.
 	 */
 
 	ifp = inm->in6m_ifp;
 	IN6_MULTI_LIST_LOCK();
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	KASSERT(error == 0, ("%s: failed to merge inm state", __func__));
 
 	CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 	error = 0;
 	if (ifp)
 		error = mld_change_state(inm, 0);
 	if (error)
 		CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
 
 	CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm);
 	if (ifp)
 		IF_ADDR_WLOCK(ifp);
 
 	SLIST_INIT(&inmh);
 	if (inm->in6m_refcount == 1)
 		in6m_disconnect_locked(&inmh, inm);
 	in6m_rele_locked(&inmh, inm);
 	if (ifp)
 		IF_ADDR_WUNLOCK(ifp);
 	IN6_MULTI_LIST_UNLOCK();
 	in6m_release_list_deferred(&inmh);
 	return (error);
 }
 
 /*
  * Block or unblock an ASM multicast source on an inpcb.
  * This implements the delta-based API described in RFC 3678.
  *
  * The delta-based API applies only to exclusive-mode memberships.
  * An MLD downcall will be performed.
  *
  * SMPng: NOTE: Must take Giant as a join may create a new ifma.
  *
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct group_source_req		 gsr;
 	struct epoch_tracker		 et;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in6_mfilter		*imf;
 	struct ip6_moptions		*imo;
 	struct in6_msource		*ims;
 	struct in6_multi			*inm;
 	uint16_t			 fmode;
 	int				 error, doblock;
 #ifdef KTR
 	char				 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	ifp = NULL;
 	error = 0;
 	doblock = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 
 	switch (sopt->sopt_name) {
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = sooptcopyin(sopt, &gsr,
 		    sizeof(struct group_source_req),
 		    sizeof(struct group_source_req));
 		if (error)
 			return (error);
 
 		if (gsa->sin6.sin6_family != AF_INET6 ||
 		    gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 
 		if (ssa->sin6.sin6_family != AF_INET6 ||
 		    ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 
 		/*
 		 * XXXGL: this function should use ifnet_byindex_ref, or
 		 * expand the epoch section all the way to where we put
 		 * the reference.
 		 */
 		NET_EPOCH_ENTER(et);
 		ifp = ifnet_byindex(gsr.gsr_interface);
 		NET_EPOCH_EXIT(et);
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 
 		if (sopt->sopt_name == MCAST_BLOCK_SOURCE)
 			doblock = 1;
 		break;
 
 	default:
 		CTR2(KTR_MLD, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	/*
 	 * Check if we are actually a member of this group.
 	 */
 	imo = in6p_findmoptions(inp);
 	imf = im6o_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
 	inm = imf->im6f_in6m;
 
 	/*
 	 * Attempting to use the delta-based API on an
 	 * non exclusive-mode membership is an error.
 	 */
 	fmode = imf->im6f_st[0];
 	if (fmode != MCAST_EXCLUDE) {
 		error = EINVAL;
 		goto out_in6p_locked;
 	}
 
 	/*
 	 * Deal with error cases up-front:
 	 *  Asked to block, but already blocked; or
 	 *  Asked to unblock, but nothing to unblock.
 	 * If adding a new block entry, allocate it.
 	 */
 	ims = im6o_match_source(imf, &ssa->sa);
 	if ((ims != NULL && doblock) || (ims == NULL && !doblock)) {
 		CTR3(KTR_MLD, "%s: source %s %spresent", __func__,
 		    ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr),
 		    doblock ? "" : "not ");
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	if (doblock) {
 		CTR2(KTR_MLD, "%s: %s source", __func__, "block");
 		ims = im6f_graft(imf, fmode, &ssa->sin6);
 		if (ims == NULL)
 			error = ENOMEM;
 	} else {
 		CTR2(KTR_MLD, "%s: %s source", __func__, "allow");
 		error = im6f_prune(imf, &ssa->sin6);
 	}
 
 	if (error) {
 		CTR1(KTR_MLD, "%s: merge imf state failed", __func__);
 		goto out_im6f_rollback;
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
 	IN6_MULTI_LIST_LOCK();
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	if (error)
 		CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
 	else {
 		CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 		error = mld_change_state(inm, 0);
 		if (error)
 			CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
 	}
 
 	IN6_MULTI_LIST_UNLOCK();
 
 out_im6f_rollback:
 	if (error)
 		im6f_rollback(imf);
 	else
 		im6f_commit(imf);
 
 	im6f_reap(imf);
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Given an inpcb, return its multicast options structure pointer.  Accepts
  * an unlocked inpcb pointer, but will return it locked.  May sleep.
  *
  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  * SMPng: NOTE: Returns with the INP write lock held.
  */
 static struct ip6_moptions *
 in6p_findmoptions(struct inpcb *inp)
 {
 	struct ip6_moptions	 *imo;
 
 	INP_WLOCK(inp);
 	if (inp->in6p_moptions != NULL)
 		return (inp->in6p_moptions);
 
 	INP_WUNLOCK(inp);
 
 	imo = malloc(sizeof(*imo), M_IP6MOPTS, M_WAITOK);
 
 	imo->im6o_multicast_ifp = NULL;
 	imo->im6o_multicast_hlim = V_ip6_defmcasthlim;
 	imo->im6o_multicast_loop = in6_mcast_loop;
 	STAILQ_INIT(&imo->im6o_head);
 
 	INP_WLOCK(inp);
 	if (inp->in6p_moptions != NULL) {
 		free(imo, M_IP6MOPTS);
 		return (inp->in6p_moptions);
 	}
 	inp->in6p_moptions = imo;
 	return (imo);
 }
 
 /*
  * Discard the IPv6 multicast options (and source filters).
  *
  * SMPng: NOTE: assumes INP write lock is held.
  *
  * XXX can all be safely deferred to epoch_call
  *
  */
 
 static void
 inp_gcmoptions(struct ip6_moptions *imo)
 {
 	struct in6_mfilter *imf;
 	struct in6_multi *inm;
 	struct ifnet *ifp;
 
 	while ((imf = ip6_mfilter_first(&imo->im6o_head)) != NULL) {
                 ip6_mfilter_remove(&imo->im6o_head, imf);
 
                 im6f_leave(imf);
                 if ((inm = imf->im6f_in6m) != NULL) {
                         if ((ifp = inm->in6m_ifp) != NULL) {
                                 CURVNET_SET(ifp->if_vnet);
                                 (void)in6_leavegroup(inm, imf);
                                 CURVNET_RESTORE();
                         } else {
                                 (void)in6_leavegroup(inm, imf);
                         }
                 }
                 ip6_mfilter_free(imf);
         }
         free(imo, M_IP6MOPTS);
 }
 
 void
 ip6_freemoptions(struct ip6_moptions *imo)
 {
 	if (imo == NULL)
 		return;
 	inp_gcmoptions(imo);
 }
 
 /*
  * Atomically get source filters on a socket for an IPv6 multicast group.
  * Called with INP lock held; returns with lock released.
  */
 static int
 in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct epoch_tracker	 et;
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct ip6_moptions	*imo;
 	struct in6_mfilter	*imf;
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 	struct sockaddr_in6	*psin;
 	struct sockaddr_storage	*ptss;
 	struct sockaddr_storage	*tss;
 	int			 error;
 	size_t			 nsrcs, ncsrcs;
 
 	INP_WLOCK_ASSERT(inp);
 
 	imo = inp->in6p_moptions;
 	KASSERT(imo != NULL, ("%s: null ip6_moptions", __func__));
 
 	INP_WUNLOCK(inp);
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_group.ss_family != AF_INET6 ||
 	    msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6))
 		return (EINVAL);
 
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	/*
 	 * XXXGL: this function should use ifnet_byindex_ref, or expand the
 	 * epoch section all the way to where the interface is referenced.
 	 */
 	NET_EPOCH_ENTER(et);
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	NET_EPOCH_EXIT(et);
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	INP_WLOCK(inp);
 
 	/*
 	 * Lookup group on the socket.
 	 */
 	imf = im6o_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		INP_WUNLOCK(inp);
 		return (EADDRNOTAVAIL);
 	}
 
 	/*
 	 * Ignore memberships which are in limbo.
 	 */
 	if (imf->im6f_st[1] == MCAST_UNDEFINED) {
 		INP_WUNLOCK(inp);
 		return (EAGAIN);
 	}
 	msfr.msfr_fmode = imf->im6f_st[1];
 
 	/*
 	 * If the user specified a buffer, copy out the source filter
 	 * entries to userland gracefully.
 	 * We only copy out the number of entries which userland
 	 * has asked for, but we always tell userland how big the
 	 * buffer really needs to be.
 	 */
 	if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc)
 		msfr.msfr_nsrcs = in6_mcast_maxsocksrc;
 	tss = NULL;
 	if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) {
 		tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_NOWAIT | M_ZERO);
 		if (tss == NULL) {
 			INP_WUNLOCK(inp);
 			return (ENOBUFS);
 		}
 	}
 
 	/*
 	 * Count number of sources in-mode at t0.
 	 * If buffer space exists and remains, copy out source entries.
 	 */
 	nsrcs = msfr.msfr_nsrcs;
 	ncsrcs = 0;
 	ptss = tss;
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		if (lims->im6sl_st[0] == MCAST_UNDEFINED ||
 		    lims->im6sl_st[0] != imf->im6f_st[0])
 			continue;
 		++ncsrcs;
 		if (tss != NULL && nsrcs > 0) {
 			psin = (struct sockaddr_in6 *)ptss;
 			psin->sin6_family = AF_INET6;
 			psin->sin6_len = sizeof(struct sockaddr_in6);
 			psin->sin6_addr = lims->im6s_addr;
 			psin->sin6_port = 0;
 			--nsrcs;
 			++ptss;
 		}
 	}
 
 	INP_WUNLOCK(inp);
 
 	if (tss != NULL) {
 		error = copyout(tss, msfr.msfr_srcs,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		free(tss, M_TEMP);
 		if (error)
 			return (error);
 	}
 
 	msfr.msfr_nsrcs = ncsrcs;
 	error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq));
 
 	return (error);
 }
 
 /*
  * Return the IP multicast options in response to user getsockopt().
  */
 int
 ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ip6_moptions	*im6o;
 	int			 error;
 	u_int			 optval;
 
 	INP_WLOCK(inp);
 	im6o = inp->in6p_moptions;
-	/*
-	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
-	 * or is a divert socket, reject it.
-	 */
-	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
-	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
-	    inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) {
+	/* If socket is neither of type SOCK_RAW or SOCK_DGRAM, reject it. */
+	if (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
+	    inp->inp_socket->so_proto->pr_type != SOCK_DGRAM) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
 
 	error = 0;
 	switch (sopt->sopt_name) {
 	case IPV6_MULTICAST_IF:
 		if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) {
 			optval = 0;
 		} else {
 			optval = im6o->im6o_multicast_ifp->if_index;
 		}
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(u_int));
 		break;
 
 	case IPV6_MULTICAST_HOPS:
 		if (im6o == NULL)
 			optval = V_ip6_defmcasthlim;
 		else
 			optval = im6o->im6o_multicast_hlim;
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(u_int));
 		break;
 
 	case IPV6_MULTICAST_LOOP:
 		if (im6o == NULL)
 			optval = in6_mcast_loop; /* XXX VIMAGE */
 		else
 			optval = im6o->im6o_multicast_loop;
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(u_int));
 		break;
 
 	case IPV6_MSFILTER:
 		if (im6o == NULL) {
 			error = EADDRNOTAVAIL;
 			INP_WUNLOCK(inp);
 		} else {
 			error = in6p_get_source_filters(inp, sopt);
 		}
 		break;
 
 	default:
 		INP_WUNLOCK(inp);
 		error = ENOPROTOOPT;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Look up the ifnet to use for a multicast group membership,
  * given the address of an IPv6 group.
  *
  * This routine exists to support legacy IPv6 multicast applications.
  *
  * Use the socket's current FIB number for any required FIB lookup. Look up the
  * group address in the unicast FIB, and use its ifp; usually, this points to
  * the default next-hop.  If the FIB lookup fails, return NULL.
  *
  * FUTURE: Support multiple forwarding tables for IPv6.
  *
  * Returns NULL if no ifp could be found.
  */
 static struct ifnet *
 in6p_lookup_mcast_ifp(const struct inpcb *inp, const struct sockaddr_in6 *gsin6)
 {
 	struct nhop_object	*nh;
 	struct in6_addr		dst;
 	uint32_t		scopeid;
 	uint32_t		fibnum;
 
 	KASSERT(gsin6->sin6_family == AF_INET6,
 	    ("%s: not AF_INET6 group", __func__));
 
 	in6_splitscope(&gsin6->sin6_addr, &dst, &scopeid);
 	fibnum = inp->inp_inc.inc_fibnum;
 	nh = fib6_lookup(fibnum, &dst, scopeid, 0, 0);
 
 	return (nh ? nh->nh_ifp : NULL);
 }
 
 /*
  * Join an IPv6 multicast group, possibly with a source.
  *
  * FIXME: The KAME use of the unspecified address (::)
  * to join *all* multicast groups is currently unsupported.
  *
  * XXXGL: this function multiple times uses ifnet_byindex() without
  * proper protection - staying in epoch, or putting reference on ifnet.
  */
 static int
 in6p_join_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct in6_multi_head		 inmh;
 	struct group_source_req		 gsr;
 	struct epoch_tracker		 et;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in6_mfilter		*imf;
 	struct ip6_moptions		*imo;
 	struct in6_multi		*inm;
 	struct in6_msource		*lims;
 	int				 error, is_new;
 
 	SLIST_INIT(&inmh);
 	ifp = NULL;
 	lims = NULL;
 	error = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	/*
 	 * Chew everything into struct group_source_req.
 	 * Overwrite the port field if present, as the sockaddr
 	 * being copied in may be matched with a binary comparison.
 	 * Ignore passed-in scope ID.
 	 */
 	switch (sopt->sopt_name) {
 	case IPV6_JOIN_GROUP: {
 		struct ipv6_mreq mreq;
 
 		error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq),
 		    sizeof(struct ipv6_mreq));
 		if (error)
 			return (error);
 
 		gsa->sin6.sin6_family = AF_INET6;
 		gsa->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr;
 
 		if (mreq.ipv6mr_interface == 0) {
 			ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6);
 		} else {
 			NET_EPOCH_ENTER(et);
 			ifp = ifnet_byindex(mreq.ipv6mr_interface);
 			NET_EPOCH_EXIT(et);
 			if (ifp == NULL)
 				return (EADDRNOTAVAIL);
 		}
 		CTR3(KTR_MLD, "%s: ipv6mr_interface = %d, ifp = %p",
 		    __func__, mreq.ipv6mr_interface, ifp);
 	} break;
 
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_JOIN_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin6.sin6_family != AF_INET6 ||
 		    gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 
 		if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			if (ssa->sin6.sin6_family != AF_INET6 ||
 			    ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 				return (EINVAL);
 			if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr))
 				return (EINVAL);
 			/*
 			 * TODO: Validate embedded scope ID in source
 			 * list entry against passed-in ifp, if and only
 			 * if source list filter entry is iface or node local.
 			 */
 			in6_clearscope(&ssa->sin6.sin6_addr);
 			ssa->sin6.sin6_port = 0;
 			ssa->sin6.sin6_scope_id = 0;
 		}
 		NET_EPOCH_ENTER(et);
 		ifp = ifnet_byindex(gsr.gsr_interface);
 		NET_EPOCH_EXIT(et);
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 		break;
 
 	default:
 		CTR2(KTR_MLD, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0)
 		return (EADDRNOTAVAIL);
 
 	gsa->sin6.sin6_port = 0;
 	gsa->sin6.sin6_scope_id = 0;
 
 	/*
 	 * Always set the scope zone ID on memberships created from userland.
 	 * Use the passed-in ifp to do this.
 	 * XXX The in6_setscope() return value is meaningless.
 	 * XXX SCOPE6_LOCK() is taken by in6_setscope().
 	 */
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	IN6_MULTI_LOCK();
 
 	/*
 	 * Find the membership in the membership list.
 	 */
 	imo = in6p_findmoptions(inp);
 	imf = im6o_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		is_new = 1;
 		inm = NULL;
 
 		if (ip6_mfilter_count(&imo->im6o_head) >= IPV6_MAX_MEMBERSHIPS) {
 			error = ENOMEM;
 			goto out_in6p_locked;
 		}
 	} else {
 		is_new = 0;
 		inm = imf->im6f_in6m;
 
 		if (ssa->ss.ss_family != AF_UNSPEC) {
 			/*
 			 * MCAST_JOIN_SOURCE_GROUP on an exclusive membership
 			 * is an error. On an existing inclusive membership,
 			 * it just adds the source to the filter list.
 			 */
 			if (imf->im6f_st[1] != MCAST_INCLUDE) {
 				error = EINVAL;
 				goto out_in6p_locked;
 			}
 			/*
 			 * Throw out duplicates.
 			 *
 			 * XXX FIXME: This makes a naive assumption that
 			 * even if entries exist for *ssa in this imf,
 			 * they will be rejected as dupes, even if they
 			 * are not valid in the current mode (in-mode).
 			 *
 			 * in6_msource is transactioned just as for anything
 			 * else in SSM -- but note naive use of in6m_graft()
 			 * below for allocating new filter entries.
 			 *
 			 * This is only an issue if someone mixes the
 			 * full-state SSM API with the delta-based API,
 			 * which is discouraged in the relevant RFCs.
 			 */
 			lims = im6o_match_source(imf, &ssa->sa);
 			if (lims != NULL /*&&
 			    lims->im6sl_st[1] == MCAST_INCLUDE*/) {
 				error = EADDRNOTAVAIL;
 				goto out_in6p_locked;
 			}
 		} else {
 			/*
 			 * MCAST_JOIN_GROUP alone, on any existing membership,
 			 * is rejected, to stop the same inpcb tying up
 			 * multiple refs to the in_multi.
 			 * On an existing inclusive membership, this is also
 			 * an error; if you want to change filter mode,
 			 * you must use the userland API setsourcefilter().
 			 * XXX We don't reject this for imf in UNDEFINED
 			 * state at t1, because allocation of a filter
 			 * is atomic with allocation of a membership.
 			 */
 			error = EADDRINUSE;
 			goto out_in6p_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Graft new source into filter list for this inpcb's
 	 * membership of the group. The in6_multi may not have
 	 * been allocated yet if this is a new membership, however,
 	 * the in_mfilter slot will be allocated and must be initialized.
 	 *
 	 * Note: Grafting of exclusive mode filters doesn't happen
 	 * in this path.
 	 * XXX: Should check for non-NULL lims (node exists but may
 	 * not be in-mode) for interop with full-state API.
 	 */
 	if (ssa->ss.ss_family != AF_UNSPEC) {
 		/* Membership starts in IN mode */
 		if (is_new) {
 			CTR1(KTR_MLD, "%s: new join w/source", __func__);
 			imf = ip6_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_INCLUDE);
 			if (imf == NULL) {
 				error = ENOMEM;
 				goto out_in6p_locked;
 			}
 		} else {
 			CTR2(KTR_MLD, "%s: %s source", __func__, "allow");
 		}
 		lims = im6f_graft(imf, MCAST_INCLUDE, &ssa->sin6);
 		if (lims == NULL) {
 			CTR1(KTR_MLD, "%s: merge imf state failed",
 			    __func__);
 			error = ENOMEM;
 			goto out_in6p_locked;
 		}
 	} else {
 		/* No address specified; Membership starts in EX mode */
 		if (is_new) {
 			CTR1(KTR_MLD, "%s: new join w/o source", __func__);
 			imf = ip6_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_EXCLUDE);
 			if (imf == NULL) {
 				error = ENOMEM;
 				goto out_in6p_locked;
 			}
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
 	if (is_new) {
 		in_pcbref(inp);
 		INP_WUNLOCK(inp);
 
 		error = in6_joingroup_locked(ifp, &gsa->sin6.sin6_addr, imf,
 		    &imf->im6f_in6m, 0);
 
 		INP_WLOCK(inp);
 		if (in_pcbrele_wlocked(inp)) {
 			error = ENXIO;
 			goto out_in6p_unlocked;
 		}
 		if (error) {
 			goto out_in6p_locked;
 		}
 		/*
 		 * NOTE: Refcount from in6_joingroup_locked()
 		 * is protecting membership.
 		 */
 		ip6_mfilter_insert(&imo->im6o_head, imf);
 	} else {
 		CTR1(KTR_MLD, "%s: merge inm state", __func__);
 		IN6_MULTI_LIST_LOCK();
 		error = in6m_merge(inm, imf);
 		if (error) {
 			CTR1(KTR_MLD, "%s: failed to merge inm state",
 			    __func__);
 			IN6_MULTI_LIST_UNLOCK();
 			im6f_rollback(imf);
 			im6f_reap(imf);
 			goto out_in6p_locked;
 		}
 		CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 		error = mld_change_state(inm, 0);
 		IN6_MULTI_LIST_UNLOCK();
 
 		if (error) {
 			CTR1(KTR_MLD, "%s: failed mld downcall",
 			     __func__);
 			im6f_rollback(imf);
 			im6f_reap(imf);
 			goto out_in6p_locked;
 		}
 	}
 
 	im6f_commit(imf);
 	imf = NULL;
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
 out_in6p_unlocked:
 	IN6_MULTI_UNLOCK();
 
 	if (is_new && imf) {
 		if (imf->im6f_in6m != NULL) {
 			struct in6_multi_head inmh;
 
 			SLIST_INIT(&inmh);
 			SLIST_INSERT_HEAD(&inmh, imf->im6f_in6m, in6m_defer);
 			in6m_release_list_deferred(&inmh);
 		}
 		ip6_mfilter_free(imf);
 	}
 	return (error);
 }
 
 /*
  * Leave an IPv6 multicast group on an inpcb, possibly with a source.
  */
 static int
 in6p_leave_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ipv6_mreq		 mreq;
 	struct group_source_req		 gsr;
 	struct epoch_tracker		 et;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in6_mfilter		*imf;
 	struct ip6_moptions		*imo;
 	struct in6_msource		*ims;
 	struct in6_multi		*inm;
 	uint32_t			 ifindex;
 	int				 error;
 	bool				 is_final;
 #ifdef KTR
 	char				 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	ifp = NULL;
 	ifindex = 0;
 	error = 0;
 	is_final = true;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	/*
 	 * Chew everything passed in up into a struct group_source_req
 	 * as that is easier to process.
 	 * Note: Any embedded scope ID in the multicast group passed
 	 * in by userland is ignored, the interface index is the recommended
 	 * mechanism to specify an interface; see below.
 	 */
 	switch (sopt->sopt_name) {
 	case IPV6_LEAVE_GROUP:
 		error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq),
 		    sizeof(struct ipv6_mreq));
 		if (error)
 			return (error);
 		gsa->sin6.sin6_family = AF_INET6;
 		gsa->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr;
 		gsa->sin6.sin6_port = 0;
 		gsa->sin6.sin6_scope_id = 0;
 		ifindex = mreq.ipv6mr_interface;
 		break;
 
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_LEAVE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin6.sin6_family != AF_INET6 ||
 		    gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			if (ssa->sin6.sin6_family != AF_INET6 ||
 			    ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 				return (EINVAL);
 			if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr))
 				return (EINVAL);
 			/*
 			 * TODO: Validate embedded scope ID in source
 			 * list entry against passed-in ifp, if and only
 			 * if source list filter entry is iface or node local.
 			 */
 			in6_clearscope(&ssa->sin6.sin6_addr);
 		}
 		gsa->sin6.sin6_port = 0;
 		gsa->sin6.sin6_scope_id = 0;
 		ifindex = gsr.gsr_interface;
 		break;
 
 	default:
 		CTR2(KTR_MLD, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	/*
 	 * Validate interface index if provided. If no interface index
 	 * was provided separately, attempt to look the membership up
 	 * from the default scope as a last resort to disambiguate
 	 * the membership we are being asked to leave.
 	 * XXX SCOPE6 lock potentially taken here.
 	 */
 	if (ifindex != 0) {
 		NET_EPOCH_ENTER(et);
 		ifp = ifnet_byindex(ifindex);
 		NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifp */
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 		(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 	} else {
 		error = sa6_embedscope(&gsa->sin6, V_ip6_use_defzone);
 		if (error)
 			return (EADDRNOTAVAIL);
 		/*
 		 * Some badly behaved applications don't pass an ifindex
 		 * or a scope ID, which is an API violation. In this case,
 		 * perform a lookup as per a v6 join.
 		 *
 		 * XXX For now, stomp on zone ID for the corner case.
 		 * This is not the 'KAME way', but we need to see the ifp
 		 * directly until such time as this implementation is
 		 * refactored, assuming the scope IDs are the way to go.
 		 */
 		ifindex = ntohs(gsa->sin6.sin6_addr.s6_addr16[1]);
 		if (ifindex == 0) {
 			CTR2(KTR_MLD, "%s: warning: no ifindex, looking up "
 			    "ifp for group %s.", __func__,
 			    ip6_sprintf(ip6tbuf, &gsa->sin6.sin6_addr));
 			ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6);
 		} else {
 			NET_EPOCH_ENTER(et);
 			ifp = ifnet_byindex(ifindex);
 			NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifp */
 		}
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 	}
 
 	CTR2(KTR_MLD, "%s: ifp = %p", __func__, ifp);
 	KASSERT(ifp != NULL, ("%s: ifp did not resolve", __func__));
 
 	IN6_MULTI_LOCK();
 
 	/*
 	 * Find the membership in the membership list.
 	 */
 	imo = in6p_findmoptions(inp);
 	imf = im6o_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
 	inm = imf->im6f_in6m;
 
 	if (ssa->ss.ss_family != AF_UNSPEC)
 		is_final = false;
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * If we were instructed only to leave a given source, do so.
 	 * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships.
 	 */
 	if (is_final) {
 		ip6_mfilter_remove(&imo->im6o_head, imf);
 		im6f_leave(imf);
 
 		/*
 		 * Give up the multicast address record to which
 		 * the membership points.
 		 */
 		(void)in6_leavegroup_locked(inm, imf);
 	} else {
 		if (imf->im6f_st[0] == MCAST_EXCLUDE) {
 			error = EADDRNOTAVAIL;
 			goto out_in6p_locked;
 		}
 		ims = im6o_match_source(imf, &ssa->sa);
 		if (ims == NULL) {
 			CTR3(KTR_MLD, "%s: source %p %spresent", __func__,
 			    ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr),
 			    "not ");
 			error = EADDRNOTAVAIL;
 			goto out_in6p_locked;
 		}
 		CTR2(KTR_MLD, "%s: %s source", __func__, "block");
 		error = im6f_prune(imf, &ssa->sin6);
 		if (error) {
 			CTR1(KTR_MLD, "%s: merge imf state failed",
 			    __func__);
 			goto out_in6p_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
 	if (!is_final) {
 		CTR1(KTR_MLD, "%s: merge inm state", __func__);
 		IN6_MULTI_LIST_LOCK();
 		error = in6m_merge(inm, imf);
 		if (error) {
 			CTR1(KTR_MLD, "%s: failed to merge inm state",
 			    __func__);
 			IN6_MULTI_LIST_UNLOCK();
 			im6f_rollback(imf);
 			im6f_reap(imf);
                         goto out_in6p_locked;
 		}
 
 		CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 		error = mld_change_state(inm, 0);
 		IN6_MULTI_LIST_UNLOCK();
 		if (error) {
 			CTR1(KTR_MLD, "%s: failed mld downcall",
 			     __func__);
 			im6f_rollback(imf);
 			im6f_reap(imf);
                         goto out_in6p_locked;
 		}
 	}
 
 	im6f_commit(imf);
 	im6f_reap(imf);
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
 
 	if (is_final && imf)
 		ip6_mfilter_free(imf);
 
 	IN6_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Select the interface for transmitting IPv6 multicast datagrams.
  *
  * Either an instance of struct in6_addr or an instance of struct ipv6_mreqn
  * may be passed to this socket option. An address of in6addr_any or an
  * interface index of 0 is used to remove a previous selection.
  * When no interface is selected, one is chosen for every send.
  */
 static int
 in6p_set_multicast_if(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct epoch_tracker	 et;
 	struct ifnet		*ifp;
 	struct ip6_moptions	*imo;
 	u_int			 ifindex;
 	int			 error;
 
 	if (sopt->sopt_valsize != sizeof(u_int))
 		return (EINVAL);
 
 	error = sooptcopyin(sopt, &ifindex, sizeof(u_int), sizeof(u_int));
 	if (error)
 		return (error);
 	NET_EPOCH_ENTER(et);
 	if (ifindex == 0)
 		ifp = NULL;
 	else {
 		ifp = ifnet_byindex(ifindex);
 		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
 			NET_EPOCH_EXIT(et);
 			return (EADDRNOTAVAIL);
 		}
 	}
 	NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifp */
 	imo = in6p_findmoptions(inp);
 	imo->im6o_multicast_ifp = ifp;
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 /*
  * Atomically set source filters on a socket for an IPv6 multicast group.
  *
  * XXXGL: unsafely exits epoch with ifnet pointer
  */
 static int
 in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct __msfilterreq	 msfr;
 	struct epoch_tracker	 et;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct in6_mfilter	*imf;
 	struct ip6_moptions	*imo;
 	struct in6_multi		*inm;
 	int			 error;
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc)
 		return (ENOBUFS);
 
 	if (msfr.msfr_fmode != MCAST_EXCLUDE &&
 	    msfr.msfr_fmode != MCAST_INCLUDE)
 		return (EINVAL);
 
 	if (msfr.msfr_group.ss_family != AF_INET6 ||
 	    msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6))
 		return (EINVAL);
 
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	gsa->sin6.sin6_port = 0;	/* ignore port */
 
 	NET_EPOCH_ENTER(et);
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	NET_EPOCH_EXIT(et);
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	/*
 	 * Take the INP write lock.
 	 * Check if this socket is a member of this group.
 	 */
 	imo = in6p_findmoptions(inp);
 	imf = im6o_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
 	inm = imf->im6f_in6m;
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	imf->im6f_st[1] = msfr.msfr_fmode;
 
 	/*
 	 * Apply any new source filters, if present.
 	 * Make a copy of the user-space source vector so
 	 * that we may copy them with a single copyin. This
 	 * allows us to deal with page faults up-front.
 	 */
 	if (msfr.msfr_nsrcs > 0) {
 		struct in6_msource	*lims;
 		struct sockaddr_in6	*psin;
 		struct sockaddr_storage	*kss, *pkss;
 		int			 i;
 
 		INP_WUNLOCK(inp);
 
 		CTR2(KTR_MLD, "%s: loading %lu source list entries",
 		    __func__, (unsigned long)msfr.msfr_nsrcs);
 		kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_WAITOK);
 		error = copyin(msfr.msfr_srcs, kss,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		if (error) {
 			free(kss, M_TEMP);
 			return (error);
 		}
 
 		INP_WLOCK(inp);
 
 		/*
 		 * Mark all source filters as UNDEFINED at t1.
 		 * Restore new group filter mode, as im6f_leave()
 		 * will set it to INCLUDE.
 		 */
 		im6f_leave(imf);
 		imf->im6f_st[1] = msfr.msfr_fmode;
 
 		/*
 		 * Update socket layer filters at t1, lazy-allocating
 		 * new entries. This saves a bunch of memory at the
 		 * cost of one RB_FIND() per source entry; duplicate
 		 * entries in the msfr_nsrcs vector are ignored.
 		 * If we encounter an error, rollback transaction.
 		 *
 		 * XXX This too could be replaced with a set-symmetric
 		 * difference like loop to avoid walking from root
 		 * every time, as the key space is common.
 		 */
 		for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) {
 			psin = (struct sockaddr_in6 *)pkss;
 			if (psin->sin6_family != AF_INET6) {
 				error = EAFNOSUPPORT;
 				break;
 			}
 			if (psin->sin6_len != sizeof(struct sockaddr_in6)) {
 				error = EINVAL;
 				break;
 			}
 			if (IN6_IS_ADDR_MULTICAST(&psin->sin6_addr)) {
 				error = EINVAL;
 				break;
 			}
 			/*
 			 * TODO: Validate embedded scope ID in source
 			 * list entry against passed-in ifp, if and only
 			 * if source list filter entry is iface or node local.
 			 */
 			in6_clearscope(&psin->sin6_addr);
 			error = im6f_get_source(imf, psin, &lims);
 			if (error)
 				break;
 			lims->im6sl_st[1] = imf->im6f_st[1];
 		}
 		free(kss, M_TEMP);
 	}
 
 	if (error)
 		goto out_im6f_rollback;
 
 	INP_WLOCK_ASSERT(inp);
 	IN6_MULTI_LIST_LOCK();
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	if (error)
 		CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
 	else {
 		CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 		error = mld_change_state(inm, 0);
 		if (error)
 			CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
 	}
 
 	IN6_MULTI_LIST_UNLOCK();
 
 out_im6f_rollback:
 	if (error)
 		im6f_rollback(imf);
 	else
 		im6f_commit(imf);
 
 	im6f_reap(imf);
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Set the IP multicast options in response to user setsockopt().
  *
  * Many of the socket options handled in this function duplicate the
  * functionality of socket options in the regular unicast API. However,
  * it is not possible to merge the duplicate code, because the idempotence
  * of the IPv6 multicast part of the BSD Sockets API must be preserved;
  * the effects of these options must be treated as separate and distinct.
  *
  * SMPng: XXX: Unlocked read of inp_socket believed OK.
  */
 int
 ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ip6_moptions	*im6o;
 	int			 error;
 
 	error = 0;
 
-	/*
-	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
-	 * or is a divert socket, reject it.
-	 */
-	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
-	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
-	     inp->inp_socket->so_proto->pr_type != SOCK_DGRAM))
+	/* If socket is neither of type SOCK_RAW or SOCK_DGRAM, reject it. */
+	if (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
+	     inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)
 		return (EOPNOTSUPP);
 
 	switch (sopt->sopt_name) {
 	case IPV6_MULTICAST_IF:
 		error = in6p_set_multicast_if(inp, sopt);
 		break;
 
 	case IPV6_MULTICAST_HOPS: {
 		int hlim;
 
 		if (sopt->sopt_valsize != sizeof(int)) {
 			error = EINVAL;
 			break;
 		}
 		error = sooptcopyin(sopt, &hlim, sizeof(hlim), sizeof(int));
 		if (error)
 			break;
 		if (hlim < -1 || hlim > 255) {
 			error = EINVAL;
 			break;
 		} else if (hlim == -1) {
 			hlim = V_ip6_defmcasthlim;
 		}
 		im6o = in6p_findmoptions(inp);
 		im6o->im6o_multicast_hlim = hlim;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IPV6_MULTICAST_LOOP: {
 		u_int loop;
 
 		/*
 		 * Set the loopback flag for outgoing multicast packets.
 		 * Must be zero or one.
 		 */
 		if (sopt->sopt_valsize != sizeof(u_int)) {
 			error = EINVAL;
 			break;
 		}
 		error = sooptcopyin(sopt, &loop, sizeof(u_int), sizeof(u_int));
 		if (error)
 			break;
 		if (loop > 1) {
 			error = EINVAL;
 			break;
 		}
 		im6o = in6p_findmoptions(inp);
 		im6o->im6o_multicast_loop = loop;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IPV6_JOIN_GROUP:
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		error = in6p_join_group(inp, sopt);
 		break;
 
 	case IPV6_LEAVE_GROUP:
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		error = in6p_leave_group(inp, sopt);
 		break;
 
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = in6p_block_unblock_source(inp, sopt);
 		break;
 
 	case IPV6_MSFILTER:
 		error = in6p_set_source_filters(inp, sopt);
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Expose MLD's multicast filter mode and source list(s) to userland,
  * keyed by (ifindex, group).
  * The filter mode is written out as a uint32_t, followed by
  * 0..n of struct in6_addr.
  * For use by ifmcstat(8).
  * SMPng: NOTE: unlocked read of ifindex space.
  */
 static int
 sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS)
 {
 	struct in6_addr			 mcaddr;
 	struct in6_addr			 src;
 	struct epoch_tracker		 et;
 	struct ifnet			*ifp;
 	struct ifmultiaddr		*ifma;
 	struct in6_multi		*inm;
 	struct ip6_msource		*ims;
 	int				*name;
 	int				 retval;
 	u_int				 namelen;
 	uint32_t			 fmode, ifindex;
 #ifdef KTR
 	char				 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	/* int: ifindex + 4 * 32 bits of IPv6 address */
 	if (namelen != 5)
 		return (EINVAL);
 
 	memcpy(&mcaddr, &name[1], sizeof(struct in6_addr));
 	if (!IN6_IS_ADDR_MULTICAST(&mcaddr)) {
 		CTR2(KTR_MLD, "%s: group %s is not multicast",
 		    __func__, ip6_sprintf(ip6tbuf, &mcaddr));
 		return (EINVAL);
 	}
 
 	ifindex = name[0];
 	NET_EPOCH_ENTER(et);
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		NET_EPOCH_EXIT(et);
 		CTR2(KTR_MLD, "%s: no ifp for ifindex %u",
 		    __func__, ifindex);
 		return (ENOENT);
 	}
 	/*
 	 * Internal MLD lookups require that scope/zone ID is set.
 	 */
 	(void)in6_setscope(&mcaddr, ifp, NULL);
 
 	retval = sysctl_wire_old_buffer(req,
 	    sizeof(uint32_t) + (in6_mcast_maxgrpsrc * sizeof(struct in6_addr)));
 	if (retval) {
 		NET_EPOCH_EXIT(et);
 		return (retval);
 	}
 
 	IN6_MULTI_LOCK();
 	IN6_MULTI_LIST_LOCK();
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		inm = in6m_ifmultiaddr_get_inm(ifma);
 		if (inm == NULL)
 			continue;
 		if (!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &mcaddr))
 			continue;
 		fmode = inm->in6m_st[1].iss_fmode;
 		retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t));
 		if (retval != 0)
 			break;
 		RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
 			CTR2(KTR_MLD, "%s: visit node %p", __func__, ims);
 			/*
 			 * Only copy-out sources which are in-mode.
 			 */
 			if (fmode != im6s_get_mode(inm, ims, 1)) {
 				CTR1(KTR_MLD, "%s: skip non-in-mode",
 				    __func__);
 				continue;
 			}
 			src = ims->im6s_addr;
 			retval = SYSCTL_OUT(req, &src,
 			    sizeof(struct in6_addr));
 			if (retval != 0)
 				break;
 		}
 	}
 	IN6_MULTI_LIST_UNLOCK();
 	IN6_MULTI_UNLOCK();
 	NET_EPOCH_EXIT(et);
 
 	return (retval);
 }
 
 #ifdef KTR
 
 static const char *in6m_modestrs[] = { "un", "in", "ex" };
 
 static const char *
 in6m_mode_str(const int mode)
 {
 
 	if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE)
 		return (in6m_modestrs[mode]);
 	return ("??");
 }
 
 static const char *in6m_statestrs[] = {
 	"not-member",
 	"silent",
 	"idle",
 	"lazy",
 	"sleeping",
 	"awakening",
 	"query-pending",
 	"sg-query-pending",
 	"leaving"
 };
 
 static const char *
 in6m_state_str(const int state)
 {
 
 	if (state >= MLD_NOT_MEMBER && state <= MLD_LEAVING_MEMBER)
 		return (in6m_statestrs[state]);
 	return ("??");
 }
 
 /*
  * Dump an in6_multi structure to the console.
  */
 void
 in6m_print(const struct in6_multi *inm)
 {
 	int t;
 	char ip6tbuf[INET6_ADDRSTRLEN];
 
 	if ((ktr_mask & KTR_MLD) == 0)
 		return;
 
 	printf("%s: --- begin in6m %p ---\n", __func__, inm);
 	printf("addr %s ifp %p(%s) ifma %p\n",
 	    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp,
 	    if_name(inm->in6m_ifp),
 	    inm->in6m_ifma);
 	printf("timer %u state %s refcount %u scq.len %u\n",
 	    inm->in6m_timer,
 	    in6m_state_str(inm->in6m_state),
 	    inm->in6m_refcount,
 	    mbufq_len(&inm->in6m_scq));
 	printf("mli %p nsrc %lu sctimer %u scrv %u\n",
 	    inm->in6m_mli,
 	    inm->in6m_nsrc,
 	    inm->in6m_sctimer,
 	    inm->in6m_scrv);
 	for (t = 0; t < 2; t++) {
 		printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t,
 		    in6m_mode_str(inm->in6m_st[t].iss_fmode),
 		    inm->in6m_st[t].iss_asm,
 		    inm->in6m_st[t].iss_ex,
 		    inm->in6m_st[t].iss_in,
 		    inm->in6m_st[t].iss_rec);
 	}
 	printf("%s: --- end in6m %p ---\n", __func__, inm);
 }
 
 #else /* !KTR */
 
 void
 in6m_print(const struct in6_multi *inm)
 {
 
 }
 
 #endif /* KTR */
diff --git a/sys/sys/socket.h b/sys/sys/socket.h
index 3ec0d3b1d06d..f81aba8f972d 100644
--- a/sys/sys/socket.h
+++ b/sys/sys/socket.h
@@ -1,753 +1,755 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1985, 1986, 1988, 1993, 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)socket.h	8.4 (Berkeley) 2/21/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_SOCKET_H_
 #define	_SYS_SOCKET_H_
 
 #include <sys/cdefs.h>
 #include <sys/_types.h>
 #include <sys/_iovec.h>
 #include <machine/_align.h>
 
 /*
  * Definitions related to sockets: types, address families, options.
  */
 
 /*
  * Data types.
  */
 #if __BSD_VISIBLE
 #ifndef _GID_T_DECLARED
 typedef	__gid_t		gid_t;
 #define	_GID_T_DECLARED
 #endif
 
 #ifndef _OFF_T_DECLARED
 typedef	__off_t		off_t;
 #define	_OFF_T_DECLARED
 #endif
 
 #ifndef _PID_T_DECLARED
 typedef	__pid_t		pid_t;
 #define	_PID_T_DECLARED
 #endif
 #endif
 
 #ifndef _SA_FAMILY_T_DECLARED
 typedef	__sa_family_t	sa_family_t;
 #define	_SA_FAMILY_T_DECLARED
 #endif
 
 #ifndef _SOCKLEN_T_DECLARED
 typedef	__socklen_t	socklen_t;
 #define	_SOCKLEN_T_DECLARED
 #endif
 
 #ifndef _SSIZE_T_DECLARED
 typedef	__ssize_t	ssize_t;
 #define	_SSIZE_T_DECLARED
 #endif
 
 #if __BSD_VISIBLE 
 #ifndef _UID_T_DECLARED
 typedef	__uid_t		uid_t;
 #define	_UID_T_DECLARED
 #endif
 #endif
 
 #ifndef _UINT32_T_DECLARED
 typedef	__uint32_t	uint32_t;
 #define	_UINT32_T_DECLARED
 #endif
 
 #ifndef _UINTPTR_T_DECLARED
 typedef	__uintptr_t	uintptr_t;
 #define	_UINTPTR_T_DECLARED
 #endif
 
 /*
  * Types
  */
 #define	SOCK_STREAM	1		/* stream socket */
 #define	SOCK_DGRAM	2		/* datagram socket */
 #define	SOCK_RAW	3		/* raw-protocol interface */
 #if __BSD_VISIBLE
 #define	SOCK_RDM	4		/* reliably-delivered message */
 #endif
 #define	SOCK_SEQPACKET	5		/* sequenced packet stream */
 
 #if __BSD_VISIBLE
 /*
  * Creation flags, OR'ed into socket() and socketpair() type argument.
  */
 #define	SOCK_CLOEXEC	0x10000000
 #define	SOCK_NONBLOCK	0x20000000
 #ifdef _KERNEL
 /*
  * Flags for accept1(), kern_accept4() and solisten_dequeue, in addition
  * to SOCK_CLOEXEC and SOCK_NONBLOCK.
  */
 #define ACCEPT4_INHERIT 0x1
 #define ACCEPT4_COMPAT  0x2
 #endif	/* _KERNEL */
 #endif	/* __BSD_VISIBLE */
 
 /*
  * Option flags per-socket.
  */
 #define	SO_DEBUG	0x00000001	/* turn on debugging info recording */
 #define	SO_ACCEPTCONN	0x00000002	/* socket has had listen() */
 #define	SO_REUSEADDR	0x00000004	/* allow local address reuse */
 #define	SO_KEEPALIVE	0x00000008	/* keep connections alive */
 #define	SO_DONTROUTE	0x00000010	/* just use interface addresses */
 #define	SO_BROADCAST	0x00000020	/* permit sending of broadcast msgs */
 #if __BSD_VISIBLE
 #define	SO_USELOOPBACK	0x00000040	/* bypass hardware when possible */
 #endif
 #define	SO_LINGER	0x00000080	/* linger on close if data present */
 #define	SO_OOBINLINE	0x00000100	/* leave received OOB data in line */
 #if __BSD_VISIBLE
 #define	SO_REUSEPORT	0x00000200	/* allow local address & port reuse */
 #define	SO_TIMESTAMP	0x00000400	/* timestamp received dgram traffic */
 #define	SO_NOSIGPIPE	0x00000800	/* no SIGPIPE from EPIPE */
 #define	SO_ACCEPTFILTER	0x00001000	/* there is an accept filter */
 #define	SO_BINTIME	0x00002000	/* timestamp received dgram traffic */
 #endif
 #define	SO_NO_OFFLOAD	0x00004000	/* socket cannot be offloaded */
 #define	SO_NO_DDP	0x00008000	/* disable direct data placement */
 #define	SO_REUSEPORT_LB	0x00010000	/* reuse with load balancing */
 #define	SO_RERROR	0x00020000	/* keep track of receive errors */
 
 /*
  * Additional options, not kept in so_options.
  */
 #define	SO_SNDBUF	0x1001		/* send buffer size */
 #define	SO_RCVBUF	0x1002		/* receive buffer size */
 #define	SO_SNDLOWAT	0x1003		/* send low-water mark */
 #define	SO_RCVLOWAT	0x1004		/* receive low-water mark */
 #define	SO_SNDTIMEO	0x1005		/* send timeout */
 #define	SO_RCVTIMEO	0x1006		/* receive timeout */
 #define	SO_ERROR	0x1007		/* get error status and clear */
 #define	SO_TYPE		0x1008		/* get socket type */
 #if __BSD_VISIBLE
 #define	SO_LABEL	0x1009		/* socket's MAC label */
 #define	SO_PEERLABEL	0x1010		/* socket's peer's MAC label */
 #define	SO_LISTENQLIMIT	0x1011		/* socket's backlog limit */
 #define	SO_LISTENQLEN	0x1012		/* socket's complete queue length */
 #define	SO_LISTENINCQLEN	0x1013	/* socket's incomplete queue length */
 #define	SO_SETFIB	0x1014		/* use this FIB to route */
 #define	SO_USER_COOKIE	0x1015		/* user cookie (dummynet etc.) */
 #define	SO_PROTOCOL	0x1016		/* get socket protocol (Linux name) */
 #define	SO_PROTOTYPE	SO_PROTOCOL	/* alias for SO_PROTOCOL (SunOS name) */
 #define	SO_TS_CLOCK	0x1017		/* clock type used for SO_TIMESTAMP */
 #define	SO_MAX_PACING_RATE	0x1018	/* socket's max TX pacing rate (Linux name) */
 #define	SO_DOMAIN	0x1019		/* get socket domain */
 #endif
 
 #if __BSD_VISIBLE
 #define	SO_TS_REALTIME_MICRO	0	/* microsecond resolution, realtime */
 #define	SO_TS_BINTIME		1	/* sub-nanosecond resolution, realtime */
 #define	SO_TS_REALTIME		2	/* nanosecond resolution, realtime */
 #define	SO_TS_MONOTONIC		3	/* nanosecond resolution, monotonic */
 #define	SO_TS_DEFAULT		SO_TS_REALTIME_MICRO
 #define	SO_TS_CLOCK_MAX		SO_TS_MONOTONIC
 #endif
 
 /*
  * Space reserved for new socket options added by third-party vendors.
  * This range applies to all socket option levels.  New socket options
  * in FreeBSD should always use an option value less than SO_VENDOR.
  */
 #if __BSD_VISIBLE
 #define	SO_VENDOR	0x80000000
 #endif
 
 /*
  * Structure used for manipulating linger option.
  */
 struct linger {
 	int	l_onoff;		/* option on/off */
 	int	l_linger;		/* linger time */
 };
 
 #if __BSD_VISIBLE
 struct accept_filter_arg {
 	char	af_name[16];
 	char	af_arg[256-16];
 };
 #endif
 
 /*
  * Level number for (get/set)sockopt() to apply to socket itself.
  */
 #define	SOL_SOCKET	0xffff		/* options for socket level */
 
 /*
  * Address families.
  */
 #define	AF_UNSPEC	0		/* unspecified */
 #if __BSD_VISIBLE
 #define	AF_LOCAL	AF_UNIX		/* local to host (pipes, portals) */
 #endif
 #define	AF_UNIX		1		/* standardized name for AF_LOCAL */
 #define	AF_INET		2		/* internetwork: UDP, TCP, etc. */
 #if __BSD_VISIBLE
 #define	AF_IMPLINK	3		/* arpanet imp addresses */
 #define	AF_PUP		4		/* pup protocols: e.g. BSP */
 #define	AF_CHAOS	5		/* mit CHAOS protocols */
 #define	AF_NETBIOS	6		/* SMB protocols */
 #define	AF_ISO		7		/* ISO protocols */
 #define	AF_OSI		AF_ISO
 #define	AF_ECMA		8		/* European computer manufacturers */
 #define	AF_DATAKIT	9		/* datakit protocols */
 #define	AF_CCITT	10		/* CCITT protocols, X.25 etc */
 #define	AF_SNA		11		/* IBM SNA */
 #define AF_DECnet	12		/* DECnet */
 #define AF_DLI		13		/* DEC Direct data link interface */
 #define AF_LAT		14		/* LAT */
 #define	AF_HYLINK	15		/* NSC Hyperchannel */
 #define	AF_APPLETALK	16		/* Apple Talk */
 #define	AF_ROUTE	17		/* Internal Routing Protocol */
 #define	AF_LINK		18		/* Link layer interface */
 #define	pseudo_AF_XTP	19		/* eXpress Transfer Protocol (no AF) */
 #define	AF_COIP		20		/* connection-oriented IP, aka ST II */
 #define	AF_CNT		21		/* Computer Network Technology */
 #define pseudo_AF_RTIP	22		/* Help Identify RTIP packets */
 #define	AF_IPX		23		/* Novell Internet Protocol */
 #define	AF_SIP		24		/* Simple Internet Protocol */
 #define	pseudo_AF_PIP	25		/* Help Identify PIP packets */
 #define	AF_ISDN		26		/* Integrated Services Digital Network*/
 #define	AF_E164		AF_ISDN		/* CCITT E.164 recommendation */
 #define	pseudo_AF_KEY	27		/* Internal key-management function */
 #endif
 #define	AF_INET6	28		/* IPv6 */
 #if __BSD_VISIBLE
 #define	AF_NATM		29		/* native ATM access */
 #define	AF_ATM		30		/* ATM */
 #define pseudo_AF_HDRCMPLT 31		/* Used by BPF to not rewrite headers
 					 * in interface output routine
 					 */
 #define	AF_NETGRAPH	32		/* Netgraph sockets */
 #define	AF_SLOW		33		/* 802.3ad slow protocol */
 #define	AF_SCLUSTER	34		/* Sitara cluster protocol */
 #define	AF_ARP		35
 #define	AF_BLUETOOTH	36		/* Bluetooth sockets */
 #define	AF_IEEE80211	37		/* IEEE 802.11 protocol */
 #define	AF_NETLINK	38		/* Netlink protocol */
 #define	AF_INET_SDP	40		/* OFED Socket Direct Protocol ipv4 */
 #define	AF_INET6_SDP	42		/* OFED Socket Direct Protocol ipv6 */
 #define	AF_HYPERV	43		/* HyperV sockets */
-#define	AF_MAX		43
+#define	AF_DIVERT	44		/* divert(4) */
+#define	AF_MAX		44
 /*
  * When allocating a new AF_ constant, please only allocate
  * even numbered constants for FreeBSD until 134 as odd numbered AF_
  * constants 39-133 are now reserved for vendors.
  */
 #define AF_VENDOR00 39
 #define AF_VENDOR01 41
 #define AF_VENDOR03 45
 #define AF_VENDOR04 47
 #define AF_VENDOR05 49
 #define AF_VENDOR06 51
 #define AF_VENDOR07 53
 #define AF_VENDOR08 55
 #define AF_VENDOR09 57
 #define AF_VENDOR10 59
 #define AF_VENDOR11 61
 #define AF_VENDOR12 63
 #define AF_VENDOR13 65
 #define AF_VENDOR14 67
 #define AF_VENDOR15 69
 #define AF_VENDOR16 71
 #define AF_VENDOR17 73
 #define AF_VENDOR18 75
 #define AF_VENDOR19 77
 #define AF_VENDOR20 79
 #define AF_VENDOR21 81
 #define AF_VENDOR22 83
 #define AF_VENDOR23 85
 #define AF_VENDOR24 87
 #define AF_VENDOR25 89
 #define AF_VENDOR26 91
 #define AF_VENDOR27 93
 #define AF_VENDOR28 95
 #define AF_VENDOR29 97
 #define AF_VENDOR30 99
 #define AF_VENDOR31 101
 #define AF_VENDOR32 103
 #define AF_VENDOR33 105
 #define AF_VENDOR34 107
 #define AF_VENDOR35 109
 #define AF_VENDOR36 111
 #define AF_VENDOR37 113
 #define AF_VENDOR38 115
 #define AF_VENDOR39 117
 #define AF_VENDOR40 119
 #define AF_VENDOR41 121
 #define AF_VENDOR42 123
 #define AF_VENDOR43 125
 #define AF_VENDOR44 127
 #define AF_VENDOR45 129
 #define AF_VENDOR46 131
 #define AF_VENDOR47 133
 #endif
 
 /*
  * Structure used by kernel to store most
  * addresses.
  */
 struct sockaddr {
 	unsigned char	sa_len;		/* total length */
 	sa_family_t	sa_family;	/* address family */
 	char		sa_data[14];	/* actually longer; address value */
 };
 #if __BSD_VISIBLE
 #define	SOCK_MAXADDRLEN	255		/* longest possible addresses */
 
 /*
  * Structure used by kernel to pass protocol
  * information in raw sockets.
  */
 struct sockproto {
 	unsigned short	sp_family;		/* address family */
 	unsigned short	sp_protocol;		/* protocol */
 };
 #endif
 
 #include <sys/_sockaddr_storage.h>
 
 #if __BSD_VISIBLE
 /*
  * Protocol families, same as address families for now.
  */
 #define	PF_UNSPEC	AF_UNSPEC
 #define	PF_LOCAL	AF_LOCAL
 #define	PF_UNIX		PF_LOCAL	/* backward compatibility */
 #define	PF_INET		AF_INET
 #define	PF_IMPLINK	AF_IMPLINK
 #define	PF_PUP		AF_PUP
 #define	PF_CHAOS	AF_CHAOS
 #define	PF_NETBIOS	AF_NETBIOS
 #define	PF_ISO		AF_ISO
 #define	PF_OSI		AF_ISO
 #define	PF_ECMA		AF_ECMA
 #define	PF_DATAKIT	AF_DATAKIT
 #define	PF_CCITT	AF_CCITT
 #define	PF_SNA		AF_SNA
 #define PF_DECnet	AF_DECnet
 #define PF_DLI		AF_DLI
 #define PF_LAT		AF_LAT
 #define	PF_HYLINK	AF_HYLINK
 #define	PF_APPLETALK	AF_APPLETALK
 #define	PF_ROUTE	AF_ROUTE
 #define	PF_LINK		AF_LINK
 #define	PF_XTP		pseudo_AF_XTP	/* really just proto family, no AF */
 #define	PF_COIP		AF_COIP
 #define	PF_CNT		AF_CNT
 #define	PF_SIP		AF_SIP
 #define	PF_IPX		AF_IPX
 #define PF_RTIP		pseudo_AF_RTIP	/* same format as AF_INET */
 #define PF_PIP		pseudo_AF_PIP
 #define	PF_ISDN		AF_ISDN
 #define	PF_KEY		pseudo_AF_KEY
 #define	PF_INET6	AF_INET6
 #define	PF_NATM		AF_NATM
 #define	PF_ATM		AF_ATM
 #define	PF_NETGRAPH	AF_NETGRAPH
 #define	PF_SLOW		AF_SLOW
 #define PF_SCLUSTER	AF_SCLUSTER
 #define	PF_ARP		AF_ARP
 #define	PF_BLUETOOTH	AF_BLUETOOTH
 #define	PF_IEEE80211	AF_IEEE80211
 #define	PF_NETLINK	AF_NETLINK
 #define	PF_INET_SDP	AF_INET_SDP
 #define	PF_INET6_SDP	AF_INET6_SDP
+#define	PF_DIVERT	AF_DIVERT
 
 #define	PF_MAX		AF_MAX
 
 /*
  * Definitions for network related sysctl, CTL_NET.
  *
  * Second level is protocol family.
  * Third level is protocol number.
  *
  * Further levels are defined by the individual families.
  */
 
 /*
  * PF_ROUTE - Routing table
  *
  * Three additional levels are defined:
  *	Fourth: address family, 0 is wildcard
  *	Fifth: type of info, defined below
  *	Sixth: flag(s) to mask with for NET_RT_FLAGS
  */
 #define NET_RT_DUMP	1		/* dump; may limit to a.f. */
 #define NET_RT_FLAGS	2		/* by flags, e.g. RESOLVING */
 #define NET_RT_IFLIST	3		/* survey interface list */
 #define	NET_RT_IFMALIST	4		/* return multicast address list */
 #define	NET_RT_IFLISTL	5		/* Survey interface list, using 'l'en
 					 * versions of msghdr structs. */
 #define NET_RT_NHOP	6		/* dump routing nexthops */
 #define NET_RT_NHGRP	7		/* dump routing nexthop groups */
 #endif /* __BSD_VISIBLE */
 
 /*
  * Maximum queue length specifiable by listen.
  */
 #define	SOMAXCONN	128
 
 /*
  * Message header for recvmsg and sendmsg calls.
  * Used value-result for recvmsg, value only for sendmsg.
  */
 struct msghdr {
 	void		*msg_name;		/* optional address */
 	socklen_t	 msg_namelen;		/* size of address */
 	struct iovec	*msg_iov;		/* scatter/gather array */
 	int		 msg_iovlen;		/* # elements in msg_iov */
 	void		*msg_control;		/* ancillary data, see below */
 	socklen_t	 msg_controllen;	/* ancillary data buffer len */
 	int		 msg_flags;		/* flags on received message */
 };
 
 #define	MSG_OOB		 0x00000001	/* process out-of-band data */
 #define	MSG_PEEK	 0x00000002	/* peek at incoming message */
 #define	MSG_DONTROUTE	 0x00000004	/* send without using routing tables */
 #define	MSG_EOR		 0x00000008	/* data completes record */
 #define	MSG_TRUNC	 0x00000010	/* data discarded before delivery */
 #define	MSG_CTRUNC	 0x00000020	/* control data lost before delivery */
 #define	MSG_WAITALL	 0x00000040	/* wait for full request or error */
 #if __BSD_VISIBLE
 #define	MSG_DONTWAIT	 0x00000080	/* this message should be nonblocking */
 #define	MSG_EOF		 0x00000100	/* data completes connection */
 /*			 0x00000200	   unused */
 /*			 0x00000400	   unused */
 /*			 0x00000800	   unused */
 /*			 0x00001000	   unused */
 #define	MSG_NOTIFICATION 0x00002000	/* SCTP notification */
 #define	MSG_NBIO	 0x00004000	/* FIONBIO mode, used by fifofs */
 #define	MSG_COMPAT       0x00008000		/* used in sendit() */
 #endif
 #ifdef _KERNEL
 #define	MSG_SOCALLBCK    0x00010000	/* for use by socket callbacks - soreceive (TCP) */
 #endif
 #if __POSIX_VISIBLE >= 200809
 #define	MSG_NOSIGNAL	 0x00020000	/* do not generate SIGPIPE on EOF */
 #endif
 #if __BSD_VISIBLE
 #define	MSG_CMSG_CLOEXEC 0x00040000	/* make received fds close-on-exec */
 #define	MSG_WAITFORONE	 0x00080000	/* for recvmmsg() */
 #endif
 #ifdef _KERNEL
 #define	MSG_MORETOCOME	 0x00100000	/* additional data pending */
 #define	MSG_TLSAPPDATA	 0x00200000	/* do not soreceive() alert rec. (TLS) */
 #endif
 
 /*
  * Header for ancillary data objects in msg_control buffer.
  * Used for additional information with/about a datagram
  * not expressible by flags.  The format is a sequence
  * of message elements headed by cmsghdr structures.
  */
 struct cmsghdr {
 	socklen_t	cmsg_len;		/* data byte count, including hdr */
 	int		cmsg_level;		/* originating protocol */
 	int		cmsg_type;		/* protocol-specific type */
 /* followed by	u_char  cmsg_data[]; */
 };
 
 #if __BSD_VISIBLE
 /*
  * While we may have more groups than this, the cmsgcred struct must
  * be able to fit in an mbuf and we have historically supported a
  * maximum of 16 groups.
 */
 #define CMGROUP_MAX 16
 
 /*
  * Credentials structure, used to verify the identity of a peer
  * process that has sent us a message. This is allocated by the
  * peer process but filled in by the kernel. This prevents the
  * peer from lying about its identity. (Note that cmcred_groups[0]
  * is the effective GID.)
  */
 struct cmsgcred {
 	pid_t	cmcred_pid;		/* PID of sending process */
 	uid_t	cmcred_uid;		/* real UID of sending process */
 	uid_t	cmcred_euid;		/* effective UID of sending process */
 	gid_t	cmcred_gid;		/* real GID of sending process */
 	short	cmcred_ngroups;		/* number or groups */
 	gid_t	cmcred_groups[CMGROUP_MAX];	/* groups */
 };
 
 /*
  * Socket credentials (LOCAL_CREDS).
  */
 struct sockcred {
 	uid_t	sc_uid;			/* real user id */
 	uid_t	sc_euid;		/* effective user id */
 	gid_t	sc_gid;			/* real group id */
 	gid_t	sc_egid;		/* effective group id */
 	int	sc_ngroups;		/* number of supplemental groups */
 	gid_t	sc_groups[1];		/* variable length */
 };
 
 /*
  * Compute size of a sockcred structure with groups.
  */
 #define	SOCKCREDSIZE(ngrps) \
 	(sizeof(struct sockcred) + (sizeof(gid_t) * ((ngrps) - 1)))
 
 /*
  * Socket credentials (LOCAL_CREDS_PERSISTENT).
  */
 struct sockcred2 {
 	int	sc_version;		/* version of this structure */
 	pid_t	sc_pid;			/* PID of sending process */
 	uid_t	sc_uid;			/* real user id */
 	uid_t	sc_euid;		/* effective user id */
 	gid_t	sc_gid;			/* real group id */
 	gid_t	sc_egid;		/* effective group id */
 	int	sc_ngroups;		/* number of supplemental groups */
 	gid_t	sc_groups[1];		/* variable length */
 };
 #define	SOCKCRED2SIZE(ngrps) \
 	(sizeof(struct sockcred2) + (sizeof(gid_t) * ((ngrps) - 1)))
 
 #endif /* __BSD_VISIBLE */
 
 /* given pointer to struct cmsghdr, return pointer to data */
 #define	CMSG_DATA(cmsg)		((unsigned char *)(cmsg) + \
 				 _ALIGN(sizeof(struct cmsghdr)))
 
 /* given pointer to struct cmsghdr, return pointer to next cmsghdr */
 #define	CMSG_NXTHDR(mhdr, cmsg)	\
 	((char *)(cmsg) == (char *)0 ? CMSG_FIRSTHDR(mhdr) : \
 	    ((char *)(cmsg) + _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len) + \
 	  _ALIGN(sizeof(struct cmsghdr)) > \
 	    (char *)(mhdr)->msg_control + (mhdr)->msg_controllen) ? \
 	    (struct cmsghdr *)0 : \
 	    (struct cmsghdr *)(void *)((char *)(cmsg) + \
 	    _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len)))
 
 /*
  * RFC 2292 requires to check msg_controllen, in case that the kernel returns
  * an empty list for some reasons.
  */
 #define	CMSG_FIRSTHDR(mhdr) \
 	((mhdr)->msg_controllen >= sizeof(struct cmsghdr) ? \
 	 (struct cmsghdr *)(mhdr)->msg_control : \
 	 (struct cmsghdr *)0)
 
 #if __BSD_VISIBLE
 /* RFC 2292 additions */
 #define	CMSG_SPACE(l)		(_ALIGN(sizeof(struct cmsghdr)) + _ALIGN(l))
 #define	CMSG_LEN(l)		(_ALIGN(sizeof(struct cmsghdr)) + (l))
 #endif
 
 #ifdef _KERNEL
 #define	CMSG_ALIGN(n)	_ALIGN(n)
 #endif
 
 /* "Socket"-level control message types: */
 #define	SCM_RIGHTS	0x01		/* access rights (array of int) */
 #if __BSD_VISIBLE
 #define	SCM_TIMESTAMP	0x02		/* timestamp (struct timeval) */
 #define	SCM_CREDS	0x03		/* process creds (struct cmsgcred) */
 #define	SCM_BINTIME	0x04		/* timestamp (struct bintime) */
 #define	SCM_REALTIME	0x05		/* timestamp (struct timespec) */
 #define	SCM_MONOTONIC	0x06		/* timestamp (struct timespec) */
 #define	SCM_TIME_INFO	0x07		/* timestamp info */
 #define	SCM_CREDS2	0x08		/* process creds (struct sockcred2) */
 
 struct sock_timestamp_info {
 	__uint32_t	st_info_flags;
 	__uint32_t	st_info_pad0;
 	__uint64_t	st_info_rsv[7];
 };
 
 #define	ST_INFO_HW		0x0001		/* SCM_TIMESTAMP was hw */
 #define	ST_INFO_HW_HPREC	0x0002		/* SCM_TIMESTAMP was hw-assisted
 						   on entrance */
 #endif
 
 #if __BSD_VISIBLE
 /*
  * 4.3 compat sockaddr, move to compat file later
  */
 struct osockaddr {
 	unsigned short sa_family;	/* address family */
 	char	sa_data[14];		/* up to 14 bytes of direct address */
 };
 
 /*
  * 4.3-compat message header (move to compat file later).
  */
 struct omsghdr {
 	char	*msg_name;		/* optional address */
 	int	msg_namelen;		/* size of address */
 	struct	iovec *msg_iov;		/* scatter/gather array */
 	int	msg_iovlen;		/* # elements in msg_iov */
 	char	*msg_accrights;		/* access rights sent/received */
 	int	msg_accrightslen;
 };
 #endif
 
 /*
  * howto arguments for shutdown(2), specified by Posix.1g.
  */
 #define	SHUT_RD		0		/* shut down the reading side */
 #define	SHUT_WR		1		/* shut down the writing side */
 #define	SHUT_RDWR	2		/* shut down both sides */
 
 #if __BSD_VISIBLE
 /* for SCTP */
 /* we cheat and use the SHUT_XX defines for these */
 #define PRU_FLUSH_RD     SHUT_RD
 #define PRU_FLUSH_WR     SHUT_WR
 #define PRU_FLUSH_RDWR   SHUT_RDWR
 #endif
 
 #if __BSD_VISIBLE
 /*
  * sendfile(2) header/trailer struct
  */
 struct sf_hdtr {
 	struct iovec *headers;	/* pointer to an array of header struct iovec's */
 	int hdr_cnt;		/* number of header iovec's */
 	struct iovec *trailers;	/* pointer to an array of trailer struct iovec's */
 	int trl_cnt;		/* number of trailer iovec's */
 };
 
 /*
  * Sendfile-specific flag(s)
  */
 #define	SF_NODISKIO     0x00000001
 #define	SF_MNOWAIT	0x00000002	/* obsolete */
 #define	SF_SYNC		0x00000004
 #define	SF_USER_READAHEAD	0x00000008
 #define	SF_NOCACHE	0x00000010
 #define	SF_FLAGS(rh, flags)	(((rh) << 16) | (flags))
 
 #ifdef _KERNEL
 #define	SF_READAHEAD(flags)	((flags) >> 16)
 #endif /* _KERNEL */
 
 /*
  * Sendmmsg/recvmmsg specific structure(s)
  */
 struct mmsghdr {
 	struct msghdr	msg_hdr;		/* message header */
 	ssize_t		msg_len;		/* message length */
 };
 #endif /* __BSD_VISIBLE */
 
 #ifndef	_KERNEL
 
 #include <sys/cdefs.h>
 
 __BEGIN_DECLS
 int	accept(int, struct sockaddr * __restrict, socklen_t * __restrict);
 int	bind(int, const struct sockaddr *, socklen_t);
 int	connect(int, const struct sockaddr *, socklen_t);
 #if __BSD_VISIBLE
 int	accept4(int, struct sockaddr * __restrict, socklen_t * __restrict, int);
 int	bindat(int, int, const struct sockaddr *, socklen_t);
 int	connectat(int, int, const struct sockaddr *, socklen_t);
 #endif
 int	getpeername(int, struct sockaddr * __restrict, socklen_t * __restrict);
 int	getsockname(int, struct sockaddr * __restrict, socklen_t * __restrict);
 int	getsockopt(int, int, int, void * __restrict, socklen_t * __restrict);
 int	listen(int, int);
 ssize_t	recv(int, void *, size_t, int);
 ssize_t	recvfrom(int, void *, size_t, int, struct sockaddr * __restrict, socklen_t * __restrict);
 ssize_t	recvmsg(int, struct msghdr *, int);
 #if __BSD_VISIBLE
 struct timespec;
 ssize_t	recvmmsg(int, struct mmsghdr * __restrict, size_t, int,
     const struct timespec * __restrict);
 #endif
 ssize_t	send(int, const void *, size_t, int);
 ssize_t	sendto(int, const void *,
 	    size_t, int, const struct sockaddr *, socklen_t);
 ssize_t	sendmsg(int, const struct msghdr *, int);
 #if __BSD_VISIBLE
 int	sendfile(int, int, off_t, size_t, struct sf_hdtr *, off_t *, int);
 ssize_t	sendmmsg(int, struct mmsghdr * __restrict, size_t, int);
 int	setfib(int);
 #endif
 int	setsockopt(int, int, int, const void *, socklen_t);
 int	shutdown(int, int);
 int	sockatmark(int);
 int	socket(int, int, int);
 int	socketpair(int, int, int, int *);
 __END_DECLS
 
 #endif /* !_KERNEL */
 
 #ifdef _KERNEL
 struct socket;
 
 struct tcpcb *so_sototcpcb(struct socket *so);
 struct inpcb *so_sotoinpcb(struct socket *so);
 struct sockbuf *so_sockbuf_snd(struct socket *);
 struct sockbuf *so_sockbuf_rcv(struct socket *);
 
 int so_state_get(const struct socket *);
 void so_state_set(struct socket *, int);
 
 int so_options_get(const struct socket *);
 void so_options_set(struct socket *, int);
 
 int so_error_get(const struct socket *);
 void so_error_set(struct socket *, int);
 
 int so_linger_get(const struct socket *);
 void so_linger_set(struct socket *, int);
 
 struct protosw *so_protosw_get(const struct socket *);
 void so_protosw_set(struct socket *, struct protosw *);
 
 void so_sorwakeup_locked(struct socket *so);
 void so_sowwakeup_locked(struct socket *so);
 
 void so_sorwakeup(struct socket *so);
 void so_sowwakeup(struct socket *so);
 
 void so_lock(struct socket *so);
 void so_unlock(struct socket *so);
 
 #endif /* _KERNEL */
 #endif /* !_SYS_SOCKET_H_ */
diff --git a/usr.bin/netstat/inet.c b/usr.bin/netstat/inet.c
index b7dbcb3531b0..e848874d1695 100644
--- a/usr.bin/netstat/inet.c
+++ b/usr.bin/netstat/inet.c
@@ -1,1516 +1,1515 @@
 /*-
  * Copyright (c) 1983, 1988, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #if 0
 #ifndef lint
 static char sccsid[] = "@(#)inet.c	8.5 (Berkeley) 5/24/95";
 #endif /* not lint */
 #endif
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #define	_WANT_SOCKET
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <net/route.h>
 #include <net/if_arp.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_carp.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif /* INET6 */
 #include <netinet/in_pcb.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/icmp_var.h>
 #include <netinet/igmp_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/pim_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_seq.h>
 #define	TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #include <arpa/inet.h>
 #include <err.h>
 #include <errno.h>
 #include <libutil.h>
 #include <netdb.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
 #include <unistd.h>
 #include <libxo/xo.h>
 #include "netstat.h"
 #include "nl_defs.h"
 
 #define max(a, b) (((a) > (b)) ? (a) : (b))
 
 #ifdef INET
 static void inetprint(const char *, struct in_addr *, int, const char *, int,
     const int);
 #endif
 #ifdef INET6
 static int udp_done, tcp_done, sdp_done;
 #endif /* INET6 */
 
 static int
 pcblist_sysctl(int proto, const char *name, char **bufp)
 {
 	const char *mibvar;
 	char *buf;
 	size_t len;
 
 	switch (proto) {
 	case IPPROTO_TCP:
 		mibvar = "net.inet.tcp.pcblist";
 		break;
 	case IPPROTO_UDP:
 		mibvar = "net.inet.udp.pcblist";
 		break;
-	case IPPROTO_DIVERT:
-		mibvar = "net.inet.divert.pcblist";
-		break;
 	default:
 		mibvar = "net.inet.raw.pcblist";
 		break;
 	}
 	if (strncmp(name, "sdp", 3) == 0)
 		mibvar = "net.inet.sdp.pcblist";
+	else if (strncmp(name, "divert", 6) == 0)
+		mibvar = "net.inet.divert.pcblist";
 	len = 0;
 	if (sysctlbyname(mibvar, 0, &len, 0, 0) < 0) {
 		if (errno != ENOENT)
 			xo_warn("sysctl: %s", mibvar);
 		return (0);
 	}
 	if ((buf = malloc(len)) == NULL) {
 		xo_warnx("malloc %lu bytes", (u_long)len);
 		return (0);
 	}
 	if (sysctlbyname(mibvar, buf, &len, 0, 0) < 0) {
 		xo_warn("sysctl: %s", mibvar);
 		free(buf);
 		return (0);
 	}
 	*bufp = buf;
 	return (1);
 }
 
 /*
  * Copied directly from uipc_socket2.c.  We leave out some fields that are in
  * nested structures that aren't used to avoid extra work.
  */
 static void
 sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
 {
 	xsb->sb_cc = sb->sb_ccc;
 	xsb->sb_hiwat = sb->sb_hiwat;
 	xsb->sb_mbcnt = sb->sb_mbcnt;
 	xsb->sb_mbmax = sb->sb_mbmax;
 	xsb->sb_lowat = sb->sb_lowat;
 	xsb->sb_flags = sb->sb_flags;
 	xsb->sb_timeo = sb->sb_timeo;
 }
 
 int
 sotoxsocket(struct socket *so, struct xsocket *xso)
 {
 	struct protosw proto;
 	struct domain domain;
 
 	bzero(xso, sizeof *xso);
 	xso->xso_len = sizeof *xso;
 	xso->xso_so = (uintptr_t)so;
 	xso->so_type = so->so_type;
 	xso->so_options = so->so_options;
 	xso->so_linger = so->so_linger;
 	xso->so_state = so->so_state;
 	xso->so_pcb = (uintptr_t)so->so_pcb;
 	if (kread((uintptr_t)so->so_proto, &proto, sizeof(proto)) != 0)
 		return (-1);
 	xso->xso_protocol = proto.pr_protocol;
 	if (kread((uintptr_t)proto.pr_domain, &domain, sizeof(domain)) != 0)
 		return (-1);
 	xso->xso_family = domain.dom_family;
 	xso->so_timeo = so->so_timeo;
 	xso->so_error = so->so_error;
 	if ((so->so_options & SO_ACCEPTCONN) != 0) {
 		xso->so_qlen = so->sol_qlen;
 		xso->so_incqlen = so->sol_incqlen;
 		xso->so_qlimit = so->sol_qlimit;
 	} else {
 		sbtoxsockbuf(&so->so_snd, &xso->so_snd);
 		sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
 		xso->so_oobmark = so->so_oobmark;
 	}
 	return (0);
 }
 
 /*
  * Print a summary of connections related to an Internet
  * protocol.  For TCP, also give state of connection.
  * Listening processes (aflag) are suppressed unless the
  * -a (all) flag is specified.
  */
 void
 protopr(u_long off, const char *name, int af1, int proto)
 {
 	static int first = 1;
 	int istcp;
 	char *buf;
 	const char *vchar;
 	struct xtcpcb *tp;
 	struct xinpcb *inp;
 	struct xinpgen *xig, *oxig;
 	struct xsocket *so;
 	int fnamelen, cnamelen;
 
 	istcp = 0;
 	switch (proto) {
 	case IPPROTO_TCP:
 #ifdef INET6
 		if (strncmp(name, "sdp", 3) != 0) {
 			if (tcp_done != 0)
 				return;
 			else
 				tcp_done = 1;
 		} else {
 			if (sdp_done != 0)
 				return;
 			else
 				sdp_done = 1;
 		}
 #endif
 		istcp = 1;
 		break;
 	case IPPROTO_UDP:
 #ifdef INET6
 		if (udp_done != 0)
 			return;
 		else
 			udp_done = 1;
 #endif
 		break;
 	}
 
 	if (!pcblist_sysctl(proto, name, &buf))
 		return;
 
 	if (cflag || Cflag) {
 		fnamelen = strlen("Stack");
 		cnamelen = strlen("CC");
 		oxig = xig = (struct xinpgen *)buf;
 		for (xig = (struct xinpgen*)((char *)xig + xig->xig_len);
 		    xig->xig_len > sizeof(struct xinpgen);
 		    xig = (struct xinpgen *)((char *)xig + xig->xig_len)) {
 			if (istcp) {
 				tp = (struct xtcpcb *)xig;
 				inp = &tp->xt_inp;
 			} else {
 				continue;
 			}
 			if (so->xso_protocol != proto)
 				continue;
 			if (inp->inp_gencnt > oxig->xig_gen)
 				continue;
 			fnamelen = max(fnamelen, (int)strlen(tp->xt_stack));
 			cnamelen = max(cnamelen, (int)strlen(tp->xt_cc));
 		}
 	}
 
 	oxig = xig = (struct xinpgen *)buf;
 	for (xig = (struct xinpgen *)((char *)xig + xig->xig_len);
 	    xig->xig_len > sizeof(struct xinpgen);
 	    xig = (struct xinpgen *)((char *)xig + xig->xig_len)) {
 		if (istcp) {
 			tp = (struct xtcpcb *)xig;
 			inp = &tp->xt_inp;
 		} else {
 			inp = (struct xinpcb *)xig;
 		}
 		so = &inp->xi_socket;
 
 		/* Ignore sockets for protocols other than the desired one. */
-		if (so->xso_protocol != proto)
+		if (proto != 0 && so->xso_protocol != proto)
 			continue;
 
 		/* Ignore PCBs which were freed during copyout. */
 		if (inp->inp_gencnt > oxig->xig_gen)
 			continue;
 
 		if ((af1 == AF_INET && (inp->inp_vflag & INP_IPV4) == 0)
 #ifdef INET6
 		    || (af1 == AF_INET6 && (inp->inp_vflag & INP_IPV6) == 0)
 #endif /* INET6 */
 		    || (af1 == AF_UNSPEC && ((inp->inp_vflag & INP_IPV4) == 0
 #ifdef INET6
 					  && (inp->inp_vflag & INP_IPV6) == 0
 #endif /* INET6 */
 			))
 		    )
 			continue;
 		if (!aflag &&
 		    (
 		     (istcp && tp->t_state == TCPS_LISTEN)
 		     || (af1 == AF_INET &&
 		      inp->inp_laddr.s_addr == INADDR_ANY)
 #ifdef INET6
 		     || (af1 == AF_INET6 &&
 			 IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
 #endif /* INET6 */
 		     || (af1 == AF_UNSPEC &&
 			 (((inp->inp_vflag & INP_IPV4) != 0 &&
 			   inp->inp_laddr.s_addr == INADDR_ANY)
 #ifdef INET6
 			  || ((inp->inp_vflag & INP_IPV6) != 0 &&
 			      IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
 #endif
 			  ))
 		     ))
 			continue;
 
 		if (first) {
 			if (!Lflag) {
 				xo_emit("Active Internet connections");
 				if (aflag)
 					xo_emit(" (including servers)");
 			} else
 				xo_emit(
 	"Current listen queue sizes (qlen/incqlen/maxqlen)");
 			xo_emit("\n");
 			if (Aflag)
 				xo_emit("{T:/%-*s} ", 2 * (int)sizeof(void *),
 				    "Tcpcb");
 			if (Lflag)
 				xo_emit((Aflag && !Wflag) ?
 				    "{T:/%-5.5s} {T:/%-32.32s} {T:/%-18.18s}" :
 				    ((!Wflag || af1 == AF_INET) ?
 				    "{T:/%-5.5s} {T:/%-32.32s} {T:/%-22.22s}" :
 				    "{T:/%-5.5s} {T:/%-32.32s} {T:/%-45.45s}"),
 				    "Proto", "Listen", "Local Address");
 			else if (Tflag)
 				xo_emit((Aflag && !Wflag) ?
     "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-18.18s} {T:/%s}" :
 				    ((!Wflag || af1 == AF_INET) ?
     "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-22.22s} {T:/%s}" :
     "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-45.45s} {T:/%s}"),
 				    "Proto", "Rexmit", "OOORcv", "0-win",
 				    "Local Address", "Foreign Address");
 			else {
 				xo_emit((Aflag && !Wflag) ?
     "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-18.18s} {T:/%-18.18s}" :
 				    ((!Wflag || af1 == AF_INET) ?
     "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-22.22s} {T:/%-22.22s}" :
     "{T:/%-5.5s} {T:/%-6.6s} {T:/%-6.6s} {T:/%-45.45s} {T:/%-45.45s}"),
 				    "Proto", "Recv-Q", "Send-Q",
 				    "Local Address", "Foreign Address");
 				if (!xflag && !Rflag)
 					xo_emit(" {T:/%-11.11s}", "(state)");
 			}
 			if (xflag) {
 				xo_emit("{T:/%-6.6s} {T:/%-6.6s} "
 				    "{T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} "
 				    "{T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s}",
 				    "R-HIWA", "S-HIWA", "R-LOWA", "S-LOWA",
 				    "R-BCNT", "S-BCNT", "R-BMAX", "S-BMAX");
 				xo_emit(" {T:/%7.7s} {T:/%7.7s} {T:/%7.7s} "
 				    "{T:/%7.7s} {T:/%7.7s} {T:/%7.7s}",
 				    "rexmt", "persist", "keep", "2msl",
 				    "delack", "rcvtime");
 			} else if (Rflag) {
 				xo_emit("  {T:/%8.8s} {T:/%5.5s}",
 				    "flowid", "ftype");
 			}
 			if (cflag) {
 				xo_emit(" {T:/%-*.*s}",
 					fnamelen, fnamelen, "Stack");
 			}
 			if (Cflag)
 				xo_emit(" {T:/%-*.*s} {T:/%10.10s}"
 					" {T:/%10.10s} {T:/%5.5s}"
 					" {T:/%3.3s}", cnamelen,
 					cnamelen, "CC",
 					"cwin",
 					"ssthresh",
 					"MSS",
 					"ECN");
 			if (Pflag)
 				xo_emit(" {T:/%s}", "Log ID");
 			xo_emit("\n");
 			first = 0;
 		}
 		if (Lflag && so->so_qlimit == 0)
 			continue;
 		xo_open_instance("socket");
 		if (Aflag) {
 			if (istcp)
 				xo_emit("{q:address/%*lx} ",
 				    2 * (int)sizeof(void *),
 				    (u_long)inp->inp_ppcb);
 			else
 				xo_emit("{q:address/%*lx} ",
 				    2 * (int)sizeof(void *),
 				    (u_long)so->so_pcb);
 		}
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV6) != 0)
 			vchar = ((inp->inp_vflag & INP_IPV4) != 0) ?
 			    "46" : "6";
 		else
 #endif
 		vchar = ((inp->inp_vflag & INP_IPV4) != 0) ?
 		    "4" : "";
 		if (istcp && (tp->t_flags & TF_TOE) != 0)
 			xo_emit("{:protocol/%-3.3s%-2.2s/%s%s} ", "toe", vchar);
 		else
 			xo_emit("{:protocol/%-3.3s%-2.2s/%s%s} ", name, vchar);
 		if (Lflag) {
 			char buf1[33];
 
 			snprintf(buf1, sizeof buf1, "%u/%u/%u", so->so_qlen,
 			    so->so_incqlen, so->so_qlimit);
 			xo_emit("{:listen-queue-sizes/%-32.32s} ", buf1);
 		} else if (Tflag) {
 			if (istcp)
 				xo_emit("{:sent-retransmit-packets/%6u} "
 				    "{:received-out-of-order-packets/%6u} "
 				    "{:sent-zero-window/%6u} ",
 				    tp->t_sndrexmitpack, tp->t_rcvoopack,
 				    tp->t_sndzerowin);
 			else
 				xo_emit("{P:/%21s}", "");
 		} else {
 			xo_emit("{:receive-bytes-waiting/%6u} "
 			    "{:send-bytes-waiting/%6u} ",
 			    so->so_rcv.sb_cc, so->so_snd.sb_cc);
 		}
 		if (numeric_port) {
 #ifdef INET
 			if (inp->inp_vflag & INP_IPV4) {
 				inetprint("local", &inp->inp_laddr,
 				    (int)inp->inp_lport, name, 1, af1);
 				if (!Lflag)
 					inetprint("remote", &inp->inp_faddr,
 					    (int)inp->inp_fport, name, 1, af1);
 			}
 #endif
 #if defined(INET) && defined(INET6)
 			else
 #endif
 #ifdef INET6
 			if (inp->inp_vflag & INP_IPV6) {
 				inet6print("local", &inp->in6p_laddr,
 				    (int)inp->inp_lport, name, 1);
 				if (!Lflag)
 					inet6print("remote", &inp->in6p_faddr,
 					    (int)inp->inp_fport, name, 1);
 			} /* else nothing printed now */
 #endif /* INET6 */
 		} else if (inp->inp_flags & INP_ANONPORT) {
 #ifdef INET
 			if (inp->inp_vflag & INP_IPV4) {
 				inetprint("local", &inp->inp_laddr,
 				    (int)inp->inp_lport, name, 1, af1);
 				if (!Lflag)
 					inetprint("remote", &inp->inp_faddr,
 					    (int)inp->inp_fport, name, 0, af1);
 			}
 #endif
 #if defined(INET) && defined(INET6)
 			else
 #endif
 #ifdef INET6
 			if (inp->inp_vflag & INP_IPV6) {
 				inet6print("local", &inp->in6p_laddr,
 				    (int)inp->inp_lport, name, 1);
 				if (!Lflag)
 					inet6print("remote", &inp->in6p_faddr,
 					    (int)inp->inp_fport, name, 0);
 			} /* else nothing printed now */
 #endif /* INET6 */
 		} else {
 #ifdef INET
 			if (inp->inp_vflag & INP_IPV4) {
 				inetprint("local", &inp->inp_laddr,
 				    (int)inp->inp_lport, name, 0, af1);
 				if (!Lflag)
 					inetprint("remote", &inp->inp_faddr,
 					    (int)inp->inp_fport, name,
 					    inp->inp_lport != inp->inp_fport,
 					    af1);
 			}
 #endif
 #if defined(INET) && defined(INET6)
 			else
 #endif
 #ifdef INET6
 			if (inp->inp_vflag & INP_IPV6) {
 				inet6print("local", &inp->in6p_laddr,
 				    (int)inp->inp_lport, name, 0);
 				if (!Lflag)
 					inet6print("remote", &inp->in6p_faddr,
 					    (int)inp->inp_fport, name,
 					    inp->inp_lport != inp->inp_fport);
 			} /* else nothing printed now */
 #endif /* INET6 */
 		}
 		if (xflag) {
 			xo_emit("{:receive-high-water/%6u} "
 			    "{:send-high-water/%6u} "
 			    "{:receive-low-water/%6u} {:send-low-water/%6u} "
 			    "{:receive-mbuf-bytes/%6u} {:send-mbuf-bytes/%6u} "
 			    "{:receive-mbuf-bytes-max/%6u} "
 			    "{:send-mbuf-bytes-max/%6u}",
 			    so->so_rcv.sb_hiwat, so->so_snd.sb_hiwat,
 			    so->so_rcv.sb_lowat, so->so_snd.sb_lowat,
 			    so->so_rcv.sb_mbcnt, so->so_snd.sb_mbcnt,
 			    so->so_rcv.sb_mbmax, so->so_snd.sb_mbmax);
 			if (istcp)
 				xo_emit(" {:retransmit-timer/%4d.%02d} "
 				    "{:persist-timer/%4d.%02d} "
 				    "{:keepalive-timer/%4d.%02d} "
 				    "{:msl2-timer/%4d.%02d} "
 				    "{:delay-ack-timer/%4d.%02d} "
 				    "{:inactivity-timer/%4d.%02d}",
 				    tp->tt_rexmt / 1000,
 				    (tp->tt_rexmt % 1000) / 10,
 				    tp->tt_persist / 1000,
 				    (tp->tt_persist % 1000) / 10,
 				    tp->tt_keep / 1000,
 				    (tp->tt_keep % 1000) / 10,
 				    tp->tt_2msl / 1000,
 				    (tp->tt_2msl % 1000) / 10,
 				    tp->tt_delack / 1000,
 				    (tp->tt_delack % 1000) / 10,
 				    tp->t_rcvtime / 1000,
 				    (tp->t_rcvtime % 1000) / 10);
 		}
 		if (istcp && !Lflag && !xflag && !Tflag && !Rflag) {
 			if (tp->t_state < 0 || tp->t_state >= TCP_NSTATES)
 				xo_emit("{:tcp-state/%-11d}", tp->t_state);
 			else {
 				xo_emit("{:tcp-state/%-11s}",
 				    tcpstates[tp->t_state]);
 #if defined(TF_NEEDSYN) && defined(TF_NEEDFIN)
 				/* Show T/TCP `hidden state' */
 				if (tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN))
 					xo_emit("{:need-syn-or-fin/*}");
 #endif /* defined(TF_NEEDSYN) && defined(TF_NEEDFIN) */
 			}
 		}
 		if (Rflag) {
 			/* XXX: is this right Alfred */
 			xo_emit(" {:flow-id/%08x} {:flow-type/%5d}",
 			    inp->inp_flowid,
 			    inp->inp_flowtype);
 		}
 		if (istcp) {
 			if (cflag)
 				xo_emit(" {:stack/%-*.*s}",
 					
 					fnamelen, fnamelen, tp->xt_stack);
 			if (Cflag)
 				xo_emit(" {:cc/%-*.*s}"
 					" {:snd-cwnd/%10lu}"
 					" {:snd-ssthresh/%10lu}"
 					" {:t-maxseg/%5u} {:ecn/%3s}",
 					cnamelen, cnamelen, tp->xt_cc,
 					tp->t_snd_cwnd, tp->t_snd_ssthresh,
 					tp->t_maxseg,
 					(tp->t_state >= TCPS_ESTABLISHED ?
 					    (tp->xt_ecn > 0 ?
 						(tp->xt_ecn == 1 ?
 						    "ecn" : "ace")
 						: "off")
 					    : "n/a"));
 			if (Pflag)
 				xo_emit(" {:log-id/%s}",
 				    tp->xt_logid[0] == '\0' ?
 				    "-" : tp->xt_logid);
 		}
 		xo_emit("\n");
 		xo_close_instance("socket");
 	}
 	if (xig != oxig && xig->xig_gen != oxig->xig_gen) {
 		if (oxig->xig_count > xig->xig_count) {
 			xo_emit("Some {d:lost/%s} sockets may have been "
 			    "deleted.\n", name);
 		} else if (oxig->xig_count < xig->xig_count) {
 			xo_emit("Some {d:created/%s} sockets may have been "
 			    "created.\n", name);
 		} else {
 			xo_emit("Some {d:changed/%s} sockets may have been "
 			    "created or deleted.\n", name);
 		}
 	}
 	free(buf);
 }
 
 /*
  * Dump TCP statistics structure.
  */
 void
 tcp_stats(u_long off, const char *name, int af1 __unused, int proto __unused)
 {
 	struct tcpstat tcpstat;
 	uint64_t tcps_states[TCP_NSTATES];
 
 #ifdef INET6
 	if (tcp_done != 0)
 		return;
 	else
 		tcp_done = 1;
 #endif
 
 	if (fetch_stats("net.inet.tcp.stats", off, &tcpstat,
 	    sizeof(tcpstat), kread_counters) != 0)
 		return;
 
 	if (fetch_stats_ro("net.inet.tcp.states", nl[N_TCPS_STATES].n_value,
 	    &tcps_states, sizeof(tcps_states), kread_counters) != 0)
 		return;
 
 	xo_open_container("tcp");
 	xo_emit("{T:/%s}:\n", name);
 
 #define	p(f, m) if (tcpstat.f || sflag <= 1)				\
 	xo_emit(m, (uintmax_t )tcpstat.f, plural(tcpstat.f))
 #define	p1a(f, m) if (tcpstat.f || sflag <= 1)				\
 	xo_emit(m, (uintmax_t )tcpstat.f)
 #define	p2(f1, f2, m) if (tcpstat.f1 || tcpstat.f2 || sflag <= 1)	\
 	xo_emit(m, (uintmax_t )tcpstat.f1, plural(tcpstat.f1),		\
 	    (uintmax_t )tcpstat.f2, plural(tcpstat.f2))
 #define	p2a(f1, f2, m) if (tcpstat.f1 || tcpstat.f2 || sflag <= 1)	\
 	xo_emit(m, (uintmax_t )tcpstat.f1, plural(tcpstat.f1),		\
 	    (uintmax_t )tcpstat.f2)
 #define	p3(f, m) if (tcpstat.f || sflag <= 1)				\
 	xo_emit(m, (uintmax_t )tcpstat.f, pluralies(tcpstat.f))
 
 	p(tcps_sndtotal, "\t{:sent-packets/%ju} {N:/packet%s sent}\n");
 	p2(tcps_sndpack,tcps_sndbyte, "\t\t{:sent-data-packets/%ju} "
 	    "{N:/data packet%s} ({:sent-data-bytes/%ju} {N:/byte%s})\n");
 	p2(tcps_sndrexmitpack, tcps_sndrexmitbyte, "\t\t"
 	    "{:sent-retransmitted-packets/%ju} {N:/data packet%s} "
 	    "({:sent-retransmitted-bytes/%ju} {N:/byte%s}) "
 	    "{N:retransmitted}\n");
 	p(tcps_sndrexmitbad, "\t\t"
 	    "{:sent-unnecessary-retransmitted-packets/%ju} "
 	    "{N:/data packet%s unnecessarily retransmitted}\n");
 	p(tcps_mturesent, "\t\t{:sent-resends-by-mtu-discovery/%ju} "
 	    "{N:/resend%s initiated by MTU discovery}\n");
 	p2a(tcps_sndacks, tcps_delack, "\t\t{:sent-ack-only-packets/%ju} "
 	    "{N:/ack-only packet%s/} ({:sent-packets-delayed/%ju} "
 	    "{N:delayed})\n");
 	p(tcps_sndurg, "\t\t{:sent-urg-only-packets/%ju} "
 	    "{N:/URG only packet%s}\n");
 	p(tcps_sndprobe, "\t\t{:sent-window-probe-packets/%ju} "
 	    "{N:/window probe packet%s}\n");
 	p(tcps_sndwinup, "\t\t{:sent-window-update-packets/%ju} "
 	    "{N:/window update packet%s}\n");
 	p(tcps_sndctrl, "\t\t{:sent-control-packets/%ju} "
 	    "{N:/control packet%s}\n");
 	p(tcps_rcvtotal, "\t{:received-packets/%ju} "
 	    "{N:/packet%s received}\n");
 	p2(tcps_rcvackpack, tcps_rcvackbyte, "\t\t"
 	    "{:received-ack-packets/%ju} {N:/ack%s} "
 	    "{N:(for} {:received-ack-bytes/%ju} {N:/byte%s})\n");
 	p(tcps_rcvdupack, "\t\t{:received-duplicate-acks/%ju} "
 	    "{N:/duplicate ack%s}\n");
 	p(tcps_tunneled_pkts, "\t\t{:received-udp-tunneled-pkts/%ju} "
 	    "{N:/UDP tunneled pkt%s}\n");
 	p(tcps_tunneled_errs, "\t\t{:received-bad-udp-tunneled-pkts/%ju} "
 	    "{N:/UDP tunneled pkt cnt with error%s}\n");
 	p(tcps_rcvacktoomuch, "\t\t{:received-acks-for-unsent-data/%ju} "
 	    "{N:/ack%s for unsent data}\n");
 	p2(tcps_rcvpack, tcps_rcvbyte, "\t\t"
 	    "{:received-in-sequence-packets/%ju} {N:/packet%s} "
 	    "({:received-in-sequence-bytes/%ju} {N:/byte%s}) "
 	    "{N:received in-sequence}\n");
 	p2(tcps_rcvduppack, tcps_rcvdupbyte, "\t\t"
 	    "{:received-completely-duplicate-packets/%ju} "
 	    "{N:/completely duplicate packet%s} "
 	    "({:received-completely-duplicate-bytes/%ju} {N:/byte%s})\n");
 	p(tcps_pawsdrop, "\t\t{:received-old-duplicate-packets/%ju} "
 	    "{N:/old duplicate packet%s}\n");
 	p2(tcps_rcvpartduppack, tcps_rcvpartdupbyte, "\t\t"
 	    "{:received-some-duplicate-packets/%ju} "
 	    "{N:/packet%s with some dup. data} "
 	    "({:received-some-duplicate-bytes/%ju} {N:/byte%s duped/})\n");
 	p2(tcps_rcvoopack, tcps_rcvoobyte, "\t\t{:received-out-of-order/%ju} "
 	    "{N:/out-of-order packet%s} "
 	    "({:received-out-of-order-bytes/%ju} {N:/byte%s})\n");
 	p2(tcps_rcvpackafterwin, tcps_rcvbyteafterwin, "\t\t"
 	    "{:received-after-window-packets/%ju} {N:/packet%s} "
 	    "({:received-after-window-bytes/%ju} {N:/byte%s}) "
 	    "{N:of data after window}\n");
 	p(tcps_rcvwinprobe, "\t\t{:received-window-probes/%ju} "
 	    "{N:/window probe%s}\n");
 	p(tcps_rcvwinupd, "\t\t{:receive-window-update-packets/%ju} "
 	    "{N:/window update packet%s}\n");
 	p(tcps_dsack_count, "\t\t{:received-with-dsack-packets/%ju} "
 	    "{N:/packet%s received with dsack}\n");
 	p(tcps_dsack_bytes, "\t\t{:received-with-dsack-bytes/%ju} "
 	    "{N:/dsack byte%s received (no TLP involved)}\n");
 	p(tcps_dsack_tlp_bytes, "\t\t{:received-with-dsack-bytes-tlp/%ju} "
 	    "{N:/dsack byte%s received (TLP responsible)}\n");
 	p(tcps_rcvafterclose, "\t\t{:received-after-close-packets/%ju} "
 	    "{N:/packet%s received after close}\n");
 	p(tcps_rcvbadsum, "\t\t{:discard-bad-checksum/%ju} "
 	    "{N:/discarded for bad checksum%s}\n");
 	p(tcps_rcvbadoff, "\t\t{:discard-bad-header-offset/%ju} "
 	    "{N:/discarded for bad header offset field%s}\n");
 	p1a(tcps_rcvshort, "\t\t{:discard-too-short/%ju} "
 	    "{N:discarded because packet too short}\n");
 	p1a(tcps_rcvreassfull, "\t\t{:discard-reassembly-queue-full/%ju} "
 	    "{N:discarded due to full reassembly queue}\n");
 	p(tcps_connattempt, "\t{:connection-requests/%ju} "
 	    "{N:/connection request%s}\n");
 	p(tcps_accepts, "\t{:connections-accepts/%ju} "
 	    "{N:/connection accept%s}\n");
 	p(tcps_badsyn, "\t{:bad-connection-attempts/%ju} "
 	    "{N:/bad connection attempt%s}\n");
 	p(tcps_listendrop, "\t{:listen-queue-overflows/%ju} "
 	    "{N:/listen queue overflow%s}\n");
 	p(tcps_badrst, "\t{:ignored-in-window-resets/%ju} "
 	    "{N:/ignored RSTs in the window%s}\n");
 	p(tcps_connects, "\t{:connections-established/%ju} "
 	    "{N:/connection%s established (including accepts)}\n");
 	p(tcps_usedrtt, "\t\t{:connections-hostcache-rtt/%ju} "
 	    "{N:/time%s used RTT from hostcache}\n");
 	p(tcps_usedrttvar, "\t\t{:connections-hostcache-rttvar/%ju} "
 	    "{N:/time%s used RTT variance from hostcache}\n");
 	p(tcps_usedssthresh, "\t\t{:connections-hostcache-ssthresh/%ju} "
 	    "{N:/time%s used slow-start threshold from hostcache}\n");
 	p2(tcps_closed, tcps_drops, "\t{:connections-closed/%ju} "
 	    "{N:/connection%s closed (including} "
 	    "{:connection-drops/%ju} {N:/drop%s})\n");
 	p(tcps_cachedrtt, "\t\t{:connections-updated-rtt-on-close/%ju} "
 	    "{N:/connection%s updated cached RTT on close}\n");
 	p(tcps_cachedrttvar, "\t\t"
 	    "{:connections-updated-variance-on-close/%ju} "
 	    "{N:/connection%s updated cached RTT variance on close}\n");
 	p(tcps_cachedssthresh, "\t\t"
 	    "{:connections-updated-ssthresh-on-close/%ju} "
 	    "{N:/connection%s updated cached ssthresh on close}\n");
 	p(tcps_conndrops, "\t{:embryonic-connections-dropped/%ju} "
 	    "{N:/embryonic connection%s dropped}\n");
 	p2(tcps_rttupdated, tcps_segstimed, "\t{:segments-updated-rtt/%ju} "
 	    "{N:/segment%s updated rtt (of} "
 	    "{:segment-update-attempts/%ju} {N:/attempt%s})\n");
 	p(tcps_rexmttimeo, "\t{:retransmit-timeouts/%ju} "
 	    "{N:/retransmit timeout%s}\n");
 	p(tcps_timeoutdrop, "\t\t"
 	    "{:connections-dropped-by-retransmit-timeout/%ju} "
 	    "{N:/connection%s dropped by rexmit timeout}\n");
 	p(tcps_persisttimeo, "\t{:persist-timeout/%ju} "
 	    "{N:/persist timeout%s}\n");
 	p(tcps_persistdrop, "\t\t"
 	    "{:connections-dropped-by-persist-timeout/%ju} "
 	    "{N:/connection%s dropped by persist timeout}\n");
 	p(tcps_finwait2_drops, "\t"
 	    "{:connections-dropped-by-finwait2-timeout/%ju} "
 	    "{N:/Connection%s (fin_wait_2) dropped because of timeout}\n");
 	p(tcps_keeptimeo, "\t{:keepalive-timeout/%ju} "
 	    "{N:/keepalive timeout%s}\n");
 	p(tcps_keepprobe, "\t\t{:keepalive-probes/%ju} "
 	    "{N:/keepalive probe%s sent}\n");
 	p(tcps_keepdrops, "\t\t{:connections-dropped-by-keepalives/%ju} "
 	    "{N:/connection%s dropped by keepalive}\n");
 	p(tcps_predack, "\t{:ack-header-predictions/%ju} "
 	    "{N:/correct ACK header prediction%s}\n");
 	p(tcps_preddat, "\t{:data-packet-header-predictions/%ju} "
 	    "{N:/correct data packet header prediction%s}\n");
 
 	xo_open_container("syncache");
 
 	p3(tcps_sc_added, "\t{:entries-added/%ju} "
 	    "{N:/syncache entr%s added}\n");
 	p1a(tcps_sc_retransmitted, "\t\t{:retransmitted/%ju} "
 	    "{N:/retransmitted}\n");
 	p1a(tcps_sc_dupsyn, "\t\t{:duplicates/%ju} {N:/dupsyn}\n");
 	p1a(tcps_sc_dropped, "\t\t{:dropped/%ju} {N:/dropped}\n");
 	p1a(tcps_sc_completed, "\t\t{:completed/%ju} {N:/completed}\n");
 	p1a(tcps_sc_bucketoverflow, "\t\t{:bucket-overflow/%ju} "
 	    "{N:/bucket overflow}\n");
 	p1a(tcps_sc_cacheoverflow, "\t\t{:cache-overflow/%ju} "
 	    "{N:/cache overflow}\n");
 	p1a(tcps_sc_reset, "\t\t{:reset/%ju} {N:/reset}\n");
 	p1a(tcps_sc_stale, "\t\t{:stale/%ju} {N:/stale}\n");
 	p1a(tcps_sc_aborted, "\t\t{:aborted/%ju} {N:/aborted}\n");
 	p1a(tcps_sc_badack, "\t\t{:bad-ack/%ju} {N:/badack}\n");
 	p1a(tcps_sc_unreach, "\t\t{:unreachable/%ju} {N:/unreach}\n");
 	p(tcps_sc_zonefail, "\t\t{:zone-failures/%ju} {N:/zone failure%s}\n");
 	p(tcps_sc_sendcookie, "\t{:sent-cookies/%ju} {N:/cookie%s sent}\n");
 	p(tcps_sc_recvcookie, "\t{:receivd-cookies/%ju} "
 	    "{N:/cookie%s received}\n");
 
 	xo_close_container("syncache");
 
 	xo_open_container("hostcache");
 
 	p3(tcps_hc_added, "\t{:entries-added/%ju} "
 	    "{N:/hostcache entr%s added}\n");
 	p1a(tcps_hc_bucketoverflow, "\t\t{:buffer-overflows/%ju} "
 	    "{N:/bucket overflow}\n");
 
 	xo_close_container("hostcache");
 
 	xo_open_container("sack");
 
 	p(tcps_sack_recovery_episode, "\t{:recovery-episodes/%ju} "
 	    "{N:/SACK recovery episode%s}\n");
  	p(tcps_sack_rexmits, "\t{:segment-retransmits/%ju} "
 	    "{N:/segment rexmit%s in SACK recovery episodes}\n");
  	p(tcps_sack_rexmit_bytes, "\t{:byte-retransmits/%ju} "
 	    "{N:/byte rexmit%s in SACK recovery episodes}\n");
  	p(tcps_sack_rcv_blocks, "\t{:received-blocks/%ju} "
 	    "{N:/SACK option%s (SACK blocks) received}\n");
 	p(tcps_sack_send_blocks, "\t{:sent-option-blocks/%ju} "
 	    "{N:/SACK option%s (SACK blocks) sent}\n");
 	p(tcps_sack_lostrexmt, "\t{:lost-retransmissions/%ju} "
 	    "{N:/SACK retransmission%s lost}\n");
 	p1a(tcps_sack_sboverflow, "\t{:scoreboard-overflows/%ju} "
 	    "{N:/SACK scoreboard overflow}\n");
 
 	xo_close_container("sack");
 	xo_open_container("ecn");
 
 	p(tcps_ecn_ce, "\t{:ce-packets/%ju} "
 	    "{N:/packet%s with ECN CE bit set}\n");
 	p(tcps_ecn_ect0, "\t{:ect0-packets/%ju} "
 	    "{N:/packet%s with ECN ECT(0) bit set}\n");
 	p(tcps_ecn_ect1, "\t{:ect1-packets/%ju} "
 	    "{N:/packet%s with ECN ECT(1) bit set}\n");
 	p(tcps_ecn_shs, "\t{:handshakes/%ju} "
 	    "{N:/successful ECN handshake%s}\n");
 	p(tcps_ecn_rcwnd, "\t{:congestion-reductions/%ju} "
 	    "{N:/time%s ECN reduced the congestion window}\n");
 
 	p(tcps_ace_nect, "\t{:ace-nonect-syn/%ju} "
 	    "{N:/ACE SYN packet%s with Non-ECT}\n");
 	p(tcps_ace_ect0, "\t{:ace-ect0-syn/%ju} "
 	    "{N:/ACE SYN packet%s with ECT0}\n");
 	p(tcps_ace_ect1, "\t{:ace-ect1-syn/%ju} "
 	    "{N:/ACE SYN packet%s with ECT1}\n");
 	p(tcps_ace_ce, "\t{:ace-ce-syn/%ju} "
 	    "{N:/ACE SYN packet%s with CE}\n");
 
 	xo_close_container("ecn");
 	xo_open_container("tcp-signature");
 	p(tcps_sig_rcvgoodsig, "\t{:received-good-signature/%ju} "
 	    "{N:/packet%s with matching signature received}\n");
 	p(tcps_sig_rcvbadsig, "\t{:received-bad-signature/%ju} "
 	    "{N:/packet%s with bad signature received}\n");
 	p(tcps_sig_err_buildsig, "\t{:failed-make-signature/%ju} "
 	    "{N:/time%s failed to make signature due to no SA}\n");
 	p(tcps_sig_err_sigopt, "\t{:no-signature-expected/%ju} "
 	    "{N:/time%s unexpected signature received}\n");
 	p(tcps_sig_err_nosigopt, "\t{:no-signature-provided/%ju} "
 	    "{N:/time%s no signature provided by segment}\n");
 
 	xo_close_container("tcp-signature");
 	xo_open_container("pmtud");
 
 	p(tcps_pmtud_blackhole_activated, "\t{:pmtud-activated/%ju} "
 	    "{N:/Path MTU discovery black hole detection activation%s}\n");
 	p(tcps_pmtud_blackhole_activated_min_mss,
 	    "\t{:pmtud-activated-min-mss/%ju} "
 	    "{N:/Path MTU discovery black hole detection min MSS activation%s}\n");
 	p(tcps_pmtud_blackhole_failed, "\t{:pmtud-failed/%ju} "
 	    "{N:/Path MTU discovery black hole detection failure%s}\n");
 
 	xo_close_container("pmtud");
 	xo_open_container("tw");
 
 	p(tcps_tw_responds, "\t{:tw_responds/%ju} "
 	    "{N:/time%s connection in TIME-WAIT responded with ACK}\n");
 	p(tcps_tw_recycles, "\t{:tw_recycles/%ju} "
 	    "{N:/time%s connection in TIME-WAIT was actively recycled}\n");
 	p(tcps_tw_resets, "\t{:tw_resets/%ju} "
 	    "{N:/time%s connection in TIME-WAIT responded with RST}\n");
 
 	xo_close_container("tw");
  #undef p
  #undef p1a
  #undef p2
  #undef p2a
  #undef p3
 
 	xo_open_container("TCP connection count by state");
 	xo_emit("{T:/TCP connection count by state}:\n");
 	for (int i = 0; i < TCP_NSTATES; i++) {
 		/*
 		 * XXXGL: is there a way in libxo to use %s
 		 * in the "content string" of a format
 		 * string? I failed to do that, that's why
 		 * a temporary buffer is used to construct
 		 * format string for xo_emit().
 		 */
 		char fmtbuf[80];
 
 		if (sflag > 1 && tcps_states[i] == 0)
 			continue;
 		snprintf(fmtbuf, sizeof(fmtbuf), "\t{:%s/%%ju} "
                     "{Np:/connection ,connections} in %s state\n",
 		    tcpstates[i], tcpstates[i]);
 		xo_emit(fmtbuf, (uintmax_t )tcps_states[i]);
 	}
 	xo_close_container("TCP connection count by state");
 
 	xo_close_container("tcp");
 }
 
 /*
  * Dump UDP statistics structure.
  */
 void
 udp_stats(u_long off, const char *name, int af1 __unused, int proto __unused)
 {
 	struct udpstat udpstat;
 	uint64_t delivered;
 
 #ifdef INET6
 	if (udp_done != 0)
 		return;
 	else
 		udp_done = 1;
 #endif
 
 	if (fetch_stats("net.inet.udp.stats", off, &udpstat,
 	    sizeof(udpstat), kread_counters) != 0)
 		return;
 
 	xo_open_container("udp");
 	xo_emit("{T:/%s}:\n", name);
 
 #define	p(f, m) if (udpstat.f || sflag <= 1) \
 	xo_emit("\t" m, (uintmax_t)udpstat.f, plural(udpstat.f))
 #define	p1a(f, m) if (udpstat.f || sflag <= 1) \
 	xo_emit("\t" m, (uintmax_t)udpstat.f)
 
 	p(udps_ipackets, "{:received-datagrams/%ju} "
 	    "{N:/datagram%s received}\n");
 	p1a(udps_hdrops, "{:dropped-incomplete-headers/%ju} "
 	    "{N:/with incomplete header}\n");
 	p1a(udps_badlen, "{:dropped-bad-data-length/%ju} "
 	    "{N:/with bad data length field}\n");
 	p1a(udps_badsum, "{:dropped-bad-checksum/%ju} "
 	    "{N:/with bad checksum}\n");
 	p1a(udps_nosum, "{:dropped-no-checksum/%ju} "
 	    "{N:/with no checksum}\n");
 	p1a(udps_noport, "{:dropped-no-socket/%ju} "
 	    "{N:/dropped due to no socket}\n");
 	p(udps_noportbcast, "{:dropped-broadcast-multicast/%ju} "
 	    "{N:/broadcast\\/multicast datagram%s undelivered}\n");
 	p1a(udps_fullsock, "{:dropped-full-socket-buffer/%ju} "
 	    "{N:/dropped due to full socket buffers}\n");
 	p1a(udpps_pcbhashmiss, "{:not-for-hashed-pcb/%ju} "
 	    "{N:/not for hashed pcb}\n");
 	delivered = udpstat.udps_ipackets -
 		    udpstat.udps_hdrops -
 		    udpstat.udps_badlen -
 		    udpstat.udps_badsum -
 		    udpstat.udps_noport -
 		    udpstat.udps_noportbcast -
 		    udpstat.udps_fullsock;
 	if (delivered || sflag <= 1)
 		xo_emit("\t{:delivered-packets/%ju} {N:/delivered}\n",
 		    (uint64_t)delivered);
 	p(udps_opackets, "{:output-packets/%ju} {N:/datagram%s output}\n");
 	/* the next statistic is cumulative in udps_noportbcast */
 	p(udps_filtermcast, "{:multicast-source-filter-matches/%ju} "
 	    "{N:/time%s multicast source filter matched}\n");
 #undef p
 #undef p1a
 	xo_close_container("udp");
 }
 
 /*
  * Dump CARP statistics structure.
  */
 void
 carp_stats(u_long off, const char *name, int af1 __unused, int proto __unused)
 {
 	struct carpstats carpstat;
 
 	if (fetch_stats("net.inet.carp.stats", off, &carpstat,
 	    sizeof(carpstat), kread_counters) != 0)
 		return;
 
 	xo_open_container(name);
 	xo_emit("{T:/%s}:\n", name);
 
 #define	p(f, m) if (carpstat.f || sflag <= 1) \
 	xo_emit(m, (uintmax_t)carpstat.f, plural(carpstat.f))
 #define	p2(f, m) if (carpstat.f || sflag <= 1) \
 	xo_emit(m, (uintmax_t)carpstat.f)
 
 	p(carps_ipackets, "\t{:received-inet-packets/%ju} "
 	    "{N:/packet%s received (IPv4)}\n");
 	p(carps_ipackets6, "\t{:received-inet6-packets/%ju} "
 	    "{N:/packet%s received (IPv6)}\n");
 	p(carps_badttl, "\t\t{:dropped-wrong-ttl/%ju} "
 	    "{N:/packet%s discarded for wrong TTL}\n");
 	p(carps_hdrops, "\t\t{:dropped-short-header/%ju} "
 	    "{N:/packet%s shorter than header}\n");
 	p(carps_badsum, "\t\t{:dropped-bad-checksum/%ju} "
 	    "{N:/discarded for bad checksum%s}\n");
 	p(carps_badver,	"\t\t{:dropped-bad-version/%ju} "
 	    "{N:/discarded packet%s with a bad version}\n");
 	p2(carps_badlen, "\t\t{:dropped-short-packet/%ju} "
 	    "{N:/discarded because packet too short}\n");
 	p2(carps_badauth, "\t\t{:dropped-bad-authentication/%ju} "
 	    "{N:/discarded for bad authentication}\n");
 	p2(carps_badvhid, "\t\t{:dropped-bad-vhid/%ju} "
 	    "{N:/discarded for bad vhid}\n");
 	p2(carps_badaddrs, "\t\t{:dropped-bad-address-list/%ju} "
 	    "{N:/discarded because of a bad address list}\n");
 	p(carps_opackets, "\t{:sent-inet-packets/%ju} "
 	    "{N:/packet%s sent (IPv4)}\n");
 	p(carps_opackets6, "\t{:sent-inet6-packets/%ju} "
 	    "{N:/packet%s sent (IPv6)}\n");
 	p2(carps_onomem, "\t\t{:send-failed-memory-error/%ju} "
 	    "{N:/send failed due to mbuf memory error}\n");
 #if notyet
 	p(carps_ostates, "\t\t{:send-state-updates/%s} "
 	    "{N:/state update%s sent}\n");
 #endif
 #undef p
 #undef p2
 	xo_close_container(name);
 }
 
 /*
  * Dump IP statistics structure.
  */
 void
 ip_stats(u_long off, const char *name, int af1 __unused, int proto __unused)
 {
 	struct ipstat ipstat;
 
 	if (fetch_stats("net.inet.ip.stats", off, &ipstat,
 	    sizeof(ipstat), kread_counters) != 0)
 		return;
 
 	xo_open_container(name);
 	xo_emit("{T:/%s}:\n", name);
 
 #define	p(f, m) if (ipstat.f || sflag <= 1) \
 	xo_emit(m, (uintmax_t )ipstat.f, plural(ipstat.f))
 #define	p1a(f, m) if (ipstat.f || sflag <= 1) \
 	xo_emit(m, (uintmax_t )ipstat.f)
 
 	p(ips_total, "\t{:received-packets/%ju} "
 	    "{N:/total packet%s received}\n");
 	p(ips_badsum, "\t{:dropped-bad-checksum/%ju} "
 	    "{N:/bad header checksum%s}\n");
 	p1a(ips_toosmall, "\t{:dropped-below-minimum-size/%ju} "
 	    "{N:/with size smaller than minimum}\n");
 	p1a(ips_tooshort, "\t{:dropped-short-packets/%ju} "
 	    "{N:/with data size < data length}\n");
 	p1a(ips_toolong, "\t{:dropped-too-long/%ju} "
 	    "{N:/with ip length > max ip packet size}\n");
 	p1a(ips_badhlen, "\t{:dropped-short-header-length/%ju} "
 	    "{N:/with header length < data size}\n");
 	p1a(ips_badlen, "\t{:dropped-short-data/%ju} "
 	    "{N:/with data length < header length}\n");
 	p1a(ips_badoptions, "\t{:dropped-bad-options/%ju} "
 	    "{N:/with bad options}\n");
 	p1a(ips_badvers, "\t{:dropped-bad-version/%ju} "
 	    "{N:/with incorrect version number}\n");
 	p(ips_fragments, "\t{:received-fragments/%ju} "
 	    "{N:/fragment%s received}\n");
 	p(ips_fragdropped, "\t{:dropped-fragments/%ju} "
 	    "{N:/fragment%s dropped (dup or out of space)}\n");
 	p(ips_fragtimeout, "\t{:dropped-fragments-after-timeout/%ju} "
 	    "{N:/fragment%s dropped after timeout}\n");
 	p(ips_reassembled, "\t{:reassembled-packets/%ju} "
 	    "{N:/packet%s reassembled ok}\n");
 	p(ips_delivered, "\t{:received-local-packets/%ju} "
 	    "{N:/packet%s for this host}\n");
 	p(ips_noproto, "\t{:dropped-unknown-protocol/%ju} "
 	    "{N:/packet%s for unknown\\/unsupported protocol}\n");
 	p(ips_forward, "\t{:forwarded-packets/%ju} "
 	    "{N:/packet%s forwarded}");
 	p(ips_fastforward, " ({:fast-forwarded-packets/%ju} "
 	    "{N:/packet%s fast forwarded})");
 	if (ipstat.ips_forward || sflag <= 1)
 		xo_emit("\n");
 	p(ips_cantforward, "\t{:packets-cannot-forward/%ju} "
 	    "{N:/packet%s not forwardable}\n");
 	p(ips_notmember, "\t{:received-unknown-multicast-group/%ju} "
 	    "{N:/packet%s received for unknown multicast group}\n");
 	p(ips_redirectsent, "\t{:redirects-sent/%ju} "
 	    "{N:/redirect%s sent}\n");
 	p(ips_localout, "\t{:sent-packets/%ju} "
 	    "{N:/packet%s sent from this host}\n");
 	p(ips_rawout, "\t{:send-packets-fabricated-header/%ju} "
 	    "{N:/packet%s sent with fabricated ip header}\n");
 	p(ips_odropped, "\t{:discard-no-mbufs/%ju} "
 	    "{N:/output packet%s dropped due to no bufs, etc.}\n");
 	p(ips_noroute, "\t{:discard-no-route/%ju} "
 	    "{N:/output packet%s discarded due to no route}\n");
 	p(ips_fragmented, "\t{:sent-fragments/%ju} "
 	    "{N:/output datagram%s fragmented}\n");
 	p(ips_ofragments, "\t{:fragments-created/%ju} "
 	    "{N:/fragment%s created}\n");
 	p(ips_cantfrag, "\t{:discard-cannot-fragment/%ju} "
 	    "{N:/datagram%s that can't be fragmented}\n");
 	p(ips_nogif, "\t{:discard-tunnel-no-gif/%ju} "
 	    "{N:/tunneling packet%s that can't find gif}\n");
 	p(ips_badaddr, "\t{:discard-bad-address/%ju} "
 	    "{N:/datagram%s with bad address in header}\n");
 #undef p
 #undef p1a
 	xo_close_container(name);
 }
 
 /*
  * Dump ARP statistics structure.
  */
 void
 arp_stats(u_long off, const char *name, int af1 __unused, int proto __unused)
 {
 	struct arpstat arpstat;
 
 	if (fetch_stats("net.link.ether.arp.stats", off, &arpstat,
 	    sizeof(arpstat), kread_counters) != 0)
 		return;
 
 	xo_open_container(name);
 	xo_emit("{T:/%s}:\n", name);
 
 #define	p(f, m) if (arpstat.f || sflag <= 1) \
 	xo_emit("\t" m, (uintmax_t)arpstat.f, plural(arpstat.f))
 #define	p2(f, m) if (arpstat.f || sflag <= 1) \
 	xo_emit("\t" m, (uintmax_t)arpstat.f, pluralies(arpstat.f))
 
 	p(txrequests, "{:sent-requests/%ju} {N:/ARP request%s sent}\n");
 	p(txerrors, "{:sent-failures/%ju} {N:/ARP request%s failed to sent}\n");
 	p2(txreplies, "{:sent-replies/%ju} {N:/ARP repl%s sent}\n");
 	p(rxrequests, "{:received-requests/%ju} "
 	    "{N:/ARP request%s received}\n");
 	p2(rxreplies, "{:received-replies/%ju} "
 	    "{N:/ARP repl%s received}\n");
 	p(received, "{:received-packets/%ju} "
 	    "{N:/ARP packet%s received}\n");
 	p(dropped, "{:dropped-no-entry/%ju} "
 	    "{N:/total packet%s dropped due to no ARP entry}\n");
 	p(timeouts, "{:entries-timeout/%ju} "
 	    "{N:/ARP entry%s timed out}\n");
 	p(dupips, "{:dropped-duplicate-address/%ju} "
 	    "{N:/Duplicate IP%s seen}\n");
 #undef p
 #undef p2
 	xo_close_container(name);
 }
 
 
 
 static	const char *icmpnames[ICMP_MAXTYPE + 1] = {
 	"echo reply",			/* RFC 792 */
 	"#1",
 	"#2",
 	"destination unreachable",	/* RFC 792 */
 	"source quench",		/* RFC 792 */
 	"routing redirect",		/* RFC 792 */
 	"#6",
 	"#7",
 	"echo",				/* RFC 792 */
 	"router advertisement",		/* RFC 1256 */
 	"router solicitation",		/* RFC 1256 */
 	"time exceeded",		/* RFC 792 */
 	"parameter problem",		/* RFC 792 */
 	"time stamp",			/* RFC 792 */
 	"time stamp reply",		/* RFC 792 */
 	"information request",		/* RFC 792 */
 	"information request reply",	/* RFC 792 */
 	"address mask request",		/* RFC 950 */
 	"address mask reply",		/* RFC 950 */
 	"#19",
 	"#20",
 	"#21",
 	"#22",
 	"#23",
 	"#24",
 	"#25",
 	"#26",
 	"#27",
 	"#28",
 	"#29",
 	"icmp traceroute",		/* RFC 1393 */
 	"datagram conversion error",	/* RFC 1475 */
 	"mobile host redirect",
 	"IPv6 where-are-you",
 	"IPv6 i-am-here",
 	"mobile registration req",
 	"mobile registration reply",
 	"domain name request",		/* RFC 1788 */
 	"domain name reply",		/* RFC 1788 */
 	"icmp SKIP",
 	"icmp photuris",		/* RFC 2521 */
 };
 
 /*
  * Dump ICMP statistics.
  */
 void
 icmp_stats(u_long off, const char *name, int af1 __unused, int proto __unused)
 {
 	struct icmpstat icmpstat;
 	size_t len;
 	int i, first;
 
 	if (fetch_stats("net.inet.icmp.stats", off, &icmpstat,
 	    sizeof(icmpstat), kread_counters) != 0)
 		return;
 
 	xo_open_container(name);
 	xo_emit("{T:/%s}:\n", name);
 
 #define	p(f, m) if (icmpstat.f || sflag <= 1) \
 	xo_emit(m, icmpstat.f, plural(icmpstat.f))
 #define	p1a(f, m) if (icmpstat.f || sflag <= 1) \
 	xo_emit(m, icmpstat.f)
 #define	p2(f, m) if (icmpstat.f || sflag <= 1) \
 	xo_emit(m, icmpstat.f, plurales(icmpstat.f))
 
 	p(icps_error, "\t{:icmp-calls/%lu} "
 	    "{N:/call%s to icmp_error}\n");
 	p(icps_oldicmp, "\t{:errors-not-from-message/%lu} "
 	    "{N:/error%s not generated in response to an icmp message}\n");
 
 	for (first = 1, i = 0; i < ICMP_MAXTYPE + 1; i++) {
 		if (icmpstat.icps_outhist[i] != 0) {
 			if (first) {
 				xo_open_list("output-histogram");
 				xo_emit("\tOutput histogram:\n");
 				first = 0;
 			}
 			xo_open_instance("output-histogram");
 			if (icmpnames[i] != NULL)
 				xo_emit("\t\t{k:name/%s}: {:count/%lu}\n",
 				    icmpnames[i], icmpstat.icps_outhist[i]);
 			else
 				xo_emit("\t\tunknown ICMP #{k:name/%d}: "
 				    "{:count/%lu}\n",
 				    i, icmpstat.icps_outhist[i]);
 			xo_close_instance("output-histogram");
 		}
 	}
 	if (!first)
 		xo_close_list("output-histogram");
 
 	p(icps_badcode, "\t{:dropped-bad-code/%lu} "
 	    "{N:/message%s with bad code fields}\n");
 	p(icps_tooshort, "\t{:dropped-too-short/%lu} "
 	    "{N:/message%s less than the minimum length}\n");
 	p(icps_checksum, "\t{:dropped-bad-checksum/%lu} "
 	    "{N:/message%s with bad checksum}\n");
 	p(icps_badlen, "\t{:dropped-bad-length/%lu} "
 	    "{N:/message%s with bad length}\n");
 	p1a(icps_bmcastecho, "\t{:dropped-multicast-echo/%lu} "
 	    "{N:/multicast echo requests ignored}\n");
 	p1a(icps_bmcasttstamp, "\t{:dropped-multicast-timestamp/%lu} "
 	    "{N:/multicast timestamp requests ignored}\n");
 
 	for (first = 1, i = 0; i < ICMP_MAXTYPE + 1; i++) {
 		if (icmpstat.icps_inhist[i] != 0) {
 			if (first) {
 				xo_open_list("input-histogram");
 				xo_emit("\tInput histogram:\n");
 				first = 0;
 			}
 			xo_open_instance("input-histogram");
 			if (icmpnames[i] != NULL)
 				xo_emit("\t\t{k:name/%s}: {:count/%lu}\n",
 					icmpnames[i],
 					icmpstat.icps_inhist[i]);
 			else
 				xo_emit(
 			"\t\tunknown ICMP #{k:name/%d}: {:count/%lu}\n",
 					i, icmpstat.icps_inhist[i]);
 			xo_close_instance("input-histogram");
 		}
 	}
 	if (!first)
 		xo_close_list("input-histogram");
 
 	p(icps_reflect, "\t{:sent-packets/%lu} "
 	    "{N:/message response%s generated}\n");
 	p2(icps_badaddr, "\t{:discard-invalid-return-address/%lu} "
 	    "{N:/invalid return address%s}\n");
 	p(icps_noroute, "\t{:discard-no-route/%lu} "
 	    "{N:/no return route%s}\n");
 #undef p
 #undef p1a
 #undef p2
 	if (live) {
 		len = sizeof i;
 		if (sysctlbyname("net.inet.icmp.maskrepl", &i, &len, NULL, 0) <
 		    0)
 			return;
 		xo_emit("\tICMP address mask responses are "
 		    "{q:icmp-address-responses/%sabled}\n", i ? "en" : "dis");
 	}
 
 	xo_close_container(name);
 }
 
 /*
  * Dump IGMP statistics structure.
  */
 void
 igmp_stats(u_long off, const char *name, int af1 __unused, int proto __unused)
 {
 	struct igmpstat igmpstat;
 	int error, zflag0;
 
 	if (fetch_stats("net.inet.igmp.stats", 0, &igmpstat,
 	    sizeof(igmpstat), kread) != 0)
 		return;
 	/*
 	 * Reread net.inet.igmp.stats when zflag == 1.
 	 * This is because this MIB contains version number and
 	 * length of the structure which are not set when clearing
 	 * the counters.
 	 */
 	zflag0 = zflag;
 	if (zflag) {
 		zflag = 0;
 		error = fetch_stats("net.inet.igmp.stats", 0, &igmpstat,
 		    sizeof(igmpstat), kread);
 		zflag = zflag0;
 		if (error)
 			return;
 	}
 
 	if (igmpstat.igps_version != IGPS_VERSION_3) {
 		xo_warnx("%s: version mismatch (%d != %d)", __func__,
 		    igmpstat.igps_version, IGPS_VERSION_3);
 		return;
 	}
 	if (igmpstat.igps_len != IGPS_VERSION3_LEN) {
 		xo_warnx("%s: size mismatch (%d != %d)", __func__,
 		    igmpstat.igps_len, IGPS_VERSION3_LEN);
 		return;
 	}
 
 	xo_open_container(name);
 	xo_emit("{T:/%s}:\n", name);
 
 #define	p64(f, m) if (igmpstat.f || sflag <= 1) \
 	xo_emit(m, (uintmax_t) igmpstat.f, plural(igmpstat.f))
 #define	py64(f, m) if (igmpstat.f || sflag <= 1) \
 	xo_emit(m, (uintmax_t) igmpstat.f, pluralies(igmpstat.f))
 
 	p64(igps_rcv_total, "\t{:received-messages/%ju} "
 	    "{N:/message%s received}\n");
 	p64(igps_rcv_tooshort, "\t{:dropped-too-short/%ju} "
 	    "{N:/message%s received with too few bytes}\n");
 	p64(igps_rcv_badttl, "\t{:dropped-wrong-ttl/%ju} "
 	    "{N:/message%s received with wrong TTL}\n");
 	p64(igps_rcv_badsum, "\t{:dropped-bad-checksum/%ju} "
 	    "{N:/message%s received with bad checksum}\n");
 	py64(igps_rcv_v1v2_queries, "\t{:received-membership-queries/%ju} "
 	    "{N:/V1\\/V2 membership quer%s received}\n");
 	py64(igps_rcv_v3_queries, "\t{:received-v3-membership-queries/%ju} "
 	    "{N:/V3 membership quer%s received}\n");
 	py64(igps_rcv_badqueries, "\t{:dropped-membership-queries/%ju} "
 	    "{N:/membership quer%s received with invalid field(s)}\n");
 	py64(igps_rcv_gen_queries, "\t{:received-general-queries/%ju} "
 	    "{N:/general quer%s received}\n");
 	py64(igps_rcv_group_queries, "\t{:received-group-queries/%ju} "
 	    "{N:/group quer%s received}\n");
 	py64(igps_rcv_gsr_queries, "\t{:received-group-source-queries/%ju} "
 	    "{N:/group-source quer%s received}\n");
 	py64(igps_drop_gsr_queries, "\t{:dropped-group-source-queries/%ju} "
 	    "{N:/group-source quer%s dropped}\n");
 	p64(igps_rcv_reports, "\t{:received-membership-requests/%ju} "
 	    "{N:/membership report%s received}\n");
 	p64(igps_rcv_badreports, "\t{:dropped-membership-reports/%ju} "
 	    "{N:/membership report%s received with invalid field(s)}\n");
 	p64(igps_rcv_ourreports, "\t"
 	    "{:received-membership-reports-matching/%ju} "
 	    "{N:/membership report%s received for groups to which we belong}"
 	    "\n");
 	p64(igps_rcv_nora, "\t{:received-v3-reports-no-router-alert/%ju} "
 	    "{N:/V3 report%s received without Router Alert}\n");
 	p64(igps_snd_reports, "\t{:sent-membership-reports/%ju} "
 	    "{N:/membership report%s sent}\n");
 #undef p64
 #undef py64
 	xo_close_container(name);
 }
 
 /*
  * Dump PIM statistics structure.
  */
 void
 pim_stats(u_long off __unused, const char *name, int af1 __unused,
     int proto __unused)
 {
 	struct pimstat pimstat;
 
 	if (fetch_stats("net.inet.pim.stats", off, &pimstat,
 	    sizeof(pimstat), kread_counters) != 0)
 		return;
 
 	xo_open_container(name);
 	xo_emit("{T:/%s}:\n", name);
 
 #define	p(f, m) if (pimstat.f || sflag <= 1) \
 	xo_emit(m, (uintmax_t)pimstat.f, plural(pimstat.f))
 #define	py(f, m) if (pimstat.f || sflag <= 1) \
 	xo_emit(m, (uintmax_t)pimstat.f, pimstat.f != 1 ? "ies" : "y")
 
 	p(pims_rcv_total_msgs, "\t{:received-messages/%ju} "
 	    "{N:/message%s received}\n");
 	p(pims_rcv_total_bytes, "\t{:received-bytes/%ju} "
 	    "{N:/byte%s received}\n");
 	p(pims_rcv_tooshort, "\t{:dropped-too-short/%ju} "
 	    "{N:/message%s received with too few bytes}\n");
 	p(pims_rcv_badsum, "\t{:dropped-bad-checksum/%ju} "
 	    "{N:/message%s received with bad checksum}\n");
 	p(pims_rcv_badversion, "\t{:dropped-bad-version/%ju} "
 	    "{N:/message%s received with bad version}\n");
 	p(pims_rcv_registers_msgs, "\t{:received-data-register-messages/%ju} "
 	    "{N:/data register message%s received}\n");
 	p(pims_rcv_registers_bytes, "\t{:received-data-register-bytes/%ju} "
 	    "{N:/data register byte%s received}\n");
 	p(pims_rcv_registers_wrongiif, "\t"
 	    "{:received-data-register-wrong-interface/%ju} "
 	    "{N:/data register message%s received on wrong iif}\n");
 	p(pims_rcv_badregisters, "\t{:received-bad-registers/%ju} "
 	    "{N:/bad register%s received}\n");
 	p(pims_snd_registers_msgs, "\t{:sent-data-register-messages/%ju} "
 	    "{N:/data register message%s sent}\n");
 	p(pims_snd_registers_bytes, "\t{:sent-data-register-bytes/%ju} "
 	    "{N:/data register byte%s sent}\n");
 #undef p
 #undef py
 	xo_close_container(name);
 }
 
 #ifdef INET
 /*
  * Pretty print an Internet address (net address + port).
  */
 static void
 inetprint(const char *container, struct in_addr *in, int port,
     const char *proto, int num_port, const int af1)
 {
 	struct servent *sp = 0;
 	char line[80], *cp;
 	int width;
 	size_t alen, plen;
 
 	if (container)
 		xo_open_container(container);
 
 	if (Wflag)
 	    snprintf(line, sizeof(line), "%s.", inetname(in));
 	else
 	    snprintf(line, sizeof(line), "%.*s.",
 		(Aflag && !num_port) ? 12 : 16, inetname(in));
 	alen = strlen(line);
 	cp = line + alen;
 	if (!num_port && port)
 		sp = getservbyport((int)port, proto);
 	if (sp || port == 0)
 		snprintf(cp, sizeof(line) - alen,
 		    "%.15s ", sp ? sp->s_name : "*");
 	else
 		snprintf(cp, sizeof(line) - alen,
 		    "%d ", ntohs((u_short)port));
 	width = (Aflag && !Wflag) ? 18 :
 		((!Wflag || af1 == AF_INET) ? 22 : 45);
 	if (Wflag)
 		xo_emit("{d:target/%-*s} ", width, line);
 	else
 		xo_emit("{d:target/%-*.*s} ", width, width, line);
 
 	plen = strlen(cp) - 1;
 	alen--;
 	xo_emit("{e:address/%*.*s}{e:port/%*.*s}", alen, alen, line, plen,
 	    plen, cp);
 
 	if (container)
 		xo_close_container(container);
 }
 
 /*
  * Construct an Internet address representation.
  * If numeric_addr has been supplied, give
  * numeric value, otherwise try for symbolic name.
  */
 char *
 inetname(struct in_addr *inp)
 {
 	char *cp;
 	static char line[MAXHOSTNAMELEN];
 	struct hostent *hp;
 
 	cp = 0;
 	if (!numeric_addr && inp->s_addr != INADDR_ANY) {
 		hp = gethostbyaddr((char *)inp, sizeof (*inp), AF_INET);
 		if (hp) {
 			cp = hp->h_name;
 			trimdomain(cp, strlen(cp));
 		}
 	}
 	if (inp->s_addr == INADDR_ANY)
 		strcpy(line, "*");
 	else if (cp) {
 		strlcpy(line, cp, sizeof(line));
 	} else {
 		inp->s_addr = ntohl(inp->s_addr);
 #define	C(x)	((u_int)((x) & 0xff))
 		snprintf(line, sizeof(line), "%u.%u.%u.%u",
 		    C(inp->s_addr >> 24), C(inp->s_addr >> 16),
 		    C(inp->s_addr >> 8), C(inp->s_addr));
 	}
 	return (line);
 }
 #endif
diff --git a/usr.bin/netstat/main.c b/usr.bin/netstat/main.c
index 1a011b9d5488..d1b069f38f0c 100644
--- a/usr.bin/netstat/main.c
+++ b/usr.bin/netstat/main.c
@@ -1,913 +1,913 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1983, 1988, 1993
  *	Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef lint
 static char const copyright[] =
 "@(#) Copyright (c) 1983, 1988, 1993\n\
 	Regents of the University of California.  All rights reserved.\n";
 #endif /* not lint */
 
 #if 0
 #ifndef lint
 static char sccsid[] = "@(#)main.c	8.4 (Berkeley) 3/1/94";
 #endif /* not lint */
 #endif
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/file.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <netinet/in.h>
 
 #ifdef NETGRAPH
 #include <netgraph/ng_socket.h>
 #endif
 
 #include <ctype.h>
 #include <err.h>
 #include <errno.h>
 #include <kvm.h>
 #include <limits.h>
 #include <netdb.h>
 #include <nlist.h>
 #include <paths.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
 #include <unistd.h>
 #include "netstat.h"
 #include "nl_defs.h"
 #include <libxo/xo.h>
 
 static struct protox {
 	int	pr_index;		/* index into nlist of cb head */
 	int	pr_sindex;		/* index into nlist of stat block */
 	u_char	pr_wanted;		/* 1 if wanted, 0 otherwise */
 	void	(*pr_cblocks)(u_long, const char *, int, int);
 					/* control blocks printing routine */
 	void	(*pr_stats)(u_long, const char *, int, int);
 					/* statistics printing routine */
 	void	(*pr_istats)(char *);	/* per/if statistics printing routine */
 	const char	*pr_name;		/* well-known name */
 	int	pr_usesysctl;		/* non-zero if we use sysctl, not kvm */
 	int	pr_protocol;
 } protox[] = {
 	{ N_TCBINFO,	N_TCPSTAT,	1,	protopr,
 	  tcp_stats,	NULL,		"tcp",	1,	IPPROTO_TCP },
 	{ N_UDBINFO,	N_UDPSTAT,	1,	protopr,
 	  udp_stats,	NULL,		"udp",	1,	IPPROTO_UDP },
 #ifdef SCTP
 	{ -1,		N_SCTPSTAT,	1,	sctp_protopr,
 	  sctp_stats,	NULL,		"sctp",	1,	IPPROTO_SCTP },
 #endif
 #ifdef SDP
 	{ -1,		-1,		1,	protopr,
 	 NULL,		NULL,		"sdp",	1,	IPPROTO_TCP },
 #endif
 	{ N_DIVCBINFO,	-1,		1,	protopr,
-	  NULL,		NULL,		"divert", 1,	IPPROTO_DIVERT },
+	  NULL,		NULL,		"divert", 1,	0 },
 	{ N_RIPCBINFO,	N_IPSTAT,	1,	protopr,
 	  ip_stats,	NULL,		"ip",	1,	IPPROTO_RAW },
 	{ N_RIPCBINFO,	N_ICMPSTAT,	1,	protopr,
 	  icmp_stats,	NULL,		"icmp",	1,	IPPROTO_ICMP },
 	{ N_RIPCBINFO,	N_IGMPSTAT,	1,	protopr,
 	  igmp_stats,	NULL,		"igmp",	1,	IPPROTO_IGMP },
 #ifdef IPSEC
 	{ -1,		N_IPSEC4STAT,	1,	NULL,	/* keep as compat */
 	  ipsec_stats,	NULL,		"ipsec", 1,	0},
 	{ -1,		N_AHSTAT,	1,	NULL,
 	  ah_stats,	NULL,		"ah",	1,	0},
 	{ -1,		N_ESPSTAT,	1,	NULL,
 	  esp_stats,	NULL,		"esp",	1,	0},
 	{ -1,		N_IPCOMPSTAT,	1,	NULL,
 	  ipcomp_stats,	NULL,		"ipcomp", 1,	0},
 #endif
 	{ N_RIPCBINFO,	N_PIMSTAT,	1,	protopr,
 	  pim_stats,	NULL,		"pim",	1,	IPPROTO_PIM },
 	{ -1,		N_CARPSTATS,	1,	NULL,
 	  carp_stats,	NULL,		"carp",	1,	0 },
 #ifdef PF
 	{ -1,		N_PFSYNCSTATS,	1,	NULL,
 	  pfsync_stats,	NULL,		"pfsync", 1,	0 },
 #endif
 	{ -1,		N_ARPSTAT,	1,	NULL,
 	  arp_stats,	NULL,		"arp", 1,	0 },
 	{ -1,		-1,		0,	NULL,
 	  NULL,		NULL,		NULL,	0,	0 }
 };
 
 #ifdef INET6
 static struct protox ip6protox[] = {
 	{ N_TCBINFO,	N_TCPSTAT,	1,	protopr,
 	  tcp_stats,	NULL,		"tcp",	1,	IPPROTO_TCP },
 	{ N_UDBINFO,	N_UDPSTAT,	1,	protopr,
 	  udp_stats,	NULL,		"udp",	1,	IPPROTO_UDP },
 	{ N_RIPCBINFO,	N_IP6STAT,	1,	protopr,
 	  ip6_stats,	ip6_ifstats,	"ip6",	1,	IPPROTO_RAW },
 	{ N_RIPCBINFO,	N_ICMP6STAT,	1,	protopr,
 	  icmp6_stats,	icmp6_ifstats,	"icmp6", 1,	IPPROTO_ICMPV6 },
 #ifdef SDP
 	{ -1,		-1,		1,	protopr,
 	 NULL,		NULL,		"sdp",	1,	IPPROTO_TCP },
 #endif
 #ifdef IPSEC
 	{ -1,		N_IPSEC6STAT,	1,	NULL,
 	  ipsec_stats,	NULL,		"ipsec6", 1,	0 },
 #endif
 #ifdef notyet
 	{ -1,		N_PIM6STAT,	1,	NULL,
 	  pim6_stats,	NULL,		"pim6",	1,	0 },
 #endif
 	{ -1,		N_RIP6STAT,	1,	NULL,
 	  rip6_stats,	NULL,		"rip6",	1,	0 },
 	{ -1,		-1,		0,	NULL,
 	  NULL,		NULL,		NULL,	0,	0 }
 };
 #endif /*INET6*/
 
 #ifdef IPSEC
 static struct protox pfkeyprotox[] = {
 	{ -1,		N_PFKEYSTAT,	1,	NULL,
 	  pfkey_stats,	NULL,		"pfkey", 0,	0 },
 	{ -1,		-1,		0,	NULL,
 	  NULL,		NULL,		NULL,	0,	0 }
 };
 #endif
 
 #ifdef NETGRAPH
 static struct protox netgraphprotox[] = {
 	{ N_NGSOCKLIST,	-1,		1,	netgraphprotopr,
 	  NULL,		NULL,		"ctrl",	0,	0 },
 	{ N_NGSOCKLIST,	-1,		1,	netgraphprotopr,
 	  NULL,		NULL,		"data",	0,	0 },
 	{ -1,		-1,		0,	NULL,
 	  NULL,		NULL,		NULL,	0,	0 }
 };
 #endif
 
 static struct protox *protoprotox[] = {
 					 protox,
 #ifdef INET6
 					 ip6protox,
 #endif
 #ifdef IPSEC
 					 pfkeyprotox,
 #endif
 					 NULL };
 
 static void printproto(struct protox *, const char *, bool *);
 static void usage(void);
 static struct protox *name2protox(const char *);
 static struct protox *knownname(const char *);
 
 static int kresolve_list(struct nlist *_nl);
 
 static kvm_t *kvmd;
 static char *nlistf = NULL, *memf = NULL;
 
 int	Aflag;		/* show addresses of protocol control block */
 int	aflag;		/* show all sockets (including servers) */
 static int	Bflag;		/* show information about bpf consumers */
 int	bflag;		/* show i/f total bytes in/out */
 int	cflag;		/* show TCP congestion control stack */
 int	Cflag;		/* show congestion control algo and vars */
 int	dflag;		/* show i/f dropped packets */
 int	gflag;		/* show group (multicast) routing or stats */
 int	hflag;		/* show counters in human readable format */
 int	iflag;		/* show interfaces */
 int	Lflag;		/* show size of listen queues */
 int	mflag;		/* show memory stats */
 int	noutputs = 0;	/* how much outputs before we exit */
 int	numeric_addr;	/* show addresses numerically */
 int	numeric_port;	/* show ports numerically */
 int	Oflag;		/* show nhgrp objects*/
 int	oflag;		/* show nexthop objects*/
 int	Pflag;		/* show TCP log ID */
 static int pflag;	/* show given protocol */
 static int	Qflag;		/* show netisr information */
 int	rflag;		/* show routing tables (or routing stats) */
 int	Rflag;		/* show flow / RSS statistics */
 int	sflag;		/* show protocol statistics */
 int	Wflag;		/* wide display */
 int	Tflag;		/* TCP Information */
 int	xflag;		/* extra information, includes all socket buffer info */
 int	zflag;		/* zero stats */
 
 int	interval;	/* repeat interval for i/f stats */
 
 char	*interface;	/* desired i/f for stats, or NULL for all i/fs */
 int	unit;		/* unit number for above */
 
 static int	af;		/* address family */
 int	live;		/* true if we are examining a live system */
 
 int
 main(int argc, char *argv[])
 {
 	struct protox *tp = NULL;  /* for printing cblocks & stats */
 	int ch;
 	int fib = -1;
 	char *endptr;
 	bool first = true;
 
 	af = AF_UNSPEC;
 
 	argc = xo_parse_args(argc, argv);
 	if (argc < 0)
 		exit(EXIT_FAILURE);
 
 	while ((ch = getopt(argc, argv, "46AaBbCcdF:f:ghI:iLlM:mN:nOoPp:Qq:RrSTsuWw:xz"))
 	    != -1)
 		switch(ch) {
 		case '4':
 #ifdef INET
 			af = AF_INET;
 #else
 			errx(1, "IPv4 support is not compiled in");
 #endif
 			break;
 		case '6':
 #ifdef INET6
 			af = AF_INET6;
 #else
 			errx(1, "IPv6 support is not compiled in");
 #endif
 			break;
 		case 'A':
 			Aflag = 1;
 			break;
 		case 'a':
 			aflag = 1;
 			break;
 		case 'B':
 			Bflag = 1;
 			break;
 		case 'b':
 			bflag = 1;
 			break;
 		case 'c':
 			cflag = 1;
 			break;
 		case 'C':
 			Cflag = 1;
 			break;
 		case 'd':
 			dflag = 1;
 			break;
 		case 'F':
 			fib = strtol(optarg, &endptr, 0);
 			if (*endptr != '\0' ||
 			    (fib == 0 && (errno == EINVAL || errno == ERANGE)))
 				xo_errx(1, "%s: invalid fib", optarg);
 			break;
 		case 'f':
 			if (strcmp(optarg, "inet") == 0)
 				af = AF_INET;
 #ifdef INET6
 			else if (strcmp(optarg, "inet6") == 0)
 				af = AF_INET6;
 #endif
 #ifdef IPSEC
 			else if (strcmp(optarg, "pfkey") == 0)
 				af = PF_KEY;
 #endif
 			else if (strcmp(optarg, "unix") == 0 ||
 				 strcmp(optarg, "local") == 0)
 				af = AF_UNIX;
 #ifdef NETGRAPH
 			else if (strcmp(optarg, "ng") == 0
 			    || strcmp(optarg, "netgraph") == 0)
 				af = AF_NETGRAPH;
 #endif
 			else if (strcmp(optarg, "link") == 0)
 				af = AF_LINK;
 			else {
 				xo_errx(1, "%s: unknown address family",
 				    optarg);
 			}
 			break;
 		case 'g':
 			gflag = 1;
 			break;
 		case 'h':
 			hflag = 1;
 			break;
 		case 'I': {
 			char *cp;
 
 			iflag = 1;
 			for (cp = interface = optarg; isalpha(*cp); cp++)
 				continue;
 			unit = atoi(cp);
 			break;
 		}
 		case 'i':
 			iflag = 1;
 			break;
 		case 'L':
 			Lflag = 1;
 			break;
 		case 'M':
 			memf = optarg;
 			break;
 		case 'm':
 			mflag = 1;
 			break;
 		case 'N':
 			nlistf = optarg;
 			break;
 		case 'n':
 			numeric_addr = numeric_port = 1;
 			break;
 		case 'o':
 			oflag = 1;
 			break;
 		case 'O':
 			Oflag = 1;
 			break;
 		case 'P':
 			Pflag = 1;
 			break;
 		case 'p':
 			if ((tp = name2protox(optarg)) == NULL) {
 				xo_errx(1, "%s: unknown or uninstrumented "
 				    "protocol", optarg);
 			}
 			pflag = 1;
 			break;
 		case 'Q':
 			Qflag = 1;
 			break;
 		case 'q':
 			noutputs = atoi(optarg);
 			if (noutputs != 0)
 				noutputs++;
 			break;
 		case 'r':
 			rflag = 1;
 			break;
 		case 'R':
 			Rflag = 1;
 			break;
 		case 's':
 			++sflag;
 			break;
 		case 'S':
 			numeric_addr = 1;
 			break;
 		case 'u':
 			af = AF_UNIX;
 			break;
 		case 'W':
 		case 'l':
 			Wflag = 1;
 			break;
 		case 'w':
 			interval = atoi(optarg);
 			iflag = 1;
 			break;
 		case 'T':
 			Tflag = 1;
 			break;
 		case 'x':
 			xflag = 1;
 			break;
 		case 'z':
 			zflag = 1;
 			break;
 		case '?':
 		default:
 			usage();
 		}
 	argv += optind;
 	argc -= optind;
 
 #define	BACKWARD_COMPATIBILITY
 #ifdef	BACKWARD_COMPATIBILITY
 	if (*argv) {
 		if (isdigit(**argv)) {
 			interval = atoi(*argv);
 			if (interval <= 0)
 				usage();
 			++argv;
 			iflag = 1;
 		}
 		if (*argv) {
 			nlistf = *argv;
 			if (*++argv)
 				memf = *argv;
 		}
 	}
 #endif
 
 	/*
 	 * Discard setgid privileges if not the running kernel so that bad
 	 * guys can't print interesting stuff from kernel memory.
 	 */
 	live = (nlistf == NULL && memf == NULL);
 	if (!live) {
 		if (setgid(getgid()) != 0)
 			xo_err(-1, "setgid");
 		/* Load all necessary kvm symbols */
 		kresolve_list(nl);
 	}
 
 	if (xflag && Tflag)
 		xo_errx(1, "-x and -T are incompatible, pick one.");
 
 	if (Bflag) {
 		if (!live)
 			usage();
 		bpf_stats(interface);
 		xo_finish();
 		exit(0);
 	}
 	if (mflag) {
 		if (!live) {
 			if (kread(0, NULL, 0) == 0)
 				mbpr(kvmd, nl[N_SFSTAT].n_value);
 		} else
 			mbpr(NULL, 0);
 		xo_finish();
 		exit(0);
 	}
 	if (Qflag) {
 		if (!live) {
 			if (kread(0, NULL, 0) == 0)
 				netisr_stats();
 		} else
 			netisr_stats();
 		xo_finish();
 		exit(0);
 	}
 #if 0
 	/*
 	 * Keep file descriptors open to avoid overhead
 	 * of open/close on each call to get* routines.
 	 */
 	sethostent(1);
 	setnetent(1);
 #else
 	/*
 	 * This does not make sense any more with DNS being default over
 	 * the files.  Doing a setXXXXent(1) causes a tcp connection to be
 	 * used for the queries, which is slower.
 	 */
 #endif
 	if (iflag && !sflag) {
 		xo_open_container("statistics");
 		intpr(NULL, af);
 		xo_close_container("statistics");
 		xo_finish();
 		exit(0);
 	}
 	if (rflag) {
 		xo_open_container("statistics");
 		if (sflag) {
 			if (live) {
 				kresolve_list(nl);
 			}
 			rt_stats();
 		} else
 			routepr(fib, af);
 		xo_close_container("statistics");
 		xo_finish();
 		exit(0);
 	}
 	if (oflag) {
 		xo_open_container("statistics");
 		nhops_print(fib, af);
 		xo_close_container("statistics");
 		xo_finish();
 		exit(0);
 	}
 	if (Oflag) {
 		xo_open_container("statistics");
 		nhgrp_print(fib, af);
 		xo_close_container("statistics");
 		xo_finish();
 		exit(0);
 	}
 
 
 
 	if (gflag) {
 		xo_open_container("statistics");
 		if (sflag) {
 			if (af == AF_INET || af == AF_UNSPEC)
 				mrt_stats();
 #ifdef INET6
 			if (af == AF_INET6 || af == AF_UNSPEC)
 				mrt6_stats();
 #endif
 		} else {
 			if (af == AF_INET || af == AF_UNSPEC)
 				mroutepr();
 #ifdef INET6
 			if (af == AF_INET6 || af == AF_UNSPEC)
 				mroute6pr();
 #endif
 		}
 		xo_close_container("statistics");
 		xo_finish();
 		exit(0);
 	}
 
 	if (tp) {
 		xo_open_container("statistics");
 		printproto(tp, tp->pr_name, &first);
 		if (!first)
 			xo_close_list("socket");
 		xo_close_container("statistics");
 		xo_finish();
 		exit(0);
 	}
 
 	xo_open_container("statistics");
 	if (af == AF_INET || af == AF_UNSPEC)
 		for (tp = protox; tp->pr_name; tp++)
 			printproto(tp, tp->pr_name, &first);
 #ifdef INET6
 	if (af == AF_INET6 || af == AF_UNSPEC)
 		for (tp = ip6protox; tp->pr_name; tp++)
 			printproto(tp, tp->pr_name, &first);
 #endif /*INET6*/
 #ifdef IPSEC
 	if (af == PF_KEY || af == AF_UNSPEC)
 		for (tp = pfkeyprotox; tp->pr_name; tp++)
 			printproto(tp, tp->pr_name, &first);
 #endif /*IPSEC*/
 #ifdef NETGRAPH
 	if (af == AF_NETGRAPH || af == AF_UNSPEC)
 		for (tp = netgraphprotox; tp->pr_name; tp++)
 			printproto(tp, tp->pr_name, &first);
 #endif /* NETGRAPH */
 	if ((af == AF_UNIX || af == AF_UNSPEC) && !sflag)
 		unixpr(nl[N_UNP_COUNT].n_value, nl[N_UNP_GENCNT].n_value,
 		    nl[N_UNP_DHEAD].n_value, nl[N_UNP_SHEAD].n_value,
 		    nl[N_UNP_SPHEAD].n_value, &first);
 
 	if (!first)
 		xo_close_list("socket");
 	xo_close_container("statistics");
 	xo_finish();
 	exit(0);
 }
 
 static int
 fetch_stats_internal(const char *sysctlname, u_long off, void *stats,
     size_t len, kreadfn_t kreadfn, int zero)
 {
 	int error;
 
 	if (live) {
 		memset(stats, 0, len);
 		if (zero)
 			error = sysctlbyname(sysctlname, NULL, NULL, stats,
 			    len);
 		else
 			error = sysctlbyname(sysctlname, stats, &len, NULL, 0);
 		if (error == -1 && errno != ENOENT)
 			xo_warn("sysctl %s", sysctlname);
 	} else {
 		if (off == 0)
 			return (1);
 		error = kreadfn(off, stats, len);
 	}
 	return (error);
 }
 
 int
 fetch_stats(const char *sysctlname, u_long off, void *stats,
     size_t len, kreadfn_t kreadfn)
 {
 
 	return (fetch_stats_internal(sysctlname, off, stats, len, kreadfn,
     zflag));
 }
 
 int
 fetch_stats_ro(const char *sysctlname, u_long off, void *stats,
     size_t len, kreadfn_t kreadfn)
 {
 
 	return (fetch_stats_internal(sysctlname, off, stats, len, kreadfn, 0));
 }
 
 /*
  * Print out protocol statistics or control blocks (per sflag).
  * If the interface was not specifically requested, and the symbol
  * is not in the namelist, ignore this one.
  */
 static void
 printproto(struct protox *tp, const char *name, bool *first)
 {
 	void (*pr)(u_long, const char *, int, int);
 	u_long off;
 	bool doingdblocks = false;
 
 	if (sflag) {
 		if (iflag) {
 			if (tp->pr_istats)
 				intpr(tp->pr_istats, af);
 			else if (pflag)
 				xo_message("%s: no per-interface stats routine",
 				    tp->pr_name);
 			return;
 		} else {
 			pr = tp->pr_stats;
 			if (!pr) {
 				if (pflag)
 					xo_message("%s: no stats routine",
 					    tp->pr_name);
 				return;
 			}
 			if (tp->pr_usesysctl && live)
 				off = 0;
 			else if (tp->pr_sindex < 0) {
 				if (pflag)
 					xo_message("%s: stats routine doesn't "
 					    "work on cores", tp->pr_name);
 				return;
 			} else
 				off = nl[tp->pr_sindex].n_value;
 		}
 	} else {
 		doingdblocks = true;
 		pr = tp->pr_cblocks;
 		if (!pr) {
 			if (pflag)
 				xo_message("%s: no PCB routine", tp->pr_name);
 			return;
 		}
 		if (tp->pr_usesysctl && live)
 			off = 0;
 		else if (tp->pr_index < 0) {
 			if (pflag)
 				xo_message("%s: PCB routine doesn't work on "
 				    "cores", tp->pr_name);
 			return;
 		} else
 			off = nl[tp->pr_index].n_value;
 	}
 	if (pr != NULL && (off || (live && tp->pr_usesysctl) ||
 	    af != AF_UNSPEC)) {
 		if (doingdblocks && *first) {
 			xo_open_list("socket");
 			*first = false;
 		}
 
 		(*pr)(off, name, af, tp->pr_protocol);
 	}
 }
 
 static int
 kvmd_init(void)
 {
 	char errbuf[_POSIX2_LINE_MAX];
 
 	if (kvmd != NULL)
 		return (0);
 
 	kvmd = kvm_openfiles(nlistf, memf, NULL, O_RDONLY, errbuf);
 	if (setgid(getgid()) != 0)
 		xo_err(-1, "setgid");
 
 	if (kvmd == NULL) {
 		xo_warnx("kvm not available: %s", errbuf);
 		return (-1);
 	}
 
 	return (0);
 }
 
 /*
  * Resolve symbol list, return 0 on success.
  */
 static int
 kresolve_list(struct nlist *_nl)
 {
 
 	if ((kvmd == NULL) && (kvmd_init() != 0))
 		return (-1);
 
 	if (_nl[0].n_type != 0)
 		return (0);
 
 	if (kvm_nlist(kvmd, _nl) < 0) {
 		if (nlistf)
 			xo_errx(1, "%s: kvm_nlist: %s", nlistf,
 			    kvm_geterr(kvmd));
 		else
 			xo_errx(1, "kvm_nlist: %s", kvm_geterr(kvmd));
 	}
 
 	return (0);
 }
 
 /*
  * Wrapper of kvm_dpcpu_setcpu().
  */
 void
 kset_dpcpu(u_int cpuid)
 {
 
 	if ((kvmd == NULL) && (kvmd_init() != 0))
 		xo_errx(-1, "%s: kvm is not available", __func__);
 
 	if (kvm_dpcpu_setcpu(kvmd, cpuid) < 0)
 		xo_errx(-1, "%s: kvm_dpcpu_setcpu(%u): %s", __func__,
 		    cpuid, kvm_geterr(kvmd)); 
 	return;
 }
 
 /*
  * Read kernel memory, return 0 on success.
  */
 int
 kread(u_long addr, void *buf, size_t size)
 {
 
 	if (kvmd_init() < 0)
 		return (-1);
 
 	if (!buf)
 		return (0);
 	if (kvm_read(kvmd, addr, buf, size) != (ssize_t)size) {
 		xo_warnx("%s", kvm_geterr(kvmd));
 		return (-1);
 	}
 	return (0);
 }
 
 /*
  * Read single counter(9).
  */
 uint64_t
 kread_counter(u_long addr)
 {
 
 	if (kvmd_init() < 0)
 		return (-1);
 
 	return (kvm_counter_u64_fetch(kvmd, addr));
 }
 
 /*
  * Read an array of N counters in kernel memory into array of N uint64_t's.
  */
 int
 kread_counters(u_long addr, void *buf, size_t size)
 {
 	uint64_t *c;
 	u_long *counters;
 	size_t i, n;
 
 	if (kvmd_init() < 0)
 		return (-1);
 
 	if (size % sizeof(uint64_t) != 0) {
 		xo_warnx("kread_counters: invalid counter set size");
 		return (-1);
 	}
 
 	n = size / sizeof(uint64_t);
 	if ((counters = malloc(n * sizeof(u_long))) == NULL)
 		xo_err(-1, "malloc");
 	if (kread(addr, counters, n * sizeof(u_long)) < 0) {
 		free(counters);
 		return (-1);
 	}
 
 	c = buf;
 	for (i = 0; i < n; i++)
 		c[i] = kvm_counter_u64_fetch(kvmd, counters[i]);
 
 	free(counters);
 	return (0);
 }
 
 const char *
 plural(uintmax_t n)
 {
 	return (n != 1 ? "s" : "");
 }
 
 const char *
 plurales(uintmax_t n)
 {
 	return (n != 1 ? "es" : "");
 }
 
 const char *
 pluralies(uintmax_t n)
 {
 	return (n != 1 ? "ies" : "y");
 }
 
 /*
  * Find the protox for the given "well-known" name.
  */
 static struct protox *
 knownname(const char *name)
 {
 	struct protox **tpp, *tp;
 
 	for (tpp = protoprotox; *tpp; tpp++)
 		for (tp = *tpp; tp->pr_name; tp++)
 			if (strcmp(tp->pr_name, name) == 0)
 				return (tp);
 	return (NULL);
 }
 
 /*
  * Find the protox corresponding to name.
  */
 static struct protox *
 name2protox(const char *name)
 {
 	struct protox *tp;
 	char **alias;			/* alias from p->aliases */
 	struct protoent *p;
 
 	/*
 	 * Try to find the name in the list of "well-known" names. If that
 	 * fails, check if name is an alias for an Internet protocol.
 	 */
 	if ((tp = knownname(name)) != NULL)
 		return (tp);
 
 	setprotoent(1);			/* make protocol lookup cheaper */
 	while ((p = getprotoent()) != NULL) {
 		/* assert: name not same as p->name */
 		for (alias = p->p_aliases; *alias; alias++)
 			if (strcmp(name, *alias) == 0) {
 				endprotoent();
 				return (knownname(p->p_name));
 			}
 	}
 	endprotoent();
 	return (NULL);
 }
 
 static void
 usage(void)
 {
 	(void)xo_error("%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n",
 "usage: netstat [-46AaCcLnRSTWx] [-f protocol_family | -p protocol]\n"
 "               [-M core] [-N system]",
 "       netstat -i | -I interface [-46abdhnW] [-f address_family]\n"
 "               [-M core] [-N system]",
 "       netstat -w wait [-I interface] [-46d] [-M core] [-N system]\n"
 "               [-q howmany]",
 "       netstat -s [-46sz] [-f protocol_family | -p protocol]\n"
 "               [-M core] [-N system]",
 "       netstat -i | -I interface -s [-46s]\n"
 "               [-f protocol_family | -p protocol] [-M core] [-N system]",
 "       netstat -m [-M core] [-N system]",
 "       netstat -B [-z] [-I interface]",
 "       netstat -r [-46AnW] [-F fibnum] [-f address_family]\n"
 "               [-M core] [-N system]",
 "       netstat -rs [-s] [-M core] [-N system]",
 "       netstat -g [-46W] [-f address_family] [-M core] [-N system]",
 "       netstat -gs [-46s] [-f address_family] [-M core] [-N system]",
 "       netstat -Q");
 	xo_finish();
 	exit(1);
 }