diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
index e6f4ced0c1cf..ba5d57bcf46c 100644
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@@ -1,1117 +1,1117 @@
 .\" Copyright (c) 1983, 1991, 1993
 .\"	The Regents of the University of California.
 .\" Copyright (c) 2010-2011 The FreeBSD Foundation
 .\" All rights reserved.
 .\"
 .\" Portions of this documentation were written at the Centre for Advanced
 .\" Internet Architectures, Swinburne University of Technology, Melbourne,
 .\" Australia by David Hayes under sponsorship from the FreeBSD Foundation.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\"     From: @(#)tcp.4	8.1 (Berkeley) 6/5/93
 .\" $FreeBSD$
 .\"
-.Dd August 1, 2022
+.Dd October 7, 2022
 .Dt TCP 4
 .Os
 .Sh NAME
 .Nm tcp
 .Nd Internet Transmission Control Protocol
 .Sh SYNOPSIS
 .In sys/types.h
 .In sys/socket.h
 .In netinet/in.h
 .In netinet/tcp.h
 .Ft int
 .Fn socket AF_INET SOCK_STREAM 0
 .Sh DESCRIPTION
 The
 .Tn TCP
 protocol provides reliable, flow-controlled, two-way
 transmission of data.
 It is a byte-stream protocol used to
 support the
 .Dv SOCK_STREAM
 abstraction.
 .Tn TCP
 uses the standard
 Internet address format and, in addition, provides a per-host
 collection of
 .Dq "port addresses" .
 Thus, each address is composed
 of an Internet address specifying the host and network,
 with a specific
 .Tn TCP
 port on the host identifying the peer entity.
 .Pp
 Sockets utilizing the
 .Tn TCP
 protocol are either
 .Dq active
 or
 .Dq passive .
 Active sockets initiate connections to passive
 sockets.
 By default,
 .Tn TCP
 sockets are created active; to create a
 passive socket, the
 .Xr listen 2
 system call must be used
 after binding the socket with the
 .Xr bind 2
 system call.
 Only passive sockets may use the
 .Xr accept 2
 call to accept incoming connections.
 Only active sockets may use the
 .Xr connect 2
 call to initiate connections.
 .Pp
 Passive sockets may
 .Dq underspecify
 their location to match
 incoming connection requests from multiple networks.
 This technique, termed
 .Dq "wildcard addressing" ,
 allows a single
 server to provide service to clients on multiple networks.
 To create a socket which listens on all networks, the Internet
 address
 .Dv INADDR_ANY
 must be bound.
 The
 .Tn TCP
 port may still be specified
 at this time; if the port is not specified, the system will assign one.
 Once a connection has been established, the socket's address is
 fixed by the peer entity's location.
 The address assigned to the
 socket is the address associated with the network interface
 through which packets are being transmitted and received.
 Normally, this address corresponds to the peer entity's network.
 .Pp
 .Tn TCP
 supports a number of socket options which can be set with
 .Xr setsockopt 2
 and tested with
 .Xr getsockopt 2 :
 .Bl -tag -width ".Dv TCP_FUNCTION_BLK"
 .It Dv TCP_INFO
 Information about a socket's underlying TCP session may be retrieved
 by passing the read-only option
 .Dv TCP_INFO
 to
 .Xr getsockopt 2 .
 It accepts a single argument: a pointer to an instance of
 .Vt "struct tcp_info" .
 .Pp
 This API is subject to change; consult the source to determine
 which fields are currently filled out by this option.
 .Fx
 specific additions include
 send window size,
 receive window size,
 and
 bandwidth-controlled window space.
 .It Dv TCP_CCALGOOPT
 Set or query congestion control algorithm specific parameters.
 See
 .Xr mod_cc 4
 for details.
 .It Dv TCP_CONGESTION
 Select or query the congestion control algorithm that TCP will use for the
 connection.
 See
 .Xr mod_cc 4
 for details.
 .It Dv TCP_FASTOPEN
 Enable or disable TCP Fast Open (TFO).
 To use this option, the kernel must be built with the
 .Dv TCP_RFC7413
 option.
 .Pp
 This option can be set on the socket either before or after the
 .Xr listen 2
 is invoked.
 Clearing this option on a listen socket after it has been set has no effect on
 existing TFO connections or TFO connections in progress; it only prevents new
 TFO connections from being established.
 .Pp
 For passively-created sockets, the
 .Dv TCP_FASTOPEN
 socket option can be queried to determine whether the connection was established
 using TFO.
 Note that connections that are established via a TFO
 .Tn SYN ,
 but that fall back to using a non-TFO
 .Tn SYN|ACK
 will have the
 .Dv TCP_FASTOPEN
 socket option set.
 .Pp
 In addition to the facilities defined in RFC7413, this implementation supports a
 pre-shared key (PSK) mode of operation in which the TFO server requires the
 client to be in posession of a shared secret in order for the client to be able
 to successfully open TFO connections with the server.
 This is useful, for example, in environments where TFO servers are exposed to
 both internal and external clients and only wish to allow TFO connections from
 internal clients.
 .Pp
 In the PSK mode of operation, the server generates and sends TFO cookies to
 requesting clients as usual.
 However, when validating cookies received in TFO SYNs from clients, the server
 requires the client-supplied cookie to equal
 .Bd -literal -offset left
 SipHash24(key=\fI16-byte-psk\fP, msg=\fIcookie-sent-to-client\fP)
 .Ed
 .Pp
 Multiple concurrent valid pre-shared keys are supported so that time-based
 rolling PSK invalidation policies can be implemented in the system.
 The default number of concurrent pre-shared keys is 2.
 .Pp
 This can be adjusted with the
 .Dv TCP_RFC7413_MAX_PSKS
 kernel option.
 .It Dv TCP_FUNCTION_BLK
 Select or query the set of functions that TCP will use for this connection.
 This allows a user to select an alternate TCP stack.
 The alternate TCP stack must already be loaded in the kernel.
 To list the available TCP stacks, see
 .Va functions_available
 in the
 .Sx MIB (sysctl) Variables
 section further down.
 To list the default TCP stack, see
 .Va functions_default
 in the
 .Sx MIB (sysctl) Variables
 section.
 .It Dv TCP_KEEPINIT
 This
 .Xr setsockopt 2
 option accepts a per-socket timeout argument of
 .Vt "u_int"
 in seconds, for new, non-established
 .Tn TCP
 connections.
 For the global default in milliseconds see
 .Va keepinit
 in the
 .Sx MIB (sysctl) Variables
 section further down.
 .It Dv TCP_KEEPIDLE
 This
 .Xr setsockopt 2
 option accepts an argument of
 .Vt "u_int"
 for the amount of time, in seconds, that the connection must be idle
 before keepalive probes (if enabled) are sent for the connection of this
 socket.
 If set on a listening socket, the value is inherited by the newly created
 socket upon
 .Xr accept 2 .
 For the global default in milliseconds see
 .Va keepidle
 in the
 .Sx MIB (sysctl) Variables
 section further down.
 .It Dv TCP_KEEPINTVL
 This
 .Xr setsockopt 2
 option accepts an argument of
 .Vt "u_int"
 to set the per-socket interval, in seconds, between keepalive probes sent
 to a peer.
 If set on a listening socket, the value is inherited by the newly created
 socket upon
 .Xr accept 2 .
 For the global default in milliseconds see
 .Va keepintvl
 in the
 .Sx MIB (sysctl) Variables
 section further down.
 .It Dv TCP_KEEPCNT
 This
 .Xr setsockopt 2
 option accepts an argument of
 .Vt "u_int"
 and allows a per-socket tuning of the number of probes sent, with no response,
 before the connection will be dropped.
 If set on a listening socket, the value is inherited by the newly created
 socket upon
 .Xr accept 2 .
 For the global default see the
 .Va keepcnt
 in the
 .Sx MIB (sysctl) Variables
 section further down.
 .It Dv TCP_NODELAY
 Under most circumstances,
 .Tn TCP
 sends data when it is presented;
 when outstanding data has not yet been acknowledged, it gathers
 small amounts of output to be sent in a single packet once
 an acknowledgement is received.
 For a small number of clients, such as window systems
 that send a stream of mouse events which receive no replies,
 this packetization may cause significant delays.
 The boolean option
 .Dv TCP_NODELAY
 defeats this algorithm.
 
 .It Dv TCP_MAXSEG
 By default, a sender- and
 .No receiver- Ns Tn TCP
 will negotiate among themselves to determine the maximum segment size
 to be used for each connection.
 The
 .Dv TCP_MAXSEG
 option allows the user to determine the result of this negotiation,
 and to reduce it if desired.
 .It Dv TCP_MAXUNACKTIME
 This
 .Xr setsockopt 2
 option accepts an argument of
 .Vt "u_int"
 to set the per-socket interval, in seconds, in which the connection must
 make progress. Progress is defined by at least 1 byte being acknowledged within
 the set time period. If a connection fails to make progress, then the
 .Tn TCP
 stack will terminate the connection with a reset. Note that the default
 value for this is zero which indicates no progress checks should be made.
 .It Dv TCP_NOOPT
 .Tn TCP
 usually sends a number of options in each packet, corresponding to
 various
 .Tn TCP
 extensions which are provided in this implementation.
 The boolean option
 .Dv TCP_NOOPT
 is provided to disable
 .Tn TCP
 option use on a per-connection basis.
 .It Dv TCP_NOPUSH
 By convention, the
 .No sender- Ns Tn TCP
 will set the
 .Dq push
 bit, and begin transmission immediately (if permitted) at the end of
 every user call to
 .Xr write 2
 or
 .Xr writev 2 .
 When this option is set to a non-zero value,
 .Tn TCP
 will delay sending any data at all until either the socket is closed,
 or the internal send buffer is filled.
 .It Dv TCP_MD5SIG
 This option enables the use of MD5 digests (also known as TCP-MD5)
 on writes to the specified socket.
 Outgoing traffic is digested;
 digests on incoming traffic are verified.
 When this option is enabled on a socket, all inbound and outgoing
 TCP segments must be signed with MD5 digests.
 .Pp
 One common use for this in a
 .Fx
 router deployment is to enable
 based routers to interwork with Cisco equipment at peering points.
 Support for this feature conforms to RFC 2385.
 .Pp
 In order for this option to function correctly, it is necessary for the
 administrator to add a tcp-md5 key entry to the system's security
 associations database (SADB) using the
 .Xr setkey 8
 utility.
 This entry can only be specified on a per-host basis at this time.
 .Pp
 If an SADB entry cannot be found for the destination,
 the system does not send any outgoing segments and drops any inbound segments.
 However, during connection negotiation, a non-signed segment will be accepted if
 an SADB entry does not exist between hosts.
 When a non-signed segment is accepted, the established connection is not
 protected with MD5 digests.
 .It Dv TCP_STATS
 Manage collection of connection level statistics using the
 .Xr stats 3
 framework.
 .Pp
 Each dropped segment is taken into account in the TCP protocol statistics.
 .It Dv TCP_TXTLS_ENABLE
 Enable in-kernel Transport Layer Security (TLS) for data written to this
 socket.
 See
 .Xr ktls 4
 for more details.
 .It Dv TCP_TXTLS_MODE
 The integer argument can be used to get or set the current TLS transmit mode
 of a socket.
 See
 .Xr ktls 4
 for more details.
 .It Dv TCP_RXTLS_ENABLE
 Enable in-kernel TLS for data read from this socket.
 See
 .Xr ktls 4
 for more details.
 .It Dv TCP_REUSPORT_LB_NUMA
 Changes NUMA affinity filtering for an established TCP listen
 socket.
 This option takes a single integer argument which specifies
 the NUMA domain to filter on for this listen socket.
 The argument can also have the follwing special values:
 .Bl -tag -width "Dv TCP_REUSPORT_LB_NUMA"
 .It Dv TCP_REUSPORT_LB_NUMA_NODOM
 Remove NUMA filtering for this listen socket.
 .It Dv TCP_REUSPORT_LB_NUMA_CURDOM
 Filter traffic associated with the domain where the calling thread is
 currently executing.
 This is typically used after a process or thread inherits a listen
 socket from its parent, and sets its CPU affinity to a particular core.
 .El
 .It Dv TCP_REMOTE_UDP_ENCAPS_PORT
 Set and get the remote UDP encapsulation port.
 It can only be set on a closed TCP socket.
 .El
 .Pp
 The option level for the
 .Xr setsockopt 2
 call is the protocol number for
 .Tn TCP ,
 available from
 .Xr getprotobyname 3 ,
 or
 .Dv IPPROTO_TCP .
 All options are declared in
 .In netinet/tcp.h .
 .Pp
 Options at the
 .Tn IP
 transport level may be used with
 .Tn TCP ;
 see
 .Xr ip 4 .
 Incoming connection requests that are source-routed are noted,
 and the reverse source route is used in responding.
 .Pp
 The default congestion control algorithm for
 .Tn TCP
 is
 .Xr cc_newreno 4 .
 Other congestion control algorithms can be made available using the
 .Xr mod_cc 4
 framework.
 .Ss MIB (sysctl) Variables
 The
 .Tn TCP
 protocol implements a number of variables in the
 .Va net.inet.tcp
 branch of the
 .Xr sysctl 3
 MIB, which can also be read or modified with
 .Xr sysctl 8 .
 .Bl -tag -width ".Va v6pmtud_blackhole_mss"
 .It Va always_keepalive
 Assume that
 .Dv SO_KEEPALIVE
 is set on all
 .Tn TCP
 connections, the kernel will
 periodically send a packet to the remote host to verify the connection
 is still up.
 .It Va blackhole
 If enabled, disable sending of RST when a connection is attempted
 to a port where there is no socket accepting connections.
 See
 .Xr blackhole 4 .
 .It Va blackhole_local
 See
 .Xr blackhole 4 .
 .It Va cc
 A number of variables for congestion control are under the
 .Va net.inet.tcp.cc
 node.
 See
 .Xr mod_cc 4 .
 .It Va cc.newreno
 Variables for NewReno congestion control are under the
 .Va net.inet.tcp.cc.newreno
 node.
 See
 .Xr cc_newreno 4 .
 .It Va delacktime
 Maximum amount of time, in milliseconds, before a delayed ACK is sent.
 .It Va delayed_ack
 Delay ACK to try and piggyback it onto a data packet or another ACK.
 .It Va do_lrd
 Enable Lost Retransmission Detection for SACK-enabled sessions, disabled by
 default.
 Under severe congestion, a retransmission can be lost which then leads to a
 mandatory Retransmission Timeout (RTO), followed by slow-start.
 LRD will try to resend the repeatedly lost packet, preventing the time-consuming
 RTO and performance reducing slow-start.
 .It Va do_prr
 Perform SACK loss recovery using the Proportional Rate Reduction (PRR) algorithm
 described in RFC6937.
 This improves the effectiveness of retransmissions particular in environments
 with ACK thinning or burst loss events, as chances to run out of the ACK clock
 are reduced, preventing lengthy and performance reducing RTO based loss recovery
 (default is true).
 .It Va do_prr_conservative
 While doing Proportional Rate Reduction, remain strictly in a packet conserving
 mode, sending only one new packet for each ACK received.
 Helpful when a misconfigured token bucket traffic policer causes persistent
 high losses leading to RTO, but reduces PRR effectiveness in more common settings
 (default is false).
 .It Va do_tcpdrain
 Flush packets in the
 .Tn TCP
 reassembly queue if the system is low on mbufs.
 .It Va drop_synfin
 Drop TCP packets with both SYN and FIN set.
 .It Va ecn.enable
 Enable support for TCP Explicit Congestion Notification (ECN).
 ECN allows a TCP sender to reduce the transmission rate in order to
 avoid packet drops.
 .Bl -tag -compact
 .It 0
 Disable ECN.
 .It 1
 Allow incoming connections to request ECN.
 Outgoing connections will request ECN.
 .It 2
 Allow incoming connections to request ECN.
 Outgoing connections will not request ECN.
 (default)
 .It 3
 Negotiate on incoming connection for Accurate ECN, ECN, or no ECN.
 Outgoing connections will request Accurate ECN and fall back to
 ECN depending on the capabilities of the server.
 .It 4
 Negotiate on incoming connection for Accurate ECN, ECN, or no ECN.
 Outgoing connections will not request ECN.
 .El
 .It Va ecn.maxretries
 Number of retries (SYN or SYN/ACK retransmits) before disabling ECN on a
 specific connection.
 This is needed to help with connection establishment
 when a broken firewall is in the network path.
 .It Va fast_finwait2_recycle
 Recycle
 .Tn TCP
 .Dv FIN_WAIT_2
 connections faster when the socket is marked as
 .Dv SBS_CANTRCVMORE
 (no user process has the socket open, data received on
 the socket cannot be read).
 The timeout used here is
 .Va finwait2_timeout .
 .It Va fastopen.acceptany
 When non-zero, all client-supplied TFO cookies will be considered to be valid.
 The default is 0.
 .It Va fastopen.autokey
 When this and
 .Va net.inet.tcp.fastopen.server_enable
 are non-zero, a new key will be automatically generated after this specified
 seconds.
 The default is 120.
 .It Va fastopen.ccache_bucket_limit
 The maximum number of entries in a client cookie cache bucket.
 The default value can be tuned with the
 .Dv TCP_FASTOPEN_CCACHE_BUCKET_LIMIT_DEFAULT
 kernel option or by setting
 .Va net.inet.tcp.fastopen_ccache_bucket_limit
 in the
 .Xr loader 8 .
 .It Va fastopen.ccache_buckets
 The number of client cookie cache buckets.
 Read-only.
 The value can be tuned with the
 .Dv TCP_FASTOPEN_CCACHE_BUCKETS_DEFAULT
 kernel option or by setting
 .Va fastopen.ccache_buckets
 in the
 .Xr loader 8 .
 .It Va fastopen.ccache_list
 Print the client cookie cache.
 Read-only.
 .It Va fastopen.client_enable
 When zero, no new active (i.e., client) TFO connections can be created.
 On the transition from enabled to disabled, the client cookie cache is cleared
 and disabled.
 The transition from enabled to disabled does not affect any active TFO
 connections in progress; it only prevents new ones from being established.
 The default is 0.
 .It Va fastopen.keylen
 The key length in bytes.
 Read-only.
 .It Va fastopen.maxkeys
 The maximum number of keys supported.
 Read-only,
 .It Va fastopen.maxpsks
 The maximum number of pre-shared keys supported.
 Read-only.
 .It Va fastopen.numkeys
 The current number of keys installed.
 Read-only.
 .It Va fastopen.numpsks
 The current number of pre-shared keys installed.
 Read-only.
 .It Va fastopen.path_disable_time
 When a failure occurs while trying to create a new active (i.e., client) TFO
 connection, new active connections on the same path, as determined by the tuple
 .Brq client_ip, server_ip, server_port ,
 will be forced to be non-TFO for this many seconds.
 Note that the path disable mechanism relies on state stored in client cookie
 cache entries, so it is possible for the disable time for a given path to be
 reduced if the corresponding client cookie cache entry is reused due to resource
 pressure before the disable period has elapsed.
 The default is
 .Dv TCP_FASTOPEN_PATH_DISABLE_TIME_DEFAULT .
 .It Va fastopen.psk_enable
 When non-zero, pre-shared key (PSK) mode is enabled for all TFO servers.
 On the transition from enabled to disabled, all installed pre-shared keys are
 removed.
 The default is 0.
 .It Va fastopen.server_enable
 When zero, no new passive (i.e., server) TFO connections can be created.
 On the transition from enabled to disabled, all installed keys and pre-shared
 keys are removed.
 On the transition from disabled to enabled, if
 .Va fastopen.autokey
 is non-zero and there are no keys installed, a new key will be generated
 immediately.
 The transition from enabled to disabled does not affect any passive TFO
 connections in progress; it only prevents new ones from being established.
 The default is 0.
 .It Va fastopen.setkey
 Install a new key by writing
 .Va net.inet.tcp.fastopen.keylen
 bytes to this sysctl.
 .It Va fastopen.setpsk
 Install a new pre-shared key by writing
 .Va net.inet.tcp.fastopen.keylen
 bytes to this sysctl.
 .It Va finwait2_timeout
 Timeout to use for fast recycling of
 .Tn TCP
 .Dv FIN_WAIT_2
 connections
 .Pq Va fast_finwait2_recycle .
 Defaults to 60 seconds.
 .It Va functions_available
 List of available TCP function blocks (TCP stacks).
 .It Va functions_default
 The default TCP function block (TCP stack).
 .It Va functions_inherit_listen_socket_stack
 Determines whether to inherit listen socket's TCP stack or use the current
 system default TCP stack, as defined by
 .Va functions_default .
 Default is true.
 .It Va hostcache
 The TCP host cache is used to cache connection details and metrics to
 improve future performance of connections between the same hosts.
 At the completion of a TCP connection, a host will cache information
 for the connection for some defined period of time.
 There are a number of
 .Va hostcache
 variables under this node.
 See
 .Va hostcache.enable .
 .It Va hostcache.bucketlimit
 The maximum number of entries for the same hash.
 Defaults to 30.
 .It Va hostcache.cachelimit
 Overall entry limit for hostcache.
 Defaults to
 .Va hashsize
 *
 .Va bucketlimit .
 .It Va hostcache.count
 The current number of entries in the host cache.
 .It Va hostcache.enable
 Enable/disable the host cache:
 .Bl -tag -compact
 .It 0
 Disable the host cache.
 .It 1
 Enable the host cache. (default)
 .El
 .It Va hostcache.expire
 Time in seconds, how long a entry should be kept in the
 host cache since last accessed.
 Defaults to 3600 (1 hour).
 .It Va hostcache.hashsize
 Size of TCP hostcache hashtable.
 This number has to be a power of two, or will be rejected.
 Defaults to 512.
 .It Va hostcache.histo
 Provide a Histogram of the hostcache hash utilization.
 .It Va hostcache.list
 Provide a complete list of all current entries in the host
 cache.
 .It Va hostcache.prune
 Time in seconds between pruning expired host cache entries.
 Defaults to 300 (5 minutes).
 .It Va hostcache.purge
 Expire all entires on next pruning of host cache entries.
 Any non-zero setting will be reset to zero, once the purge
 is running.
 .Bl -tag -compact
 .It 0
 Do not purge all entries when pruning the host cache (default).
 .It 1
 Purge all entries when doing the next pruning.
 .It 2
 Purge all entries and also reseed the hash salt.
 .El
 .It Va hostcache.purgenow
 Immediately purge all entries once set to any value.
 Setting this to 2 will also reseed the hash salt.
 .It Va icmp_may_rst
 Certain
 .Tn ICMP
 unreachable messages may abort connections in
 .Tn SYN-SENT
 state.
 .It Va initcwnd_segments
 Enable the ability to specify initial congestion window in number of segments.
 The default value is 10 as suggested by RFC 6928.
 Changing the value on the fly would not affect connections
 using congestion window from the hostcache.
 Caution:
 This regulates the burst of packets allowed to be sent in the first RTT.
 The value should be relative to the link capacity.
 Start with small values for lower-capacity links.
 Large bursts can cause buffer overruns and packet drops if routers have small
 buffers or the link is experiencing congestion.
 .It Va insecure_rst
 Use criteria defined in RFC793 instead of RFC5961 for accepting RST segments.
 Default is false.
 .It Va insecure_syn
 Use criteria defined in RFC793 instead of RFC5961 for accepting SYN segments.
 Default is false.
 .It Va isn_reseed_interval
 The interval (in seconds) specifying how often the secret data used in
 RFC 1948 initial sequence number calculations should be reseeded.
 By default, this variable is set to zero, indicating that
 no reseeding will occur.
 Reseeding should not be necessary, and will break
 .Dv TIME_WAIT
 recycling for a few minutes.
 .It Va keepcnt
 Number of keepalive probes sent, with no response, before a connection
 is dropped.
 The default is 8 packets.
 .It Va keepidle
 Amount of time, in milliseconds, that the connection must be idle
 before sending keepalive probes (if enabled).
 The default is 7200000 msec (7.2M msec, 2 hours).
 .It Va keepinit
 Timeout, in milliseconds, for new, non-established
 .Tn TCP
 connections.
 The default is 75000 msec (75K msec, 75 sec).
 .It Va keepintvl
 The interval, in milliseconds, between keepalive probes sent to remote
 machines, when no response is received on a
 .Va keepidle
 probe.
 The default is 75000 msec (75K msec, 75 sec).
 .It Va log_in_vain
 Log any connection attempts to ports where there is no socket
 accepting connections.
 The value of 1 limits the logging to
 .Tn SYN
 (connection establishment) packets only.
 A value of 2 results in any
 .Tn TCP
 packets to closed ports being logged.
 Any value not listed above disables the logging
 (default is 0, i.e., the logging is disabled).
 .It Va maxtcptw
 When a TCP connection enters the
 .Dv TIME_WAIT
 state, its associated socket structure is freed, since it is of
 negligible size and use, and a new structure is allocated to contain a
 minimal amount of information necessary for sustaining a connection in
 this state, called the compressed TCP
 .Dv TIME_WAIT
 state.
 Since this structure is smaller than a socket structure, it can save
 a significant amount of system memory.
 The
 .Va net.inet.tcp.maxtcptw
 MIB variable controls the maximum number of these structures allocated.
 By default, it is initialized to
 .Va kern.ipc.maxsockets
 / 5.
 .It Va minmss
 Minimum TCP Maximum Segment Size; used to prevent a denial of service attack
 from an unreasonably low MSS.
 .It Va msl
 The Maximum Segment Lifetime, in milliseconds, for a packet.
 .It Va mssdflt
 The default value used for the TCP Maximum Segment Size
 .Pq Dq MSS
 for IPv4 when no advice to the contrary is received from MSS negotiation.
 .It Va newcwd
 Enable the New Congestion Window Validation mechanism as described in RFC 7661.
 This gently reduces the congestion window during periods, where TCP is
 application limited and the network bandwidth is not utilized completely.
 That prevents self-inflicted packet losses once the application starts to
 transmit data at a higher speed.
 .It Va nolocaltimewait
 Suppress creation of compressed TCP
 .Dv TIME_WAIT
 states for connections in
 which both endpoints are local.
 .It Va path_mtu_discovery
 Enable Path MTU Discovery.
 .It Va pcbcount
 Number of active process control blocks
 (read-only).
 .It Va perconn_stats_enable
 Controls the default collection of statistics for all connections using the
 .Xr stats 3
 framework.
 0 disables, 1 enables, 2 enables random sampling across log id connection
 groups with all connections in a group receiving the same setting.
 .It Va perconn_stats_sample_rates
 A CSV list of template_spec=percent key-value pairs which controls the per
 template sampling rates when
 .Xr stats 3
 sampling is enabled.
 .It Va persmax
 Maximum persistence interval, msec.
 .It Va persmin
 Minimum persistence interval, msec.
 .It Va pmtud_blackhole_detection
 Enable automatic path MTU blackhole detection.
 In case of retransmits of MSS sized segments,
 the OS will lower the MSS to check if it's an MTU problem.
 If the current MSS is greater than the configured value to try
 .Po Va net.inet.tcp.pmtud_blackhole_mss
 and
 .Va net.inet.tcp.v6pmtud_blackhole_mss
 .Pc ,
 it will be set to this value, otherwise,
 the MSS will be set to the default values
 .Po Va net.inet.tcp.mssdflt
 and
 .Va net.inet.tcp.v6mssdflt
 .Pc .
 Settings:
 .Bl -tag -compact
 .It 0
 Disable path MTU blackhole detection.
 .It 1
 Enable path MTU blackhole detection for IPv4 and IPv6.
 .It 2
 Enable path MTU blackhole detection only for IPv4.
 .It 3
 Enable path MTU blackhole detection only for IPv6.
 .El
 .It Va pmtud_blackhole_mss
 MSS to try for IPv4 if PMTU blackhole detection is turned on.
 .It Va reass.cursegments
 The current total number of segments present in all reassembly queues.
 .It Va reass.maxqueuelen
 The maximum number of segments allowed in each reassembly queue.
 By default, the system chooses a limit based on each TCP connection's
 receive buffer size and maximum segment size (MSS).
 The actual limit applied to a session's reassembly queue will be the lower of
 the system-calculated automatic limit and the user-specified
 .Va reass.maxqueuelen
 limit.
 .It Va reass.maxsegments
 The maximum limit on the total number of segments across all reassembly
 queues.
 The limit can be adjusted as a tunable.
 .It Va recvbuf_auto
 Enable automatic receive buffer sizing as a connection progresses.
 .It Va recvbuf_max
 Maximum size of automatic receive buffer.
 .It Va recvspace
 Initial
 .Tn TCP
 receive window (buffer size).
 .It Va require_unique_port
 Require unique ephemeral port for outgoing connections;
 otherwise, the 4-tuple of local and remote ports and addresses must be unique.
 Requiring a unique port limits the number of outgoing connections.
 .It Va rexmit_drop_options
 Drop TCP options from third and later retransmitted SYN segments
 of a connection.
 .It Va rexmit_initial , rexmit_min , rexmit_slop
 Adjust the retransmit timer calculation for
 .Tn TCP .
 The slop is
 typically added to the raw calculation to take into account
 occasional variances that the
 .Tn SRTT
 (smoothed round-trip time)
 is unable to accommodate, while the minimum specifies an
 absolute minimum.
 While a number of
 .Tn TCP
 RFCs suggest a 1
 second minimum, these RFCs tend to focus on streaming behavior,
 and fail to deal with the fact that a 1 second minimum has severe
 detrimental effects over lossy interactive connections, such
 as a 802.11b wireless link, and over very fast but lossy
 connections for those cases not covered by the fast retransmit
 code.
 For this reason, we use 200ms of slop and a near-0
 minimum, which gives us an effective minimum of 200ms (similar to
 .Tn Linux ) .
 The initial value is used before an RTT measurement has been performed.
 .It Va rfc1323
 Implement the window scaling and timestamp options of RFC 1323/RFC 7323
 (default is 1).
 Settings:
 .Bl -tag -compact
 .It 0
 Disable window scaling and timestamp option.
 .It 1
 Enable window scaling and timestamp option.
 .It 2
 Enable only window scaling.
 .It 3
 Enable only timestamp option.
 .El
 .It Va rfc3042
 Enable the Limited Transmit algorithm as described in RFC 3042.
 It helps avoid timeouts on lossy links and also when the congestion window
 is small, as happens on short transfers.
 .It Va rfc3390
 Enable support for RFC 3390, which allows for a variable-sized
 starting congestion window on new connections, depending on the
 maximum segment size.
 This helps throughput in general, but
 particularly affects short transfers and high-bandwidth large
 propagation-delay connections.
 .It Va rfc6675_pipe
 Deprecated and superseded by
 .Va sack.revised
 .It Va sack.enable
 Enable support for RFC 2018, TCP Selective Acknowledgment option,
 which allows the receiver to inform the sender about all successfully
 arrived segments, allowing the sender to retransmit the missing segments
 only.
 .It Va sack.globalholes
 Global number of TCP SACK holes currently allocated.
 .It Va sack.globalmaxholes
 Maximum number of SACK holes per system, across all connections.
 Defaults to 65536.
 .It Va sack.maxholes
 Maximum number of SACK holes per connection.
 Defaults to 128.
 .It Va sack.revised
 Enables three updated mechanisms from RFC6675 (default is true).
 Calculate the bytes in flight using the algorithm described in RFC 6675, and
 is also an improvement when Proportional Rate Reduction is enabled.
 Next, Rescue Retransmission helps timely loss recovery, when the trailing segments
 of a transmission are lost, while no additional data is ready to be sent.
 In case a partial ACK without a SACK block is received during SACK loss
 recovery, the trailing segment is immediately resent, rather than waiting
 for a Retransmission timeout.
 Finally, SACK loss recovery is also engaged, once two segments plus one byte are
 SACKed - even if no traditional duplicate ACKs were observed.
 .It Va sendbuf_auto
 Enable automatic send buffer sizing.
 .It Va sendbuf_auto_lowat
 Modify threshold for auto send buffer growth to account for
 .Dv SO_SNDLOWAT .
 .It Va sendbuf_inc
 Incrementor step size of automatic send buffer.
 .It Va sendbuf_max
 Maximum size of automatic send buffer.
 .It Va sendspace
 Initial
 .Tn TCP
 send window (buffer size).
 .It Va syncache
 Variables under the
 .Va net.inet.tcp.syncache
 node are documented in
 .Xr syncache 4 .
 .It Va syncookies
 Determines whether or not
 .Tn SYN
 cookies should be generated for outbound
 .Tn SYN-ACK
 packets.
 .Tn SYN
 cookies are a great help during
 .Tn SYN
 flood attacks, and are enabled by default.
 (See
 .Xr syncookies 4 . )
 .It Va syncookies_only
 See
 .Xr syncookies 4 .
 .It Va tcbhashsize
 Size of the
 .Tn TCP
 control-block hash table
 (read-only).
 This is tuned using the kernel option
 .Dv TCBHASHSIZE
 or by setting
 .Va net.inet.tcp.tcbhashsize
 in the
 .Xr loader 8 .
 .It Va tolerate_missing_ts
 Tolerate the missing of timestamps (RFC 1323/RFC 7323) for
 .Tn TCP
 segments belonging to
 .Tn TCP
 connections for which support of
 .Tn TCP
 timestamps has been negotiated.
 As of June 2021, several TCP stacks are known to violate RFC 7323, including
 modern widely deployed ones.
 Therefore the default is 1, i.e., the missing of timestamps is tolerated.
 .It Va ts_offset_per_conn
 When initializing the TCP timestamps, use a per connection offset instead of a
 per host pair offset.
 Default is to use per connection offsets as recommended in RFC 7323.
 .It Va tso
 Enable TCP Segmentation Offload.
 .It Va udp_tunneling_overhead
 The overhead taken into account when using UDP encapsulation.
 Since MSS clamping by middleboxes will most likely not work, values larger than
 8 (the size of the UDP header) are also supported.
 Supported values are between 8 and 1024.
 The default is 8.
 .It Va udp_tunneling_port
 The local UDP encapsulation port.
 A value of 0 indicates that UDP encapsulation is disabled.
 The default is 0.
 .It Va v6mssdflt
 The default value used for the TCP Maximum Segment Size
 .Pq Dq MSS
 for IPv6 when no advice to the contrary is received from MSS negotiation.
 .It Va v6pmtud_blackhole_mss
 MSS to try for IPv6 if PMTU blackhole detection is turned on.
 See
 .Va pmtud_blackhole_detection .
 .El
 .Sh ERRORS
 A socket operation may fail with one of the following errors returned:
 .Bl -tag -width Er
 .It Bq Er EISCONN
 when trying to establish a connection on a socket which
 already has one;
 .It Bo Er ENOBUFS Bc or Bo Er ENOMEM Bc
 when the system runs out of memory for
 an internal data structure;
 .It Bq Er ETIMEDOUT
 when a connection was dropped
 due to excessive retransmissions;
 .It Bq Er ECONNRESET
 when the remote peer
 forces the connection to be closed;
 .It Bq Er ECONNREFUSED
 when the remote
 peer actively refuses connection establishment (usually because
 no process is listening to the port);
 .It Bq Er EADDRINUSE
 when an attempt
 is made to create a socket with a port which has already been
 allocated;
 .It Bq Er EADDRNOTAVAIL
 when an attempt is made to create a
 socket with a network address for which no network interface
 exists;
 .It Bq Er EAFNOSUPPORT
 when an attempt is made to bind or connect a socket to a multicast
 address.
 .It Bq Er EINVAL
 when trying to change TCP function blocks at an invalid point in the session;
 .It Bq Er ENOENT
 when trying to use a TCP function block that is not available;
 .El
 .Sh SEE ALSO
 .Xr getsockopt 2 ,
 .Xr socket 2 ,
 .Xr stats 3 ,
 .Xr sysctl 3 ,
 .Xr blackhole 4 ,
 .Xr inet 4 ,
 .Xr intro 4 ,
 .Xr ip 4 ,
 .Xr ktls 4 ,
 .Xr mod_cc 4 ,
 .Xr siftr 4 ,
 .Xr syncache 4 ,
 .Xr tcp_bbr 4 ,
 .Xr setkey 8 ,
 .Xr sysctl 8 ,
 .Xr tcp_functions 9
 .Rs
 .%A "V. Jacobson"
 .%A "B. Braden"
 .%A "D. Borman"
 .%T "TCP Extensions for High Performance"
 .%O "RFC 1323"
 .Re
 .Rs
 .%A "D. Borman"
 .%A "B. Braden"
 .%A "V. Jacobson"
 .%A "R. Scheffenegger"
 .%T "TCP Extensions for High Performance"
 .%O "RFC 7323"
 .Re
 .Rs
 .%A "A. Heffernan"
 .%T "Protection of BGP Sessions via the TCP MD5 Signature Option"
 .%O "RFC 2385"
 .Re
 .Rs
 .%A "K. Ramakrishnan"
 .%A "S. Floyd"
 .%A "D. Black"
 .%T "The Addition of Explicit Congestion Notification (ECN) to IP"
 .%O "RFC 3168"
 .Re
 .Sh HISTORY
 The
 .Tn TCP
 protocol appeared in
 .Bx 4.2 .
 The RFC 1323 extensions for window scaling and timestamps were added
 in
 .Bx 4.4 .
 The
 .Dv TCP_INFO
 option was introduced in
 .Tn Linux 2.6
 and is
 .Em subject to change .
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
index d5380c32391b..2f0e6236f612 100644
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -1,2581 +1,2581 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2001 McAfee, Inc.
  * Copyright (c) 2006,2013 Andre Oppermann, Internet Business Solutions AG
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jonathan Lemon
  * and McAfee Research, the Security Research Division of McAfee, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program. [2001 McAfee, Inc.]
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hash.h>
 #include <sys/refcount.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/random.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 
 #include <sys/md5.h>
 #include <crypto/siphash/siphash.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/nd6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/tcp.h>
 #include <netinet/tcp_fastopen.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/tcp_ecn.h>
 #ifdef TCP_OFFLOAD
 #include <netinet/toecore.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #include <netipsec/ipsec_support.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 VNET_DEFINE_STATIC(int, tcp_syncookies) = 1;
 #define	V_tcp_syncookies		VNET(tcp_syncookies)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_syncookies), 0,
     "Use TCP SYN cookies if the syncache overflows");
 
 VNET_DEFINE_STATIC(int, tcp_syncookiesonly) = 0;
 #define	V_tcp_syncookiesonly		VNET(tcp_syncookiesonly)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_syncookiesonly), 0,
     "Use only TCP SYN cookies");
 
 VNET_DEFINE_STATIC(int, functions_inherit_listen_socket_stack) = 1;
 #define V_functions_inherit_listen_socket_stack \
     VNET(functions_inherit_listen_socket_stack)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, functions_inherit_listen_socket_stack,
     CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(functions_inherit_listen_socket_stack), 0,
     "Inherit listen socket's stack");
 
 #ifdef TCP_OFFLOAD
 #define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL)
 #endif
 
 static void	 syncache_drop(struct syncache *, struct syncache_head *);
 static void	 syncache_free(struct syncache *);
 static void	 syncache_insert(struct syncache *, struct syncache_head *);
 static int	 syncache_respond(struct syncache *, const struct mbuf *, int);
 static struct	 socket *syncache_socket(struct syncache *, struct socket *,
 		    struct mbuf *m);
 static void	 syncache_timeout(struct syncache *sc, struct syncache_head *sch,
 		    int docallout);
 static void	 syncache_timer(void *);
 
 static uint32_t	 syncookie_mac(struct in_conninfo *, tcp_seq, uint8_t,
 		    uint8_t *, uintptr_t);
 static tcp_seq	 syncookie_generate(struct syncache_head *, struct syncache *);
 static struct syncache
 		*syncookie_lookup(struct in_conninfo *, struct syncache_head *,
 		    struct syncache *, struct tcphdr *, struct tcpopt *,
 		    struct socket *, uint16_t);
 static void	syncache_pause(struct in_conninfo *);
 static void	syncache_unpause(void *);
 static void	 syncookie_reseed(void *);
 #ifdef INVARIANTS
 static int	 syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch,
 		    struct syncache *sc, struct tcphdr *th, struct tcpopt *to,
 		    struct socket *lso, uint16_t port);
 #endif
 
 /*
  * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies.
  * 3 retransmits corresponds to a timeout with default values of
  * tcp_rexmit_initial * (             1 +
  *                       tcp_backoff[1] +
  *                       tcp_backoff[2] +
  *                       tcp_backoff[3]) + 3 * tcp_rexmit_slop,
  * 1000 ms * (1 + 2 + 4 + 8) +  3 * 200 ms = 15600 ms,
  * the odds are that the user has given up attempting to connect by then.
  */
 #define SYNCACHE_MAXREXMTS		3
 
 /* Arbitrary values */
 #define TCP_SYNCACHE_HASHSIZE		512
 #define TCP_SYNCACHE_BUCKETLIMIT	30
 
 VNET_DEFINE_STATIC(struct tcp_syncache, tcp_syncache);
 #define	V_tcp_syncache			VNET(tcp_syncache)
 
 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "TCP SYN cache");
 
 SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_VNET | CTLFLAG_RDTUN,
     &VNET_NAME(tcp_syncache.bucket_limit), 0,
     "Per-bucket hash limit for syncache");
 
 SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN,
     &VNET_NAME(tcp_syncache.cache_limit), 0,
     "Overall entry limit for syncache");
 
 SYSCTL_UMA_CUR(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_VNET,
     &VNET_NAME(tcp_syncache.zone), "Current number of entries in syncache");
 
 SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN,
     &VNET_NAME(tcp_syncache.hashsize), 0,
     "Size of TCP syncache hashtable");
 
 SYSCTL_BOOL(_net_inet_tcp_syncache, OID_AUTO, see_other, CTLFLAG_VNET |
     CTLFLAG_RW, &VNET_NAME(tcp_syncache.see_other), 0,
     "All syncache(4) entries are visible, ignoring UID/GID, jail(2) "
     "and mac(4) checks");
 
 static int
 sysctl_net_inet_tcp_syncache_rexmtlimit_check(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	u_int new;
 
 	new = V_tcp_syncache.rexmt_limit;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if ((error == 0) && (req->newptr != NULL)) {
 		if (new > TCP_MAXRXTSHIFT)
 			error = EINVAL;
 		else
 			V_tcp_syncache.rexmt_limit = new;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(tcp_syncache.rexmt_limit), 0,
     sysctl_net_inet_tcp_syncache_rexmtlimit_check, "UI",
     "Limit on SYN/ACK retransmissions");
 
 VNET_DEFINE(int, tcp_sc_rst_sock_fail) = 1;
 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0,
     "Send reset on socket allocation failure");
 
 static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
 
 #define	SCH_LOCK(sch)		mtx_lock(&(sch)->sch_mtx)
 #define	SCH_UNLOCK(sch)		mtx_unlock(&(sch)->sch_mtx)
 #define	SCH_LOCK_ASSERT(sch)	mtx_assert(&(sch)->sch_mtx, MA_OWNED)
 
 /*
  * Requires the syncache entry to be already removed from the bucket list.
  */
 static void
 syncache_free(struct syncache *sc)
 {
 
 	if (sc->sc_ipopts)
 		(void) m_free(sc->sc_ipopts);
 	if (sc->sc_cred)
 		crfree(sc->sc_cred);
 #ifdef MAC
 	mac_syncache_destroy(&sc->sc_label);
 #endif
 
 	uma_zfree(V_tcp_syncache.zone, sc);
 }
 
 void
 syncache_init(void)
 {
 	int i;
 
 	V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
 	V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT;
 	V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS;
 	V_tcp_syncache.hash_secret = arc4random();
 
 	TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize",
 	    &V_tcp_syncache.hashsize);
 	TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit",
 	    &V_tcp_syncache.bucket_limit);
 	if (!powerof2(V_tcp_syncache.hashsize) ||
 	    V_tcp_syncache.hashsize == 0) {
 		printf("WARNING: syncache hash size is not a power of 2.\n");
 		V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
 	}
 	V_tcp_syncache.hashmask = V_tcp_syncache.hashsize - 1;
 
 	/* Set limits. */
 	V_tcp_syncache.cache_limit =
 	    V_tcp_syncache.hashsize * V_tcp_syncache.bucket_limit;
 	TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit",
 	    &V_tcp_syncache.cache_limit);
 
 	/* Allocate the hash table. */
 	V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize *
 	    sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO);
 
 #ifdef VIMAGE
 	V_tcp_syncache.vnet = curvnet;
 #endif
 
 	/* Initialize the hash buckets. */
 	for (i = 0; i < V_tcp_syncache.hashsize; i++) {
 		TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket);
 		mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head",
 			 NULL, MTX_DEF);
 		callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer,
 			 &V_tcp_syncache.hashbase[i].sch_mtx, 0);
 		V_tcp_syncache.hashbase[i].sch_length = 0;
 		V_tcp_syncache.hashbase[i].sch_sc = &V_tcp_syncache;
 		V_tcp_syncache.hashbase[i].sch_last_overflow =
 		    -(SYNCOOKIE_LIFETIME + 1);
 	}
 
 	/* Create the syncache entry zone. */
 	V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	V_tcp_syncache.cache_limit = uma_zone_set_max(V_tcp_syncache.zone,
 	    V_tcp_syncache.cache_limit);
 
 	/* Start the SYN cookie reseeder callout. */
 	callout_init(&V_tcp_syncache.secret.reseed, 1);
 	arc4rand(V_tcp_syncache.secret.key[0], SYNCOOKIE_SECRET_SIZE, 0);
 	arc4rand(V_tcp_syncache.secret.key[1], SYNCOOKIE_SECRET_SIZE, 0);
 	callout_reset(&V_tcp_syncache.secret.reseed, SYNCOOKIE_LIFETIME * hz,
 	    syncookie_reseed, &V_tcp_syncache);
 
 	/* Initialize the pause machinery. */
 	mtx_init(&V_tcp_syncache.pause_mtx, "tcp_sc_pause", NULL, MTX_DEF);
 	callout_init_mtx(&V_tcp_syncache.pause_co, &V_tcp_syncache.pause_mtx,
 	    0);
 	V_tcp_syncache.pause_until = time_uptime - TCP_SYNCACHE_PAUSE_TIME;
 	V_tcp_syncache.pause_backoff = 0;
 	V_tcp_syncache.paused = false;
 }
 
 #ifdef VIMAGE
 void
 syncache_destroy(void)
 {
 	struct syncache_head *sch;
 	struct syncache *sc, *nsc;
 	int i;
 
 	/*
 	 * Stop the re-seed timer before freeing resources.  No need to
 	 * possibly schedule it another time.
 	 */
 	callout_drain(&V_tcp_syncache.secret.reseed);
 
 	/* Stop the SYN cache pause callout. */
 	mtx_lock(&V_tcp_syncache.pause_mtx);
 	if (callout_stop(&V_tcp_syncache.pause_co) == 0) {
 		mtx_unlock(&V_tcp_syncache.pause_mtx);
 		callout_drain(&V_tcp_syncache.pause_co);
 	} else
 		mtx_unlock(&V_tcp_syncache.pause_mtx);
 
 	/* Cleanup hash buckets: stop timers, free entries, destroy locks. */
 	for (i = 0; i < V_tcp_syncache.hashsize; i++) {
 		sch = &V_tcp_syncache.hashbase[i];
 		callout_drain(&sch->sch_timer);
 
 		SCH_LOCK(sch);
 		TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc)
 			syncache_drop(sc, sch);
 		SCH_UNLOCK(sch);
 		KASSERT(TAILQ_EMPTY(&sch->sch_bucket),
 		    ("%s: sch->sch_bucket not empty", __func__));
 		KASSERT(sch->sch_length == 0, ("%s: sch->sch_length %d not 0",
 		    __func__, sch->sch_length));
 		mtx_destroy(&sch->sch_mtx);
 	}
 
 	KASSERT(uma_zone_get_cur(V_tcp_syncache.zone) == 0,
 	    ("%s: cache_count not 0", __func__));
 
 	/* Free the allocated global resources. */
 	uma_zdestroy(V_tcp_syncache.zone);
 	free(V_tcp_syncache.hashbase, M_SYNCACHE);
 	mtx_destroy(&V_tcp_syncache.pause_mtx);
 }
 #endif
 
 /*
  * Inserts a syncache entry into the specified bucket row.
  * Locks and unlocks the syncache_head autonomously.
  */
 static void
 syncache_insert(struct syncache *sc, struct syncache_head *sch)
 {
 	struct syncache *sc2;
 
 	SCH_LOCK(sch);
 
 	/*
 	 * Make sure that we don't overflow the per-bucket limit.
 	 * If the bucket is full, toss the oldest element.
 	 */
 	if (sch->sch_length >= V_tcp_syncache.bucket_limit) {
 		KASSERT(!TAILQ_EMPTY(&sch->sch_bucket),
 			("sch->sch_length incorrect"));
 		syncache_pause(&sc->sc_inc);
 		sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head);
 		sch->sch_last_overflow = time_uptime;
 		syncache_drop(sc2, sch);
 	}
 
 	/* Put it into the bucket. */
 	TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash);
 	sch->sch_length++;
 
 #ifdef TCP_OFFLOAD
 	if (ADDED_BY_TOE(sc)) {
 		struct toedev *tod = sc->sc_tod;
 
 		tod->tod_syncache_added(tod, sc->sc_todctx);
 	}
 #endif
 
 	/* Reinitialize the bucket row's timer. */
 	if (sch->sch_length == 1)
 		sch->sch_nextc = ticks + INT_MAX;
 	syncache_timeout(sc, sch, 1);
 
 	SCH_UNLOCK(sch);
 
 	TCPSTATES_INC(TCPS_SYN_RECEIVED);
 	TCPSTAT_INC(tcps_sc_added);
 }
 
 /*
  * Remove and free entry from syncache bucket row.
  * Expects locked syncache head.
  */
 static void
 syncache_drop(struct syncache *sc, struct syncache_head *sch)
 {
 
 	SCH_LOCK_ASSERT(sch);
 
 	TCPSTATES_DEC(TCPS_SYN_RECEIVED);
 	TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
 	sch->sch_length--;
 
 #ifdef TCP_OFFLOAD
 	if (ADDED_BY_TOE(sc)) {
 		struct toedev *tod = sc->sc_tod;
 
 		tod->tod_syncache_removed(tod, sc->sc_todctx);
 	}
 #endif
 
 	syncache_free(sc);
 }
 
 /*
  * Engage/reengage time on bucket row.
  */
 static void
 syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout)
 {
 	int rexmt;
 
 	if (sc->sc_rxmits == 0)
 		rexmt = tcp_rexmit_initial;
 	else
 		TCPT_RANGESET(rexmt,
 		    tcp_rexmit_initial * tcp_backoff[sc->sc_rxmits],
 		    tcp_rexmit_min, TCPTV_REXMTMAX);
 	sc->sc_rxttime = ticks + rexmt;
 	sc->sc_rxmits++;
 	if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) {
 		sch->sch_nextc = sc->sc_rxttime;
 		if (docallout)
 			callout_reset(&sch->sch_timer, sch->sch_nextc - ticks,
 			    syncache_timer, (void *)sch);
 	}
 }
 
 /*
  * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
  * If we have retransmitted an entry the maximum number of times, expire it.
  * One separate timer for each bucket row.
  */
 static void
 syncache_timer(void *xsch)
 {
 	struct syncache_head *sch = (struct syncache_head *)xsch;
 	struct syncache *sc, *nsc;
 	struct epoch_tracker et;
 	int tick = ticks;
 	char *s;
 	bool paused;
 
 	CURVNET_SET(sch->sch_sc->vnet);
 
 	/* NB: syncache_head has already been locked by the callout. */
 	SCH_LOCK_ASSERT(sch);
 
 	/*
 	 * In the following cycle we may remove some entries and/or
 	 * advance some timeouts, so re-initialize the bucket timer.
 	 */
 	sch->sch_nextc = tick + INT_MAX;
 
 	/*
 	 * If we have paused processing, unconditionally remove
 	 * all syncache entries.
 	 */
 	mtx_lock(&V_tcp_syncache.pause_mtx);
 	paused = V_tcp_syncache.paused;
 	mtx_unlock(&V_tcp_syncache.pause_mtx);
 
 	TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) {
 		if (paused) {
 			syncache_drop(sc, sch);
 			continue;
 		}
 		/*
 		 * We do not check if the listen socket still exists
 		 * and accept the case where the listen socket may be
 		 * gone by the time we resend the SYN/ACK.  We do
 		 * not expect this to happens often. If it does,
 		 * then the RST will be sent by the time the remote
 		 * host does the SYN/ACK->ACK.
 		 */
 		if (TSTMP_GT(sc->sc_rxttime, tick)) {
 			if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc))
 				sch->sch_nextc = sc->sc_rxttime;
 			continue;
 		}
 		if (sc->sc_rxmits > V_tcp_ecn_maxretries) {
 			sc->sc_flags &= ~SCF_ECN;
 		}
 		if (sc->sc_rxmits > V_tcp_syncache.rexmt_limit) {
 			if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
 				log(LOG_DEBUG, "%s; %s: Retransmits exhausted, "
 				    "giving up and removing syncache entry\n",
 				    s, __func__);
 				free(s, M_TCPLOG);
 			}
 			syncache_drop(sc, sch);
 			TCPSTAT_INC(tcps_sc_stale);
 			continue;
 		}
 		if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Response timeout, "
 			    "retransmitting (%u) SYN|ACK\n",
 			    s, __func__, sc->sc_rxmits);
 			free(s, M_TCPLOG);
 		}
 
 		NET_EPOCH_ENTER(et);
 		syncache_respond(sc, NULL, TH_SYN|TH_ACK);
 		NET_EPOCH_EXIT(et);
 		TCPSTAT_INC(tcps_sc_retransmitted);
 		syncache_timeout(sc, sch, 0);
 	}
 	if (!TAILQ_EMPTY(&(sch)->sch_bucket))
 		callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick,
 			syncache_timer, (void *)(sch));
 	CURVNET_RESTORE();
 }
 
 /*
  * Returns true if the system is only using cookies at the moment.
  * This could be due to a sysadmin decision to only use cookies, or it
  * could be due to the system detecting an attack.
  */
 static inline bool
 syncache_cookiesonly(void)
 {
 
 	return (V_tcp_syncookies && (V_tcp_syncache.paused ||
 	    V_tcp_syncookiesonly));
 }
 
 /*
  * Find the hash bucket for the given connection.
  */
 static struct syncache_head *
 syncache_hashbucket(struct in_conninfo *inc)
 {
 	uint32_t hash;
 
 	/*
 	 * The hash is built on foreign port + local port + foreign address.
 	 * We rely on the fact that struct in_conninfo starts with 16 bits
 	 * of foreign port, then 16 bits of local port then followed by 128
 	 * bits of foreign address.  In case of IPv4 address, the first 3
 	 * 32-bit words of the address always are zeroes.
 	 */
 	hash = jenkins_hash32((uint32_t *)&inc->inc_ie, 5,
 	    V_tcp_syncache.hash_secret) & V_tcp_syncache.hashmask;
 
 	return (&V_tcp_syncache.hashbase[hash]);
 }
 
 /*
  * Find an entry in the syncache.
  * Returns always with locked syncache_head plus a matching entry or NULL.
  */
 static struct syncache *
 syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp)
 {
 	struct syncache *sc;
 	struct syncache_head *sch;
 
 	*schp = sch = syncache_hashbucket(inc);
 	SCH_LOCK(sch);
 
 	/* Circle through bucket row to find matching entry. */
 	TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash)
 		if (bcmp(&inc->inc_ie, &sc->sc_inc.inc_ie,
 		    sizeof(struct in_endpoints)) == 0)
 			break;
 
 	return (sc);	/* Always returns with locked sch. */
 }
 
 /*
  * This function is called when we get a RST for a
  * non-existent connection, so that we can see if the
  * connection is in the syn cache.  If it is, zap it.
  * If required send a challenge ACK.
  */
 void
 syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th, struct mbuf *m,
     uint16_t port)
 {
 	struct syncache *sc;
 	struct syncache_head *sch;
 	char *s = NULL;
 
 	if (syncache_cookiesonly())
 		return;
 	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
 	SCH_LOCK_ASSERT(sch);
 
 	/*
 	 * Any RST to our SYN|ACK must not carry ACK, SYN or FIN flags.
 	 * See RFC 793 page 65, section SEGMENT ARRIVES.
 	 */
 	if (tcp_get_flags(th) & (TH_ACK|TH_SYN|TH_FIN)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or "
 			    "FIN flag set, segment ignored\n", s, __func__);
 		TCPSTAT_INC(tcps_badrst);
 		goto done;
 	}
 
 	/*
 	 * No corresponding connection was found in syncache.
 	 * If syncookies are enabled and possibly exclusively
 	 * used, or we are under memory pressure, a valid RST
 	 * may not find a syncache entry.  In that case we're
 	 * done and no SYN|ACK retransmissions will happen.
 	 * Otherwise the RST was misdirected or spoofed.
 	 */
 	if (sc == NULL) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: Spurious RST without matching "
 			    "syncache entry (possibly syncookie only), "
 			    "segment ignored\n", s, __func__);
 		TCPSTAT_INC(tcps_badrst);
 		goto done;
 	}
 
 	/* The remote UDP encaps port does not match. */
 	if (sc->sc_port != port) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: Spurious RST with matching "
 			    "syncache entry but non-matching UDP encaps port, "
 			    "segment ignored\n", s, __func__);
 		TCPSTAT_INC(tcps_badrst);
 		goto done;
 	}
 
 	/*
 	 * If the RST bit is set, check the sequence number to see
 	 * if this is a valid reset segment.
 	 *
 	 * RFC 793 page 37:
 	 *   In all states except SYN-SENT, all reset (RST) segments
 	 *   are validated by checking their SEQ-fields.  A reset is
 	 *   valid if its sequence number is in the window.
 	 *
 	 * RFC 793 page 69:
 	 *   There are four cases for the acceptability test for an incoming
 	 *   segment:
 	 *
 	 * Segment Receive  Test
 	 * Length  Window
 	 * ------- -------  -------------------------------------------
 	 *    0       0     SEG.SEQ = RCV.NXT
 	 *    0      >0     RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
 	 *   >0       0     not acceptable
 	 *   >0      >0     RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
 	 *               or RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
 	 *
 	 * Note that when receiving a SYN segment in the LISTEN state,
 	 * IRS is set to SEG.SEQ and RCV.NXT is set to SEG.SEQ+1, as
 	 * described in RFC 793, page 66.
 	 */
 	if ((SEQ_GEQ(th->th_seq, sc->sc_irs + 1) &&
 	    SEQ_LT(th->th_seq, sc->sc_irs + 1 + sc->sc_wnd)) ||
 	    (sc->sc_wnd == 0 && th->th_seq == sc->sc_irs + 1)) {
 		if (V_tcp_insecure_rst ||
 		    th->th_seq == sc->sc_irs + 1) {
 			syncache_drop(sc, sch);
 			if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 				log(LOG_DEBUG,
 				    "%s; %s: Our SYN|ACK was rejected, "
 				    "connection attempt aborted by remote "
 				    "endpoint\n",
 				    s, __func__);
 			TCPSTAT_INC(tcps_sc_reset);
 		} else {
 			TCPSTAT_INC(tcps_badrst);
 			/* Send challenge ACK. */
 			if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: RST with invalid "
 				    " SEQ %u != NXT %u (+WND %u), "
 				    "sending challenge ACK\n",
 				    s, __func__,
 				    th->th_seq, sc->sc_irs + 1, sc->sc_wnd);
 			syncache_respond(sc, m, TH_ACK);
 		}
 	} else {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != "
 			    "NXT %u (+WND %u), segment ignored\n",
 			    s, __func__,
 			    th->th_seq, sc->sc_irs + 1, sc->sc_wnd);
 		TCPSTAT_INC(tcps_badrst);
 	}
 
 done:
 	if (s != NULL)
 		free(s, M_TCPLOG);
 	SCH_UNLOCK(sch);
 }
 
 void
 syncache_badack(struct in_conninfo *inc, uint16_t port)
 {
 	struct syncache *sc;
 	struct syncache_head *sch;
 
 	if (syncache_cookiesonly())
 		return;
 	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
 	SCH_LOCK_ASSERT(sch);
 	if ((sc != NULL) && (sc->sc_port == port)) {
 		syncache_drop(sc, sch);
 		TCPSTAT_INC(tcps_sc_badack);
 	}
 	SCH_UNLOCK(sch);
 }
 
 void
 syncache_unreach(struct in_conninfo *inc, tcp_seq th_seq, uint16_t port)
 {
 	struct syncache *sc;
 	struct syncache_head *sch;
 
 	if (syncache_cookiesonly())
 		return;
 	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
 	SCH_LOCK_ASSERT(sch);
 	if (sc == NULL)
 		goto done;
 
 	/* If the port != sc_port, then it's a bogus ICMP msg */
 	if (port != sc->sc_port)
 		goto done;
 
 	/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
 	if (ntohl(th_seq) != sc->sc_iss)
 		goto done;
 
 	/*
 	 * If we've rertransmitted 3 times and this is our second error,
 	 * we remove the entry.  Otherwise, we allow it to continue on.
 	 * This prevents us from incorrectly nuking an entry during a
 	 * spurious network outage.
 	 *
 	 * See tcp_notify().
 	 */
 	if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) {
 		sc->sc_flags |= SCF_UNREACH;
 		goto done;
 	}
 	syncache_drop(sc, sch);
 	TCPSTAT_INC(tcps_sc_unreach);
 done:
 	SCH_UNLOCK(sch);
 }
 
 /*
  * Build a new TCP socket structure from a syncache entry.
  *
  * On success return the newly created socket with its underlying inp locked.
  */
 static struct socket *
 syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
 {
 	struct tcp_function_block *blk;
 	struct inpcb *inp = NULL;
 	struct socket *so;
 	struct tcpcb *tp;
 	int error;
 	char *s;
 
 	NET_EPOCH_ASSERT();
 
 	/*
 	 * Ok, create the full blown connection, and set things up
 	 * as they would have been set up if we had created the
 	 * connection when the SYN arrived.
 	 */
 	if ((so = solisten_clone(lso)) == NULL)
 		goto allocfail;
 #ifdef MAC
 	mac_socketpeer_set_from_mbuf(m, so);
 #endif
 	error = in_pcballoc(so, &V_tcbinfo);
 	if (error) {
 		sodealloc(so);
 		goto allocfail;
 	}
 	inp = sotoinpcb(so);
 	if ((tp = tcp_newtcpcb(inp)) == NULL) {
 		in_pcbdetach(inp);
 		in_pcbfree(inp);
 		sodealloc(so);
 		goto allocfail;
 	}
 	inp->inp_inc.inc_flags = sc->sc_inc.inc_flags;
 #ifdef INET6
 	if (sc->sc_inc.inc_flags & INC_ISIPV6) {
 		inp->inp_vflag &= ~INP_IPV4;
 		inp->inp_vflag |= INP_IPV6;
 		inp->in6p_laddr = sc->sc_inc.inc6_laddr;
 	} else {
 		inp->inp_vflag &= ~INP_IPV6;
 		inp->inp_vflag |= INP_IPV4;
 #endif
 		inp->inp_ip_ttl = sc->sc_ip_ttl;
 		inp->inp_ip_tos = sc->sc_ip_tos;
 		inp->inp_laddr = sc->sc_inc.inc_laddr;
 #ifdef INET6
 	}
 #endif
 
 	/*
 	 * If there's an mbuf and it has a flowid, then let's initialise the
 	 * inp with that particular flowid.
 	 */
 	if (m != NULL && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
 		inp->inp_flowid = m->m_pkthdr.flowid;
 		inp->inp_flowtype = M_HASHTYPE_GET(m);
 #ifdef NUMA
 		inp->inp_numa_domain = m->m_pkthdr.numa_domain;
 #endif
 	}
 
 	inp->inp_lport = sc->sc_inc.inc_lport;
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6PROTO) {
 		struct inpcb *oinp = sotoinpcb(lso);
 
 		/*
 		 * Inherit socket options from the listening socket.
 		 * Note that in6p_inputopts are not (and should not be)
 		 * copied, since it stores previously received options and is
 		 * used to detect if each new option is different than the
 		 * previous one and hence should be passed to a user.
 		 * If we copied in6p_inputopts, a user would not be able to
 		 * receive options just after calling the accept system call.
 		 */
 		inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS;
 		if (oinp->in6p_outputopts)
 			inp->in6p_outputopts =
 			    ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT);
 		inp->in6p_hops = oinp->in6p_hops;
 	}
 
 	if (sc->sc_inc.inc_flags & INC_ISIPV6) {
 		struct in6_addr laddr6;
 		struct sockaddr_in6 sin6;
 
 		sin6.sin6_family = AF_INET6;
 		sin6.sin6_len = sizeof(sin6);
 		sin6.sin6_addr = sc->sc_inc.inc6_faddr;
 		sin6.sin6_port = sc->sc_inc.inc_fport;
 		sin6.sin6_flowinfo = sin6.sin6_scope_id = 0;
 		laddr6 = inp->in6p_laddr;
 		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
 			inp->in6p_laddr = sc->sc_inc.inc6_laddr;
 		INP_HASH_WLOCK(&V_tcbinfo);
 		error = in6_pcbconnect_mbuf(inp, (struct sockaddr *)&sin6,
 		    thread0.td_ucred, m, false);
 		INP_HASH_WUNLOCK(&V_tcbinfo);
 		if (error != 0) {
 			inp->in6p_laddr = laddr6;
 			goto abort;
 		}
 		/* Override flowlabel from in6_pcbconnect. */
 		inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
 		inp->inp_flow |= sc->sc_flowlabel;
 	}
 #endif /* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		struct in_addr laddr;
 		struct sockaddr_in sin;
 
 		inp->inp_options = (m) ? ip_srcroute(m) : NULL;
 
 		if (inp->inp_options == NULL) {
 			inp->inp_options = sc->sc_ipopts;
 			sc->sc_ipopts = NULL;
 		}
 
 		sin.sin_family = AF_INET;
 		sin.sin_len = sizeof(sin);
 		sin.sin_addr = sc->sc_inc.inc_faddr;
 		sin.sin_port = sc->sc_inc.inc_fport;
 		bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero));
 		laddr = inp->inp_laddr;
 		if (inp->inp_laddr.s_addr == INADDR_ANY)
 			inp->inp_laddr = sc->sc_inc.inc_laddr;
 		INP_HASH_WLOCK(&V_tcbinfo);
 		error = in_pcbconnect(inp, (struct sockaddr *)&sin,
 		    thread0.td_ucred, false);
 		INP_HASH_WUNLOCK(&V_tcbinfo);
 		if (error != 0) {
 			inp->inp_laddr = laddr;
 			goto abort;
 		}
 	}
 #endif /* INET */
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	/* Copy old policy into new socket's. */
 	if (ipsec_copy_pcbpolicy(sotoinpcb(lso), inp) != 0)
 		printf("syncache_socket: could not copy policy\n");
 #endif
 	tp->t_state = TCPS_SYN_RECEIVED;
 	tp->iss = sc->sc_iss;
 	tp->irs = sc->sc_irs;
 	tp->t_port = sc->sc_port;
 	tcp_rcvseqinit(tp);
 	tcp_sendseqinit(tp);
 	blk = sototcpcb(lso)->t_fb;
 	if (V_functions_inherit_listen_socket_stack && blk != tp->t_fb) {
 		/*
 		 * Our parents t_fb was not the default,
 		 * we need to release our ref on tp->t_fb and
 		 * pickup one on the new entry.
 		 */
 		struct tcp_function_block *rblk;
 
 		rblk = find_and_ref_tcp_fb(blk);
 		KASSERT(rblk != NULL,
 		    ("cannot find blk %p out of syncache?", blk));
 		if (tp->t_fb->tfb_tcp_fb_fini)
 			(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		tp->t_fb = rblk;
 		/*
 		 * XXXrrs this is quite dangerous, it is possible
 		 * for the new function to fail to init. We also
 		 * are not asking if the handoff_is_ok though at
 		 * the very start thats probalbly ok.
 		 */
 		if (tp->t_fb->tfb_tcp_fb_init) {
 			(*tp->t_fb->tfb_tcp_fb_init)(tp);
 		}
 	}
 	tp->snd_wl1 = sc->sc_irs;
 	tp->snd_max = tp->iss + 1;
 	tp->snd_nxt = tp->iss + 1;
 	tp->rcv_up = sc->sc_irs + 1;
 	tp->rcv_wnd = sc->sc_wnd;
 	tp->rcv_adv += tp->rcv_wnd;
 	tp->last_ack_sent = tp->rcv_nxt;
 
 	tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY);
 	if (sc->sc_flags & SCF_NOOPT)
 		tp->t_flags |= TF_NOOPT;
 	else {
 		if (sc->sc_flags & SCF_WINSCALE) {
 			tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
 			tp->snd_scale = sc->sc_requested_s_scale;
 			tp->request_r_scale = sc->sc_requested_r_scale;
 		}
 		if (sc->sc_flags & SCF_TIMESTAMP) {
 			tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
 			tp->ts_recent = sc->sc_tsreflect;
 			tp->ts_recent_age = tcp_ts_getticks();
 			tp->ts_offset = sc->sc_tsoff;
 		}
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		if (sc->sc_flags & SCF_SIGNATURE)
 			tp->t_flags |= TF_SIGNATURE;
 #endif
 		if (sc->sc_flags & SCF_SACK)
 			tp->t_flags |= TF_SACK_PERMIT;
 	}
 
 	tcp_ecn_syncache_socket(tp, sc);
 
 	/*
 	 * Set up MSS and get cached values from tcp_hostcache.
 	 * This might overwrite some of the defaults we just set.
 	 */
 	tcp_mss(tp, sc->sc_peer_mss);
 
 	/*
 	 * If the SYN,ACK was retransmitted, indicate that CWND to be
 	 * limited to one segment in cc_conn_init().
 	 * NB: sc_rxmits counts all SYN,ACK transmits, not just retransmits.
 	 */
 	if (sc->sc_rxmits > 1)
 		tp->snd_cwnd = 1;
 
 #ifdef TCP_OFFLOAD
 	/*
 	 * Allow a TOE driver to install its hooks.  Note that we hold the
 	 * pcbinfo lock too and that prevents tcp_usr_accept from accepting a
 	 * new connection before the TOE driver has done its thing.
 	 */
 	if (ADDED_BY_TOE(sc)) {
 		struct toedev *tod = sc->sc_tod;
 
 		tod->tod_offload_socket(tod, sc->sc_todctx, so);
 	}
 #endif
 	/*
 	 * Copy and activate timers.
 	 */
 	tp->t_maxunacktime = sototcpcb(lso)->t_maxunacktime;
 	tp->t_keepinit = sototcpcb(lso)->t_keepinit;
 	tp->t_keepidle = sototcpcb(lso)->t_keepidle;
 	tp->t_keepintvl = sototcpcb(lso)->t_keepintvl;
 	tp->t_keepcnt = sototcpcb(lso)->t_keepcnt;
 	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
 
 	TCPSTAT_INC(tcps_accepts);
 	TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, TCPS_LISTEN);
 
 	if (!solisten_enqueue(so, SS_ISCONNECTED))
 		tp->t_flags |= TF_SONOTCONN;
 
 	return (so);
 
 allocfail:
 	/*
 	 * Drop the connection; we will either send a RST or have the peer
 	 * retransmit its SYN again after its RTO and try again.
 	 */
 	if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
 		log(LOG_DEBUG, "%s; %s: Socket create failed "
 		    "due to limits or memory shortage\n",
 		    s, __func__);
 		free(s, M_TCPLOG);
 	}
 	TCPSTAT_INC(tcps_listendrop);
 	return (NULL);
 
 abort:
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 	sodealloc(so);
 	if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) {
 		log(LOG_DEBUG, "%s; %s: in%s_pcbconnect failed with error %i\n",
 		    s, __func__, (sc->sc_inc.inc_flags & INC_ISIPV6) ? "6" : "",
 		    error);
 		free(s, M_TCPLOG);
 	}
 	TCPSTAT_INC(tcps_listendrop);
 	return (NULL);
 }
 
 /*
  * This function gets called when we receive an ACK for a
  * socket in the LISTEN state.  We look up the connection
  * in the syncache, and if its there, we pull it out of
  * the cache and turn it into a full-blown connection in
  * the SYN-RECEIVED state.
  *
  * On syncache_socket() success the newly created socket
  * has its underlying inp locked.
  */
 int
 syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
     struct socket **lsop, struct mbuf *m, uint16_t port)
 {
 	struct syncache *sc;
 	struct syncache_head *sch;
 	struct syncache scs;
 	char *s;
 	bool locked;
 
 	NET_EPOCH_ASSERT();
 	KASSERT((tcp_get_flags(th) & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK,
 	    ("%s: can handle only ACK", __func__));
 
 	if (syncache_cookiesonly()) {
 		sc = NULL;
 		sch = syncache_hashbucket(inc);
 		locked = false;
 	} else {
 		sc = syncache_lookup(inc, &sch);	/* returns locked sch */
 		locked = true;
 		SCH_LOCK_ASSERT(sch);
 	}
 
 #ifdef INVARIANTS
 	/*
 	 * Test code for syncookies comparing the syncache stored
 	 * values with the reconstructed values from the cookie.
 	 */
 	if (sc != NULL)
 		syncookie_cmp(inc, sch, sc, th, to, *lsop, port);
 #endif
 
 	if (sc == NULL) {
 		/*
 		 * There is no syncache entry, so see if this ACK is
 		 * a returning syncookie.  To do this, first:
 		 *  A. Check if syncookies are used in case of syncache
 		 *     overflows
 		 *  B. See if this socket has had a syncache entry dropped in
 		 *     the recent past. We don't want to accept a bogus
 		 *     syncookie if we've never received a SYN or accept it
 		 *     twice.
 		 *  C. check that the syncookie is valid.  If it is, then
 		 *     cobble up a fake syncache entry, and return.
 		 */
 		if (locked && !V_tcp_syncookies) {
 			SCH_UNLOCK(sch);
 			if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Spurious ACK, "
 				    "segment rejected (syncookies disabled)\n",
 				    s, __func__);
 			goto failed;
 		}
 		if (locked && !V_tcp_syncookiesonly &&
 		    sch->sch_last_overflow < time_uptime - SYNCOOKIE_LIFETIME) {
 			SCH_UNLOCK(sch);
 			if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Spurious ACK, "
 				    "segment rejected (no syncache entry)\n",
 				    s, __func__);
 			goto failed;
 		}
 		bzero(&scs, sizeof(scs));
 		sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop, port);
 		if (locked)
 			SCH_UNLOCK(sch);
 		if (sc == NULL) {
 			if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Segment failed "
 				    "SYNCOOKIE authentication, segment rejected "
 				    "(probably spoofed)\n", s, __func__);
 			goto failed;
 		}
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		/* If received ACK has MD5 signature, check it. */
 		if ((to->to_flags & TOF_SIGNATURE) != 0 &&
 		    (!TCPMD5_ENABLED() ||
 		    TCPMD5_INPUT(m, th, to->to_signature) != 0)) {
 			/* Drop the ACK. */
 			if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 				log(LOG_DEBUG, "%s; %s: Segment rejected, "
 				    "MD5 signature doesn't match.\n",
 				    s, __func__);
 				free(s, M_TCPLOG);
 			}
 			TCPSTAT_INC(tcps_sig_err_sigopt);
 			return (-1); /* Do not send RST */
 		}
 #endif /* TCP_SIGNATURE */
 		TCPSTATES_INC(TCPS_SYN_RECEIVED);
 	} else {
 		if (sc->sc_port != port) {
 			SCH_UNLOCK(sch);
 			return (0);
 		}
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		/*
 		 * If listening socket requested TCP digests, check that
 		 * received ACK has signature and it is correct.
 		 * If not, drop the ACK and leave sc entry in th cache,
 		 * because SYN was received with correct signature.
 		 */
 		if (sc->sc_flags & SCF_SIGNATURE) {
 			if ((to->to_flags & TOF_SIGNATURE) == 0) {
 				/* No signature */
 				TCPSTAT_INC(tcps_sig_err_nosigopt);
 				SCH_UNLOCK(sch);
 				if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 					log(LOG_DEBUG, "%s; %s: Segment "
 					    "rejected, MD5 signature wasn't "
 					    "provided.\n", s, __func__);
 					free(s, M_TCPLOG);
 				}
 				return (-1); /* Do not send RST */
 			}
 			if (!TCPMD5_ENABLED() ||
 			    TCPMD5_INPUT(m, th, to->to_signature) != 0) {
 				/* Doesn't match or no SA */
 				SCH_UNLOCK(sch);
 				if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 					log(LOG_DEBUG, "%s; %s: Segment "
 					    "rejected, MD5 signature doesn't "
 					    "match.\n", s, __func__);
 					free(s, M_TCPLOG);
 				}
 				return (-1); /* Do not send RST */
 			}
 		}
 #endif /* TCP_SIGNATURE */
 
 		/*
 		 * RFC 7323 PAWS: If we have a timestamp on this segment and
 		 * it's less than ts_recent, drop it.
 		 * XXXMT: RFC 7323 also requires to send an ACK.
 		 *        In tcp_input.c this is only done for TCP segments
 		 *        with user data, so be consistent here and just drop
 		 *        the segment.
 		 */
 		if (sc->sc_flags & SCF_TIMESTAMP && to->to_flags & TOF_TS &&
 		    TSTMP_LT(to->to_tsval, sc->sc_tsreflect)) {
 			SCH_UNLOCK(sch);
 			if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 				log(LOG_DEBUG,
 				    "%s; %s: SEG.TSval %u < TS.Recent %u, "
 				    "segment dropped\n", s, __func__,
 				    to->to_tsval, sc->sc_tsreflect);
 				free(s, M_TCPLOG);
 			}
 			return (-1);  /* Do not send RST */
 		}
 
 		/*
 		 * If timestamps were not negotiated during SYN/ACK and a
 		 * segment with a timestamp is received, ignore the
 		 * timestamp and process the packet normally.
 		 * See section 3.2 of RFC 7323.
 		 */
 		if (!(sc->sc_flags & SCF_TIMESTAMP) &&
 		    (to->to_flags & TOF_TS)) {
 			if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 				log(LOG_DEBUG, "%s; %s: Timestamp not "
 				    "expected, segment processed normally\n",
 				    s, __func__);
 				free(s, M_TCPLOG);
 				s = NULL;
 			}
 		}
 
 		/*
 		 * If timestamps were negotiated during SYN/ACK and a
 		 * segment without a timestamp is received, silently drop
 		 * the segment, unless the missing timestamps are tolerated.
 		 * See section 3.2 of RFC 7323.
 		 */
 		if ((sc->sc_flags & SCF_TIMESTAMP) &&
 		    !(to->to_flags & TOF_TS)) {
 			if (V_tcp_tolerate_missing_ts) {
 				if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 					log(LOG_DEBUG,
 					    "%s; %s: Timestamp missing, "
 					    "segment processed normally\n",
 					    s, __func__);
 					free(s, M_TCPLOG);
 				}
 			} else {
 				SCH_UNLOCK(sch);
 				if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 					log(LOG_DEBUG,
 					    "%s; %s: Timestamp missing, "
 					    "segment silently dropped\n",
 					    s, __func__);
 					free(s, M_TCPLOG);
 				}
 				return (-1);  /* Do not send RST */
 			}
 		}
 		TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
 		sch->sch_length--;
 #ifdef TCP_OFFLOAD
 		if (ADDED_BY_TOE(sc)) {
 			struct toedev *tod = sc->sc_tod;
 
 			tod->tod_syncache_removed(tod, sc->sc_todctx);
 		}
 #endif
 		SCH_UNLOCK(sch);
 	}
 
 	/*
 	 * Segment validation:
 	 * ACK must match our initial sequence number + 1 (the SYN|ACK).
 	 */
 	if (th->th_ack != sc->sc_iss + 1) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment "
 			    "rejected\n", s, __func__, th->th_ack, sc->sc_iss);
 		goto failed;
 	}
 
 	/*
 	 * The SEQ must fall in the window starting at the received
 	 * initial receive sequence number + 1 (the SYN).
 	 */
 	if (SEQ_LEQ(th->th_seq, sc->sc_irs) ||
 	    SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL)))
 			log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment "
 			    "rejected\n", s, __func__, th->th_seq, sc->sc_irs);
 		goto failed;
 	}
 
 	*lsop = syncache_socket(sc, *lsop, m);
 
 	if (*lsop == NULL)
 		TCPSTAT_INC(tcps_sc_aborted);
 	else
 		TCPSTAT_INC(tcps_sc_completed);
 
 /* how do we find the inp for the new socket? */
 	if (sc != &scs)
 		syncache_free(sc);
 	return (1);
 failed:
 	if (sc != NULL) {
 		TCPSTATES_DEC(TCPS_SYN_RECEIVED);
 		if (sc != &scs)
 			syncache_free(sc);
 	}
 	if (s != NULL)
 		free(s, M_TCPLOG);
 	*lsop = NULL;
 	return (0);
 }
 
 static struct socket *
 syncache_tfo_expand(struct syncache *sc, struct socket *lso, struct mbuf *m,
     uint64_t response_cookie)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	unsigned int *pending_counter;
 	struct socket *so;
 
 	NET_EPOCH_ASSERT();
 
 	pending_counter = intotcpcb(sotoinpcb(lso))->t_tfo_pending;
 	so = syncache_socket(sc, lso, m);
 	if (so == NULL) {
 		TCPSTAT_INC(tcps_sc_aborted);
 		atomic_subtract_int(pending_counter, 1);
 	} else {
 		soisconnected(so);
 		inp = sotoinpcb(so);
 		tp = intotcpcb(inp);
 		tp->t_flags |= TF_FASTOPEN;
 		tp->t_tfo_cookie.server = response_cookie;
 		tp->snd_max = tp->iss;
 		tp->snd_nxt = tp->iss;
 		tp->t_tfo_pending = pending_counter;
 		TCPSTATES_INC(TCPS_SYN_RECEIVED);
 		TCPSTAT_INC(tcps_sc_completed);
 	}
 
 	return (so);
 }
 
 /*
  * Given a LISTEN socket and an inbound SYN request, add
  * this to the syn cache, and send back a segment:
  *	<SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
  * to the source.
  *
  * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
  * Doing so would require that we hold onto the data and deliver it
  * to the application.  However, if we are the target of a SYN-flood
  * DoS attack, an attacker could send data which would eventually
  * consume all available buffer space if it were ACKed.  By not ACKing
  * the data, we avoid this DoS scenario.
  *
  * The exception to the above is when a SYN with a valid TCP Fast Open (TFO)
  * cookie is processed and a new socket is created.  In this case, any data
  * accompanying the SYN will be queued to the socket by tcp_input() and will
  * be ACKed either when the application sends response data or the delayed
  * ACK timer expires, whichever comes first.
  */
 struct socket *
 syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
     struct inpcb *inp, struct socket *so, struct mbuf *m, void *tod,
     void *todctx, uint8_t iptos, uint16_t port)
 {
 	struct tcpcb *tp;
 	struct socket *rv = NULL;
 	struct syncache *sc = NULL;
 	struct syncache_head *sch;
 	struct mbuf *ipopts = NULL;
 	u_int ltflags;
 	int win, ip_ttl, ip_tos;
 	char *s;
 #ifdef INET6
 	int autoflowlabel = 0;
 #endif
 #ifdef MAC
 	struct label *maclabel;
 #endif
 	struct syncache scs;
 	struct ucred *cred;
 	uint64_t tfo_response_cookie;
 	unsigned int *tfo_pending = NULL;
 	int tfo_cookie_valid = 0;
 	int tfo_response_cookie_valid = 0;
 	bool locked;
 
 	INP_RLOCK_ASSERT(inp);			/* listen socket */
 	KASSERT((tcp_get_flags(th) & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN,
 	    ("%s: unexpected tcp flags", __func__));
 
 	/*
 	 * Combine all so/tp operations very early to drop the INP lock as
 	 * soon as possible.
 	 */
 	KASSERT(SOLISTENING(so), ("%s: %p not listening", __func__, so));
 	tp = sototcpcb(so);
 	cred = V_tcp_syncache.see_other ? NULL : crhold(so->so_cred);
 
 #ifdef INET6
 	if (inc->inc_flags & INC_ISIPV6) {
 		if (inp->inp_flags & IN6P_AUTOFLOWLABEL) {
 			autoflowlabel = 1;
 		}
 		ip_ttl = in6_selecthlim(inp, NULL);
 		if ((inp->in6p_outputopts == NULL) ||
 		    (inp->in6p_outputopts->ip6po_tclass == -1)) {
 			ip_tos = 0;
 		} else {
 			ip_tos = inp->in6p_outputopts->ip6po_tclass;
 		}
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		ip_ttl = inp->inp_ip_ttl;
 		ip_tos = inp->inp_ip_tos;
 	}
 #endif
 	win = so->sol_sbrcv_hiwat;
 	ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE));
 
 	if (V_tcp_fastopen_server_enable && IS_FASTOPEN(tp->t_flags) &&
 	    (tp->t_tfo_pending != NULL) &&
 	    (to->to_flags & TOF_FASTOPEN)) {
 		/*
 		 * Limit the number of pending TFO connections to
 		 * approximately half of the queue limit.  This prevents TFO
 		 * SYN floods from starving the service by filling the
 		 * listen queue with bogus TFO connections.
 		 */
 		if (atomic_fetchadd_int(tp->t_tfo_pending, 1) <=
 		    (so->sol_qlimit / 2)) {
 			int result;
 
 			result = tcp_fastopen_check_cookie(inc,
 			    to->to_tfo_cookie, to->to_tfo_len,
 			    &tfo_response_cookie);
 			tfo_cookie_valid = (result > 0);
 			tfo_response_cookie_valid = (result >= 0);
 		}
 
 		/*
 		 * Remember the TFO pending counter as it will have to be
 		 * decremented below if we don't make it to syncache_tfo_expand().
 		 */
 		tfo_pending = tp->t_tfo_pending;
 	}
 
 #ifdef MAC
 	if (mac_syncache_init(&maclabel) != 0) {
 		INP_RUNLOCK(inp);
 		goto done;
 	} else
 		mac_syncache_create(maclabel, inp);
 #endif
 	if (!tfo_cookie_valid)
 		INP_RUNLOCK(inp);
 
 	/*
 	 * Remember the IP options, if any.
 	 */
 #ifdef INET6
 	if (!(inc->inc_flags & INC_ISIPV6))
 #endif
 #ifdef INET
 		ipopts = (m) ? ip_srcroute(m) : NULL;
 #else
 		ipopts = NULL;
 #endif
 
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 	/*
 	 * When the socket is TCP-MD5 enabled check that,
 	 *  - a signed packet is valid
 	 *  - a non-signed packet does not have a security association
 	 *
 	 *  If a signed packet fails validation or a non-signed packet has a
 	 *  security association, the packet will be dropped.
 	 */
 	if (ltflags & TF_SIGNATURE) {
 		if (to->to_flags & TOF_SIGNATURE) {
 			if (!TCPMD5_ENABLED() ||
 			    TCPMD5_INPUT(m, th, to->to_signature) != 0)
 				goto done;
 		} else {
 			if (TCPMD5_ENABLED() &&
 			    TCPMD5_INPUT(m, NULL, NULL) != ENOENT)
 				goto done;
 		}
 	} else if (to->to_flags & TOF_SIGNATURE)
 		goto done;
 #endif	/* TCP_SIGNATURE */
 	/*
 	 * See if we already have an entry for this connection.
 	 * If we do, resend the SYN,ACK, and reset the retransmit timer.
 	 *
 	 * XXX: should the syncache be re-initialized with the contents
 	 * of the new SYN here (which may have different options?)
 	 *
 	 * XXX: We do not check the sequence number to see if this is a
 	 * real retransmit or a new connection attempt.  The question is
 	 * how to handle such a case; either ignore it as spoofed, or
 	 * drop the current entry and create a new one?
 	 */
 	if (syncache_cookiesonly()) {
 		sc = NULL;
 		sch = syncache_hashbucket(inc);
 		locked = false;
 	} else {
 		sc = syncache_lookup(inc, &sch);	/* returns locked sch */
 		locked = true;
 		SCH_LOCK_ASSERT(sch);
 	}
 	if (sc != NULL) {
 		if (tfo_cookie_valid)
 			INP_RUNLOCK(inp);
 		TCPSTAT_INC(tcps_sc_dupsyn);
 		if (ipopts) {
 			/*
 			 * If we were remembering a previous source route,
 			 * forget it and use the new one we've been given.
 			 */
 			if (sc->sc_ipopts)
 				(void) m_free(sc->sc_ipopts);
 			sc->sc_ipopts = ipopts;
 		}
 		/*
 		 * Update timestamp if present.
 		 */
 		if ((sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS))
 			sc->sc_tsreflect = to->to_tsval;
 		else
 			sc->sc_flags &= ~SCF_TIMESTAMP;
 		/*
 		 * Disable ECN if needed.
 		 */
 		if ((sc->sc_flags & SCF_ECN) &&
 		    ((tcp_get_flags(th) & (TH_ECE|TH_CWR)) != (TH_ECE|TH_CWR))) {
 			sc->sc_flags &= ~SCF_ECN;
 		}
 #ifdef MAC
 		/*
 		 * Since we have already unconditionally allocated label
 		 * storage, free it up.  The syncache entry will already
 		 * have an initialized label we can use.
 		 */
 		mac_syncache_destroy(&maclabel);
 #endif
 		TCP_PROBE5(receive, NULL, NULL, m, NULL, th);
 		/* Retransmit SYN|ACK and reset retransmit count. */
 		if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Received duplicate SYN, "
 			    "resetting timer and retransmitting SYN|ACK\n",
 			    s, __func__);
 			free(s, M_TCPLOG);
 		}
 		if (syncache_respond(sc, m, TH_SYN|TH_ACK) == 0) {
 			sc->sc_rxmits = 0;
 			syncache_timeout(sc, sch, 1);
 			TCPSTAT_INC(tcps_sndacks);
 			TCPSTAT_INC(tcps_sndtotal);
 		}
 		SCH_UNLOCK(sch);
 		goto donenoprobe;
 	}
 
 	if (tfo_cookie_valid) {
 		bzero(&scs, sizeof(scs));
 		sc = &scs;
 		goto skip_alloc;
 	}
 
 	/*
 	 * Skip allocating a syncache entry if we are just going to discard
 	 * it later.
 	 */
 	if (!locked) {
 		bzero(&scs, sizeof(scs));
 		sc = &scs;
 	} else
 		sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO);
 	if (sc == NULL) {
 		/*
 		 * The zone allocator couldn't provide more entries.
 		 * Treat this as if the cache was full; drop the oldest
 		 * entry and insert the new one.
 		 */
 		TCPSTAT_INC(tcps_sc_zonefail);
 		if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL) {
 			sch->sch_last_overflow = time_uptime;
 			syncache_drop(sc, sch);
 			syncache_pause(inc);
 		}
 		sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO);
 		if (sc == NULL) {
 			if (V_tcp_syncookies) {
 				bzero(&scs, sizeof(scs));
 				sc = &scs;
 			} else {
 				KASSERT(locked,
 				    ("%s: bucket unexpectedly unlocked",
 				    __func__));
 				SCH_UNLOCK(sch);
 				if (ipopts)
 					(void) m_free(ipopts);
 				goto done;
 			}
 		}
 	}
 
 skip_alloc:
 	if (!tfo_cookie_valid && tfo_response_cookie_valid)
 		sc->sc_tfo_cookie = &tfo_response_cookie;
 
 	/*
 	 * Fill in the syncache values.
 	 */
 #ifdef MAC
 	sc->sc_label = maclabel;
 #endif
 	sc->sc_cred = cred;
 	sc->sc_port = port;
 	cred = NULL;
 	sc->sc_ipopts = ipopts;
 	bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo));
 	sc->sc_ip_tos = ip_tos;
 	sc->sc_ip_ttl = ip_ttl;
 #ifdef TCP_OFFLOAD
 	sc->sc_tod = tod;
 	sc->sc_todctx = todctx;
 #endif
 	sc->sc_irs = th->th_seq;
 	sc->sc_flags = 0;
 	sc->sc_flowlabel = 0;
 
 	/*
 	 * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN].
 	 * win was derived from socket earlier in the function.
 	 */
 	win = imax(win, 0);
 	win = imin(win, TCP_MAXWIN);
 	sc->sc_wnd = win;
 
 	if (V_tcp_do_rfc1323 &&
 	    !(ltflags & TF_NOOPT)) {
 		/*
 		 * A timestamp received in a SYN makes
 		 * it ok to send timestamp requests and replies.
 		 */
-		if (to->to_flags & TOF_TS) {
+		if ((to->to_flags & TOF_TS) && (V_tcp_do_rfc1323 != 2)) {
 			sc->sc_tsreflect = to->to_tsval;
 			sc->sc_flags |= SCF_TIMESTAMP;
 			sc->sc_tsoff = tcp_new_ts_offset(inc);
 		}
-		if (to->to_flags & TOF_SCALE) {
+		if ((to->to_flags & TOF_SCALE) && (V_tcp_do_rfc1323 != 3)) {
 			int wscale = 0;
 
 			/*
 			 * Pick the smallest possible scaling factor that
 			 * will still allow us to scale up to sb_max, aka
 			 * kern.ipc.maxsockbuf.
 			 *
 			 * We do this because there are broken firewalls that
 			 * will corrupt the window scale option, leading to
 			 * the other endpoint believing that our advertised
 			 * window is unscaled.  At scale factors larger than
 			 * 5 the unscaled window will drop below 1500 bytes,
 			 * leading to serious problems when traversing these
 			 * broken firewalls.
 			 *
 			 * With the default maxsockbuf of 256K, a scale factor
 			 * of 3 will be chosen by this algorithm.  Those who
 			 * choose a larger maxsockbuf should watch out
 			 * for the compatibility problems mentioned above.
 			 *
 			 * RFC1323: The Window field in a SYN (i.e., a <SYN>
 			 * or <SYN,ACK>) segment itself is never scaled.
 			 */
 			while (wscale < TCP_MAX_WINSHIFT &&
 			    (TCP_MAXWIN << wscale) < sb_max)
 				wscale++;
 			sc->sc_requested_r_scale = wscale;
 			sc->sc_requested_s_scale = to->to_wscale;
 			sc->sc_flags |= SCF_WINSCALE;
 		}
 	}
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 	/*
 	 * If incoming packet has an MD5 signature, flag this in the
 	 * syncache so that syncache_respond() will do the right thing
 	 * with the SYN+ACK.
 	 */
 	if (to->to_flags & TOF_SIGNATURE)
 		sc->sc_flags |= SCF_SIGNATURE;
 #endif	/* TCP_SIGNATURE */
 	if (to->to_flags & TOF_SACKPERM)
 		sc->sc_flags |= SCF_SACK;
 	if (to->to_flags & TOF_MSS)
 		sc->sc_peer_mss = to->to_mss;	/* peer mss may be zero */
 	if (ltflags & TF_NOOPT)
 		sc->sc_flags |= SCF_NOOPT;
 	/* ECN Handshake */
 	if (V_tcp_do_ecn)
 		sc->sc_flags |= tcp_ecn_syncache_add(tcp_get_flags(th), iptos);
 
 	if (V_tcp_syncookies)
 		sc->sc_iss = syncookie_generate(sch, sc);
 	else
 		sc->sc_iss = arc4random();
 #ifdef INET6
 	if (autoflowlabel) {
 		if (V_tcp_syncookies)
 			sc->sc_flowlabel = sc->sc_iss;
 		else
 			sc->sc_flowlabel = ip6_randomflowlabel();
 		sc->sc_flowlabel = htonl(sc->sc_flowlabel) & IPV6_FLOWLABEL_MASK;
 	}
 #endif
 	if (locked)
 		SCH_UNLOCK(sch);
 
 	if (tfo_cookie_valid) {
 		rv = syncache_tfo_expand(sc, so, m, tfo_response_cookie);
 		/* INP_RUNLOCK(inp) will be performed by the caller */
 		goto tfo_expanded;
 	}
 
 	TCP_PROBE5(receive, NULL, NULL, m, NULL, th);
 	/*
 	 * Do a standard 3-way handshake.
 	 */
 	if (syncache_respond(sc, m, TH_SYN|TH_ACK) == 0) {
 		if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs)
 			syncache_free(sc);
 		else if (sc != &scs)
 			syncache_insert(sc, sch);   /* locks and unlocks sch */
 		TCPSTAT_INC(tcps_sndacks);
 		TCPSTAT_INC(tcps_sndtotal);
 	} else {
 		if (sc != &scs)
 			syncache_free(sc);
 		TCPSTAT_INC(tcps_sc_dropped);
 	}
 	goto donenoprobe;
 
 done:
 	TCP_PROBE5(receive, NULL, NULL, m, NULL, th);
 donenoprobe:
 	if (m)
 		m_freem(m);
 	/*
 	 * If tfo_pending is not NULL here, then a TFO SYN that did not
 	 * result in a new socket was processed and the associated pending
 	 * counter has not yet been decremented.  All such TFO processing paths
 	 * transit this point.
 	 */
 	if (tfo_pending != NULL)
 		tcp_fastopen_decrement_counter(tfo_pending);
 
 tfo_expanded:
 	if (cred != NULL)
 		crfree(cred);
 #ifdef MAC
 	if (sc == &scs)
 		mac_syncache_destroy(&maclabel);
 #endif
 	return (rv);
 }
 
 /*
  * Send SYN|ACK or ACK to the peer.  Either in response to a peer's segment,
  * i.e. m0 != NULL, or upon 3WHS ACK timeout, i.e. m0 == NULL.
  */
 static int
 syncache_respond(struct syncache *sc, const struct mbuf *m0, int flags)
 {
 	struct ip *ip = NULL;
 	struct mbuf *m;
 	struct tcphdr *th = NULL;
 	struct udphdr *udp = NULL;
 	int optlen, error = 0;	/* Make compiler happy */
 	u_int16_t hlen, tlen, mssopt, ulen;
 	struct tcpopt to;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 #endif
 
 	NET_EPOCH_ASSERT();
 
 	hlen =
 #ifdef INET6
 	       (sc->sc_inc.inc_flags & INC_ISIPV6) ? sizeof(struct ip6_hdr) :
 #endif
 		sizeof(struct ip);
 	tlen = hlen + sizeof(struct tcphdr);
 	if (sc->sc_port) {
 		tlen += sizeof(struct udphdr);
 	}
 	/* Determine MSS we advertize to other end of connection. */
 	mssopt = tcp_mssopt(&sc->sc_inc);
 	if (sc->sc_port)
 		mssopt -= V_tcp_udp_tunneling_overhead;
 	mssopt = max(mssopt, V_tcp_minmss);
 
 	/* XXX: Assume that the entire packet will fit in a header mbuf. */
 	KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN,
 	    ("syncache: mbuf too small: hlen %u, sc_port %u, max_linkhdr %d + "
 	    "tlen %d + TCP_MAXOLEN %ju <= MHLEN %d", hlen, sc->sc_port,
 	    max_linkhdr, tlen, (uintmax_t)TCP_MAXOLEN, MHLEN));
 
 	/* Create the IP+TCP header from scratch. */
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOBUFS);
 #ifdef MAC
 	mac_syncache_create_mbuf(sc->sc_label, m);
 #endif
 	m->m_data += max_linkhdr;
 	m->m_len = tlen;
 	m->m_pkthdr.len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 
 #ifdef INET6
 	if (sc->sc_inc.inc_flags & INC_ISIPV6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_src = sc->sc_inc.inc6_laddr;
 		ip6->ip6_dst = sc->sc_inc.inc6_faddr;
 		ip6->ip6_plen = htons(tlen - hlen);
 		/* ip6_hlim is set after checksum */
 		/* Zero out traffic class and flow label. */
 		ip6->ip6_flow &= ~IPV6_FLOWINFO_MASK;
 		ip6->ip6_flow |= sc->sc_flowlabel;
 		if (sc->sc_port != 0) {
 			ip6->ip6_nxt = IPPROTO_UDP;
 			udp = (struct udphdr *)(ip6 + 1);
 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
 			udp->uh_dport = sc->sc_port;
 			ulen = (tlen - sizeof(struct ip6_hdr));
 			th = (struct tcphdr *)(udp + 1);
 		} else {
 			ip6->ip6_nxt = IPPROTO_TCP;
 			th = (struct tcphdr *)(ip6 + 1);
 		}
 		ip6->ip6_flow |= htonl(sc->sc_ip_tos << 20);
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		ip = mtod(m, struct ip *);
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = sizeof(struct ip) >> 2;
 		ip->ip_len = htons(tlen);
 		ip->ip_id = 0;
 		ip->ip_off = 0;
 		ip->ip_sum = 0;
 		ip->ip_src = sc->sc_inc.inc_laddr;
 		ip->ip_dst = sc->sc_inc.inc_faddr;
 		ip->ip_ttl = sc->sc_ip_ttl;
 		ip->ip_tos = sc->sc_ip_tos;
 
 		/*
 		 * See if we should do MTU discovery.  Route lookups are
 		 * expensive, so we will only unset the DF bit if:
 		 *
 		 *	1) path_mtu_discovery is disabled
 		 *	2) the SCF_UNREACH flag has been set
 		 */
 		if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0))
 		       ip->ip_off |= htons(IP_DF);
 		if (sc->sc_port == 0) {
 			ip->ip_p = IPPROTO_TCP;
 			th = (struct tcphdr *)(ip + 1);
 		} else {
 			ip->ip_p = IPPROTO_UDP;
 			udp = (struct udphdr *)(ip + 1);
 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
 			udp->uh_dport = sc->sc_port;
 			ulen = (tlen - sizeof(struct ip));
 			th = (struct tcphdr *)(udp + 1);
 		}
 	}
 #endif /* INET */
 	th->th_sport = sc->sc_inc.inc_lport;
 	th->th_dport = sc->sc_inc.inc_fport;
 
 	if (flags & TH_SYN)
 		th->th_seq = htonl(sc->sc_iss);
 	else
 		th->th_seq = htonl(sc->sc_iss + 1);
 	th->th_ack = htonl(sc->sc_irs + 1);
 	th->th_off = sizeof(struct tcphdr) >> 2;
 	th->th_win = htons(sc->sc_wnd);
 	th->th_urp = 0;
 
 	flags = tcp_ecn_syncache_respond(flags, sc);
 	tcp_set_flags(th, flags);
 
 	/* Tack on the TCP options. */
 	if ((sc->sc_flags & SCF_NOOPT) == 0) {
 		to.to_flags = 0;
 
 		if (flags & TH_SYN) {
 			to.to_mss = mssopt;
 			to.to_flags = TOF_MSS;
 			if (sc->sc_flags & SCF_WINSCALE) {
 				to.to_wscale = sc->sc_requested_r_scale;
 				to.to_flags |= TOF_SCALE;
 			}
 			if (sc->sc_flags & SCF_SACK)
 				to.to_flags |= TOF_SACKPERM;
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 			if (sc->sc_flags & SCF_SIGNATURE)
 				to.to_flags |= TOF_SIGNATURE;
 #endif
 			if (sc->sc_tfo_cookie) {
 				to.to_flags |= TOF_FASTOPEN;
 				to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
 				to.to_tfo_cookie = sc->sc_tfo_cookie;
 				/* don't send cookie again when retransmitting response */
 				sc->sc_tfo_cookie = NULL;
 			}
 		}
 		if (sc->sc_flags & SCF_TIMESTAMP) {
 			to.to_tsval = sc->sc_tsoff + tcp_ts_getticks();
 			to.to_tsecr = sc->sc_tsreflect;
 			to.to_flags |= TOF_TS;
 		}
 		optlen = tcp_addoptions(&to, (u_char *)(th + 1));
 
 		/* Adjust headers by option size. */
 		th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 		m->m_len += optlen;
 		m->m_pkthdr.len += optlen;
 #ifdef INET6
 		if (sc->sc_inc.inc_flags & INC_ISIPV6)
 			ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen);
 		else
 #endif
 			ip->ip_len = htons(ntohs(ip->ip_len) + optlen);
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		if (sc->sc_flags & SCF_SIGNATURE) {
 			KASSERT(to.to_flags & TOF_SIGNATURE,
 			    ("tcp_addoptions() didn't set tcp_signature"));
 
 			/* NOTE: to.to_signature is inside of mbuf */
 			if (!TCPMD5_ENABLED() ||
 			    TCPMD5_OUTPUT(m, th, to.to_signature) != 0) {
 				m_freem(m);
 				return (EACCES);
 			}
 		}
 #endif
 	} else
 		optlen = 0;
 
 	if (udp) {
 		ulen += optlen;
 		udp->uh_ulen = htons(ulen);
 	}
 	M_SETFIB(m, sc->sc_inc.inc_fibnum);
 	/*
 	 * If we have peer's SYN and it has a flowid, then let's assign it to
 	 * our SYN|ACK.  ip6_output() and ip_output() will not assign flowid
 	 * to SYN|ACK due to lack of inp here.
 	 */
 	if (m0 != NULL && M_HASHTYPE_GET(m0) != M_HASHTYPE_NONE) {
 		m->m_pkthdr.flowid = m0->m_pkthdr.flowid;
 		M_HASHTYPE_SET(m, M_HASHTYPE_GET(m0));
 	}
 #ifdef INET6
 	if (sc->sc_inc.inc_flags & INC_ISIPV6) {
 		if (sc->sc_port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			udp->uh_sum = in6_cksum_pseudo(ip6, ulen,
 			      IPPROTO_UDP, 0);
 			th->th_sum = htons(0);
 		} else {
 			m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			th->th_sum = in6_cksum_pseudo(ip6, tlen + optlen - hlen,
 			    IPPROTO_TCP, 0);
 		}
 		ip6->ip6_hlim = sc->sc_ip_ttl;
 #ifdef TCP_OFFLOAD
 		if (ADDED_BY_TOE(sc)) {
 			struct toedev *tod = sc->sc_tod;
 
 			error = tod->tod_syncache_respond(tod, sc->sc_todctx, m);
 
 			return (error);
 		}
 #endif
 		TCP_PROBE5(send, NULL, NULL, ip6, NULL, th);
 		error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		if (sc->sc_port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
 			      ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
 			th->th_sum = htons(0);
 		} else {
 			m->m_pkthdr.csum_flags = CSUM_TCP;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 			    htons(tlen + optlen - hlen + IPPROTO_TCP));
 		}
 #ifdef TCP_OFFLOAD
 		if (ADDED_BY_TOE(sc)) {
 			struct toedev *tod = sc->sc_tod;
 
 			error = tod->tod_syncache_respond(tod, sc->sc_todctx, m);
 
 			return (error);
 		}
 #endif
 		TCP_PROBE5(send, NULL, NULL, ip, NULL, th);
 		error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL);
 	}
 #endif
 	return (error);
 }
 
 /*
  * The purpose of syncookies is to handle spoofed SYN flooding DoS attacks
  * that exceed the capacity of the syncache by avoiding the storage of any
  * of the SYNs we receive.  Syncookies defend against blind SYN flooding
  * attacks where the attacker does not have access to our responses.
  *
  * Syncookies encode and include all necessary information about the
  * connection setup within the SYN|ACK that we send back.  That way we
  * can avoid keeping any local state until the ACK to our SYN|ACK returns
  * (if ever).  Normally the syncache and syncookies are running in parallel
  * with the latter taking over when the former is exhausted.  When matching
  * syncache entry is found the syncookie is ignored.
  *
  * The only reliable information persisting the 3WHS is our initial sequence
  * number ISS of 32 bits.  Syncookies embed a cryptographically sufficient
  * strong hash (MAC) value and a few bits of TCP SYN options in the ISS
  * of our SYN|ACK.  The MAC can be recomputed when the ACK to our SYN|ACK
  * returns and signifies a legitimate connection if it matches the ACK.
  *
  * The available space of 32 bits to store the hash and to encode the SYN
  * option information is very tight and we should have at least 24 bits for
  * the MAC to keep the number of guesses by blind spoofing reasonably high.
  *
  * SYN option information we have to encode to fully restore a connection:
  * MSS: is imporant to chose an optimal segment size to avoid IP level
  *   fragmentation along the path.  The common MSS values can be encoded
  *   in a 3-bit table.  Uncommon values are captured by the next lower value
  *   in the table leading to a slight increase in packetization overhead.
  * WSCALE: is necessary to allow large windows to be used for high delay-
  *   bandwidth product links.  Not scaling the window when it was initially
  *   negotiated is bad for performance as lack of scaling further decreases
  *   the apparent available send window.  We only need to encode the WSCALE
  *   we received from the remote end.  Our end can be recalculated at any
  *   time.  The common WSCALE values can be encoded in a 3-bit table.
  *   Uncommon values are captured by the next lower value in the table
  *   making us under-estimate the available window size halving our
  *   theoretically possible maximum throughput for that connection.
  * SACK: Greatly assists in packet loss recovery and requires 1 bit.
  * TIMESTAMP and SIGNATURE is not encoded because they are permanent options
  *   that are included in all segments on a connection.  We enable them when
  *   the ACK has them.
  *
  * Security of syncookies and attack vectors:
  *
  * The MAC is computed over (faddr||laddr||fport||lport||irs||flags||secmod)
  * together with the gloabl secret to make it unique per connection attempt.
  * Thus any change of any of those parameters results in a different MAC output
  * in an unpredictable way unless a collision is encountered.  24 bits of the
  * MAC are embedded into the ISS.
  *
  * To prevent replay attacks two rotating global secrets are updated with a
  * new random value every 15 seconds.  The life-time of a syncookie is thus
  * 15-30 seconds.
  *
  * Vector 1: Attacking the secret.  This requires finding a weakness in the
  * MAC itself or the way it is used here.  The attacker can do a chosen plain
  * text attack by varying and testing the all parameters under his control.
  * The strength depends on the size and randomness of the secret, and the
  * cryptographic security of the MAC function.  Due to the constant updating
  * of the secret the attacker has at most 29.999 seconds to find the secret
  * and launch spoofed connections.  After that he has to start all over again.
  *
  * Vector 2: Collision attack on the MAC of a single ACK.  With a 24 bit MAC
  * size an average of 4,823 attempts are required for a 50% chance of success
  * to spoof a single syncookie (birthday collision paradox).  However the
  * attacker is blind and doesn't know if one of his attempts succeeded unless
  * he has a side channel to interfere success from.  A single connection setup
  * success average of 90% requires 8,790 packets, 99.99% requires 17,578 packets.
  * This many attempts are required for each one blind spoofed connection.  For
  * every additional spoofed connection he has to launch another N attempts.
  * Thus for a sustained rate 100 spoofed connections per second approximately
  * 1,800,000 packets per second would have to be sent.
  *
  * NB: The MAC function should be fast so that it doesn't become a CPU
  * exhaustion attack vector itself.
  *
  * References:
  *  RFC4987 TCP SYN Flooding Attacks and Common Mitigations
  *  SYN cookies were first proposed by cryptographer Dan J. Bernstein in 1996
  *   http://cr.yp.to/syncookies.html    (overview)
  *   http://cr.yp.to/syncookies/archive (details)
  *
  *
  * Schematic construction of a syncookie enabled Initial Sequence Number:
  *  0        1         2         3
  *  12345678901234567890123456789012
  * |xxxxxxxxxxxxxxxxxxxxxxxxWWWMMMSP|
  *
  *  x 24 MAC (truncated)
  *  W  3 Send Window Scale index
  *  M  3 MSS index
  *  S  1 SACK permitted
  *  P  1 Odd/even secret
  */
 
 /*
  * Distribution and probability of certain MSS values.  Those in between are
  * rounded down to the next lower one.
  * [An Analysis of TCP Maximum Segment Sizes, S. Alcock and R. Nelson, 2011]
  *                            .2%  .3%   5%    7%    7%    20%   15%   45%
  */
 static int tcp_sc_msstab[] = { 216, 536, 1200, 1360, 1400, 1440, 1452, 1460 };
 
 /*
  * Distribution and probability of certain WSCALE values.  We have to map the
  * (send) window scale (shift) option with a range of 0-14 from 4 bits into 3
  * bits based on prevalence of certain values.  Where we don't have an exact
  * match for are rounded down to the next lower one letting us under-estimate
  * the true available window.  At the moment this would happen only for the
  * very uncommon values 3, 5 and those above 8 (more than 16MB socket buffer
  * and window size).  The absence of the WSCALE option (no scaling in either
  * direction) is encoded with index zero.
  * [WSCALE values histograms, Allman, 2012]
  *                            X 10 10 35  5  6 14 10%   by host
  *                            X 11  4  5  5 18 49  3%   by connections
  */
 static int tcp_sc_wstab[] = { 0, 0, 1, 2, 4, 6, 7, 8 };
 
 /*
  * Compute the MAC for the SYN cookie.  SIPHASH-2-4 is chosen for its speed
  * and good cryptographic properties.
  */
 static uint32_t
 syncookie_mac(struct in_conninfo *inc, tcp_seq irs, uint8_t flags,
     uint8_t *secbits, uintptr_t secmod)
 {
 	SIPHASH_CTX ctx;
 	uint32_t siphash[2];
 
 	SipHash24_Init(&ctx);
 	SipHash_SetKey(&ctx, secbits);
 	switch (inc->inc_flags & INC_ISIPV6) {
 #ifdef INET
 	case 0:
 		SipHash_Update(&ctx, &inc->inc_faddr, sizeof(inc->inc_faddr));
 		SipHash_Update(&ctx, &inc->inc_laddr, sizeof(inc->inc_laddr));
 		break;
 #endif
 #ifdef INET6
 	case INC_ISIPV6:
 		SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(inc->inc6_faddr));
 		SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(inc->inc6_laddr));
 		break;
 #endif
 	}
 	SipHash_Update(&ctx, &inc->inc_fport, sizeof(inc->inc_fport));
 	SipHash_Update(&ctx, &inc->inc_lport, sizeof(inc->inc_lport));
 	SipHash_Update(&ctx, &irs, sizeof(irs));
 	SipHash_Update(&ctx, &flags, sizeof(flags));
 	SipHash_Update(&ctx, &secmod, sizeof(secmod));
 	SipHash_Final((u_int8_t *)&siphash, &ctx);
 
 	return (siphash[0] ^ siphash[1]);
 }
 
 static tcp_seq
 syncookie_generate(struct syncache_head *sch, struct syncache *sc)
 {
 	u_int i, secbit, wscale;
 	uint32_t iss, hash;
 	uint8_t *secbits;
 	union syncookie cookie;
 
 	cookie.cookie = 0;
 
 	/* Map our computed MSS into the 3-bit index. */
 	for (i = nitems(tcp_sc_msstab) - 1;
 	     tcp_sc_msstab[i] > sc->sc_peer_mss && i > 0;
 	     i--)
 		;
 	cookie.flags.mss_idx = i;
 
 	/*
 	 * Map the send window scale into the 3-bit index but only if
 	 * the wscale option was received.
 	 */
 	if (sc->sc_flags & SCF_WINSCALE) {
 		wscale = sc->sc_requested_s_scale;
 		for (i = nitems(tcp_sc_wstab) - 1;
 		    tcp_sc_wstab[i] > wscale && i > 0;
 		     i--)
 			;
 		cookie.flags.wscale_idx = i;
 	}
 
 	/* Can we do SACK? */
 	if (sc->sc_flags & SCF_SACK)
 		cookie.flags.sack_ok = 1;
 
 	/* Which of the two secrets to use. */
 	secbit = V_tcp_syncache.secret.oddeven & 0x1;
 	cookie.flags.odd_even = secbit;
 
 	secbits = V_tcp_syncache.secret.key[secbit];
 	hash = syncookie_mac(&sc->sc_inc, sc->sc_irs, cookie.cookie, secbits,
 	    (uintptr_t)sch);
 
 	/*
 	 * Put the flags into the hash and XOR them to get better ISS number
 	 * variance.  This doesn't enhance the cryptographic strength and is
 	 * done to prevent the 8 cookie bits from showing up directly on the
 	 * wire.
 	 */
 	iss = hash & ~0xff;
 	iss |= cookie.cookie ^ (hash >> 24);
 
 	TCPSTAT_INC(tcps_sc_sendcookie);
 	return (iss);
 }
 
 static struct syncache *
 syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch,
     struct syncache *sc, struct tcphdr *th, struct tcpopt *to,
     struct socket *lso, uint16_t port)
 {
 	uint32_t hash;
 	uint8_t *secbits;
 	tcp_seq ack, seq;
 	int wnd, wscale = 0;
 	union syncookie cookie;
 
 	/*
 	 * Pull information out of SYN-ACK/ACK and revert sequence number
 	 * advances.
 	 */
 	ack = th->th_ack - 1;
 	seq = th->th_seq - 1;
 
 	/*
 	 * Unpack the flags containing enough information to restore the
 	 * connection.
 	 */
 	cookie.cookie = (ack & 0xff) ^ (ack >> 24);
 
 	/* Which of the two secrets to use. */
 	secbits = V_tcp_syncache.secret.key[cookie.flags.odd_even];
 
 	hash = syncookie_mac(inc, seq, cookie.cookie, secbits, (uintptr_t)sch);
 
 	/* The recomputed hash matches the ACK if this was a genuine cookie. */
 	if ((ack & ~0xff) != (hash & ~0xff))
 		return (NULL);
 
 	/* Fill in the syncache values. */
 	sc->sc_flags = 0;
 	bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo));
 	sc->sc_ipopts = NULL;
 
 	sc->sc_irs = seq;
 	sc->sc_iss = ack;
 
 	switch (inc->inc_flags & INC_ISIPV6) {
 #ifdef INET
 	case 0:
 		sc->sc_ip_ttl = sotoinpcb(lso)->inp_ip_ttl;
 		sc->sc_ip_tos = sotoinpcb(lso)->inp_ip_tos;
 		break;
 #endif
 #ifdef INET6
 	case INC_ISIPV6:
 		if (sotoinpcb(lso)->inp_flags & IN6P_AUTOFLOWLABEL)
 			sc->sc_flowlabel =
 			    htonl(sc->sc_iss) & IPV6_FLOWLABEL_MASK;
 		break;
 #endif
 	}
 
 	sc->sc_peer_mss = tcp_sc_msstab[cookie.flags.mss_idx];
 
 	/* We can simply recompute receive window scale we sent earlier. */
 	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max)
 		wscale++;
 
 	/* Only use wscale if it was enabled in the orignal SYN. */
 	if (cookie.flags.wscale_idx > 0) {
 		sc->sc_requested_r_scale = wscale;
 		sc->sc_requested_s_scale = tcp_sc_wstab[cookie.flags.wscale_idx];
 		sc->sc_flags |= SCF_WINSCALE;
 	}
 
 	wnd = lso->sol_sbrcv_hiwat;
 	wnd = imax(wnd, 0);
 	wnd = imin(wnd, TCP_MAXWIN);
 	sc->sc_wnd = wnd;
 
 	if (cookie.flags.sack_ok)
 		sc->sc_flags |= SCF_SACK;
 
 	if (to->to_flags & TOF_TS) {
 		sc->sc_flags |= SCF_TIMESTAMP;
 		sc->sc_tsreflect = to->to_tsval;
 		sc->sc_tsoff = tcp_new_ts_offset(inc);
 	}
 
 	if (to->to_flags & TOF_SIGNATURE)
 		sc->sc_flags |= SCF_SIGNATURE;
 
 	sc->sc_rxmits = 0;
 
 	sc->sc_port = port;
 
 	TCPSTAT_INC(tcps_sc_recvcookie);
 	return (sc);
 }
 
 #ifdef INVARIANTS
 static int
 syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch,
     struct syncache *sc, struct tcphdr *th, struct tcpopt *to,
     struct socket *lso, uint16_t port)
 {
 	struct syncache scs, *scx;
 	char *s;
 
 	bzero(&scs, sizeof(scs));
 	scx = syncookie_lookup(inc, sch, &scs, th, to, lso, port);
 
 	if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL)
 		return (0);
 
 	if (scx != NULL) {
 		if (sc->sc_peer_mss != scx->sc_peer_mss)
 			log(LOG_DEBUG, "%s; %s: mss different %i vs %i\n",
 			    s, __func__, sc->sc_peer_mss, scx->sc_peer_mss);
 
 		if (sc->sc_requested_r_scale != scx->sc_requested_r_scale)
 			log(LOG_DEBUG, "%s; %s: rwscale different %i vs %i\n",
 			    s, __func__, sc->sc_requested_r_scale,
 			    scx->sc_requested_r_scale);
 
 		if (sc->sc_requested_s_scale != scx->sc_requested_s_scale)
 			log(LOG_DEBUG, "%s; %s: swscale different %i vs %i\n",
 			    s, __func__, sc->sc_requested_s_scale,
 			    scx->sc_requested_s_scale);
 
 		if ((sc->sc_flags & SCF_SACK) != (scx->sc_flags & SCF_SACK))
 			log(LOG_DEBUG, "%s; %s: SACK different\n", s, __func__);
 	}
 
 	if (s != NULL)
 		free(s, M_TCPLOG);
 	return (0);
 }
 #endif /* INVARIANTS */
 
 static void
 syncookie_reseed(void *arg)
 {
 	struct tcp_syncache *sc = arg;
 	uint8_t *secbits;
 	int secbit;
 
 	/*
 	 * Reseeding the secret doesn't have to be protected by a lock.
 	 * It only must be ensured that the new random values are visible
 	 * to all CPUs in a SMP environment.  The atomic with release
 	 * semantics ensures that.
 	 */
 	secbit = (sc->secret.oddeven & 0x1) ? 0 : 1;
 	secbits = sc->secret.key[secbit];
 	arc4rand(secbits, SYNCOOKIE_SECRET_SIZE, 0);
 	atomic_add_rel_int(&sc->secret.oddeven, 1);
 
 	/* Reschedule ourself. */
 	callout_schedule(&sc->secret.reseed, SYNCOOKIE_LIFETIME * hz);
 }
 
 /*
  * We have overflowed a bucket. Let's pause dealing with the syncache.
  * This function will increment the bucketoverflow statistics appropriately
  * (once per pause when pausing is enabled; otherwise, once per overflow).
  */
 static void
 syncache_pause(struct in_conninfo *inc)
 {
 	time_t delta;
 	const char *s;
 
 	/* XXX:
 	 * 2. Add sysctl read here so we don't get the benefit of this
 	 * change without the new sysctl.
 	 */
 
 	/*
 	 * Try an unlocked read. If we already know that another thread
 	 * has activated the feature, there is no need to proceed.
 	 */
 	if (V_tcp_syncache.paused)
 		return;
 
 	/* Are cookied enabled? If not, we can't pause. */
 	if (!V_tcp_syncookies) {
 		TCPSTAT_INC(tcps_sc_bucketoverflow);
 		return;
 	}
 
 	/*
 	 * We may be the first thread to find an overflow. Get the lock
 	 * and evaluate if we need to take action.
 	 */
 	mtx_lock(&V_tcp_syncache.pause_mtx);
 	if (V_tcp_syncache.paused) {
 		mtx_unlock(&V_tcp_syncache.pause_mtx);
 		return;
 	}
 
 	/* Activate protection. */
 	V_tcp_syncache.paused = true;
 	TCPSTAT_INC(tcps_sc_bucketoverflow);
 
 	/*
 	 * Determine the last backoff time. If we are seeing a re-newed
 	 * attack within that same time after last reactivating the syncache,
 	 * consider it an extension of the same attack.
 	 */
 	delta = TCP_SYNCACHE_PAUSE_TIME << V_tcp_syncache.pause_backoff;
 	if (V_tcp_syncache.pause_until + delta - time_uptime > 0) {
 		if (V_tcp_syncache.pause_backoff < TCP_SYNCACHE_MAX_BACKOFF) {
 			delta <<= 1;
 			V_tcp_syncache.pause_backoff++;
 		}
 	} else {
 		delta = TCP_SYNCACHE_PAUSE_TIME;
 		V_tcp_syncache.pause_backoff = 0;
 	}
 
 	/* Log a warning, including IP addresses, if able. */
 	if (inc != NULL)
 		s = tcp_log_addrs(inc, NULL, NULL, NULL);
 	else
 		s = (const char *)NULL;
 	log(LOG_WARNING, "TCP syncache overflow detected; using syncookies for "
 	    "the next %lld seconds%s%s%s\n", (long long)delta,
 	    (s != NULL) ? " (last SYN: " : "", (s != NULL) ? s : "",
 	    (s != NULL) ? ")" : "");
 	free(__DECONST(void *, s), M_TCPLOG);
 
 	/* Use the calculated delta to set a new pause time. */
 	V_tcp_syncache.pause_until = time_uptime + delta;
 	callout_reset(&V_tcp_syncache.pause_co, delta * hz, syncache_unpause,
 	    &V_tcp_syncache);
 	mtx_unlock(&V_tcp_syncache.pause_mtx);
 }
 
 /* Evaluate whether we need to unpause. */
 static void
 syncache_unpause(void *arg)
 {
 	struct tcp_syncache *sc;
 	time_t delta;
 
 	sc = arg;
 	mtx_assert(&sc->pause_mtx, MA_OWNED | MA_NOTRECURSED);
 	callout_deactivate(&sc->pause_co);
 
 	/*
 	 * Check to make sure we are not running early. If the pause
 	 * time has expired, then deactivate the protection.
 	 */
 	if ((delta = sc->pause_until - time_uptime) > 0)
 		callout_schedule(&sc->pause_co, delta * hz);
 	else
 		sc->paused = false;
 }
 
 /*
  * Exports the syncache entries to userland so that netstat can display
  * them alongside the other sockets.  This function is intended to be
  * called only from tcp_pcblist.
  *
  * Due to concurrency on an active system, the number of pcbs exported
  * may have no relation to max_pcbs.  max_pcbs merely indicates the
  * amount of space the caller allocated for this function to use.
  */
 int
 syncache_pcblist(struct sysctl_req *req)
 {
 	struct xtcpcb xt;
 	struct syncache *sc;
 	struct syncache_head *sch;
 	int error, i;
 
 	bzero(&xt, sizeof(xt));
 	xt.xt_len = sizeof(xt);
 	xt.t_state = TCPS_SYN_RECEIVED;
 	xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP;
 	xt.xt_inp.xi_socket.xso_len = sizeof (struct xsocket);
 	xt.xt_inp.xi_socket.so_type = SOCK_STREAM;
 	xt.xt_inp.xi_socket.so_state = SS_ISCONNECTING;
 
 	for (i = 0; i < V_tcp_syncache.hashsize; i++) {
 		sch = &V_tcp_syncache.hashbase[i];
 		SCH_LOCK(sch);
 		TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
 			if (sc->sc_cred != NULL &&
 			    cr_cansee(req->td->td_ucred, sc->sc_cred) != 0)
 				continue;
 			if (sc->sc_inc.inc_flags & INC_ISIPV6)
 				xt.xt_inp.inp_vflag = INP_IPV6;
 			else
 				xt.xt_inp.inp_vflag = INP_IPV4;
 			xt.xt_encaps_port = sc->sc_port;
 			bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc,
 			    sizeof (struct in_conninfo));
 			error = SYSCTL_OUT(req, &xt, sizeof xt);
 			if (error) {
 				SCH_UNLOCK(sch);
 				return (0);
 			}
 		}
 		SCH_UNLOCK(sch);
 	}
 
 	return (0);
 }