diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
index 0ee57dcb0594..16c9e0ce84df 100644
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@@ -1,1099 +1,1093 @@
 .\" Copyright (c) 1983, 1991, 1993
 .\"	The Regents of the University of California.
 .\" Copyright (c) 2010-2011 The FreeBSD Foundation
 .\" All rights reserved.
 .\"
 .\" Portions of this documentation were written at the Centre for Advanced
 .\" Internet Architectures, Swinburne University of Technology, Melbourne,
 .\" Australia by David Hayes under sponsorship from the FreeBSD Foundation.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\"     From: @(#)tcp.4	8.1 (Berkeley) 6/5/93
 .\"
-.Dd June 29, 2023
+.Dd November 17, 2023
 .Dt TCP 4
 .Os
 .Sh NAME
 .Nm tcp
 .Nd Internet Transmission Control Protocol
 .Sh SYNOPSIS
 .In sys/types.h
 .In sys/socket.h
 .In netinet/in.h
 .In netinet/tcp.h
 .Ft int
 .Fn socket AF_INET SOCK_STREAM 0
 .Sh DESCRIPTION
 The
 .Tn TCP
 protocol provides reliable, flow-controlled, two-way
 transmission of data.
 It is a byte-stream protocol used to
 support the
 .Dv SOCK_STREAM
 abstraction.
 .Tn TCP
 uses the standard
 Internet address format and, in addition, provides a per-host
 collection of
 .Dq "port addresses" .
 Thus, each address is composed
 of an Internet address specifying the host and network,
 with a specific
 .Tn TCP
 port on the host identifying the peer entity.
 .Pp
 Sockets utilizing the
 .Tn TCP
 protocol are either
 .Dq active
 or
 .Dq passive .
 Active sockets initiate connections to passive
 sockets.
 By default,
 .Tn TCP
 sockets are created active; to create a
 passive socket, the
 .Xr listen 2
 system call must be used
 after binding the socket with the
 .Xr bind 2
 system call.
 Only passive sockets may use the
 .Xr accept 2
 call to accept incoming connections.
 Only active sockets may use the
 .Xr connect 2
 call to initiate connections.
 .Pp
 Passive sockets may
 .Dq underspecify
 their location to match
 incoming connection requests from multiple networks.
 This technique, termed
 .Dq "wildcard addressing" ,
 allows a single
 server to provide service to clients on multiple networks.
 To create a socket which listens on all networks, the Internet
 address
 .Dv INADDR_ANY
 must be bound.
 The
 .Tn TCP
 port may still be specified
 at this time; if the port is not specified, the system will assign one.
 Once a connection has been established, the socket's address is
 fixed by the peer entity's location.
 The address assigned to the
 socket is the address associated with the network interface
 through which packets are being transmitted and received.
 Normally, this address corresponds to the peer entity's network.
 .Pp
 .Tn TCP
 supports a number of socket options which can be set with
 .Xr setsockopt 2
 and tested with
 .Xr getsockopt 2 :
 .Bl -tag -width ".Dv TCP_FUNCTION_BLK"
 .It Dv TCP_INFO
 Information about a socket's underlying TCP session may be retrieved
 by passing the read-only option
 .Dv TCP_INFO
 to
 .Xr getsockopt 2 .
 It accepts a single argument: a pointer to an instance of
 .Vt "struct tcp_info" .
 .Pp
 This API is subject to change; consult the source to determine
 which fields are currently filled out by this option.
 .Fx
 specific additions include
 send window size,
 receive window size,
 and
 bandwidth-controlled window space.
 .It Dv TCP_CCALGOOPT
 Set or query congestion control algorithm specific parameters.
 See
 .Xr mod_cc 4
 for details.
 .It Dv TCP_CONGESTION
 Select or query the congestion control algorithm that TCP will use for the
 connection.
 See
 .Xr mod_cc 4
 for details.
 .It Dv TCP_FASTOPEN
 Enable or disable TCP Fast Open (TFO).
 To use this option, the kernel must be built with the
 .Dv TCP_RFC7413
 option.
 .Pp
 This option can be set on the socket either before or after the
 .Xr listen 2
 is invoked.
 Clearing this option on a listen socket after it has been set has no effect on
 existing TFO connections or TFO connections in progress; it only prevents new
 TFO connections from being established.
 .Pp
 For passively-created sockets, the
 .Dv TCP_FASTOPEN
 socket option can be queried to determine whether the connection was established
 using TFO.
 Note that connections that are established via a TFO
 .Tn SYN ,
 but that fall back to using a non-TFO
 .Tn SYN|ACK
 will have the
 .Dv TCP_FASTOPEN
 socket option set.
 .Pp
 In addition to the facilities defined in RFC7413, this implementation supports a
 pre-shared key (PSK) mode of operation in which the TFO server requires the
 client to be in posession of a shared secret in order for the client to be able
 to successfully open TFO connections with the server.
 This is useful, for example, in environments where TFO servers are exposed to
 both internal and external clients and only wish to allow TFO connections from
 internal clients.
 .Pp
 In the PSK mode of operation, the server generates and sends TFO cookies to
 requesting clients as usual.
 However, when validating cookies received in TFO SYNs from clients, the server
 requires the client-supplied cookie to equal
 .Bd -literal -offset left
 SipHash24(key=\fI16-byte-psk\fP, msg=\fIcookie-sent-to-client\fP)
 .Ed
 .Pp
 Multiple concurrent valid pre-shared keys are supported so that time-based
 rolling PSK invalidation policies can be implemented in the system.
 The default number of concurrent pre-shared keys is 2.
 .Pp
 This can be adjusted with the
 .Dv TCP_RFC7413_MAX_PSKS
 kernel option.
 .It Dv TCP_FUNCTION_BLK
 Select or query the set of functions that TCP will use for this connection.
 This allows a user to select an alternate TCP stack.
 The alternate TCP stack must already be loaded in the kernel.
 To list the available TCP stacks, see
 .Va functions_available
 in the
 .Sx MIB (sysctl) Variables
 section further down.
 To list the default TCP stack, see
 .Va functions_default
 in the
 .Sx MIB (sysctl) Variables
 section.
 .It Dv TCP_KEEPINIT
 This
 .Xr setsockopt 2
 option accepts a per-socket timeout argument of
 .Vt "u_int"
 in seconds, for new, non-established
 .Tn TCP
 connections.
 For the global default in milliseconds see
 .Va keepinit
 in the
 .Sx MIB (sysctl) Variables
 section further down.
 .It Dv TCP_KEEPIDLE
 This
 .Xr setsockopt 2
 option accepts an argument of
 .Vt "u_int"
 for the amount of time, in seconds, that the connection must be idle
 before keepalive probes (if enabled) are sent for the connection of this
 socket.
 If set on a listening socket, the value is inherited by the newly created
 socket upon
 .Xr accept 2 .
 For the global default in milliseconds see
 .Va keepidle
 in the
 .Sx MIB (sysctl) Variables
 section further down.
 .It Dv TCP_KEEPINTVL
 This
 .Xr setsockopt 2
 option accepts an argument of
 .Vt "u_int"
 to set the per-socket interval, in seconds, between keepalive probes sent
 to a peer.
 If set on a listening socket, the value is inherited by the newly created
 socket upon
 .Xr accept 2 .
 For the global default in milliseconds see
 .Va keepintvl
 in the
 .Sx MIB (sysctl) Variables
 section further down.
 .It Dv TCP_KEEPCNT
 This
 .Xr setsockopt 2
 option accepts an argument of
 .Vt "u_int"
 and allows a per-socket tuning of the number of probes sent, with no response,
 before the connection will be dropped.
 If set on a listening socket, the value is inherited by the newly created
 socket upon
 .Xr accept 2 .
 For the global default see the
 .Va keepcnt
 in the
 .Sx MIB (sysctl) Variables
 section further down.
 .It Dv TCP_NODELAY
 Under most circumstances,
 .Tn TCP
 sends data when it is presented;
 when outstanding data has not yet been acknowledged, it gathers
 small amounts of output to be sent in a single packet once
 an acknowledgement is received.
 For a small number of clients, such as window systems
 that send a stream of mouse events which receive no replies,
 this packetization may cause significant delays.
 The boolean option
 .Dv TCP_NODELAY
 defeats this algorithm.
 
 .It Dv TCP_MAXSEG
 By default, a sender- and
 .No receiver- Ns Tn TCP
 will negotiate among themselves to determine the maximum segment size
 to be used for each connection.
 The
 .Dv TCP_MAXSEG
 option allows the user to determine the result of this negotiation,
 and to reduce it if desired.
 .It Dv TCP_MAXUNACKTIME
 This
 .Xr setsockopt 2
 option accepts an argument of
 .Vt "u_int"
 to set the per-socket interval, in seconds, in which the connection must
 make progress. Progress is defined by at least 1 byte being acknowledged within
 the set time period. If a connection fails to make progress, then the
 .Tn TCP
 stack will terminate the connection with a reset. Note that the default
 value for this is zero which indicates no progress checks should be made.
 .It Dv TCP_NOOPT
 .Tn TCP
 usually sends a number of options in each packet, corresponding to
 various
 .Tn TCP
 extensions which are provided in this implementation.
 The boolean option
 .Dv TCP_NOOPT
 is provided to disable
 .Tn TCP
 option use on a per-connection basis.
 .It Dv TCP_NOPUSH
 By convention, the
 .No sender- Ns Tn TCP
 will set the
 .Dq push
 bit, and begin transmission immediately (if permitted) at the end of
 every user call to
 .Xr write 2
 or
 .Xr writev 2 .
 When this option is set to a non-zero value,
 .Tn TCP
 will delay sending any data at all until either the socket is closed,
 or the internal send buffer is filled.
 .It Dv TCP_MD5SIG
 This option enables the use of MD5 digests (also known as TCP-MD5)
 on writes to the specified socket.
 Outgoing traffic is digested;
 digests on incoming traffic are verified.
 When this option is enabled on a socket, all inbound and outgoing
 TCP segments must be signed with MD5 digests.
 .Pp
 One common use for this in a
 .Fx
 router deployment is to enable
 based routers to interwork with Cisco equipment at peering points.
 Support for this feature conforms to RFC 2385.
 .Pp
 In order for this option to function correctly, it is necessary for the
 administrator to add a tcp-md5 key entry to the system's security
 associations database (SADB) using the
 .Xr setkey 8
 utility.
 This entry can only be specified on a per-host basis at this time.
 .Pp
 If an SADB entry cannot be found for the destination,
 the system does not send any outgoing segments and drops any inbound segments.
 However, during connection negotiation, a non-signed segment will be accepted if
 an SADB entry does not exist between hosts.
 When a non-signed segment is accepted, the established connection is not
 protected with MD5 digests.
 .It Dv TCP_STATS
 Manage collection of connection level statistics using the
 .Xr stats 3
 framework.
 .Pp
 Each dropped segment is taken into account in the TCP protocol statistics.
 .It Dv TCP_TXTLS_ENABLE
 Enable in-kernel Transport Layer Security (TLS) for data written to this
 socket.
 See
 .Xr ktls 4
 for more details.
 .It Dv TCP_TXTLS_MODE
 The integer argument can be used to get or set the current TLS transmit mode
 of a socket.
 See
 .Xr ktls 4
 for more details.
 .It Dv TCP_RXTLS_ENABLE
 Enable in-kernel TLS for data read from this socket.
 See
 .Xr ktls 4
 for more details.
 .It Dv TCP_REUSPORT_LB_NUMA
 Changes NUMA affinity filtering for an established TCP listen
 socket.
 This option takes a single integer argument which specifies
 the NUMA domain to filter on for this listen socket.
 The argument can also have the follwing special values:
 .Bl -tag -width "Dv TCP_REUSPORT_LB_NUMA"
 .It Dv TCP_REUSPORT_LB_NUMA_NODOM
 Remove NUMA filtering for this listen socket.
 .It Dv TCP_REUSPORT_LB_NUMA_CURDOM
 Filter traffic associated with the domain where the calling thread is
 currently executing.
 This is typically used after a process or thread inherits a listen
 socket from its parent, and sets its CPU affinity to a particular core.
 .El
 .It Dv TCP_REMOTE_UDP_ENCAPS_PORT
 Set and get the remote UDP encapsulation port.
 It can only be set on a closed TCP socket.
 .El
 .Pp
 The option level for the
 .Xr setsockopt 2
 call is the protocol number for
 .Tn TCP ,
 available from
 .Xr getprotobyname 3 ,
 or
 .Dv IPPROTO_TCP .
 All options are declared in
 .In netinet/tcp.h .
 .Pp
 Options at the
 .Tn IP
 transport level may be used with
 .Tn TCP ;
 see
 .Xr ip 4 .
 Incoming connection requests that are source-routed are noted,
 and the reverse source route is used in responding.
 .Pp
 The default congestion control algorithm for
 .Tn TCP
 is
 .Xr cc_newreno 4 .
 Other congestion control algorithms can be made available using the
 .Xr mod_cc 4
 framework.
 .Ss MIB (sysctl) Variables
 The
 .Tn TCP
 protocol implements a number of variables in the
 .Va net.inet.tcp
 branch of the
 .Xr sysctl 3
 MIB, which can also be read or modified with
 .Xr sysctl 8 .
 .Bl -tag -width ".Va v6pmtud_blackhole_mss"
 .It Va always_keepalive
 Assume that
 .Dv SO_KEEPALIVE
 is set on all
 .Tn TCP
 connections, the kernel will
 periodically send a packet to the remote host to verify the connection
 is still up.
 .It Va blackhole
 If enabled, disable sending of RST when a connection is attempted
 to a port where there is no socket accepting connections.
 See
 .Xr blackhole 4 .
 .It Va blackhole_local
 See
 .Xr blackhole 4 .
 .It Va cc
 A number of variables for congestion control are under the
 .Va net.inet.tcp.cc
 node.
 See
 .Xr mod_cc 4 .
 .It Va cc.newreno
 Variables for NewReno congestion control are under the
 .Va net.inet.tcp.cc.newreno
 node.
 See
 .Xr cc_newreno 4 .
 .It Va delacktime
 Maximum amount of time, in milliseconds, before a delayed ACK is sent.
 .It Va delayed_ack
 Delay ACK to try and piggyback it onto a data packet or another ACK.
 .It Va do_lrd
 Enable Lost Retransmission Detection for SACK-enabled sessions, disabled by
 default.
 Under severe congestion, a retransmission can be lost which then leads to a
 mandatory Retransmission Timeout (RTO), followed by slow-start.
 LRD will try to resend the repeatedly lost packet, preventing the time-consuming
 RTO and performance reducing slow-start.
 .It Va do_prr
 Perform SACK loss recovery using the Proportional Rate Reduction (PRR) algorithm
 described in RFC6937.
 This improves the effectiveness of retransmissions particular in environments
 with ACK thinning or burst loss events, as chances to run out of the ACK clock
 are reduced, preventing lengthy and performance reducing RTO based loss recovery
 (default is true).
-.It Va do_prr_conservative
-While doing Proportional Rate Reduction, remain strictly in a packet conserving
-mode, sending only one new packet for each ACK received.
-Helpful when a misconfigured token bucket traffic policer causes persistent
-high losses leading to RTO, but reduces PRR effectiveness in more common settings
-(default is false).
 .It Va do_tcpdrain
 Flush packets in the
 .Tn TCP
 reassembly queue if the system is low on mbufs.
 .It Va drop_synfin
 Drop TCP packets with both SYN and FIN set.
 .It Va ecn.enable
 Enable support for TCP Explicit Congestion Notification (ECN).
 ECN allows a TCP sender to reduce the transmission rate in order to
 avoid packet drops.
 .Bl -tag -compact
 .It 0
 Disable ECN.
 .It 1
 Allow incoming connections to request ECN.
 Outgoing connections will request ECN.
 .It 2
 Allow incoming connections to request ECN.
 Outgoing connections will not request ECN.
 (default)
 .It 3
 Negotiate on incoming connection for Accurate ECN, ECN, or no ECN.
 Outgoing connections will request Accurate ECN and fall back to
 ECN depending on the capabilities of the server.
 .It 4
 Negotiate on incoming connection for Accurate ECN, ECN, or no ECN.
 Outgoing connections will not request ECN.
 .El
 .It Va ecn.maxretries
 Number of retries (SYN or SYN/ACK retransmits) before disabling ECN on a
 specific connection.
 This is needed to help with connection establishment
 when a broken firewall is in the network path.
 .It Va fast_finwait2_recycle
 Recycle
 .Tn TCP
 .Dv FIN_WAIT_2
 connections faster when the socket is marked as
 .Dv SBS_CANTRCVMORE
 (no user process has the socket open, data received on
 the socket cannot be read).
 The timeout used here is
 .Va finwait2_timeout .
 .It Va fastopen.acceptany
 When non-zero, all client-supplied TFO cookies will be considered to be valid.
 The default is 0.
 .It Va fastopen.autokey
 When this and
 .Va net.inet.tcp.fastopen.server_enable
 are non-zero, a new key will be automatically generated after this specified
 seconds.
 The default is 120.
 .It Va fastopen.ccache_bucket_limit
 The maximum number of entries in a client cookie cache bucket.
 The default value can be tuned with the
 .Dv TCP_FASTOPEN_CCACHE_BUCKET_LIMIT_DEFAULT
 kernel option or by setting
 .Va net.inet.tcp.fastopen_ccache_bucket_limit
 in the
 .Xr loader 8 .
 .It Va fastopen.ccache_buckets
 The number of client cookie cache buckets.
 Read-only.
 The value can be tuned with the
 .Dv TCP_FASTOPEN_CCACHE_BUCKETS_DEFAULT
 kernel option or by setting
 .Va fastopen.ccache_buckets
 in the
 .Xr loader 8 .
 .It Va fastopen.ccache_list
 Print the client cookie cache.
 Read-only.
 .It Va fastopen.client_enable
 When zero, no new active (i.e., client) TFO connections can be created.
 On the transition from enabled to disabled, the client cookie cache is cleared
 and disabled.
 The transition from enabled to disabled does not affect any active TFO
 connections in progress; it only prevents new ones from being established.
 The default is 0.
 .It Va fastopen.keylen
 The key length in bytes.
 Read-only.
 .It Va fastopen.maxkeys
 The maximum number of keys supported.
 Read-only,
 .It Va fastopen.maxpsks
 The maximum number of pre-shared keys supported.
 Read-only.
 .It Va fastopen.numkeys
 The current number of keys installed.
 Read-only.
 .It Va fastopen.numpsks
 The current number of pre-shared keys installed.
 Read-only.
 .It Va fastopen.path_disable_time
 When a failure occurs while trying to create a new active (i.e., client) TFO
 connection, new active connections on the same path, as determined by the tuple
 .Brq client_ip, server_ip, server_port ,
 will be forced to be non-TFO for this many seconds.
 Note that the path disable mechanism relies on state stored in client cookie
 cache entries, so it is possible for the disable time for a given path to be
 reduced if the corresponding client cookie cache entry is reused due to resource
 pressure before the disable period has elapsed.
 The default is
 .Dv TCP_FASTOPEN_PATH_DISABLE_TIME_DEFAULT .
 .It Va fastopen.psk_enable
 When non-zero, pre-shared key (PSK) mode is enabled for all TFO servers.
 On the transition from enabled to disabled, all installed pre-shared keys are
 removed.
 The default is 0.
 .It Va fastopen.server_enable
 When zero, no new passive (i.e., server) TFO connections can be created.
 On the transition from enabled to disabled, all installed keys and pre-shared
 keys are removed.
 On the transition from disabled to enabled, if
 .Va fastopen.autokey
 is non-zero and there are no keys installed, a new key will be generated
 immediately.
 The transition from enabled to disabled does not affect any passive TFO
 connections in progress; it only prevents new ones from being established.
 The default is 0.
 .It Va fastopen.setkey
 Install a new key by writing
 .Va net.inet.tcp.fastopen.keylen
 bytes to this sysctl.
 .It Va fastopen.setpsk
 Install a new pre-shared key by writing
 .Va net.inet.tcp.fastopen.keylen
 bytes to this sysctl.
 .It Va finwait2_timeout
 Timeout to use for fast recycling of
 .Tn TCP
 .Dv FIN_WAIT_2
 connections
 .Pq Va fast_finwait2_recycle .
 Defaults to 60 seconds.
 .It Va functions_available
 List of available TCP function blocks (TCP stacks).
 .It Va functions_default
 The default TCP function block (TCP stack).
 .It Va functions_inherit_listen_socket_stack
 Determines whether to inherit listen socket's TCP stack or use the current
 system default TCP stack, as defined by
 .Va functions_default .
 Default is true.
 .It Va hostcache
 The TCP host cache is used to cache connection details and metrics to
 improve future performance of connections between the same hosts.
 At the completion of a TCP connection, a host will cache information
 for the connection for some defined period of time.
 There are a number of
 .Va hostcache
 variables under this node.
 See
 .Va hostcache.enable .
 .It Va hostcache.bucketlimit
 The maximum number of entries for the same hash.
 Defaults to 30.
 .It Va hostcache.cachelimit
 Overall entry limit for hostcache.
 Defaults to
 .Va hashsize
 *
 .Va bucketlimit .
 .It Va hostcache.count
 The current number of entries in the host cache.
 .It Va hostcache.enable
 Enable/disable the host cache:
 .Bl -tag -compact
 .It 0
 Disable the host cache.
 .It 1
 Enable the host cache. (default)
 .El
 .It Va hostcache.expire
 Time in seconds, how long a entry should be kept in the
 host cache since last accessed.
 Defaults to 3600 (1 hour).
 .It Va hostcache.hashsize
 Size of TCP hostcache hashtable.
 This number has to be a power of two, or will be rejected.
 Defaults to 512.
 .It Va hostcache.histo
 Provide a Histogram of the hostcache hash utilization.
 .It Va hostcache.list
 Provide a complete list of all current entries in the host
 cache.
 .It Va hostcache.prune
 Time in seconds between pruning expired host cache entries.
 Defaults to 300 (5 minutes).
 .It Va hostcache.purge
 Expire all entires on next pruning of host cache entries.
 Any non-zero setting will be reset to zero, once the purge
 is running.
 .Bl -tag -compact
 .It 0
 Do not purge all entries when pruning the host cache (default).
 .It 1
 Purge all entries when doing the next pruning.
 .It 2
 Purge all entries and also reseed the hash salt.
 .El
 .It Va hostcache.purgenow
 Immediately purge all entries once set to any value.
 Setting this to 2 will also reseed the hash salt.
 .It Va icmp_may_rst
 Certain
 .Tn ICMP
 unreachable messages may abort connections in
 .Tn SYN-SENT
 state.
 .It Va initcwnd_segments
 Enable the ability to specify initial congestion window in number of segments.
 The default value is 10 as suggested by RFC 6928.
 Changing the value on the fly would not affect connections
 using congestion window from the hostcache.
 Caution:
 This regulates the burst of packets allowed to be sent in the first RTT.
 The value should be relative to the link capacity.
 Start with small values for lower-capacity links.
 Large bursts can cause buffer overruns and packet drops if routers have small
 buffers or the link is experiencing congestion.
 .It Va insecure_rst
 Use criteria defined in RFC793 instead of RFC5961 for accepting RST segments.
 Default is false.
 .It Va insecure_syn
 Use criteria defined in RFC793 instead of RFC5961 for accepting SYN segments.
 Default is false.
 .It Va isn_reseed_interval
 The interval (in seconds) specifying how often the secret data used in
 RFC 1948 initial sequence number calculations should be reseeded.
 By default, this variable is set to zero, indicating that
 no reseeding will occur.
 Reseeding should not be necessary, and will break
 .Dv TIME_WAIT
 recycling for a few minutes.
 .It Va keepcnt
 Number of keepalive probes sent, with no response, before a connection
 is dropped.
 The default is 8 packets.
 .It Va keepidle
 Amount of time, in milliseconds, that the connection must be idle
 before sending keepalive probes (if enabled).
 The default is 7200000 msec (7.2M msec, 2 hours).
 .It Va keepinit
 Timeout, in milliseconds, for new, non-established
 .Tn TCP
 connections.
 The default is 75000 msec (75K msec, 75 sec).
 .It Va keepintvl
 The interval, in milliseconds, between keepalive probes sent to remote
 machines, when no response is received on a
 .Va keepidle
 probe.
 The default is 75000 msec (75K msec, 75 sec).
 .It Va log_in_vain
 Log any connection attempts to ports where there is no socket
 accepting connections.
 The value of 1 limits the logging to
 .Tn SYN
 (connection establishment) packets only.
 A value of 2 results in any
 .Tn TCP
 packets to closed ports being logged.
 Any value not listed above disables the logging
 (default is 0, i.e., the logging is disabled).
 .It Va minmss
 Minimum TCP Maximum Segment Size; used to prevent a denial of service attack
 from an unreasonably low MSS.
 .It Va msl
 The Maximum Segment Lifetime, in milliseconds, for a packet.
 .It Va mssdflt
 The default value used for the TCP Maximum Segment Size
 .Pq Dq MSS
 for IPv4 when no advice to the contrary is received from MSS negotiation.
 .It Va newcwd
 Enable the New Congestion Window Validation mechanism as described in RFC 7661.
 This gently reduces the congestion window during periods, where TCP is
 application limited and the network bandwidth is not utilized completely.
 That prevents self-inflicted packet losses once the application starts to
 transmit data at a higher speed.
 .It Va nolocaltimewait
 Suppress creation of TCP
 .Dv TIME_WAIT
 states for connections in
 which both endpoints are local.
 .It Va path_mtu_discovery
 Enable Path MTU Discovery.
 .It Va pcbcount
 Number of active protocol control blocks
 (read-only).
 .It Va perconn_stats_enable
 Controls the default collection of statistics for all connections using the
 .Xr stats 3
 framework.
 0 disables, 1 enables, 2 enables random sampling across log id connection
 groups with all connections in a group receiving the same setting.
 .It Va perconn_stats_sample_rates
 A CSV list of template_spec=percent key-value pairs which controls the per
 template sampling rates when
 .Xr stats 3
 sampling is enabled.
 .It Va persmax
 Maximum persistence interval, msec.
 .It Va persmin
 Minimum persistence interval, msec.
 .It Va pmtud_blackhole_detection
 Enable automatic path MTU blackhole detection.
 In case of retransmits of MSS sized segments,
 the OS will lower the MSS to check if it's an MTU problem.
 If the current MSS is greater than the configured value to try
 .Po Va net.inet.tcp.pmtud_blackhole_mss
 and
 .Va net.inet.tcp.v6pmtud_blackhole_mss
 .Pc ,
 it will be set to this value, otherwise,
 the MSS will be set to the default values
 .Po Va net.inet.tcp.mssdflt
 and
 .Va net.inet.tcp.v6mssdflt
 .Pc .
 Settings:
 .Bl -tag -compact
 .It 0
 Disable path MTU blackhole detection.
 .It 1
 Enable path MTU blackhole detection for IPv4 and IPv6.
 .It 2
 Enable path MTU blackhole detection only for IPv4.
 .It 3
 Enable path MTU blackhole detection only for IPv6.
 .El
 .It Va pmtud_blackhole_mss
 MSS to try for IPv4 if PMTU blackhole detection is turned on.
 .It Va reass.cursegments
 The current total number of segments present in all reassembly queues.
 .It Va reass.maxqueuelen
 The maximum number of segments allowed in each reassembly queue.
 By default, the system chooses a limit based on each TCP connection's
 receive buffer size and maximum segment size (MSS).
 The actual limit applied to a session's reassembly queue will be the lower of
 the system-calculated automatic limit and the user-specified
 .Va reass.maxqueuelen
 limit.
 .It Va reass.maxsegments
 The maximum limit on the total number of segments across all reassembly
 queues.
 The limit can be adjusted as a tunable.
 .It Va recvbuf_auto
 Enable automatic receive buffer sizing as a connection progresses.
 .It Va recvbuf_max
 Maximum size of automatic receive buffer.
 .It Va recvspace
 Initial
 .Tn TCP
 receive window (buffer size).
 .It Va retries
 Maximum number of consecutive timer based retransmits sent after a data
 segment is lost (default and maximum is 12).
 .It Va rexmit_drop_options
 Drop TCP options from third and later retransmitted SYN segments
 of a connection.
 .It Va rexmit_initial , rexmit_min , rexmit_slop
 Adjust the retransmit timer calculation for
 .Tn TCP .
 The slop is
 typically added to the raw calculation to take into account
 occasional variances that the
 .Tn SRTT
 (smoothed round-trip time)
 is unable to accommodate, while the minimum specifies an
 absolute minimum.
 While a number of
 .Tn TCP
 RFCs suggest a 1
 second minimum, these RFCs tend to focus on streaming behavior,
 and fail to deal with the fact that a 1 second minimum has severe
 detrimental effects over lossy interactive connections, such
 as a 802.11b wireless link, and over very fast but lossy
 connections for those cases not covered by the fast retransmit
 code.
 For this reason, we use 200ms of slop and a near-0
 minimum, which gives us an effective minimum of 200ms (similar to
 .Tn Linux ) .
 The initial value is used before an RTT measurement has been performed.
 .It Va rfc1323
 Implement the window scaling and timestamp options of RFC 1323/RFC 7323
 (default is 1).
 Settings:
 .Bl -tag -compact
 .It 0
 Disable window scaling and timestamp option.
 .It 1
 Enable window scaling and timestamp option.
 .It 2
 Enable only window scaling.
 .It 3
 Enable only timestamp option.
 .El
 .It Va rfc3042
 Enable the Limited Transmit algorithm as described in RFC 3042.
 It helps avoid timeouts on lossy links and also when the congestion window
 is small, as happens on short transfers.
 .It Va rfc3390
 Enable support for RFC 3390, which allows for a variable-sized
 starting congestion window on new connections, depending on the
 maximum segment size.
 This helps throughput in general, but
 particularly affects short transfers and high-bandwidth large
 propagation-delay connections.
 .It Va rfc6675_pipe
 Deprecated and superseded by
 .Va sack.revised
 .It Va sack.enable
 Enable support for RFC 2018, TCP Selective Acknowledgment option,
 which allows the receiver to inform the sender about all successfully
 arrived segments, allowing the sender to retransmit the missing segments
 only.
 .It Va sack.globalholes
 Global number of TCP SACK holes currently allocated.
 .It Va sack.globalmaxholes
 Maximum number of SACK holes per system, across all connections.
 Defaults to 65536.
 .It Va sack.maxholes
 Maximum number of SACK holes per connection.
 Defaults to 128.
 .It Va sack.revised
 Enables three updated mechanisms from RFC6675 (default is true).
 Calculate the bytes in flight using the algorithm described in RFC 6675, and
 is also an improvement when Proportional Rate Reduction is enabled.
 Next, Rescue Retransmission helps timely loss recovery, when the trailing segments
 of a transmission are lost, while no additional data is ready to be sent.
 In case a partial ACK without a SACK block is received during SACK loss
 recovery, the trailing segment is immediately resent, rather than waiting
 for a Retransmission timeout.
 Finally, SACK loss recovery is also engaged, once two segments plus one byte are
 SACKed - even if no traditional duplicate ACKs were observed.
 .It Va sendbuf_auto
 Enable automatic send buffer sizing.
 .It Va sendbuf_auto_lowat
 Modify threshold for auto send buffer growth to account for
 .Dv SO_SNDLOWAT .
 .It Va sendbuf_inc
 Incrementor step size of automatic send buffer.
 .It Va sendbuf_max
 Maximum size of automatic send buffer.
 .It Va sendspace
 Initial
 .Tn TCP
 send window (buffer size).
 .It Va syncache
 Variables under the
 .Va net.inet.tcp.syncache
 node are documented in
 .Xr syncache 4 .
 .It Va syncookies
 Determines whether or not
 .Tn SYN
 cookies should be generated for outbound
 .Tn SYN-ACK
 packets.
 .Tn SYN
 cookies are a great help during
 .Tn SYN
 flood attacks, and are enabled by default.
 (See
 .Xr syncookies 4 . )
 .It Va syncookies_only
 See
 .Xr syncookies 4 .
 .It Va tcbhashsize
 Size of the
 .Tn TCP
 control-block hash table
 (read-only).
 This is tuned using the kernel option
 .Dv TCBHASHSIZE
 or by setting
 .Va net.inet.tcp.tcbhashsize
 in the
 .Xr loader 8 .
 .It Va tolerate_missing_ts
 Tolerate the missing of timestamps (RFC 1323/RFC 7323) for
 .Tn TCP
 segments belonging to
 .Tn TCP
 connections for which support of
 .Tn TCP
 timestamps has been negotiated.
 As of June 2021, several TCP stacks are known to violate RFC 7323, including
 modern widely deployed ones.
 Therefore the default is 1, i.e., the missing of timestamps is tolerated.
 .It Va ts_offset_per_conn
 When initializing the TCP timestamps, use a per connection offset instead of a
 per host pair offset.
 Default is to use per connection offsets as recommended in RFC 7323.
 .It Va tso
 Enable TCP Segmentation Offload.
 .It Va udp_tunneling_overhead
 The overhead taken into account when using UDP encapsulation.
 Since MSS clamping by middleboxes will most likely not work, values larger than
 8 (the size of the UDP header) are also supported.
 Supported values are between 8 and 1024.
 The default is 8.
 .It Va udp_tunneling_port
 The local UDP encapsulation port.
 A value of 0 indicates that UDP encapsulation is disabled.
 The default is 0.
 .It Va v6mssdflt
 The default value used for the TCP Maximum Segment Size
 .Pq Dq MSS
 for IPv6 when no advice to the contrary is received from MSS negotiation.
 .It Va v6pmtud_blackhole_mss
 MSS to try for IPv6 if PMTU blackhole detection is turned on.
 See
 .Va pmtud_blackhole_detection .
 .El
 .Sh ERRORS
 A socket operation may fail with one of the following errors returned:
 .Bl -tag -width Er
 .It Bq Er EISCONN
 when trying to establish a connection on a socket which
 already has one;
 .It Bo Er ENOBUFS Bc or Bo Er ENOMEM Bc
 when the system runs out of memory for
 an internal data structure;
 .It Bq Er ETIMEDOUT
 when a connection was dropped
 due to excessive retransmissions;
 .It Bq Er ECONNRESET
 when the remote peer
 forces the connection to be closed;
 .It Bq Er ECONNREFUSED
 when the remote
 peer actively refuses connection establishment (usually because
 no process is listening to the port);
 .It Bq Er EADDRINUSE
 when an attempt
 is made to create a socket with a port which has already been
 allocated;
 .It Bq Er EADDRNOTAVAIL
 when an attempt is made to create a
 socket with a network address for which no network interface
 exists;
 .It Bq Er EAFNOSUPPORT
 when an attempt is made to bind or connect a socket to a multicast
 address.
 .It Bq Er EINVAL
 when trying to change TCP function blocks at an invalid point in the session;
 .It Bq Er ENOENT
 when trying to use a TCP function block that is not available;
 .El
 .Sh SEE ALSO
 .Xr getsockopt 2 ,
 .Xr socket 2 ,
 .Xr stats 3 ,
 .Xr sysctl 3 ,
 .Xr blackhole 4 ,
 .Xr inet 4 ,
 .Xr intro 4 ,
 .Xr ip 4 ,
 .Xr ktls 4 ,
 .Xr mod_cc 4 ,
 .Xr siftr 4 ,
 .Xr syncache 4 ,
 .Xr tcp_bbr 4 ,
 .Xr tcp_rack 4 ,
 .Xr setkey 8 ,
 .Xr sysctl 8 ,
 .Xr tcp_functions 9
 .Rs
 .%A "V. Jacobson"
 .%A "B. Braden"
 .%A "D. Borman"
 .%T "TCP Extensions for High Performance"
 .%O "RFC 1323"
 .Re
 .Rs
 .%A "D. Borman"
 .%A "B. Braden"
 .%A "V. Jacobson"
 .%A "R. Scheffenegger"
 .%T "TCP Extensions for High Performance"
 .%O "RFC 7323"
 .Re
 .Rs
 .%A "A. Heffernan"
 .%T "Protection of BGP Sessions via the TCP MD5 Signature Option"
 .%O "RFC 2385"
 .Re
 .Rs
 .%A "K. Ramakrishnan"
 .%A "S. Floyd"
 .%A "D. Black"
 .%T "The Addition of Explicit Congestion Notification (ECN) to IP"
 .%O "RFC 3168"
 .Re
 .Sh HISTORY
 The
 .Tn TCP
 protocol appeared in
 .Bx 4.2 .
 The RFC 1323 extensions for window scaling and timestamps were added
 in
 .Bx 4.4 .
 The
 .Dv TCP_INFO
 option was introduced in
 .Tn Linux 2.6
 and is
 .Em subject to change .
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index ba11503955dc..3673bc1e0b98 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -1,4083 +1,4098 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 2007-2008,2010
  *	Swinburne University of Technology, Melbourne, Australia.
  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2010 The FreeBSD Foundation
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, by Lawrence Stewart,
  * James Healy and David Hayes, made possible in part by a grant from the Cisco
  * University Research Program Fund at Community Foundation Silicon Valley.
  *
  * Portions of this software were developed at the Centre for Advanced
  * Internet Architectures, Swinburne University of Technology, Melbourne,
  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/arb.h>
 #include <sys/kernel.h>
 #ifdef TCP_HHOOK
 #include <sys/hhook.h>
 #endif
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/protosw.h>
 #include <sys/qmath.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/stats.h>
 
 #include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #define TCPSTATES		/* for logging */
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_rss.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* required for icmp_var.h */
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_rss.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_log_buf.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcp_fastopen.h>
 #ifdef TCPPCAP
 #include <netinet/tcp_pcap.h>
 #endif
 #include <netinet/tcp_syncache.h>
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 #include <netinet/tcp_ecn.h>
 #include <netinet/udp.h>
 
 #include <netipsec/ipsec_support.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 const int tcprexmtthresh = 3;
 
 VNET_DEFINE(int, tcp_log_in_vain) = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_log_in_vain), 0,
     "Log all incoming TCP segments to closed ports");
 
 VNET_DEFINE(int, blackhole) = 0;
 #define	V_blackhole		VNET(blackhole)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(blackhole), 0,
     "Do not send RST on segments to closed ports");
 
 VNET_DEFINE(bool, blackhole_local) = false;
 #define	V_blackhole_local	VNET(blackhole_local)
 SYSCTL_BOOL(_net_inet_tcp, OID_AUTO, blackhole_local, CTLFLAG_VNET |
     CTLFLAG_RW, &VNET_NAME(blackhole_local), false,
     "Enforce net.inet.tcp.blackhole for locally originated packets");
 
 VNET_DEFINE(int, tcp_delack_enabled) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_delack_enabled), 0,
     "Delay ACK to try and piggyback it onto a data packet");
 
 VNET_DEFINE(int, drop_synfin) = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(drop_synfin), 0,
     "Drop TCP packets with SYN+FIN set");
 
-VNET_DEFINE(int, tcp_do_prr_conservative) = 0;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr_conservative, CTLFLAG_VNET | CTLFLAG_RW,
-    &VNET_NAME(tcp_do_prr_conservative), 0,
-    "Do conservative Proportional Rate Reduction");
-
 VNET_DEFINE(int, tcp_do_prr) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_prr), 1,
     "Enable Proportional Rate Reduction per RFC 6937");
 
 VNET_DEFINE(int, tcp_do_lrd) = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_lrd, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_lrd), 1,
     "Perform Lost Retransmission Detection");
 
 VNET_DEFINE(int, tcp_do_newcwv) = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, newcwv, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_newcwv), 0,
     "Enable New Congestion Window Validation per RFC7661");
 
 VNET_DEFINE(int, tcp_do_rfc3042) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3042), 0,
     "Enable RFC 3042 (Limited Transmit)");
 
 VNET_DEFINE(int, tcp_do_rfc3390) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3390), 0,
     "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
 
 VNET_DEFINE(int, tcp_initcwnd_segments) = 10;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, initcwnd_segments,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_initcwnd_segments), 0,
     "Slow-start flight size (initial congestion window) in number of segments");
 
 VNET_DEFINE(int, tcp_do_rfc3465) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc3465), 0,
     "Enable RFC 3465 (Appropriate Byte Counting)");
 
 VNET_DEFINE(int, tcp_abc_l_var) = 2;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_abc_l_var), 2,
     "Cap the max cwnd increment during slow-start to this number of segments");
 
 VNET_DEFINE(int, tcp_insecure_syn) = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_insecure_syn), 0,
     "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets");
 
 VNET_DEFINE(int, tcp_insecure_rst) = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_insecure_rst), 0,
     "Follow RFC793 instead of RFC5961 criteria for accepting RST packets");
 
 VNET_DEFINE(int, tcp_recvspace) = 1024*64;
 #define	V_tcp_recvspace	VNET(tcp_recvspace)
 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size");
 
 VNET_DEFINE(int, tcp_do_autorcvbuf) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_autorcvbuf), 0,
     "Enable automatic receive buffer sizing");
 
 VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_autorcvbuf_max), 0,
     "Max size of automatic receive buffer");
 
 VNET_DEFINE(struct inpcbinfo, tcbinfo);
 
 /*
  * TCP statistics are stored in an array of counter(9)s, which size matches
  * size of struct tcpstat.  TCP running connection count is a regular array.
  */
 VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat,
     tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
 VNET_DEFINE(counter_u64_t, tcps_states[TCP_NSTATES]);
 SYSCTL_COUNTER_U64_ARRAY(_net_inet_tcp, TCPCTL_STATES, states, CTLFLAG_RD |
     CTLFLAG_VNET, &VNET_NAME(tcps_states)[0], TCP_NSTATES,
     "TCP connection counts by TCP state");
 
 /*
  * Kernel module interface for updating tcpstat.  The first argument is an index
  * into tcpstat treated as an array.
  */
 void
 kmod_tcpstat_add(int statnum, int val)
 {
 
 	counter_u64_add(VNET(tcpstat)[statnum], val);
 }
 
 /*
  * Make sure that we only start a SACK loss recovery when
  * receiving a duplicate ACK with a SACK block, and also
  * complete SACK loss recovery in case the other end
  * reneges.
  */
 static bool inline
 tcp_is_sack_recovery(struct tcpcb *tp, struct tcpopt *to)
 {
 	return ((tp->t_flags & TF_SACK_PERMIT) &&
 		((to->to_flags & TOF_SACK) ||
 		(!TAILQ_EMPTY(&tp->snd_holes))));
 }
 
 #ifdef TCP_HHOOK
 /*
  * Wrapper for the TCP established input helper hook.
  */
 void
 hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
 {
 	struct tcp_hhook_data hhook_data;
 
 	if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) {
 		hhook_data.tp = tp;
 		hhook_data.th = th;
 		hhook_data.to = to;
 
 		hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data,
 		    &tp->t_osd);
 	}
 }
 #endif
 
 /*
  * CC wrapper hook functions
  */
 void
 cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs,
     uint16_t type)
 {
 #ifdef STATS
 	int32_t gput;
 #endif
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	tp->t_ccv.nsegs = nsegs;
 	tp->t_ccv.bytes_this_ack = BYTES_THIS_ACK(tp, th);
 	if ((!V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd)) ||
 	    (V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd) &&
 	     (tp->snd_cwnd < (tcp_compute_pipe(tp) * 2))))
 		tp->t_ccv.flags |= CCF_CWND_LIMITED;
 	else
 		tp->t_ccv.flags &= ~CCF_CWND_LIMITED;
 
 	if (type == CC_ACK) {
 #ifdef STATS
 		stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
 		    ((int32_t)tp->snd_cwnd) - tp->snd_wnd);
 		if (!IN_RECOVERY(tp->t_flags))
 			stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_ACKLEN,
 			   tp->t_ccv.bytes_this_ack / (tcp_maxseg(tp) * nsegs));
 		if ((tp->t_flags & TF_GPUTINPROG) &&
 		    SEQ_GEQ(th->th_ack, tp->gput_ack)) {
 			/*
 			 * Compute goodput in bits per millisecond.
 			 */
 			gput = (((int64_t)SEQ_SUB(th->th_ack, tp->gput_seq)) << 3) /
 			    max(1, tcp_ts_getticks() - tp->gput_ts);
 			stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
 			    gput);
 			/*
 			 * XXXLAS: This is a temporary hack, and should be
 			 * chained off VOI_TCP_GPUT when stats(9) grows an API
 			 * to deal with chained VOIs.
 			 */
 			if (tp->t_stats_gput_prev > 0)
 				stats_voi_update_abs_s32(tp->t_stats,
 				    VOI_TCP_GPUT_ND,
 				    ((gput - tp->t_stats_gput_prev) * 100) /
 				    tp->t_stats_gput_prev);
 			tp->t_flags &= ~TF_GPUTINPROG;
 			tp->t_stats_gput_prev = gput;
 		}
 #endif /* STATS */
 		if (tp->snd_cwnd > tp->snd_ssthresh) {
 			tp->t_bytes_acked += tp->t_ccv.bytes_this_ack;
 			if (tp->t_bytes_acked >= tp->snd_cwnd) {
 				tp->t_bytes_acked -= tp->snd_cwnd;
 				tp->t_ccv.flags |= CCF_ABC_SENTAWND;
 			}
 		} else {
 				tp->t_ccv.flags &= ~CCF_ABC_SENTAWND;
 				tp->t_bytes_acked = 0;
 		}
 	}
 
 	if (CC_ALGO(tp)->ack_received != NULL) {
 		/* XXXLAS: Find a way to live without this */
 		tp->t_ccv.curack = th->th_ack;
 		CC_ALGO(tp)->ack_received(&tp->t_ccv, type);
 	}
 #ifdef STATS
 	stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd);
 #endif
 }
 
 void
 cc_conn_init(struct tcpcb *tp)
 {
 	struct hc_metrics_lite metrics;
 	struct inpcb *inp = tptoinpcb(tp);
 	u_int maxseg;
 	int rtt;
 
 	INP_WLOCK_ASSERT(inp);
 
 	tcp_hc_get(&inp->inp_inc, &metrics);
 	maxseg = tcp_maxseg(tp);
 
 	if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
 		tp->t_srtt = rtt;
 		TCPSTAT_INC(tcps_usedrtt);
 		if (metrics.rmx_rttvar) {
 			tp->t_rttvar = metrics.rmx_rttvar;
 			TCPSTAT_INC(tcps_usedrttvar);
 		} else {
 			/* default variation is +- 1 rtt */
 			tp->t_rttvar =
 			    tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
 		}
 		TCPT_RANGESET(tp->t_rxtcur,
 		    ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
 		    tp->t_rttmin, TCPTV_REXMTMAX);
 	}
 	if (metrics.rmx_ssthresh) {
 		/*
 		 * There's some sort of gateway or interface
 		 * buffer limit on the path.  Use this to set
 		 * the slow start threshold, but set the
 		 * threshold to no less than 2*mss.
 		 */
 		tp->snd_ssthresh = max(2 * maxseg, metrics.rmx_ssthresh);
 		TCPSTAT_INC(tcps_usedssthresh);
 	}
 
 	/*
 	 * Set the initial slow-start flight size.
 	 *
 	 * If a SYN or SYN/ACK was lost and retransmitted, we have to
 	 * reduce the initial CWND to one segment as congestion is likely
 	 * requiring us to be cautious.
 	 */
 	if (tp->snd_cwnd == 1)
 		tp->snd_cwnd = maxseg;		/* SYN(-ACK) lost */
 	else
 		tp->snd_cwnd = tcp_compute_initwnd(maxseg);
 
 	if (CC_ALGO(tp)->conn_init != NULL)
 		CC_ALGO(tp)->conn_init(&tp->t_ccv);
 }
 
 void inline
 cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
 {
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 #ifdef STATS
 	stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
 #endif
 
 	switch(type) {
 	case CC_NDUPACK:
 		if (!IN_FASTRECOVERY(tp->t_flags)) {
 			tp->snd_recover = tp->snd_max;
 			if (tp->t_flags2 & TF2_ECN_PERMIT)
 				tp->t_flags2 |= TF2_ECN_SND_CWR;
 		}
 		break;
 	case CC_ECN:
 		if (!IN_CONGRECOVERY(tp->t_flags) ||
 		    /*
 		     * Allow ECN reaction on ACK to CWR, if
 		     * that data segment was also CE marked.
 		     */
 		    SEQ_GEQ(th->th_ack, tp->snd_recover)) {
 			EXIT_CONGRECOVERY(tp->t_flags);
 			TCPSTAT_INC(tcps_ecn_rcwnd);
 			tp->snd_recover = tp->snd_max + 1;
 			if (tp->t_flags2 & TF2_ECN_PERMIT)
 				tp->t_flags2 |= TF2_ECN_SND_CWR;
 		}
 		break;
 	case CC_RTO:
 		tp->t_dupacks = 0;
 		tp->t_bytes_acked = 0;
 		EXIT_RECOVERY(tp->t_flags);
 		if (tp->t_flags2 & TF2_ECN_PERMIT)
 			tp->t_flags2 |= TF2_ECN_SND_CWR;
 		break;
 	case CC_RTO_ERR:
 		TCPSTAT_INC(tcps_sndrexmitbad);
 		/* RTO was unnecessary, so reset everything. */
 		tp->snd_cwnd = tp->snd_cwnd_prev;
 		tp->snd_ssthresh = tp->snd_ssthresh_prev;
 		tp->snd_recover = tp->snd_recover_prev;
 		if (tp->t_flags & TF_WASFRECOVERY)
 			ENTER_FASTRECOVERY(tp->t_flags);
 		if (tp->t_flags & TF_WASCRECOVERY)
 			ENTER_CONGRECOVERY(tp->t_flags);
 		tp->snd_nxt = tp->snd_max;
 		tp->t_flags &= ~TF_PREVVALID;
 		tp->t_badrxtwin = 0;
 		break;
 	}
 
 	if (CC_ALGO(tp)->cong_signal != NULL) {
 		if (th != NULL)
 			tp->t_ccv.curack = th->th_ack;
 		CC_ALGO(tp)->cong_signal(&tp->t_ccv, type);
 	}
 }
 
 void inline
 cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
 {
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	/* XXXLAS: KASSERT that we're in recovery? */
 
 	if (CC_ALGO(tp)->post_recovery != NULL) {
 		tp->t_ccv.curack = th->th_ack;
 		CC_ALGO(tp)->post_recovery(&tp->t_ccv);
 	}
 	/* XXXLAS: EXIT_RECOVERY ? */
 	tp->t_bytes_acked = 0;
 	tp->sackhint.delivered_data = 0;
 	tp->sackhint.prr_out = 0;
 }
 
 /*
  * Indicate whether this ack should be delayed.  We can delay the ack if
  * following conditions are met:
  *	- There is no delayed ack timer in progress.
  *	- Our last ack wasn't a 0-sized window. We never want to delay
  *	  the ack that opens up a 0-sized window.
  *	- LRO wasn't used for this segment. We make sure by checking that the
  *	  segment size is not larger than the MSS.
  */
 #define DELAY_ACK(tp, tlen)						\
 	((!tcp_timer_active(tp, TT_DELACK) &&				\
 	    (tp->t_flags & TF_RXWIN0SENT) == 0) &&			\
 	    (tlen <= tp->t_maxseg) &&					\
 	    (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
 
 void inline
 cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos)
 {
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	if (CC_ALGO(tp)->ecnpkt_handler != NULL) {
 		switch (iptos & IPTOS_ECN_MASK) {
 		case IPTOS_ECN_CE:
 			tp->t_ccv.flags |= CCF_IPHDR_CE;
 			break;
 		case IPTOS_ECN_ECT0:
 			/* FALLTHROUGH */
 		case IPTOS_ECN_ECT1:
 			/* FALLTHROUGH */
 		case IPTOS_ECN_NOTECT:
 			tp->t_ccv.flags &= ~CCF_IPHDR_CE;
 			break;
 		}
 
 		if (flags & TH_CWR)
 			tp->t_ccv.flags |= CCF_TCPHDR_CWR;
 		else
 			tp->t_ccv.flags &= ~CCF_TCPHDR_CWR;
 
 		CC_ALGO(tp)->ecnpkt_handler(&tp->t_ccv);
 
 		if (tp->t_ccv.flags & CCF_ACKNOW) {
 			tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
 			tp->t_flags |= TF_ACKNOW;
 		}
 	}
 }
 
 void inline
 cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
 {
 	cc_ecnpkt_handler_flags(tp, tcp_get_flags(th), iptos);
 }
 
 /*
  * TCP input handling is split into multiple parts:
  *   tcp6_input is a thin wrapper around tcp_input for the extended
  *	ip6_protox[] call format in ip6_input
  *   tcp_input handles primary segment validation, inpcb lookup and
  *	SYN processing on listen sockets
  *   tcp_do_segment processes the ACK and text of the segment for
  *	establishing, established and closing connections
  */
 #ifdef INET6
 int
 tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
 {
 	struct mbuf *m;
 	struct in6_ifaddr *ia6;
 	struct ip6_hdr *ip6;
 
 	m = *mp;
 	if (m->m_len < *offp + sizeof(struct tcphdr)) {
 		m = m_pullup(m, *offp + sizeof(struct tcphdr));
 		if (m == NULL) {
 			*mp = m;
 			TCPSTAT_INC(tcps_rcvshort);
 			return (IPPROTO_DONE);
 		}
 	}
 
 	/*
 	 * draft-itojun-ipv6-tcp-to-anycast
 	 * better place to put this in?
 	 */
 	ip6 = mtod(m, struct ip6_hdr *);
 	ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false);
 	if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
 		icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
 			    (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
 		*mp = NULL;
 		return (IPPROTO_DONE);
 	}
 
 	*mp = m;
 	return (tcp_input_with_port(mp, offp, proto, port));
 }
 
 int
 tcp6_input(struct mbuf **mp, int *offp, int proto)
 {
 
 	return(tcp6_input_with_port(mp, offp, proto, 0));
 }
 #endif /* INET6 */
 
 int
 tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
 {
 	struct mbuf *m = *mp;
 	struct tcphdr *th = NULL;
 	struct ip *ip = NULL;
 	struct inpcb *inp = NULL;
 	struct tcpcb *tp = NULL;
 	struct socket *so = NULL;
 	u_char *optp = NULL;
 	int off0;
 	int optlen = 0;
 #ifdef INET
 	int len;
 	uint8_t ipttl;
 #endif
 	int tlen = 0, off;
 	int drop_hdrlen;
 	int thflags;
 	int rstreason = 0;	/* For badport_bandlim accounting purposes */
 	int lookupflag;
 	uint8_t iptos;
 	struct m_tag *fwd_tag = NULL;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int isipv6;
 #else
 	const void *ip6 = NULL;
 #endif /* INET6 */
 	struct tcpopt to;		/* options in this segment */
 	char *s = NULL;			/* address and port logging */
 
 	NET_EPOCH_ASSERT();
 
 #ifdef INET6
 	isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
 #endif
 
 	off0 = *offp;
 	m = *mp;
 	*mp = NULL;
 	to.to_flags = 0;
 	TCPSTAT_INC(tcps_rcvtotal);
 
 #ifdef INET6
 	if (isipv6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)((caddr_t)ip6 + off0);
 		tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
 		if (port)
 			goto skip6_csum;
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				th->th_sum = m->m_pkthdr.csum_data;
 			else
 				th->th_sum = in6_cksum_pseudo(ip6, tlen,
 				    IPPROTO_TCP, m->m_pkthdr.csum_data);
 			th->th_sum ^= 0xffff;
 		} else
 			th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen);
 		if (th->th_sum) {
 			TCPSTAT_INC(tcps_rcvbadsum);
 			goto drop;
 		}
 	skip6_csum:
 		/*
 		 * Be proactive about unspecified IPv6 address in source.
 		 * As we use all-zero to indicate unbounded/unconnected pcb,
 		 * unspecified IPv6 address can be used to confuse us.
 		 *
 		 * Note that packets with unspecified IPv6 destination is
 		 * already dropped in ip6_input.
 		 */
 		KASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst),
 		    ("%s: unspecified destination v6 address", __func__));
 		if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
 			IP6STAT_INC(ip6s_badscope); /* XXX */
 			goto drop;
 		}
 		iptos = IPV6_TRAFFIC_CLASS(ip6);
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		/*
 		 * Get IP and TCP header together in first mbuf.
 		 * Note: IP leaves IP header in first mbuf.
 		 */
 		if (off0 > sizeof (struct ip)) {
 			ip_stripoptions(m);
 			off0 = sizeof(struct ip);
 		}
 		if (m->m_len < sizeof (struct tcpiphdr)) {
 			if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
 			    == NULL) {
 				TCPSTAT_INC(tcps_rcvshort);
 				return (IPPROTO_DONE);
 			}
 		}
 		ip = mtod(m, struct ip *);
 		th = (struct tcphdr *)((caddr_t)ip + off0);
 		tlen = ntohs(ip->ip_len) - off0;
 
 		iptos = ip->ip_tos;
 		if (port)
 			goto skip_csum;
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				th->th_sum = m->m_pkthdr.csum_data;
 			else
 				th->th_sum = in_pseudo(ip->ip_src.s_addr,
 				    ip->ip_dst.s_addr,
 				    htonl(m->m_pkthdr.csum_data + tlen +
 				    IPPROTO_TCP));
 			th->th_sum ^= 0xffff;
 		} else {
 			struct ipovly *ipov = (struct ipovly *)ip;
 
 			/*
 			 * Checksum extended TCP header and data.
 			 */
 			len = off0 + tlen;
 			ipttl = ip->ip_ttl;
 			bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
 			ipov->ih_len = htons(tlen);
 			th->th_sum = in_cksum(m, len);
 			/* Reset length for SDT probes. */
 			ip->ip_len = htons(len);
 			/* Reset TOS bits */
 			ip->ip_tos = iptos;
 			/* Re-initialization for later version check */
 			ip->ip_ttl = ipttl;
 			ip->ip_v = IPVERSION;
 			ip->ip_hl = off0 >> 2;
 		}
 	skip_csum:
 		if (th->th_sum && (port == 0)) {
 			TCPSTAT_INC(tcps_rcvbadsum);
 			goto drop;
 		}
 		KASSERT(ip->ip_dst.s_addr != INADDR_ANY,
 		    ("%s: unspecified destination v4 address", __func__));
 		if (__predict_false(ip->ip_src.s_addr == INADDR_ANY)) {
 			IPSTAT_INC(ips_badaddr);
 			goto drop;
 		}
 	}
 #endif /* INET */
 
 	/*
 	 * Check that TCP offset makes sense,
 	 * pull out TCP options and adjust length.		XXX
 	 */
 	off = th->th_off << 2;
 	if (off < sizeof (struct tcphdr) || off > tlen) {
 		TCPSTAT_INC(tcps_rcvbadoff);
 		goto drop;
 	}
 	tlen -= off;	/* tlen is used instead of ti->ti_len */
 	if (off > sizeof (struct tcphdr)) {
 #ifdef INET6
 		if (isipv6) {
 			if (m->m_len < off0 + off) {
 				m = m_pullup(m, off0 + off);
 				if (m == NULL) {
 					TCPSTAT_INC(tcps_rcvshort);
 					return (IPPROTO_DONE);
 				}
 			}
 			ip6 = mtod(m, struct ip6_hdr *);
 			th = (struct tcphdr *)((caddr_t)ip6 + off0);
 		}
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 		{
 			if (m->m_len < sizeof(struct ip) + off) {
 				if ((m = m_pullup(m, sizeof (struct ip) + off))
 				    == NULL) {
 					TCPSTAT_INC(tcps_rcvshort);
 					return (IPPROTO_DONE);
 				}
 				ip = mtod(m, struct ip *);
 				th = (struct tcphdr *)((caddr_t)ip + off0);
 			}
 		}
 #endif
 		optlen = off - sizeof (struct tcphdr);
 		optp = (u_char *)(th + 1);
 	}
 	thflags = tcp_get_flags(th);
 
 	/*
 	 * Convert TCP protocol specific fields to host format.
 	 */
 	tcp_fields_to_host(th);
 
 	/*
 	 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options.
 	 */
 	drop_hdrlen = off0 + off;
 
 	/*
 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
 	 */
         if (
 #ifdef INET6
 	    (isipv6 && (m->m_flags & M_IP6_NEXTHOP))
 #ifdef INET
 	    || (!isipv6 && (m->m_flags & M_IP_NEXTHOP))
 #endif
 #endif
 #if defined(INET) && !defined(INET6)
 	    (m->m_flags & M_IP_NEXTHOP)
 #endif
 	    )
 		fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
 
 	/*
 	 * For initial SYN packets we don't need write lock on matching
 	 * PCB, be it a listening one or a synchronized one.  The packet
 	 * shall not modify its state.
 	 */
 	lookupflag = INPLOOKUP_WILDCARD |
 	    ((thflags & (TH_ACK|TH_SYN)) == TH_SYN ?
 	    INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB);
 findpcb:
 #ifdef INET6
 	if (isipv6 && fwd_tag != NULL) {
 		struct sockaddr_in6 *next_hop6;
 
 		next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1);
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * Already got one like this?
 		 */
 		inp = in6_pcblookup_mbuf(&V_tcbinfo,
 		    &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport,
 		    lookupflag & ~INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif, m);
 		if (!inp) {
 			/*
 			 * It's new.  Try to find the ambushing socket.
 			 * Because we've rewritten the destination address,
 			 * any hardware-generated hash is ignored.
 			 */
 			inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src,
 			    th->th_sport, &next_hop6->sin6_addr,
 			    next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) :
 			    th->th_dport, lookupflag, m->m_pkthdr.rcvif);
 		}
 	} else if (isipv6) {
 		inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src,
 		    th->th_sport, &ip6->ip6_dst, th->th_dport, lookupflag,
 		    m->m_pkthdr.rcvif, m);
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	if (fwd_tag != NULL) {
 		struct sockaddr_in *next_hop;
 
 		next_hop = (struct sockaddr_in *)(fwd_tag+1);
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * already got one like this?
 		 */
 		inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport,
 		    ip->ip_dst, th->th_dport, lookupflag & ~INPLOOKUP_WILDCARD,
 		    m->m_pkthdr.rcvif, m);
 		if (!inp) {
 			/*
 			 * It's new.  Try to find the ambushing socket.
 			 * Because we've rewritten the destination address,
 			 * any hardware-generated hash is ignored.
 			 */
 			inp = in_pcblookup(&V_tcbinfo, ip->ip_src,
 			    th->th_sport, next_hop->sin_addr,
 			    next_hop->sin_port ? ntohs(next_hop->sin_port) :
 			    th->th_dport, lookupflag, m->m_pkthdr.rcvif);
 		}
 	} else
 		inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src,
 		    th->th_sport, ip->ip_dst, th->th_dport, lookupflag,
 		    m->m_pkthdr.rcvif, m);
 #endif /* INET */
 
 	/*
 	 * If the INPCB does not exist then all data in the incoming
 	 * segment is discarded and an appropriate RST is sent back.
 	 * XXX MRT Send RST using which routing table?
 	 */
 	if (inp == NULL) {
 		if (rstreason != 0) {
 			/* We came here after second (safety) lookup. */
 			MPASS((lookupflag & INPLOOKUP_WILDCARD) == 0);
 			goto dropwithreset;
 		}
 		/*
 		 * Log communication attempts to ports that are not
 		 * in use.
 		 */
 		if ((V_tcp_log_in_vain == 1 && (thflags & TH_SYN)) ||
 		    V_tcp_log_in_vain == 2) {
 			if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6)))
 				log(LOG_INFO, "%s; %s: Connection attempt "
 				    "to closed port\n", s, __func__);
 		}
 		/*
 		 * When blackholing do not respond with a RST but
 		 * completely ignore the segment and drop it.
 		 */
 		if (((V_blackhole == 1 && (thflags & TH_SYN)) ||
 		    V_blackhole == 2) && (V_blackhole_local || (
 #ifdef INET6
 		    isipv6 ? !in6_localaddr(&ip6->ip6_src) :
 #endif
 #ifdef INET
 		    !in_localip(ip->ip_src)
 #else
 		    true
 #endif
 		    )))
 			goto dropunlock;
 
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
 	INP_LOCK_ASSERT(inp);
 
 	if ((inp->inp_flowtype == M_HASHTYPE_NONE) &&
 	    !SOLISTENING(inp->inp_socket)) {
 		if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
 			inp->inp_flowid = m->m_pkthdr.flowid;
 			inp->inp_flowtype = M_HASHTYPE_GET(m);
 #ifdef	RSS
 		} else {
 			  /* assign flowid by software RSS hash */
 #ifdef INET6
 			  if (isipv6) {
 				rss_proto_software_hash_v6(&inp->in6p_faddr,
 							   &inp->in6p_laddr,
 							   inp->inp_fport,
 							   inp->inp_lport,
 							   IPPROTO_TCP,
 							   &inp->inp_flowid,
 							   &inp->inp_flowtype);
 			  } else
 #endif	/* INET6 */
 			  {
 				rss_proto_software_hash_v4(inp->inp_faddr,
 							   inp->inp_laddr,
 							   inp->inp_fport,
 							   inp->inp_lport,
 							   IPPROTO_TCP,
 							   &inp->inp_flowid,
 							   &inp->inp_flowtype);
 			  }
 #endif	/* RSS */
 		}
 	}
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 #ifdef INET6
 	if (isipv6 && IPSEC_ENABLED(ipv6) &&
 	    IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) {
 		goto dropunlock;
 	}
 #ifdef INET
 	else
 #endif
 #endif /* INET6 */
 #ifdef INET
 	if (IPSEC_ENABLED(ipv4) &&
 	    IPSEC_CHECK_POLICY(ipv4, m, inp) != 0) {
 		goto dropunlock;
 	}
 #endif /* INET */
 #endif /* IPSEC */
 
 	/*
 	 * Check the minimum TTL for socket.
 	 */
 	if (inp->inp_ip_minttl != 0) {
 #ifdef INET6
 		if (isipv6) {
 			if (inp->inp_ip_minttl > ip6->ip6_hlim)
 				goto dropunlock;
 		} else
 #endif
 		if (inp->inp_ip_minttl > ip->ip_ttl)
 			goto dropunlock;
 	}
 
 	tp = intotcpcb(inp);
 	switch (tp->t_state) {
 	case TCPS_TIME_WAIT:
 		/*
 		 * A previous connection in TIMEWAIT state is supposed to catch
 		 * stray or duplicate segments arriving late.  If this segment
 		 * was a legitimate new connection attempt, the old INPCB gets
 		 * removed and we can try again to find a listening socket.
 		 */
 		tcp_dooptions(&to, optp, optlen,
 		    (thflags & TH_SYN) ? TO_SYN : 0);
 		/*
 		 * tcp_twcheck unlocks the inp always, and frees the m if fails.
 		 */
 		if (tcp_twcheck(inp, &to, th, m, tlen))
 			goto findpcb;
 		return (IPPROTO_DONE);
 	case TCPS_CLOSED:
 		/*
 		 * The TCPCB may no longer exist if the connection is winding
 		 * down or it is in the CLOSED state.  Either way we drop the
 		 * segment and send an appropriate response.
 		 */
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
 
 	if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) {
 		rstreason = BANDLIM_RST_CLOSEDPORT;
 		goto dropwithreset;
 	}
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE) {
 		tcp_offload_input(tp, m);
 		m = NULL;	/* consumed by the TOE driver */
 		goto dropunlock;
 	}
 #endif
 
 #ifdef MAC
 	if (mac_inpcb_check_deliver(inp, m))
 		goto dropunlock;
 #endif
 	so = inp->inp_socket;
 	KASSERT(so != NULL, ("%s: so == NULL", __func__));
 	/*
 	 * When the socket is accepting connections (the INPCB is in LISTEN
 	 * state) we look into the SYN cache if this is a new connection
 	 * attempt or the completion of a previous one.
 	 */
 	KASSERT(tp->t_state == TCPS_LISTEN || !SOLISTENING(so),
 	    ("%s: so accepting but tp %p not listening", __func__, tp));
 	if (tp->t_state == TCPS_LISTEN && SOLISTENING(so)) {
 		struct in_conninfo inc;
 
 		bzero(&inc, sizeof(inc));
 #ifdef INET6
 		if (isipv6) {
 			inc.inc_flags |= INC_ISIPV6;
 			if (inp->inp_inc.inc_flags & INC_IPV6MINMTU)
 				inc.inc_flags |= INC_IPV6MINMTU;
 			inc.inc6_faddr = ip6->ip6_src;
 			inc.inc6_laddr = ip6->ip6_dst;
 		} else
 #endif
 		{
 			inc.inc_faddr = ip->ip_src;
 			inc.inc_laddr = ip->ip_dst;
 		}
 		inc.inc_fport = th->th_sport;
 		inc.inc_lport = th->th_dport;
 		inc.inc_fibnum = so->so_fibnum;
 
 		/*
 		 * Check for an existing connection attempt in syncache if
 		 * the flag is only ACK.  A successful lookup creates a new
 		 * socket appended to the listen queue in SYN_RECEIVED state.
 		 */
 		if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
 			/*
 			 * Parse the TCP options here because
 			 * syncookies need access to the reflected
 			 * timestamp.
 			 */
 			tcp_dooptions(&to, optp, optlen, 0);
 			/*
 			 * NB: syncache_expand() doesn't unlock inp.
 			 */
 			rstreason = syncache_expand(&inc, &to, th, &so, m, port);
 			if (rstreason < 0) {
 				/*
 				 * A failing TCP MD5 signature comparison
 				 * must result in the segment being dropped
 				 * and must not produce any response back
 				 * to the sender.
 				 */
 				goto dropunlock;
 			} else if (rstreason == 0) {
 				/*
 				 * No syncache entry, or ACK was not for our
 				 * SYN/ACK.  Do our protection against double
 				 * ACK.  If peer sent us 2 ACKs, then for the
 				 * first one syncache_expand() successfully
 				 * converted syncache entry into a socket,
 				 * while we were waiting on the inpcb lock.  We
 				 * don't want to sent RST for the second ACK,
 				 * so we perform second lookup without wildcard
 				 * match, hoping to find the new socket.  If
 				 * the ACK is stray indeed, rstreason would
 				 * hint the above code that the lookup was a
 				 * second attempt.
 				 *
 				 * NB: syncache did its own logging
 				 * of the failure cause.
 				 */
 				INP_WUNLOCK(inp);
 				rstreason = BANDLIM_RST_OPENPORT;
 				lookupflag &= ~INPLOOKUP_WILDCARD;
 				goto findpcb;
 			}
 tfo_socket_result:
 			if (so == NULL) {
 				/*
 				 * We completed the 3-way handshake
 				 * but could not allocate a socket
 				 * either due to memory shortage,
 				 * listen queue length limits or
 				 * global socket limits.  Send RST
 				 * or wait and have the remote end
 				 * retransmit the ACK for another
 				 * try.
 				 */
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 					log(LOG_DEBUG, "%s; %s: Listen socket: "
 					    "Socket allocation failed due to "
 					    "limits or memory shortage, %s\n",
 					    s, __func__,
 					    V_tcp_sc_rst_sock_fail ?
 					    "sending RST" : "try again");
 				if (V_tcp_sc_rst_sock_fail) {
 					rstreason = BANDLIM_UNLIMITED;
 					goto dropwithreset;
 				} else
 					goto dropunlock;
 			}
 			/*
 			 * Socket is created in state SYN_RECEIVED.
 			 * Unlock the listen socket, lock the newly
 			 * created socket and update the tp variable.
 			 * If we came here via jump to tfo_socket_result,
 			 * then listening socket is read-locked.
 			 */
 			INP_UNLOCK(inp);	/* listen socket */
 			inp = sotoinpcb(so);
 			/*
 			 * New connection inpcb is already locked by
 			 * syncache_expand().
 			 */
 			INP_WLOCK_ASSERT(inp);
 			tp = intotcpcb(inp);
 			KASSERT(tp->t_state == TCPS_SYN_RECEIVED,
 			    ("%s: ", __func__));
 			/*
 			 * Process the segment and the data it
 			 * contains.  tcp_do_segment() consumes
 			 * the mbuf chain and unlocks the inpcb.
 			 */
 			TCP_PROBE5(receive, NULL, tp, m, tp, th);
 			tp->t_fb->tfb_tcp_do_segment(tp, m, th, drop_hdrlen,
 			    tlen, iptos);
 			return (IPPROTO_DONE);
 		}
 		/*
 		 * Segment flag validation for new connection attempts:
 		 *
 		 * Our (SYN|ACK) response was rejected.
 		 * Check with syncache and remove entry to prevent
 		 * retransmits.
 		 *
 		 * NB: syncache_chkrst does its own logging of failure
 		 * causes.
 		 */
 		if (thflags & TH_RST) {
 			syncache_chkrst(&inc, th, m, port);
 			goto dropunlock;
 		}
 		/*
 		 * We can't do anything without SYN.
 		 */
 		if ((thflags & TH_SYN) == 0) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN is missing, segment ignored\n",
 				    s, __func__);
 			TCPSTAT_INC(tcps_badsyn);
 			goto dropunlock;
 		}
 		/*
 		 * (SYN|ACK) is bogus on a listen socket.
 		 */
 		if (thflags & TH_ACK) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN|ACK invalid, segment rejected\n",
 				    s, __func__);
 			syncache_badack(&inc, port);	/* XXX: Not needed! */
 			TCPSTAT_INC(tcps_badsyn);
 			rstreason = BANDLIM_RST_OPENPORT;
 			goto dropwithreset;
 		}
 		/*
 		 * If the drop_synfin option is enabled, drop all
 		 * segments with both the SYN and FIN bits set.
 		 * This prevents e.g. nmap from identifying the
 		 * TCP/IP stack.
 		 * XXX: Poor reasoning.  nmap has other methods
 		 * and is constantly refining its stack detection
 		 * strategies.
 		 * XXX: This is a violation of the TCP specification
 		 * and was used by RFC1644.
 		 */
 		if ((thflags & TH_FIN) && V_drop_synfin) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN|FIN segment ignored (based on "
 				    "sysctl setting)\n", s, __func__);
 			TCPSTAT_INC(tcps_badsyn);
 			goto dropunlock;
 		}
 		/*
 		 * Segment's flags are (SYN) or (SYN|FIN).
 		 *
 		 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored
 		 * as they do not affect the state of the TCP FSM.
 		 * The data pointed to by TH_URG and th_urp is ignored.
 		 */
 		KASSERT((thflags & (TH_RST|TH_ACK)) == 0,
 		    ("%s: Listen socket: TH_RST or TH_ACK set", __func__));
 		KASSERT(thflags & (TH_SYN),
 		    ("%s: Listen socket: TH_SYN not set", __func__));
 		INP_RLOCK_ASSERT(inp);
 #ifdef INET6
 		/*
 		 * If deprecated address is forbidden,
 		 * we do not accept SYN to deprecated interface
 		 * address to prevent any new inbound connection from
 		 * getting established.
 		 * When we do not accept SYN, we send a TCP RST,
 		 * with deprecated source address (instead of dropping
 		 * it).  We compromise it as it is much better for peer
 		 * to send a RST, and RST will be the final packet
 		 * for the exchange.
 		 *
 		 * If we do not forbid deprecated addresses, we accept
 		 * the SYN packet.  RFC2462 does not suggest dropping
 		 * SYN in this case.
 		 * If we decipher RFC2462 5.5.4, it says like this:
 		 * 1. use of deprecated addr with existing
 		 *    communication is okay - "SHOULD continue to be
 		 *    used"
 		 * 2. use of it with new communication:
 		 *   (2a) "SHOULD NOT be used if alternate address
 		 *        with sufficient scope is available"
 		 *   (2b) nothing mentioned otherwise.
 		 * Here we fall into (2b) case as we have no choice in
 		 * our source address selection - we must obey the peer.
 		 *
 		 * The wording in RFC2462 is confusing, and there are
 		 * multiple description text for deprecated address
 		 * handling - worse, they are not exactly the same.
 		 * I believe 5.5.4 is the best one, so we follow 5.5.4.
 		 */
 		if (isipv6 && !V_ip6_use_deprecated) {
 			struct in6_ifaddr *ia6;
 
 			ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false);
 			if (ia6 != NULL &&
 			    (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt to deprecated "
 					"IPv6 address rejected\n",
 					s, __func__);
 				rstreason = BANDLIM_RST_OPENPORT;
 				goto dropwithreset;
 			}
 		}
 #endif /* INET6 */
 		/*
 		 * Basic sanity checks on incoming SYN requests:
 		 *   Don't respond if the destination is a link layer
 		 *	broadcast according to RFC1122 4.2.3.10, p. 104.
 		 *   If it is from this socket it must be forged.
 		 *   Don't respond if the source or destination is a
 		 *	global or subnet broad- or multicast address.
 		 *   Note that it is quite possible to receive unicast
 		 *	link-layer packets with a broadcast IP address. Use
 		 *	in_broadcast() to find them.
 		 */
 		if (m->m_flags & (M_BCAST|M_MCAST)) {
 			if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 			    log(LOG_DEBUG, "%s; %s: Listen socket: "
 				"Connection attempt from broad- or multicast "
 				"link layer address ignored\n", s, __func__);
 			goto dropunlock;
 		}
 #ifdef INET6
 		if (isipv6) {
 			if (th->th_dport == th->th_sport &&
 			    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt to/from self "
 					"ignored\n", s, __func__);
 				goto dropunlock;
 			}
 			if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 			    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to multicast "
 					"address ignored\n", s, __func__);
 				goto dropunlock;
 			}
 		}
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 		{
 			if (th->th_dport == th->th_sport &&
 			    ip->ip_dst.s_addr == ip->ip_src.s_addr) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to self "
 					"ignored\n", s, __func__);
 				goto dropunlock;
 			}
 			if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 			    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 			    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 			    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
 				if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 				    log(LOG_DEBUG, "%s; %s: Listen socket: "
 					"Connection attempt from/to broad- "
 					"or multicast address ignored\n",
 					s, __func__);
 				goto dropunlock;
 			}
 		}
 #endif
 		/*
 		 * SYN appears to be valid.  Create compressed TCP state
 		 * for syncache.
 		 */
 		TCP_PROBE3(debug__input, tp, th, m);
 		tcp_dooptions(&to, optp, optlen, TO_SYN);
 		if ((so = syncache_add(&inc, &to, th, inp, so, m, NULL, NULL,
 		    iptos, port)) != NULL)
 			goto tfo_socket_result;
 
 		/*
 		 * Entry added to syncache and mbuf consumed.
 		 * Only the listen socket is unlocked by syncache_add().
 		 */
 		return (IPPROTO_DONE);
 	} else if (tp->t_state == TCPS_LISTEN) {
 		/*
 		 * When a listen socket is torn down the SO_ACCEPTCONN
 		 * flag is removed first while connections are drained
 		 * from the accept queue in a unlock/lock cycle of the
 		 * ACCEPT_LOCK, opening a race condition allowing a SYN
 		 * attempt go through unhandled.
 		 */
 		goto dropunlock;
 	}
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 	if (tp->t_flags & TF_SIGNATURE) {
 		tcp_dooptions(&to, optp, optlen, thflags);
 		if ((to.to_flags & TOF_SIGNATURE) == 0) {
 			TCPSTAT_INC(tcps_sig_err_nosigopt);
 			goto dropunlock;
 		}
 		if (!TCPMD5_ENABLED() ||
 		    TCPMD5_INPUT(m, th, to.to_signature) != 0)
 			goto dropunlock;
 	}
 #endif
 	TCP_PROBE5(receive, NULL, tp, m, tp, th);
 
 	/*
 	 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
 	 * state.  tcp_do_segment() always consumes the mbuf chain, unlocks
 	 * the inpcb, and unlocks pcbinfo.
 	 *
 	 * XXXGL: in case of a pure SYN arriving on existing connection
 	 * TCP stacks won't need to modify the PCB, they would either drop
 	 * the segment silently, or send a challenge ACK.  However, we try
 	 * to upgrade the lock, because calling convention for stacks is
 	 * write-lock on PCB.  If upgrade fails, drop the SYN.
 	 */
 	if ((lookupflag & INPLOOKUP_RLOCKPCB) && INP_TRY_UPGRADE(inp) == 0)
 		goto dropunlock;
 
 	tp->t_fb->tfb_tcp_do_segment(tp, m, th, drop_hdrlen, tlen, iptos);
 	return (IPPROTO_DONE);
 
 dropwithreset:
 	TCP_PROBE5(receive, NULL, tp, m, tp, th);
 
 	if (inp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
 		INP_UNLOCK(inp);
 	} else
 		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 	m = NULL;	/* mbuf chain got consumed. */
 	goto drop;
 
 dropunlock:
 	if (m != NULL)
 		TCP_PROBE5(receive, NULL, tp, m, tp, th);
 
 	if (inp != NULL)
 		INP_UNLOCK(inp);
 
 drop:
 	if (s != NULL)
 		free(s, M_TCPLOG);
 	if (m != NULL)
 		m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 /*
  * Automatic sizing of receive socket buffer.  Often the send
  * buffer size is not optimally adjusted to the actual network
  * conditions at hand (delay bandwidth product).  Setting the
  * buffer size too small limits throughput on links with high
  * bandwidth and high delay (eg. trans-continental/oceanic links).
  *
  * On the receive side the socket buffer memory is only rarely
  * used to any significant extent.  This allows us to be much
  * more aggressive in scaling the receive socket buffer.  For
  * the case that the buffer space is actually used to a large
  * extent and we run out of kernel memory we can simply drop
  * the new segments; TCP on the sender will just retransmit it
  * later.  Setting the buffer size too big may only consume too
  * much kernel memory if the application doesn't read() from
  * the socket or packet loss or reordering makes use of the
  * reassembly queue.
  *
  * The criteria to step up the receive buffer one notch are:
  *  1. Application has not set receive buffer size with
  *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
  *  2. the number of bytes received during 1/2 of an sRTT
  *     is at least 3/8 of the current socket buffer size.
  *  3. receive buffer size has not hit maximal automatic size;
  *
  * If all of the criteria are met we increaset the socket buffer
  * by a 1/2 (bounded by the max). This allows us to keep ahead
  * of slow-start but also makes it so our peer never gets limited
  * by our rwnd which we then open up causing a burst.
  *
  * This algorithm does two steps per RTT at most and only if
  * we receive a bulk stream w/o packet losses or reorderings.
  * Shrinking the buffer during idle times is not necessary as
  * it doesn't consume any memory when idle.
  *
  * TODO: Only step up if the application is actually serving
  * the buffer to better manage the socket buffer resources.
  */
 int
 tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int tlen)
 {
 	int newsize = 0;
 
 	if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) &&
 	    tp->t_srtt != 0 && tp->rfbuf_ts != 0 &&
 	    TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) >
 	    ((tp->t_srtt >> TCP_RTT_SHIFT)/2)) {
 		if (tp->rfbuf_cnt > ((so->so_rcv.sb_hiwat / 2)/ 4 * 3) &&
 		    so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) {
 			newsize = min((so->so_rcv.sb_hiwat + (so->so_rcv.sb_hiwat/2)), V_tcp_autorcvbuf_max);
 		}
 		TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize);
 
 		/* Start over with next RTT. */
 		tp->rfbuf_ts = 0;
 		tp->rfbuf_cnt = 0;
 	} else {
 		tp->rfbuf_cnt += tlen;	/* add up */
 	}
 	return (newsize);
 }
 
 int
 tcp_input(struct mbuf **mp, int *offp, int proto)
 {
 	return(tcp_input_with_port(mp, offp, proto, 0));
 }
 
 static void
 tcp_handle_wakeup(struct tcpcb *tp)
 {
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	if (tp->t_flags & TF_WAKESOR) {
 		struct socket *so = tptosocket(tp);
 
 		tp->t_flags &= ~TF_WAKESOR;
 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 		sorwakeup_locked(so);
 	}
 }
 
 void
 tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
     int drop_hdrlen, int tlen, uint8_t iptos)
 {
 	uint16_t thflags;
-	int acked, ourfinisacked, needoutput = 0, sack_changed;
+	int acked, ourfinisacked, needoutput = 0;
+	sackstatus_t sack_changed;
 	int rstreason, todrop, win, incforsyn = 0;
 	uint32_t tiwin;
 	uint16_t nsegs;
 	char *s;
 	struct inpcb *inp = tptoinpcb(tp);
 	struct socket *so = tptosocket(tp);
 	struct in_conninfo *inc = &inp->inp_inc;
 	struct mbuf *mfree;
 	struct tcpopt to;
 	int tfo_syn;
 	u_int maxseg;
 
 	thflags = tcp_get_flags(th);
 	tp->sackhint.last_sack_ack = 0;
-	sack_changed = 0;
+	sack_changed = SACK_NOCHANGE;
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
 	    __func__));
 	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
 	    __func__));
 
 #ifdef TCPPCAP
 	/* Save segment, if requested. */
 	tcp_pcap_add(th, m, &(tp->t_inpkts));
 #endif
 	TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
 	    tlen, NULL, true);
 
 	if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: "
 			    "SYN|FIN segment ignored (based on "
 			    "sysctl setting)\n", s, __func__);
 			free(s, M_TCPLOG);
 		}
 		goto drop;
 	}
 
 	/*
 	 * If a segment with the ACK-bit set arrives in the SYN-SENT state
 	 * check SEQ.ACK first.
 	 */
 	if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
 	    (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
 		rstreason = BANDLIM_UNLIMITED;
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		goto dropwithreset;
 	}
 
 	/*
 	 * Segment received on connection.
 	 * Reset idle time and keep-alive timer.
 	 * XXX: This should be done after segment
 	 * validation to ignore broken/spoofed segs.
 	 */
 	if  (tp->t_idle_reduce &&
 	     (tp->snd_max == tp->snd_una) &&
 	     ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
 		cc_after_idle(tp);
 	tp->t_rcvtime = ticks;
 
 	if (thflags & TH_FIN)
 		tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN);
 	/*
 	 * Scale up the window into a 32-bit value.
 	 * For the SYN_SENT state the scale is zero.
 	 */
 	tiwin = th->th_win << tp->snd_scale;
 #ifdef STATS
 	stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
 #endif
 
 	/*
 	 * TCP ECN processing.
 	 */
 	if (tcp_ecn_input_segment(tp, thflags, tlen,
 	    tcp_packets_this_ack(tp, th->th_ack),
 	    iptos))
 		cc_cong_signal(tp, th, CC_ECN);
 
 	/*
 	 * Parse options on any incoming segment.
 	 */
 	tcp_dooptions(&to, (u_char *)(th + 1),
 	    (th->th_off << 2) - sizeof(struct tcphdr),
 	    (thflags & TH_SYN) ? TO_SYN : 0);
 
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 	if ((tp->t_flags & TF_SIGNATURE) != 0 &&
 	    (to.to_flags & TOF_SIGNATURE) == 0) {
 		TCPSTAT_INC(tcps_sig_err_sigopt);
 		/* XXX: should drop? */
 	}
 #endif
 	/*
 	 * If echoed timestamp is later than the current time,
 	 * fall back to non RFC1323 RTT calculation.  Normalize
 	 * timestamp if syncookies were used when this connection
 	 * was established.
 	 */
 	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
 		to.to_tsecr -= tp->ts_offset;
 		if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
 			to.to_tsecr = 0;
 		else if (tp->t_rxtshift == 1 &&
 			 tp->t_flags & TF_PREVVALID &&
 			 tp->t_badrxtwin != 0 &&
 			 TSTMP_LT(to.to_tsecr, tp->t_badrxtwin))
 			cc_cong_signal(tp, th, CC_RTO_ERR);
 	}
 	/*
 	 * Process options only when we get SYN/ACK back. The SYN case
 	 * for incoming connections is handled in tcp_syncache.
 	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
 	 * or <SYN,ACK>) segment itself is never scaled.
 	 * XXX this is traditional behavior, may need to be cleaned up.
 	 */
 	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
 		/* Handle parallel SYN for ECN */
 		tcp_ecn_input_parallel_syn(tp, thflags, iptos);
 		if ((to.to_flags & TOF_SCALE) &&
 		    (tp->t_flags & TF_REQ_SCALE) &&
 		    !(tp->t_flags & TF_NOOPT)) {
 			tp->t_flags |= TF_RCVD_SCALE;
 			tp->snd_scale = to.to_wscale;
 		} else
 			tp->t_flags &= ~TF_REQ_SCALE;
 		/*
 		 * Initial send window.  It will be updated with
 		 * the next incoming segment to the scaled value.
 		 */
 		tp->snd_wnd = th->th_win;
 		if ((to.to_flags & TOF_TS) &&
 		    (tp->t_flags & TF_REQ_TSTMP) &&
 		    !(tp->t_flags & TF_NOOPT)) {
 			tp->t_flags |= TF_RCVD_TSTMP;
 			tp->ts_recent = to.to_tsval;
 			tp->ts_recent_age = tcp_ts_getticks();
 		} else
 			tp->t_flags &= ~TF_REQ_TSTMP;
 		if (to.to_flags & TOF_MSS)
 			tcp_mss(tp, to.to_mss);
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    (!(to.to_flags & TOF_SACKPERM) ||
 		    (tp->t_flags & TF_NOOPT)))
 			tp->t_flags &= ~TF_SACK_PERMIT;
 		if (IS_FASTOPEN(tp->t_flags)) {
 			if ((to.to_flags & TOF_FASTOPEN) &&
 			    !(tp->t_flags & TF_NOOPT)) {
 				uint16_t mss;
 
 				if (to.to_flags & TOF_MSS)
 					mss = to.to_mss;
 				else
 					if ((inp->inp_vflag & INP_IPV6) != 0)
 						mss = TCP6_MSS;
 					else
 						mss = TCP_MSS;
 				tcp_fastopen_update_cache(tp, mss,
 				    to.to_tfo_len, to.to_tfo_cookie);
 			} else
 				tcp_fastopen_disable_path(tp);
 		}
 	}
 
 	/*
 	 * If timestamps were negotiated during SYN/ACK and a
 	 * segment without a timestamp is received, silently drop
 	 * the segment, unless it is a RST segment or missing timestamps are
 	 * tolerated.
 	 * See section 3.2 of RFC 7323.
 	 */
 	if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
 		if (((thflags & TH_RST) != 0) || V_tcp_tolerate_missing_ts) {
 			if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 				log(LOG_DEBUG, "%s; %s: Timestamp missing, "
 				    "segment processed normally\n",
 				    s, __func__);
 				free(s, M_TCPLOG);
 			}
 		} else {
 			if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 				log(LOG_DEBUG, "%s; %s: Timestamp missing, "
 				    "segment silently dropped\n", s, __func__);
 				free(s, M_TCPLOG);
 			}
 			goto drop;
 		}
 	}
 	/*
 	 * If timestamps were not negotiated during SYN/ACK and a
 	 * segment with a timestamp is received, ignore the
 	 * timestamp and process the packet normally.
 	 * See section 3.2 of RFC 7323.
 	 */
 	if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
 			    "segment processed normally\n", s, __func__);
 			free(s, M_TCPLOG);
 		}
 	}
 
 	/*
 	 * Header prediction: check for the two common cases
 	 * of a uni-directional data xfer.  If the packet has
 	 * no control flags, is in-sequence, the window didn't
 	 * change and we're not retransmitting, it's a
 	 * candidate.  If the length is zero and the ack moved
 	 * forward, we're the sender side of the xfer.  Just
 	 * free the data acked & wake any higher level process
 	 * that was blocked waiting for space.  If the length
 	 * is non-zero and the ack didn't move, we're the
 	 * receiver side.  If we're getting packets in-order
 	 * (the reassembly queue is empty), add the data to
 	 * the socket buffer and note that we need a delayed ack.
 	 * Make sure that the hidden state-flags are also off.
 	 * Since we check for TCPS_ESTABLISHED first, it can only
 	 * be TH_NEEDSYN.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    th->th_seq == tp->rcv_nxt &&
 	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
 	    tp->snd_nxt == tp->snd_max &&
 	    tiwin && tiwin == tp->snd_wnd &&
 	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
 	    SEGQ_EMPTY(tp) &&
 	    ((to.to_flags & TOF_TS) == 0 ||
 	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
 		/*
 		 * If last ACK falls within this segment's sequence numbers,
 		 * record the timestamp.
 		 * NOTE that the test is modified according to the latest
 		 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
 		 */
 		if ((to.to_flags & TOF_TS) != 0 &&
 		    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 			tp->ts_recent_age = tcp_ts_getticks();
 			tp->ts_recent = to.to_tsval;
 		}
 
 		if (tlen == 0) {
 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
 			    !IN_RECOVERY(tp->t_flags) &&
 			    (to.to_flags & TOF_SACK) == 0 &&
 			    TAILQ_EMPTY(&tp->snd_holes)) {
 				/*
 				 * This is a pure ack for outstanding data.
 				 */
 				TCPSTAT_INC(tcps_predack);
 
 				/*
 				 * "bad retransmit" recovery without timestamps.
 				 */
 				if ((to.to_flags & TOF_TS) == 0 &&
 				    tp->t_rxtshift == 1 &&
 				    tp->t_flags & TF_PREVVALID &&
 				    tp->t_badrxtwin != 0 &&
 				    TSTMP_LT(ticks, tp->t_badrxtwin)) {
 					cc_cong_signal(tp, th, CC_RTO_ERR);
 				}
 
 				/*
 				 * Recalculate the transmit timer / rtt.
 				 *
 				 * Some boxes send broken timestamp replies
 				 * during the SYN+ACK phase, ignore
 				 * timestamps of 0 or we could calculate a
 				 * huge RTT and blow up the retransmit timer.
 				 */
 				if ((to.to_flags & TOF_TS) != 0 &&
 				    to.to_tsecr) {
 					uint32_t t;
 
 					t = tcp_ts_getticks() - to.to_tsecr;
 					if (!tp->t_rttlow || tp->t_rttlow > t)
 						tp->t_rttlow = t;
 					tcp_xmit_timer(tp,
 					    TCP_TS_TO_TICKS(t) + 1);
 				} else if (tp->t_rtttime &&
 				    SEQ_GT(th->th_ack, tp->t_rtseq)) {
 					if (!tp->t_rttlow ||
 					    tp->t_rttlow > ticks - tp->t_rtttime)
 						tp->t_rttlow = ticks - tp->t_rtttime;
 					tcp_xmit_timer(tp,
 							ticks - tp->t_rtttime);
 				}
 				acked = BYTES_THIS_ACK(tp, th);
 
 #ifdef TCP_HHOOK
 				/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
 				hhook_run_tcp_est_in(tp, th, &to);
 #endif
 
 				TCPSTAT_ADD(tcps_rcvackpack, nsegs);
 				TCPSTAT_ADD(tcps_rcvackbyte, acked);
 				sbdrop(&so->so_snd, acked);
 				if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
 				    SEQ_LEQ(th->th_ack, tp->snd_recover))
 					tp->snd_recover = th->th_ack - 1;
 
 				/*
 				 * Let the congestion control algorithm update
 				 * congestion control related information. This
 				 * typically means increasing the congestion
 				 * window.
 				 */
 				cc_ack_received(tp, th, nsegs, CC_ACK);
 
 				tp->snd_una = th->th_ack;
 				/*
 				 * Pull snd_wl2 up to prevent seq wrap relative
 				 * to th_ack.
 				 */
 				tp->snd_wl2 = th->th_ack;
 				tp->t_dupacks = 0;
 				m_freem(m);
 
 				/*
 				 * If all outstanding data are acked, stop
 				 * retransmit timer, otherwise restart timer
 				 * using current (possibly backed-off) value.
 				 * If process is waiting for space,
 				 * wakeup/selwakeup/signal.  If data
 				 * are ready to send, let tcp_output
 				 * decide between more output or persist.
 				 */
 				TCP_PROBE3(debug__input, tp, th, m);
 				/*
 				 * Clear t_acktime if remote side has ACKd
 				 * all data in the socket buffer.
 				 * Otherwise, update t_acktime if we received
 				 * a sufficiently large ACK.
 				 */
 				if (sbavail(&so->so_snd) == 0)
 					tp->t_acktime = 0;
 				else if (acked > 1)
 					tp->t_acktime = ticks;
 				if (tp->snd_una == tp->snd_max)
 					tcp_timer_activate(tp, TT_REXMT, 0);
 				else if (!tcp_timer_active(tp, TT_PERSIST))
 					tcp_timer_activate(tp, TT_REXMT,
 					    TP_RXTCUR(tp));
 				sowwakeup(so);
 				if (sbavail(&so->so_snd))
 					(void) tcp_output(tp);
 				goto check_delack;
 			}
 		} else if (th->th_ack == tp->snd_una &&
 		    tlen <= sbspace(&so->so_rcv)) {
 			int newsize = 0;	/* automatic sockbuf scaling */
 
 			/*
 			 * This is a pure, in-sequence data packet with
 			 * nothing on the reassembly queue and we have enough
 			 * buffer space to take it.
 			 */
 			/* Clean receiver SACK report if present */
 			if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
 				tcp_clean_sackreport(tp);
 			TCPSTAT_INC(tcps_preddat);
 			tp->rcv_nxt += tlen;
 			if (tlen &&
 			    ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
 			    (tp->t_fbyte_in == 0)) {
 				tp->t_fbyte_in = ticks;
 				if (tp->t_fbyte_in == 0)
 					tp->t_fbyte_in = 1;
 				if (tp->t_fbyte_out && tp->t_fbyte_in)
 					tp->t_flags2 |= TF2_FBYTES_COMPLETE;
 			}
 			/*
 			 * Pull snd_wl1 up to prevent seq wrap relative to
 			 * th_seq.
 			 */
 			tp->snd_wl1 = th->th_seq;
 			/*
 			 * Pull rcv_up up to prevent seq wrap relative to
 			 * rcv_nxt.
 			 */
 			tp->rcv_up = tp->rcv_nxt;
 			TCPSTAT_ADD(tcps_rcvpack, nsegs);
 			TCPSTAT_ADD(tcps_rcvbyte, tlen);
 			TCP_PROBE3(debug__input, tp, th, m);
 
 			newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
 
 			/* Add data to socket buffer. */
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				m_freem(m);
 			} else {
 				/*
 				 * Set new socket buffer size.
 				 * Give up when limit is reached.
 				 */
 				if (newsize)
 					if (!sbreserve_locked(so, SO_RCV,
 					    newsize, NULL))
 						so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 				m_adj(m, drop_hdrlen);	/* delayed header drop */
 				sbappendstream_locked(&so->so_rcv, m, 0);
 			}
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 			if (DELAY_ACK(tp, tlen)) {
 				tp->t_flags |= TF_DELACK;
 			} else {
 				tp->t_flags |= TF_ACKNOW;
 				tcp_output(tp);
 			}
 			goto check_delack;
 		}
 	}
 
 	/*
 	 * Calculate amount of space in receive window,
 	 * and then do TCP input processing.
 	 * Receive window is amount of space in rcv queue,
 	 * but not less than advertised window.
 	 */
 	win = sbspace(&so->so_rcv);
 	if (win < 0)
 		win = 0;
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 
 	switch (tp->t_state) {
 	/*
 	 * If the state is SYN_RECEIVED:
 	 *	if seg contains an ACK, but not for our SYN/ACK, send a RST.
 	 */
 	case TCPS_SYN_RECEIVED:
 		if (thflags & TH_RST) {
 			/* Handle RST segments later. */
 			break;
 		}
 		if ((thflags & TH_ACK) &&
 		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 		     SEQ_GT(th->th_ack, tp->snd_max))) {
 				rstreason = BANDLIM_RST_OPENPORT;
 				tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 				goto dropwithreset;
 		}
 		if (IS_FASTOPEN(tp->t_flags)) {
 			/*
 			 * When a TFO connection is in SYN_RECEIVED, the
 			 * only valid packets are the initial SYN, a
 			 * retransmit/copy of the initial SYN (possibly with
 			 * a subset of the original data), a valid ACK, a
 			 * FIN, or a RST.
 			 */
 			if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) {
 				rstreason = BANDLIM_RST_OPENPORT;
 				tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 				goto dropwithreset;
 			} else if (thflags & TH_SYN) {
 				/* non-initial SYN is ignored */
 				if ((tcp_timer_active(tp, TT_DELACK) ||
 				     tcp_timer_active(tp, TT_REXMT)))
 					goto drop;
 			} else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) {
 				goto drop;
 			}
 		}
 		break;
 
 	/*
 	 * If the state is SYN_SENT:
 	 *	if seg contains a RST with valid ACK (SEQ.ACK has already
 	 *	    been verified), then drop the connection.
 	 *	if seg contains a RST without an ACK, drop the seg.
 	 *	if seg does not contain SYN, then drop the seg.
 	 * Otherwise this is an acceptable SYN segment
 	 *	initialize tp->rcv_nxt and tp->irs
 	 *	if seg contains ack then advance tp->snd_una
 	 *	if seg contains an ECE and ECN support is enabled, the stream
 	 *	    is ECN capable.
 	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
 	 *	arrange for segment to be acked (eventually)
 	 *	continue processing rest of data/controls, beginning with URG
 	 */
 	case TCPS_SYN_SENT:
 		if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
 			TCP_PROBE5(connect__refused, NULL, tp,
 			    m, tp, th);
 			tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 			tp = tcp_drop(tp, ECONNREFUSED);
 		}
 		if (thflags & TH_RST)
 			goto drop;
 		if (!(thflags & TH_SYN))
 			goto drop;
 
 		tp->irs = th->th_seq;
 		tcp_rcvseqinit(tp);
 		if (thflags & TH_ACK) {
 			int tfo_partial_ack = 0;
 
 			TCPSTAT_INC(tcps_connects);
 			soisconnected(so);
 #ifdef MAC
 			mac_socketpeer_set_from_mbuf(m, so);
 #endif
 			/* Do window scaling on this connection? */
 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 				tp->rcv_scale = tp->request_r_scale;
 			}
 			tp->rcv_adv += min(tp->rcv_wnd,
 			    TCP_MAXWIN << tp->rcv_scale);
 			tp->snd_una++;		/* SYN is acked */
 			/*
 			 * If not all the data that was sent in the TFO SYN
 			 * has been acked, resend the remainder right away.
 			 */
 			if (IS_FASTOPEN(tp->t_flags) &&
 			    (tp->snd_una != tp->snd_max)) {
 				tp->snd_nxt = th->th_ack;
 				tfo_partial_ack = 1;
 			}
 			/*
 			 * If there's data, delay ACK; if there's also a FIN
 			 * ACKNOW will be turned on later.
 			 */
 			if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial_ack)
 				tcp_timer_activate(tp, TT_DELACK,
 				    tcp_delacktime);
 			else
 				tp->t_flags |= TF_ACKNOW;
 
 			tcp_ecn_input_syn_sent(tp, thflags, iptos);
 
 			/*
 			 * Received <SYN,ACK> in SYN_SENT[*] state.
 			 * Transitions:
 			 *	SYN_SENT  --> ESTABLISHED
 			 *	SYN_SENT* --> FIN_WAIT_1
 			 */
 			tp->t_starttime = ticks;
 			if (tp->t_flags & TF_NEEDFIN) {
 				tp->t_acktime = ticks;
 				tcp_state_change(tp, TCPS_FIN_WAIT_1);
 				tp->t_flags &= ~TF_NEEDFIN;
 				thflags &= ~TH_SYN;
 			} else {
 				tcp_state_change(tp, TCPS_ESTABLISHED);
 				TCP_PROBE5(connect__established, NULL, tp,
 				    m, tp, th);
 				cc_conn_init(tp);
 				tcp_timer_activate(tp, TT_KEEP,
 				    TP_KEEPIDLE(tp));
 			}
 		} else {
 			/*
 			 * Received initial SYN in SYN-SENT[*] state =>
 			 * simultaneous open.
 			 * If it succeeds, connection is * half-synchronized.
 			 * Otherwise, do 3-way handshake:
 			 *        SYN-SENT -> SYN-RECEIVED
 			 *        SYN-SENT* -> SYN-RECEIVED*
 			 */
 			tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN);
 			tcp_timer_activate(tp, TT_REXMT, 0);
 			tcp_state_change(tp, TCPS_SYN_RECEIVED);
 		}
 
 		/*
 		 * Advance th->th_seq to correspond to first data byte.
 		 * If data, trim to stay within window,
 		 * dropping FIN if necessary.
 		 */
 		th->th_seq++;
 		if (tlen > tp->rcv_wnd) {
 			todrop = tlen - tp->rcv_wnd;
 			m_adj(m, -todrop);
 			tlen = tp->rcv_wnd;
 			thflags &= ~TH_FIN;
 			TCPSTAT_INC(tcps_rcvpackafterwin);
 			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 		}
 		tp->snd_wl1 = th->th_seq - 1;
 		tp->rcv_up = th->th_seq;
 		/*
 		 * Client side of transaction: already sent SYN and data.
 		 * If the remote host used T/TCP to validate the SYN,
 		 * our data will be ACK'd; if so, enter normal data segment
 		 * processing in the middle of step 5, ack processing.
 		 * Otherwise, goto step 6.
 		 */
 		if (thflags & TH_ACK)
 			goto process_ACK;
 
 		goto step6;
 	}
 
 	/*
 	 * States other than LISTEN or SYN_SENT.
 	 * First check the RST flag and sequence number since reset segments
 	 * are exempt from the timestamp and connection count tests.  This
 	 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
 	 * below which allowed reset segments in half the sequence space
 	 * to fall though and be processed (which gives forged reset
 	 * segments with a random sequence number a 50 percent chance of
 	 * killing a connection).
 	 * Then check timestamp, if present.
 	 * Then check the connection count, if present.
 	 * Then check that at least some bytes of segment are within
 	 * receive window.  If segment begins before rcv_nxt,
 	 * drop leading data (and SYN); if nothing left, just ack.
 	 */
 	if (thflags & TH_RST) {
 		/*
 		 * RFC5961 Section 3.2
 		 *
 		 * - RST drops connection only if SEG.SEQ == RCV.NXT.
 		 * - If RST is in window, we send challenge ACK.
 		 *
 		 * Note: to take into account delayed ACKs, we should
 		 *   test against last_ack_sent instead of rcv_nxt.
 		 * Note 2: we handle special case of closed window, not
 		 *   covered by the RFC.
 		 */
 		if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
 		    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
 		    (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
 			KASSERT(tp->t_state != TCPS_SYN_SENT,
 			    ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
 			    __func__, th, tp));
 
 			if (V_tcp_insecure_rst ||
 			    tp->last_ack_sent == th->th_seq) {
 				TCPSTAT_INC(tcps_drops);
 				/* Drop the connection. */
 				switch (tp->t_state) {
 				case TCPS_SYN_RECEIVED:
 					so->so_error = ECONNREFUSED;
 					goto close;
 				case TCPS_ESTABLISHED:
 				case TCPS_FIN_WAIT_1:
 				case TCPS_FIN_WAIT_2:
 				case TCPS_CLOSE_WAIT:
 				case TCPS_CLOSING:
 				case TCPS_LAST_ACK:
 					so->so_error = ECONNRESET;
 				close:
 					/* FALLTHROUGH */
 				default:
 					tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_RST);
 					tp = tcp_close(tp);
 				}
 			} else {
 				TCPSTAT_INC(tcps_badrst);
 				/* Send challenge ACK. */
 				tcp_respond(tp, mtod(m, void *), th, m,
 				    tp->rcv_nxt, tp->snd_nxt, TH_ACK);
 				tp->last_ack_sent = tp->rcv_nxt;
 				m = NULL;
 			}
 		}
 		goto drop;
 	}
 
 	/*
 	 * RFC5961 Section 4.2
 	 * Send challenge ACK for any SYN in synchronized state.
 	 */
 	if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT &&
 	    tp->t_state != TCPS_SYN_RECEIVED) {
 		TCPSTAT_INC(tcps_badsyn);
 		if (V_tcp_insecure_syn &&
 		    SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
 		    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
 			tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 			tp = tcp_drop(tp, ECONNRESET);
 			rstreason = BANDLIM_UNLIMITED;
 		} else {
 			tcp_ecn_input_syn_sent(tp, thflags, iptos);
 			/* Send challenge ACK. */
 			tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
 			    tp->snd_nxt, TH_ACK);
 			tp->last_ack_sent = tp->rcv_nxt;
 			m = NULL;
 		}
 		goto drop;
 	}
 
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment
 	 * and it's less than ts_recent, drop it.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to.to_tsval, tp->ts_recent)) {
 		/* Check to see if ts_recent is over 24 days old.  */
 		if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
 			/*
 			 * Invalidate ts_recent.  If this segment updates
 			 * ts_recent, the age will be reset later and ts_recent
 			 * will get a valid value.  If it does not, setting
 			 * ts_recent to zero will at least satisfy the
 			 * requirement that zero be placed in the timestamp
 			 * echo reply when ts_recent isn't valid.  The
 			 * age isn't reset until we get a valid ts_recent
 			 * because we don't want out-of-order segments to be
 			 * dropped when ts_recent is old.
 			 */
 			tp->ts_recent = 0;
 		} else {
 			TCPSTAT_INC(tcps_rcvduppack);
 			TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
 			TCPSTAT_INC(tcps_pawsdrop);
 			if (tlen)
 				goto dropafterack;
 			goto drop;
 		}
 	}
 
 	/*
 	 * In the SYN-RECEIVED state, validate that the packet belongs to
 	 * this connection before trimming the data to fit the receive
 	 * window.  Check the sequence number versus IRS since we know
 	 * the sequence numbers haven't wrapped.  This is a partial fix
 	 * for the "LAND" DoS attack.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
 		rstreason = BANDLIM_RST_OPENPORT;
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		goto dropwithreset;
 	}
 
 	todrop = tp->rcv_nxt - th->th_seq;
 	if (todrop > 0) {
 		if (thflags & TH_SYN) {
 			thflags &= ~TH_SYN;
 			th->th_seq++;
 			if (th->th_urp > 1)
 				th->th_urp--;
 			else
 				thflags &= ~TH_URG;
 			todrop--;
 		}
 		/*
 		 * Following if statement from Stevens, vol. 2, p. 960.
 		 */
 		if (todrop > tlen
 		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
 			/*
 			 * Any valid FIN must be to the left of the window.
 			 * At this point the FIN must be a duplicate or out
 			 * of sequence; drop it.
 			 */
 			thflags &= ~TH_FIN;
 
 			/*
 			 * Send an ACK to resynchronize and drop any data.
 			 * But keep on processing for RST or ACK.
 			 */
 			tp->t_flags |= TF_ACKNOW;
 			todrop = tlen;
 			TCPSTAT_INC(tcps_rcvduppack);
 			TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
 		} else {
 			TCPSTAT_INC(tcps_rcvpartduppack);
 			TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
 		}
 		/*
 		 * DSACK - add SACK block for dropped range
 		 */
 		if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) {
 			tcp_update_sack_list(tp, th->th_seq,
 			    th->th_seq + todrop);
 			/*
 			 * ACK now, as the next in-sequence segment
 			 * will clear the DSACK block again
 			 */
 			tp->t_flags |= TF_ACKNOW;
 		}
 		drop_hdrlen += todrop;	/* drop from the top afterwards */
 		th->th_seq += todrop;
 		tlen -= todrop;
 		if (th->th_urp > todrop)
 			th->th_urp -= todrop;
 		else {
 			thflags &= ~TH_URG;
 			th->th_urp = 0;
 		}
 	}
 
 	/*
 	 * If new data are received on a connection after the
 	 * user processes are gone, then RST the other end.
 	 */
 	if ((tp->t_flags & TF_CLOSED) && tlen) {
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
 			    "after socket was closed, "
 			    "sending RST and removing tcpcb\n",
 			    s, __func__, tcpstates[tp->t_state], tlen);
 			free(s, M_TCPLOG);
 		}
 		tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
 		/* tcp_close will kill the inp pre-log the Reset */
 		tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
 		tp = tcp_close(tp);
 		TCPSTAT_INC(tcps_rcvafterclose);
 		rstreason = BANDLIM_UNLIMITED;
 		goto dropwithreset;
 	}
 
 	/*
 	 * If segment ends after window, drop trailing data
 	 * (and PUSH and FIN); if nothing left, just ACK.
 	 */
 	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
 	if (todrop > 0) {
 		TCPSTAT_INC(tcps_rcvpackafterwin);
 		if (todrop >= tlen) {
 			TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
 			/*
 			 * If window is closed can only take segments at
 			 * window edge, and have to drop data and PUSH from
 			 * incoming segments.  Continue processing, but
 			 * remember to ack.  Otherwise, drop segment
 			 * and ack.
 			 */
 			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
 				tp->t_flags |= TF_ACKNOW;
 				TCPSTAT_INC(tcps_rcvwinprobe);
 			} else
 				goto dropafterack;
 		} else
 			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 		m_adj(m, -todrop);
 		tlen -= todrop;
 		thflags &= ~(TH_PUSH|TH_FIN);
 	}
 
 	/*
 	 * If last ACK falls within this segment's sequence numbers,
 	 * record its timestamp.
 	 * NOTE:
 	 * 1) That the test incorporates suggestions from the latest
 	 *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
 	 * 2) That updating only on newer timestamps interferes with
 	 *    our earlier PAWS tests, so this check should be solely
 	 *    predicated on the sequence space of this segment.
 	 * 3) That we modify the segment boundary check to be
 	 *        Last.ACK.Sent <= SEG.SEQ + SEG.Len
 	 *    instead of RFC1323's
 	 *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
 	 *    This modified check allows us to overcome RFC1323's
 	 *    limitations as described in Stevens TCP/IP Illustrated
 	 *    Vol. 2 p.869. In such cases, we can still calculate the
 	 *    RTT correctly when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to.to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 		((thflags & (TH_SYN|TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to.to_tsval;
 	}
 
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
 	 * flag is on (half-synchronized state), then queue data for
 	 * later processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_state == TCPS_SYN_RECEIVED ||
 		    (tp->t_flags & TF_NEEDSYN)) {
 			if (tp->t_state == TCPS_SYN_RECEIVED &&
 			    IS_FASTOPEN(tp->t_flags)) {
 				tp->snd_wnd = tiwin;
 				cc_conn_init(tp);
 			}
 			goto step6;
 		} else if (tp->t_flags & TF_ACKNOW)
 			goto dropafterack;
 		else
 			goto drop;
 	}
 
 	/*
 	 * Ack processing.
 	 */
 	switch (tp->t_state) {
 	/*
 	 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
 	 * ESTABLISHED state and continue processing.
 	 * The ACK was checked above.
 	 */
 	case TCPS_SYN_RECEIVED:
 
 		TCPSTAT_INC(tcps_connects);
 		if (tp->t_flags & TF_SONOTCONN) {
 			/*
 			 * Usually SYN_RECEIVED had been created from a LISTEN,
 			 * and solisten_enqueue() has already marked the socket
 			 * layer as connected.  If it didn't, which can happen
 			 * only with an accept_filter(9), then the tp is marked
 			 * with TF_SONOTCONN.  The other reason for this mark
 			 * to be set is a simultaneous open, a SYN_RECEIVED
 			 * that had been created from SYN_SENT.
 			 */
 			tp->t_flags &= ~TF_SONOTCONN;
 			soisconnected(so);
 		}
 		/* Do window scaling? */
 		if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 			(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 			tp->rcv_scale = tp->request_r_scale;
 		}
 		tp->snd_wnd = tiwin;
 		/*
 		 * Make transitions:
 		 *      SYN-RECEIVED  -> ESTABLISHED
 		 *      SYN-RECEIVED* -> FIN-WAIT-1
 		 */
 		tp->t_starttime = ticks;
 		if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
 			tcp_fastopen_decrement_counter(tp->t_tfo_pending);
 			tp->t_tfo_pending = NULL;
 		}
 		if (tp->t_flags & TF_NEEDFIN) {
 			tp->t_acktime = ticks;
 			tcp_state_change(tp, TCPS_FIN_WAIT_1);
 			tp->t_flags &= ~TF_NEEDFIN;
 		} else {
 			tcp_state_change(tp, TCPS_ESTABLISHED);
 			TCP_PROBE5(accept__established, NULL, tp,
 			    m, tp, th);
 			/*
 			 * TFO connections call cc_conn_init() during SYN
 			 * processing.  Calling it again here for such
 			 * connections is not harmless as it would undo the
 			 * snd_cwnd reduction that occurs when a TFO SYN|ACK
 			 * is retransmitted.
 			 */
 			if (!IS_FASTOPEN(tp->t_flags))
 				cc_conn_init(tp);
 			tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
 		}
 		/*
 		 * Account for the ACK of our SYN prior to
 		 * regular ACK processing below, except for
 		 * simultaneous SYN, which is handled later.
 		 */
 		if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN))
 			incforsyn = 1;
 		/*
 		 * If segment contains data or ACK, will call tcp_reass()
 		 * later; if not, do so now to pass queued data to user.
 		 */
 		if (tlen == 0 && (thflags & TH_FIN) == 0) {
 			(void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
 			    (struct mbuf *)0);
 			tcp_handle_wakeup(tp);
 		}
 		tp->snd_wl1 = th->th_seq - 1;
 		/* FALLTHROUGH */
 
 	/*
 	 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
 	 * ACKs.  If the ack is in the range
 	 *	tp->snd_una < th->th_ack <= tp->snd_max
 	 * then advance tp->snd_una to th->th_ack and drop
 	 * data from the retransmission queue.  If this ACK reflects
 	 * more up to date window information we update our window information.
 	 */
 	case TCPS_ESTABLISHED:
 	case TCPS_FIN_WAIT_1:
 	case TCPS_FIN_WAIT_2:
 	case TCPS_CLOSE_WAIT:
 	case TCPS_CLOSING:
 	case TCPS_LAST_ACK:
 		if (SEQ_GT(th->th_ack, tp->snd_max)) {
 			TCPSTAT_INC(tcps_rcvacktoomuch);
 			goto dropafterack;
 		}
 		if (tcp_is_sack_recovery(tp, &to)) {
 			if (((sack_changed = tcp_sack_doack(tp, &to, th->th_ack)) != 0) &&
 			    (tp->t_flags & TF_LRD)) {
 				tcp_sack_lost_retransmission(tp, th);
 			}
 		} else
 			/*
 			 * Reset the value so that previous (valid) value
 			 * from the last ack with SACK doesn't get used.
 			 */
 			tp->sackhint.sacked_bytes = 0;
 
 #ifdef TCP_HHOOK
 		/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
 		hhook_run_tcp_est_in(tp, th, &to);
 #endif
 
 		if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
 			maxseg = tcp_maxseg(tp);
 			if (tlen == 0 &&
 			    (tiwin == tp->snd_wnd ||
 			    (tp->t_flags & TF_SACK_PERMIT))) {
 				/*
 				 * If this is the first time we've seen a
 				 * FIN from the remote, this is not a
 				 * duplicate and it needs to be processed
 				 * normally.  This happens during a
 				 * simultaneous close.
 				 */
 				if ((thflags & TH_FIN) &&
 				    (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
 					tp->t_dupacks = 0;
 					break;
 				}
 				TCPSTAT_INC(tcps_rcvdupack);
 				/*
 				 * If we have outstanding data (other than
 				 * a window probe), this is a completely
 				 * duplicate ack (ie, window info didn't
 				 * change and FIN isn't set),
 				 * the ack is the biggest we've
 				 * seen and we've seen exactly our rexmt
 				 * threshold of them, assume a packet
 				 * has been dropped and retransmit it.
 				 * Kludge snd_nxt & the congestion
 				 * window so we send only this one
 				 * packet.
 				 *
 				 * We know we're losing at the current
 				 * window size so do congestion avoidance
 				 * (set ssthresh to half the current window
 				 * and pull our congestion window back to
 				 * the new ssthresh).
 				 *
 				 * Dup acks mean that packets have left the
 				 * network (they're now cached at the receiver)
 				 * so bump cwnd by the amount in the receiver
 				 * to keep a constant cwnd packets in the
 				 * network.
 				 *
 				 * When using TCP ECN, notify the peer that
 				 * we reduced the cwnd.
 				 */
 				/*
 				 * Following 2 kinds of acks should not affect
 				 * dupack counting:
 				 * 1) Old acks
 				 * 2) Acks with SACK but without any new SACK
 				 * information in them. These could result from
 				 * any anomaly in the network like a switch
 				 * duplicating packets or a possible DoS attack.
 				 */
 				if (th->th_ack != tp->snd_una ||
 				    (tcp_is_sack_recovery(tp, &to) &&
-				    !sack_changed))
+				    (sack_changed == SACK_NOCHANGE)))
 					break;
 				else if (!tcp_timer_active(tp, TT_REXMT))
 					tp->t_dupacks = 0;
 				else if (++tp->t_dupacks > tcprexmtthresh ||
 				     IN_FASTRECOVERY(tp->t_flags)) {
 					cc_ack_received(tp, th, nsegs,
 					    CC_DUPACK);
 					if (V_tcp_do_prr &&
-					    IN_FASTRECOVERY(tp->t_flags)) {
-						tcp_do_prr_ack(tp, th, &to);
+					    IN_FASTRECOVERY(tp->t_flags) &&
+					    (tp->t_flags & TF_SACK_PERMIT)) {
+						tcp_do_prr_ack(tp, th, &to, sack_changed);
 					} else if (tcp_is_sack_recovery(tp, &to) &&
 					    IN_FASTRECOVERY(tp->t_flags)) {
 						int awnd;
 
 						/*
 						 * Compute the amount of data in flight first.
 						 * We can inject new data into the pipe iff
 						 * we have less than 1/2 the original window's
 						 * worth of data in flight.
 						 */
 						if (V_tcp_do_newsack)
 							awnd = tcp_compute_pipe(tp);
 						else
 							awnd = (tp->snd_nxt - tp->snd_fack) +
 								tp->sackhint.sack_bytes_rexmit;
 
 						if (awnd < tp->snd_ssthresh) {
 							tp->snd_cwnd += maxseg;
 							if (tp->snd_cwnd > tp->snd_ssthresh)
 								tp->snd_cwnd = tp->snd_ssthresh;
 						}
 					} else
 						tp->snd_cwnd += maxseg;
 					(void) tcp_output(tp);
 					goto drop;
 				} else if (tp->t_dupacks == tcprexmtthresh ||
 					    (tp->t_flags & TF_SACK_PERMIT &&
 					     V_tcp_do_newsack &&
 					     tp->sackhint.sacked_bytes >
 					     (tcprexmtthresh - 1) * maxseg)) {
 enter_recovery:
 					/*
 					 * Above is the RFC6675 trigger condition of
 					 * more than (dupthresh-1)*maxseg sacked data.
 					 * If the count of holes in the
 					 * scoreboard is >= dupthresh, we could
 					 * also enter loss recovery, but don't
 					 * have that value readily available.
 					 */
 					tp->t_dupacks = tcprexmtthresh;
 					tcp_seq onxt = tp->snd_nxt;
 
 					/*
 					 * If we're doing sack, or prr, check
 					 * to see if we're already in sack
 					 * recovery. If we're not doing sack,
 					 * check to see if we're in newreno
 					 * recovery.
 					 */
 					if (V_tcp_do_prr ||
 					    (tp->t_flags & TF_SACK_PERMIT)) {
 						if (IN_FASTRECOVERY(tp->t_flags)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					} else {
 						if (SEQ_LEQ(th->th_ack,
 						    tp->snd_recover)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					}
 					/* Congestion signal before ack. */
 					cc_cong_signal(tp, th, CC_NDUPACK);
 					cc_ack_received(tp, th, nsegs,
 					    CC_DUPACK);
 					tcp_timer_activate(tp, TT_REXMT, 0);
 					tp->t_rtttime = 0;
 					if (V_tcp_do_prr) {
 						/*
 						 * snd_ssthresh is already updated by
 						 * cc_cong_signal.
 						 */
 						if (tcp_is_sack_recovery(tp, &to)) {
+							/*
+							 * Exclude Limited Transmit
+							 * segments here
+							 */
 							tp->sackhint.prr_delivered =
-							    tp->sackhint.sacked_bytes;
+							    maxseg;
 						} else {
 							tp->sackhint.prr_delivered =
 							    imin(tp->snd_max - tp->snd_una,
 							    imin(INT_MAX / 65536,
 								tp->t_dupacks) * maxseg);
 						}
 						tp->sackhint.recover_fs = max(1,
 						    tp->snd_nxt - tp->snd_una);
 					}
 					if (tcp_is_sack_recovery(tp, &to)) {
 						TCPSTAT_INC(
 						    tcps_sack_recovery_episode);
 						tp->snd_recover = tp->snd_nxt;
 						tp->snd_cwnd = maxseg;
 						(void) tcp_output(tp);
 						if (SEQ_GT(th->th_ack, tp->snd_una))
 							goto resume_partialack;
 						goto drop;
 					}
 					tp->snd_nxt = th->th_ack;
 					tp->snd_cwnd = maxseg;
 					(void) tcp_output(tp);
 					KASSERT(tp->snd_limited <= 2,
 					    ("%s: tp->snd_limited too big",
 					    __func__));
 					tp->snd_cwnd = tp->snd_ssthresh +
 					     maxseg *
 					     (tp->t_dupacks - tp->snd_limited);
 					if (SEQ_GT(onxt, tp->snd_nxt))
 						tp->snd_nxt = onxt;
 					goto drop;
 				} else if (V_tcp_do_rfc3042) {
 					/*
 					 * Process first and second duplicate
 					 * ACKs. Each indicates a segment
 					 * leaving the network, creating room
 					 * for more. Make sure we can send a
 					 * packet on reception of each duplicate
 					 * ACK by increasing snd_cwnd by one
 					 * segment. Restore the original
 					 * snd_cwnd after packet transmission.
 					 */
 					cc_ack_received(tp, th, nsegs,
 					    CC_DUPACK);
 					uint32_t oldcwnd = tp->snd_cwnd;
 					tcp_seq oldsndmax = tp->snd_max;
 					u_int sent;
 					int avail;
 
 					KASSERT(tp->t_dupacks == 1 ||
 					    tp->t_dupacks == 2,
 					    ("%s: dupacks not 1 or 2",
 					    __func__));
 					if (tp->t_dupacks == 1)
 						tp->snd_limited = 0;
 					tp->snd_cwnd =
 					    (tp->snd_nxt - tp->snd_una) +
 					    (tp->t_dupacks - tp->snd_limited) *
 					    maxseg;
 					/*
 					 * Only call tcp_output when there
 					 * is new data available to be sent
 					 * or we need to send an ACK.
 					 */
 					SOCKBUF_LOCK(&so->so_snd);
 					avail = sbavail(&so->so_snd) -
 					    (tp->snd_nxt - tp->snd_una);
 					SOCKBUF_UNLOCK(&so->so_snd);
 					if (avail > 0 || tp->t_flags & TF_ACKNOW)
 						(void) tcp_output(tp);
 					sent = tp->snd_max - oldsndmax;
 					if (sent > maxseg) {
 						KASSERT((tp->t_dupacks == 2 &&
 						    tp->snd_limited == 0) ||
 						   (sent == maxseg + 1 &&
 						    tp->t_flags & TF_SENTFIN),
 						    ("%s: sent too much",
 						    __func__));
 						tp->snd_limited = 2;
 					} else if (sent > 0)
 						++tp->snd_limited;
 					tp->snd_cwnd = oldcwnd;
 					goto drop;
 				}
 			}
 			break;
 		} else {
 			/*
 			 * This ack is advancing the left edge, reset the
 			 * counter.
 			 */
 			tp->t_dupacks = 0;
 			/*
 			 * If this ack also has new SACK info, increment the
 			 * counter as per rfc6675. The variable
 			 * sack_changed tracks all changes to the SACK
 			 * scoreboard, including when partial ACKs without
 			 * SACK options are received, and clear the scoreboard
 			 * from the left side. Such partial ACKs should not be
 			 * counted as dupacks here.
 			 */
 			if (tcp_is_sack_recovery(tp, &to) &&
-			    sack_changed) {
+			    (sack_changed != SACK_NOCHANGE)) {
 				tp->t_dupacks++;
 				/* limit overhead by setting maxseg last */
 				if (!IN_FASTRECOVERY(tp->t_flags) &&
 				    (tp->sackhint.sacked_bytes >
 				    ((tcprexmtthresh - 1) *
 				    (maxseg = tcp_maxseg(tp))))) {
 					goto enter_recovery;
 				}
 			}
 		}
 
 resume_partialack:
 		KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
 		    ("%s: th_ack <= snd_una", __func__));
 
 		/*
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
 		if (IN_FASTRECOVERY(tp->t_flags)) {
 			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
 				if (tp->t_flags & TF_SACK_PERMIT)
 					if (V_tcp_do_prr && to.to_flags & TOF_SACK) {
 						tcp_timer_activate(tp, TT_REXMT, 0);
 						tp->t_rtttime = 0;
-						tcp_do_prr_ack(tp, th, &to);
+						tcp_do_prr_ack(tp, th, &to, sack_changed);
 						tp->t_flags |= TF_ACKNOW;
 						(void) tcp_output(tp);
 					} else
 						tcp_sack_partialack(tp, th);
 				else
 					tcp_newreno_partial_ack(tp, th);
 			} else
 				cc_post_recovery(tp, th);
 		} else if (IN_CONGRECOVERY(tp->t_flags)) {
 			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
 				if (V_tcp_do_prr) {
 					tp->sackhint.delivered_data = BYTES_THIS_ACK(tp, th);
 					tp->snd_fack = th->th_ack;
-					tcp_do_prr_ack(tp, th, &to);
+					/*
+					 * During ECN cwnd reduction
+					 * always use PRR-SSRB
+					 */
+					tcp_do_prr_ack(tp, th, &to, SACK_CHANGE);
 					(void) tcp_output(tp);
 				}
 			} else
 				cc_post_recovery(tp, th);
 		}
 		/*
 		 * If we reach this point, ACK is not a duplicate,
 		 *     i.e., it ACKs something we sent.
 		 */
 		if (tp->t_flags & TF_NEEDSYN) {
 			/*
 			 * T/TCP: Connection was half-synchronized, and our
 			 * SYN has been ACK'd (so connection is now fully
 			 * synchronized).  Go to non-starred state,
 			 * increment snd_una for ACK of SYN, and check if
 			 * we can do window scaling.
 			 */
 			tp->t_flags &= ~TF_NEEDSYN;
 			tp->snd_una++;
 			/* Do window scaling? */
 			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
 				tp->rcv_scale = tp->request_r_scale;
 				/* Send window already scaled. */
 			}
 		}
 
 process_ACK:
 		INP_WLOCK_ASSERT(inp);
 
 		/*
 		 * Adjust for the SYN bit in sequence space,
 		 * but don't account for it in cwnd calculations.
 		 * This is for the SYN_RECEIVED, non-simultaneous
 		 * SYN case. SYN_SENT and simultaneous SYN are
 		 * treated elsewhere.
 		 */
 		if (incforsyn)
 			tp->snd_una++;
 		acked = BYTES_THIS_ACK(tp, th);
 		KASSERT(acked >= 0, ("%s: acked unexepectedly negative "
 		    "(tp->snd_una=%u, th->th_ack=%u, tp=%p, m=%p)", __func__,
 		    tp->snd_una, th->th_ack, tp, m));
 		TCPSTAT_ADD(tcps_rcvackpack, nsegs);
 		TCPSTAT_ADD(tcps_rcvackbyte, acked);
 
 		/*
 		 * If we just performed our first retransmit, and the ACK
 		 * arrives within our recovery window, then it was a mistake
 		 * to do the retransmit in the first place.  Recover our
 		 * original cwnd and ssthresh, and proceed to transmit where
 		 * we left off.
 		 */
 		if (tp->t_rxtshift == 1 &&
 		    tp->t_flags & TF_PREVVALID &&
 		    tp->t_badrxtwin != 0 &&
 		    to.to_flags & TOF_TS &&
 		    to.to_tsecr != 0 &&
 		    TSTMP_LT(to.to_tsecr, tp->t_badrxtwin))
 			cc_cong_signal(tp, th, CC_RTO_ERR);
 
 		/*
 		 * If we have a timestamp reply, update smoothed
 		 * round trip time.  If no timestamp is present but
 		 * transmit timer is running and timed sequence
 		 * number was acked, update smoothed round trip time.
 		 * Since we now have an rtt measurement, cancel the
 		 * timer backoff (cf., Phil Karn's retransmit alg.).
 		 * Recompute the initial retransmit timer.
 		 *
 		 * Some boxes send broken timestamp replies
 		 * during the SYN+ACK phase, ignore
 		 * timestamps of 0 or we could calculate a
 		 * huge RTT and blow up the retransmit timer.
 		 */
 		if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) {
 			uint32_t t;
 
 			t = tcp_ts_getticks() - to.to_tsecr;
 			if (!tp->t_rttlow || tp->t_rttlow > t)
 				tp->t_rttlow = t;
 			tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
 		} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
 			if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
 				tp->t_rttlow = ticks - tp->t_rtttime;
 			tcp_xmit_timer(tp, ticks - tp->t_rtttime);
 		}
 
 		SOCKBUF_LOCK(&so->so_snd);
 		/*
 		 * Clear t_acktime if remote side has ACKd all data in the
 		 * socket buffer and FIN (if applicable).
 		 * Otherwise, update t_acktime if we received a sufficiently
 		 * large ACK.
 		 */
 		if ((tp->t_state <= TCPS_CLOSE_WAIT &&
 		    acked == sbavail(&so->so_snd)) ||
 		    acked > sbavail(&so->so_snd))
 			tp->t_acktime = 0;
 		else if (acked > 1)
 			tp->t_acktime = ticks;
 
 		/*
 		 * If all outstanding data is acked, stop retransmit
 		 * timer and remember to restart (more output or persist).
 		 * If there is more data to be acked, restart retransmit
 		 * timer, using current (possibly backed-off) value.
 		 */
 		if (th->th_ack == tp->snd_max) {
 			tcp_timer_activate(tp, TT_REXMT, 0);
 			needoutput = 1;
 		} else if (!tcp_timer_active(tp, TT_PERSIST))
 			tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp));
 
 		/*
 		 * If no data (only SYN) was ACK'd,
 		 *    skip rest of ACK processing.
 		 */
 		if (acked == 0) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto step6;
 		}
 
 		/*
 		 * Let the congestion control algorithm update congestion
 		 * control related information. This typically means increasing
 		 * the congestion window.
 		 */
 		cc_ack_received(tp, th, nsegs, CC_ACK);
 
 		if (acked > sbavail(&so->so_snd)) {
 			if (tp->snd_wnd >= sbavail(&so->so_snd))
 				tp->snd_wnd -= sbavail(&so->so_snd);
 			else
 				tp->snd_wnd = 0;
 			mfree = sbcut_locked(&so->so_snd,
 			    (int)sbavail(&so->so_snd));
 			ourfinisacked = 1;
 		} else {
 			mfree = sbcut_locked(&so->so_snd, acked);
 			if (tp->snd_wnd >= (uint32_t) acked)
 				tp->snd_wnd -= acked;
 			else
 				tp->snd_wnd = 0;
 			ourfinisacked = 0;
 		}
 		/* NB: sowwakeup_locked() does an implicit unlock. */
 		sowwakeup_locked(so);
 		m_freem(mfree);
 		/* Detect una wraparound. */
 		if (!IN_RECOVERY(tp->t_flags) &&
 		    SEQ_GT(tp->snd_una, tp->snd_recover) &&
 		    SEQ_LEQ(th->th_ack, tp->snd_recover))
 			tp->snd_recover = th->th_ack - 1;
 		/* XXXLAS: Can this be moved up into cc_post_recovery? */
 		if (IN_RECOVERY(tp->t_flags) &&
 		    SEQ_GEQ(th->th_ack, tp->snd_recover)) {
 			EXIT_RECOVERY(tp->t_flags);
 		}
 		tp->snd_una = th->th_ack;
 		if (tp->t_flags & TF_SACK_PERMIT) {
 			if (SEQ_GT(tp->snd_una, tp->snd_recover))
 				tp->snd_recover = tp->snd_una;
 		}
 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 			tp->snd_nxt = tp->snd_una;
 
 		switch (tp->t_state) {
 		/*
 		 * In FIN_WAIT_1 STATE in addition to the processing
 		 * for the ESTABLISHED state if our FIN is now acknowledged
 		 * then enter FIN_WAIT_2.
 		 */
 		case TCPS_FIN_WAIT_1:
 			if (ourfinisacked) {
 				/*
 				 * If we can't receive any more
 				 * data, then closing user can proceed.
 				 * Starting the timer is contrary to the
 				 * specification, but if we don't get a FIN
 				 * we'll hang forever.
 				 */
 				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 					soisdisconnected(so);
 					tcp_timer_activate(tp, TT_2MSL,
 					    (tcp_fast_finwait2_recycle ?
 					    tcp_finwait2_timeout :
 					    TP_MAXIDLE(tp)));
 				}
 				tcp_state_change(tp, TCPS_FIN_WAIT_2);
 			}
 			break;
 
 		/*
 		 * In CLOSING STATE in addition to the processing for
 		 * the ESTABLISHED state if the ACK acknowledges our FIN
 		 * then enter the TIME-WAIT state, otherwise ignore
 		 * the segment.
 		 */
 		case TCPS_CLOSING:
 			if (ourfinisacked) {
 				tcp_twstart(tp);
 				m_freem(m);
 				return;
 			}
 			break;
 
 		/*
 		 * In LAST_ACK, we may still be waiting for data to drain
 		 * and/or to be acked, as well as for the ack of our FIN.
 		 * If our FIN is now acknowledged, delete the TCB,
 		 * enter the closed state and return.
 		 */
 		case TCPS_LAST_ACK:
 			if (ourfinisacked) {
 				tp = tcp_close(tp);
 				goto drop;
 			}
 			break;
 		}
 	}
 
 step6:
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Update window information.
 	 * Don't look at window if no ACK: TAC's send garbage on first SYN.
 	 */
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
 	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
 	     (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
 		/* keep track of pure window updates */
 		if (tlen == 0 &&
 		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 			TCPSTAT_INC(tcps_rcvwinupd);
 		tp->snd_wnd = tiwin;
 		tp->snd_wl1 = th->th_seq;
 		tp->snd_wl2 = th->th_ack;
 		if (tp->snd_wnd > tp->max_sndwnd)
 			tp->max_sndwnd = tp->snd_wnd;
 		needoutput = 1;
 	}
 
 	/*
 	 * Process segments with URG.
 	 */
 	if ((thflags & TH_URG) && th->th_urp &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		/*
 		 * This is a kludge, but if we receive and accept
 		 * random urgent pointers, we'll crash in
 		 * soreceive.  It's hard to imagine someone
 		 * actually wanting to send this much urgent data.
 		 */
 		SOCKBUF_LOCK(&so->so_rcv);
 		if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
 			th->th_urp = 0;			/* XXX */
 			thflags &= ~TH_URG;		/* XXX */
 			SOCKBUF_UNLOCK(&so->so_rcv);	/* XXX */
 			goto dodata;			/* XXX */
 		}
 		/*
 		 * If this segment advances the known urgent pointer,
 		 * then mark the data stream.  This should not happen
 		 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
 		 * a FIN has been received from the remote side.
 		 * In these states we ignore the URG.
 		 *
 		 * According to RFC961 (Assigned Protocols),
 		 * the urgent pointer points to the last octet
 		 * of urgent data.  We continue, however,
 		 * to consider it to indicate the first octet
 		 * of data past the urgent section as the original
 		 * spec states (in one of two places).
 		 */
 		if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
 			tp->rcv_up = th->th_seq + th->th_urp;
 			so->so_oobmark = sbavail(&so->so_rcv) +
 			    (tp->rcv_up - tp->rcv_nxt) - 1;
 			if (so->so_oobmark == 0)
 				so->so_rcv.sb_state |= SBS_RCVATMARK;
 			sohasoutofband(so);
 			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 		}
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		/*
 		 * Remove out of band data so doesn't get presented to user.
 		 * This can happen independent of advancing the URG pointer,
 		 * but if two URG's are pending at once, some out-of-band
 		 * data may creep in... ick.
 		 */
 		if (th->th_urp <= (uint32_t)tlen &&
 		    !(so->so_options & SO_OOBINLINE)) {
 			/* hdr drop is delayed */
 			tcp_pulloutofband(so, th, m, drop_hdrlen);
 		}
 	} else {
 		/*
 		 * If no out of band data is expected,
 		 * pull receive urgent pointer along
 		 * with the receive window.
 		 */
 		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
 			tp->rcv_up = tp->rcv_nxt;
 	}
 dodata:							/* XXX */
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Process the segment text, merging it into the TCP sequencing queue,
 	 * and arranging for acknowledgment of receipt if necessary.
 	 * This process logically involves adjusting tp->rcv_wnd as data
 	 * is presented to the user (this happens in tcp_usrreq.c,
 	 * case PRU_RCVD).  If a FIN has already been received on this
 	 * connection then we just ignore the text.
 	 */
 	tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
 		   IS_FASTOPEN(tp->t_flags));
 	if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		tcp_seq save_start = th->th_seq;
 		tcp_seq save_rnxt  = tp->rcv_nxt;
 		int     save_tlen  = tlen;
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 		/*
 		 * Insert segment which includes th into TCP reassembly queue
 		 * with control block tp.  Set thflags to whether reassembly now
 		 * includes a segment with FIN.  This handles the common case
 		 * inline (segment is the next to be received on an established
 		 * connection, and the queue is empty), avoiding linkage into
 		 * and removal from the queue and repetition of various
 		 * conversions.
 		 * Set DELACK for segments received in order, but ack
 		 * immediately when segments are out of order (so
 		 * fast retransmit can work).
 		 */
 		if (th->th_seq == tp->rcv_nxt &&
 		    SEGQ_EMPTY(tp) &&
 		    (TCPS_HAVEESTABLISHED(tp->t_state) ||
 		     tfo_syn)) {
 			if (DELAY_ACK(tp, tlen) || tfo_syn)
 				tp->t_flags |= TF_DELACK;
 			else
 				tp->t_flags |= TF_ACKNOW;
 			tp->rcv_nxt += tlen;
 			if (tlen &&
 			    ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
 			    (tp->t_fbyte_in == 0)) {
 				tp->t_fbyte_in = ticks;
 				if (tp->t_fbyte_in == 0)
 					tp->t_fbyte_in = 1;
 				if (tp->t_fbyte_out && tp->t_fbyte_in)
 					tp->t_flags2 |= TF2_FBYTES_COMPLETE;
 			}
 			thflags = tcp_get_flags(th) & TH_FIN;
 			TCPSTAT_INC(tcps_rcvpack);
 			TCPSTAT_ADD(tcps_rcvbyte, tlen);
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 				m_freem(m);
 			else
 				sbappendstream_locked(&so->so_rcv, m, 0);
 			tp->t_flags |= TF_WAKESOR;
 		} else {
 			/*
 			 * XXX: Due to the header drop above "th" is
 			 * theoretically invalid by now.  Fortunately
 			 * m_adj() doesn't actually frees any mbufs
 			 * when trimming from the head.
 			 */
 			tcp_seq temp = save_start;
 
 			thflags = tcp_reass(tp, th, &temp, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
 		}
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    (save_tlen > 0) &&
 		    TCPS_HAVEESTABLISHED(tp->t_state)) {
 			if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
 				/*
 				 * DSACK actually handled in the fastpath
 				 * above.
 				 */
 				tcp_update_sack_list(tp, save_start,
 				    save_start + save_tlen);
 			} else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
 				if ((tp->rcv_numsacks >= 1) &&
 				    (tp->sackblks[0].end == save_start)) {
 					/*
 					 * Partial overlap, recorded at todrop
 					 * above.
 					 */
 					tcp_update_sack_list(tp,
 					    tp->sackblks[0].start,
 					    tp->sackblks[0].end);
 				} else {
 					tcp_update_dsack_list(tp, save_start,
 					    save_start + save_tlen);
 				}
 			} else if (tlen >= save_tlen) {
 				/* Update of sackblks. */
 				tcp_update_dsack_list(tp, save_start,
 				    save_start + save_tlen);
 			} else if (tlen > 0) {
 				tcp_update_dsack_list(tp, save_start,
 				    save_start + tlen);
 			}
 		}
 		tcp_handle_wakeup(tp);
 #if 0
 		/*
 		 * Note the amount of data that peer has sent into
 		 * our window, in order to estimate the sender's
 		 * buffer size.
 		 * XXX: Unused.
 		 */
 		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
 			len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
 		else
 			len = so->so_rcv.sb_hiwat;
 #endif
 	} else {
 		m_freem(m);
 		thflags &= ~TH_FIN;
 	}
 
 	/*
 	 * If FIN is received ACK the FIN and let the user know
 	 * that the connection is closing.
 	 */
 	if (thflags & TH_FIN) {
 		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 			/* The socket upcall is handled by socantrcvmore. */
 			socantrcvmore(so);
 			/*
 			 * If connection is half-synchronized
 			 * (ie NEEDSYN flag on) then delay ACK,
 			 * so it may be piggybacked when SYN is sent.
 			 * Otherwise, since we received a FIN then no
 			 * more input can be expected, send ACK now.
 			 */
 			if (tp->t_flags & TF_NEEDSYN)
 				tp->t_flags |= TF_DELACK;
 			else
 				tp->t_flags |= TF_ACKNOW;
 			tp->rcv_nxt++;
 		}
 		switch (tp->t_state) {
 		/*
 		 * In SYN_RECEIVED and ESTABLISHED STATES
 		 * enter the CLOSE_WAIT state.
 		 */
 		case TCPS_SYN_RECEIVED:
 			tp->t_starttime = ticks;
 			/* FALLTHROUGH */
 		case TCPS_ESTABLISHED:
 			tcp_state_change(tp, TCPS_CLOSE_WAIT);
 			break;
 
 		/*
 		 * If still in FIN_WAIT_1 STATE FIN has not been acked so
 		 * enter the CLOSING state.
 		 */
 		case TCPS_FIN_WAIT_1:
 			tcp_state_change(tp, TCPS_CLOSING);
 			break;
 
 		/*
 		 * In FIN_WAIT_2 state enter the TIME_WAIT state,
 		 * starting the time-wait timer, turning off the other
 		 * standard timers.
 		 */
 		case TCPS_FIN_WAIT_2:
 			tcp_twstart(tp);
 			return;
 		}
 	}
 	TCP_PROBE3(debug__input, tp, th, m);
 
 	/*
 	 * Return any desired output.
 	 */
 	if (needoutput || (tp->t_flags & TF_ACKNOW))
 		(void) tcp_output(tp);
 
 check_delack:
 	INP_WLOCK_ASSERT(inp);
 
 	if (tp->t_flags & TF_DELACK) {
 		tp->t_flags &= ~TF_DELACK;
 		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
 	}
 	INP_WUNLOCK(inp);
 	return;
 
 dropafterack:
 	/*
 	 * Generate an ACK dropping incoming segment if it occupies
 	 * sequence space, where the ACK reflects our state.
 	 *
 	 * We can now skip the test for the RST flag since all
 	 * paths to this code happen after packets containing
 	 * RST have been dropped.
 	 *
 	 * In the SYN-RECEIVED state, don't send an ACK unless the
 	 * segment we received passes the SYN-RECEIVED ACK test.
 	 * If it fails send a RST.  This breaks the loop in the
 	 * "LAND" DoS attack, and also prevents an ACK storm
 	 * between two listening ports that have been sent forged
 	 * SYN segments, each with the source address of the other.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
 	    (SEQ_GT(tp->snd_una, th->th_ack) ||
 	     SEQ_GT(th->th_ack, tp->snd_max)) ) {
 		rstreason = BANDLIM_RST_OPENPORT;
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		goto dropwithreset;
 	}
 	TCP_PROBE3(debug__input, tp, th, m);
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 	INP_WUNLOCK(inp);
 	m_freem(m);
 	return;
 
 dropwithreset:
 	if (tp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
 		INP_WUNLOCK(inp);
 	} else
 		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 	return;
 
 drop:
 	/*
 	 * Drop space held by incoming segment and return.
 	 */
 	TCP_PROBE3(debug__input, tp, th, m);
 	if (tp != NULL) {
 		INP_WUNLOCK(inp);
 	}
 	m_freem(m);
 }
 
 /*
  * Issue RST and make ACK acceptable to originator of segment.
  * The mbuf must still include the original packet header.
  * tp may be NULL.
  */
 void
 tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
     int tlen, int rstreason)
 {
 #ifdef INET
 	struct ip *ip;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 
 	if (tp != NULL) {
 		INP_LOCK_ASSERT(tptoinpcb(tp));
 	}
 
 	/* Don't bother if destination was broadcast/multicast. */
 	if ((tcp_get_flags(th) & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
 		goto drop;
 #ifdef INET6
 	if (mtod(m, struct ip *)->ip_v == 6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
 			goto drop;
 		/* IPv6 anycast check is done at tcp6_input() */
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		ip = mtod(m, struct ip *);
 		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 		    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 		    ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 		    in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 			goto drop;
 	}
 #endif
 
 	/* Perform bandwidth limiting. */
 	if (badport_bandlim(rstreason) < 0)
 		goto drop;
 
 	/* tcp_respond consumes the mbuf chain. */
 	if (tcp_get_flags(th) & TH_ACK) {
 		tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0,
 		    th->th_ack, TH_RST);
 	} else {
 		if (tcp_get_flags(th) & TH_SYN)
 			tlen++;
 		if (tcp_get_flags(th) & TH_FIN)
 			tlen++;
 		tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
 		    (tcp_seq)0, TH_RST|TH_ACK);
 	}
 	return;
 drop:
 	m_freem(m);
 }
 
 /*
  * Parse TCP options and place in tcpopt.
  */
 void
 tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
 {
 	int opt, optlen;
 
 	to->to_flags = 0;
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = cp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 		switch (opt) {
 		case TCPOPT_MAXSEG:
 			if (optlen != TCPOLEN_MAXSEG)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			to->to_flags |= TOF_MSS;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_mss, sizeof(to->to_mss));
 			to->to_mss = ntohs(to->to_mss);
 			break;
 		case TCPOPT_WINDOW:
 			if (optlen != TCPOLEN_WINDOW)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			to->to_flags |= TOF_SCALE;
 			to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT);
 			break;
 		case TCPOPT_TIMESTAMP:
 			if (optlen != TCPOLEN_TIMESTAMP)
 				continue;
 			to->to_flags |= TOF_TS;
 			bcopy((char *)cp + 2,
 			    (char *)&to->to_tsval, sizeof(to->to_tsval));
 			to->to_tsval = ntohl(to->to_tsval);
 			bcopy((char *)cp + 6,
 			    (char *)&to->to_tsecr, sizeof(to->to_tsecr));
 			to->to_tsecr = ntohl(to->to_tsecr);
 			break;
 		case TCPOPT_SIGNATURE:
 			/*
 			 * In order to reply to a host which has set the
 			 * TCP_SIGNATURE option in its initial SYN, we have
 			 * to record the fact that the option was observed
 			 * here for the syncache code to perform the correct
 			 * response.
 			 */
 			if (optlen != TCPOLEN_SIGNATURE)
 				continue;
 			to->to_flags |= TOF_SIGNATURE;
 			to->to_signature = cp + 2;
 			break;
 		case TCPOPT_SACK_PERMITTED:
 			if (optlen != TCPOLEN_SACK_PERMITTED)
 				continue;
 			if (!(flags & TO_SYN))
 				continue;
 			if (!V_tcp_do_sack)
 				continue;
 			to->to_flags |= TOF_SACKPERM;
 			break;
 		case TCPOPT_SACK:
 			if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
 				continue;
 			if (flags & TO_SYN)
 				continue;
 			to->to_flags |= TOF_SACK;
 			to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
 			to->to_sacks = cp + 2;
 			TCPSTAT_INC(tcps_sack_rcv_blocks);
 			break;
 		case TCPOPT_FAST_OPEN:
 			/*
 			 * Cookie length validation is performed by the
 			 * server side cookie checking code or the client
 			 * side cookie cache update code.
 			 */
 			if (!(flags & TO_SYN))
 				continue;
 			if (!V_tcp_fastopen_client_enable &&
 			    !V_tcp_fastopen_server_enable)
 				continue;
 			to->to_flags |= TOF_FASTOPEN;
 			to->to_tfo_len = optlen - 2;
 			to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL;
 			break;
 		default:
 			continue;
 		}
 	}
 }
 
 /*
  * Pull out of band byte out of a segment so
  * it doesn't appear in the user's data queue.
  * It is still reflected in the segment length for
  * sequencing purposes.
  */
 void
 tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
     int off)
 {
 	int cnt = off + th->th_urp - 1;
 
 	while (cnt >= 0) {
 		if (m->m_len > cnt) {
 			char *cp = mtod(m, caddr_t) + cnt;
 			struct tcpcb *tp = sototcpcb(so);
 
 			INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 			tp->t_iobc = *cp;
 			tp->t_oobflags |= TCPOOB_HAVEDATA;
 			bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
 			m->m_len--;
 			if (m->m_flags & M_PKTHDR)
 				m->m_pkthdr.len--;
 			return;
 		}
 		cnt -= m->m_len;
 		m = m->m_next;
 		if (m == NULL)
 			break;
 	}
 	panic("tcp_pulloutofband");
 }
 
 /*
  * Collect new round-trip time estimate
  * and update averages and current timeout.
  */
 void
 tcp_xmit_timer(struct tcpcb *tp, int rtt)
 {
 	int delta;
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	TCPSTAT_INC(tcps_rttupdated);
 	if (tp->t_rttupdated < UCHAR_MAX)
 		tp->t_rttupdated++;
 #ifdef STATS
 	stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT,
 	    imax(0, rtt * 1000 / hz));
 #endif
 	if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) {
 		/*
 		 * srtt is stored as fixed point with 5 bits after the
 		 * binary point (i.e., scaled by 8).  The following magic
 		 * is equivalent to the smoothing algorithm in rfc793 with
 		 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
 		 * point).  Adjust rtt to origin 0.
 		 */
 		delta = ((rtt - 1) << TCP_DELTA_SHIFT)
 			- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
 
 		if ((tp->t_srtt += delta) <= 0)
 			tp->t_srtt = 1;
 
 		/*
 		 * We accumulate a smoothed rtt variance (actually, a
 		 * smoothed mean difference), then set the retransmit
 		 * timer to smoothed rtt + 4 times the smoothed variance.
 		 * rttvar is stored as fixed point with 4 bits after the
 		 * binary point (scaled by 16).  The following is
 		 * equivalent to rfc793 smoothing with an alpha of .75
 		 * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
 		 * rfc793's wired-in beta.
 		 */
 		if (delta < 0)
 			delta = -delta;
 		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
 		if ((tp->t_rttvar += delta) <= 0)
 			tp->t_rttvar = 1;
 	} else {
 		/*
 		 * No rtt measurement yet - use the unsmoothed rtt.
 		 * Set the variance to half the rtt (so our first
 		 * retransmit happens at 3*rtt).
 		 */
 		tp->t_srtt = rtt << TCP_RTT_SHIFT;
 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
 	}
 	tp->t_rtttime = 0;
 	tp->t_rxtshift = 0;
 
 	/*
 	 * the retransmit should happen at rtt + 4 * rttvar.
 	 * Because of the way we do the smoothing, srtt and rttvar
 	 * will each average +1/2 tick of bias.  When we compute
 	 * the retransmit timer, we want 1/2 tick of rounding and
 	 * 1 extra tick because of +-1/2 tick uncertainty in the
 	 * firing of the timer.  The bias will give us exactly the
 	 * 1.5 tick we need.  But, because the bias is
 	 * statistical, we have to test that we don't drop below
 	 * the minimum feasible timer (which is 2 ticks).
 	 */
 	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
 		      max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
 
 	/*
 	 * We received an ack for a packet that wasn't retransmitted;
 	 * it is probably safe to discard any error indications we've
 	 * received recently.  This isn't quite right, but close enough
 	 * for now (a route might have failed after we sent a segment,
 	 * and the return path might not be symmetrical).
 	 */
 	tp->t_softerror = 0;
 }
 
 /*
  * Determine a reasonable value for maxseg size.
  * If the route is known, check route for mtu.
  * If none, use an mss that can be handled on the outgoing interface
  * without forcing IP to fragment.  If no route is found, route has no mtu,
  * or the destination isn't local, use a default, hopefully conservative
  * size (usually 512 or the default IP max size, but no more than the mtu
  * of the interface), as we can't discover anything about intervening
  * gateways or networks.  We also initialize the congestion/slow start
  * window to be a single segment if the destination isn't local.
  * While looking at the routing entry, we also initialize other path-dependent
  * parameters from pre-set or cached values in the routing entry.
  *
  * NOTE that resulting t_maxseg doesn't include space for TCP options or
  * IP options, e.g. IPSEC data, since length of this data may vary, and
  * thus it is calculated for every segment separately in tcp_output().
  *
  * NOTE that this routine is only called when we process an incoming
  * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS
  * settings are handled in tcp_mssopt().
  */
 void
 tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
     struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap)
 {
 	int mss = 0;
 	uint32_t maxmtu = 0;
 	struct inpcb *inp = tptoinpcb(tp);
 	struct hc_metrics_lite metrics;
 #ifdef INET6
 	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
 	size_t min_protoh = isipv6 ?
 			    sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
 			    sizeof (struct tcpiphdr);
 #else
 	 size_t min_protoh = sizeof(struct tcpiphdr);
 #endif
 
 	INP_WLOCK_ASSERT(inp);
 
 	if (tp->t_port)
 		min_protoh += V_tcp_udp_tunneling_overhead;
 	if (mtuoffer != -1) {
 		KASSERT(offer == -1, ("%s: conflict", __func__));
 		offer = mtuoffer - min_protoh;
 	}
 
 	/* Initialize. */
 #ifdef INET6
 	if (isipv6) {
 		maxmtu = tcp_maxmtu6(&inp->inp_inc, cap);
 		tp->t_maxseg = V_tcp_v6mssdflt;
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		maxmtu = tcp_maxmtu(&inp->inp_inc, cap);
 		tp->t_maxseg = V_tcp_mssdflt;
 	}
 #endif
 
 	/*
 	 * No route to sender, stay with default mss and return.
 	 */
 	if (maxmtu == 0) {
 		/*
 		 * In case we return early we need to initialize metrics
 		 * to a defined state as tcp_hc_get() would do for us
 		 * if there was no cache hit.
 		 */
 		if (metricptr != NULL)
 			bzero(metricptr, sizeof(struct hc_metrics_lite));
 		return;
 	}
 
 	/* What have we got? */
 	switch (offer) {
 		case 0:
 			/*
 			 * Offer == 0 means that there was no MSS on the SYN
 			 * segment, in this case we use tcp_mssdflt as
 			 * already assigned to t_maxseg above.
 			 */
 			offer = tp->t_maxseg;
 			break;
 
 		case -1:
 			/*
 			 * Offer == -1 means that we didn't receive SYN yet.
 			 */
 			/* FALLTHROUGH */
 
 		default:
 			/*
 			 * Prevent DoS attack with too small MSS. Round up
 			 * to at least minmss.
 			 */
 			offer = max(offer, V_tcp_minmss);
 	}
 
 	/*
 	 * rmx information is now retrieved from tcp_hostcache.
 	 */
 	tcp_hc_get(&inp->inp_inc, &metrics);
 	if (metricptr != NULL)
 		bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite));
 
 	/*
 	 * If there's a discovered mtu in tcp hostcache, use it.
 	 * Else, use the link mtu.
 	 */
 	if (metrics.rmx_mtu)
 		mss = min(metrics.rmx_mtu, maxmtu) - min_protoh;
 	else {
 #ifdef INET6
 		if (isipv6) {
 			mss = maxmtu - min_protoh;
 			if (!V_path_mtu_discovery &&
 			    !in6_localaddr(&inp->in6p_faddr))
 				mss = min(mss, V_tcp_v6mssdflt);
 		}
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 		{
 			mss = maxmtu - min_protoh;
 			if (!V_path_mtu_discovery &&
 			    !in_localaddr(inp->inp_faddr))
 				mss = min(mss, V_tcp_mssdflt);
 		}
 #endif
 		/*
 		 * XXX - The above conditional (mss = maxmtu - min_protoh)
 		 * probably violates the TCP spec.
 		 * The problem is that, since we don't know the
 		 * other end's MSS, we are supposed to use a conservative
 		 * default.  But, if we do that, then MTU discovery will
 		 * never actually take place, because the conservative
 		 * default is much less than the MTUs typically seen
 		 * on the Internet today.  For the moment, we'll sweep
 		 * this under the carpet.
 		 *
 		 * The conservative default might not actually be a problem
 		 * if the only case this occurs is when sending an initial
 		 * SYN with options and data to a host we've never talked
 		 * to before.  Then, they will reply with an MSS value which
 		 * will get recorded and the new parameters should get
 		 * recomputed.  For Further Study.
 		 */
 	}
 	mss = min(mss, offer);
 
 	/*
 	 * Sanity check: make sure that maxseg will be large
 	 * enough to allow some data on segments even if the
 	 * all the option space is used (40bytes).  Otherwise
 	 * funny things may happen in tcp_output.
 	 *
 	 * XXXGL: shouldn't we reserve space for IP/IPv6 options?
 	 */
 	mss = max(mss, 64);
 
 	tp->t_maxseg = mss;
 }
 
 void
 tcp_mss(struct tcpcb *tp, int offer)
 {
 	int mss;
 	uint32_t bufsize;
 	struct inpcb *inp = tptoinpcb(tp);
 	struct socket *so;
 	struct hc_metrics_lite metrics;
 	struct tcp_ifcap cap;
 
 	KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
 
 	bzero(&cap, sizeof(cap));
 	tcp_mss_update(tp, offer, -1, &metrics, &cap);
 
 	mss = tp->t_maxseg;
 
 	/*
 	 * If there's a pipesize, change the socket buffer to that size,
 	 * don't change if sb_hiwat is different than default (then it
 	 * has been changed on purpose with setsockopt).
 	 * Make the socket buffers an integral number of mss units;
 	 * if the mss is larger than the socket buffer, decrease the mss.
 	 */
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_snd);
 	if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.rmx_sendpipe)
 		bufsize = metrics.rmx_sendpipe;
 	else
 		bufsize = so->so_snd.sb_hiwat;
 	if (bufsize < mss)
 		mss = bufsize;
 	else {
 		bufsize = roundup(bufsize, mss);
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		if (bufsize > so->so_snd.sb_hiwat)
 			(void)sbreserve_locked(so, SO_SND, bufsize, NULL);
 	}
 	SOCKBUF_UNLOCK(&so->so_snd);
 	/*
 	 * Sanity check: make sure that maxseg will be large
 	 * enough to allow some data on segments even if the
 	 * all the option space is used (40bytes).  Otherwise
 	 * funny things may happen in tcp_output.
 	 *
 	 * XXXGL: shouldn't we reserve space for IP/IPv6 options?
 	 */
 	tp->t_maxseg = max(mss, 64);
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.rmx_recvpipe)
 		bufsize = metrics.rmx_recvpipe;
 	else
 		bufsize = so->so_rcv.sb_hiwat;
 	if (bufsize > mss) {
 		bufsize = roundup(bufsize, mss);
 		if (bufsize > sb_max)
 			bufsize = sb_max;
 		if (bufsize > so->so_rcv.sb_hiwat)
 			(void)sbreserve_locked(so, SO_RCV, bufsize, NULL);
 	}
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	/* Check the interface for TSO capabilities. */
 	if (cap.ifcap & CSUM_TSO) {
 		tp->t_flags |= TF_TSO;
 		tp->t_tsomax = cap.tsomax;
 		tp->t_tsomaxsegcount = cap.tsomaxsegcount;
 		tp->t_tsomaxsegsize = cap.tsomaxsegsize;
 	}
 }
 
 /*
  * Determine the MSS option to send on an outgoing SYN.
  */
 int
 tcp_mssopt(struct in_conninfo *inc)
 {
 	int mss = 0;
 	uint32_t thcmtu = 0;
 	uint32_t maxmtu = 0;
 	size_t min_protoh;
 
 	KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
 
 #ifdef INET6
 	if (inc->inc_flags & INC_ISIPV6) {
 		mss = V_tcp_v6mssdflt;
 		maxmtu = tcp_maxmtu6(inc, NULL);
 		min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		mss = V_tcp_mssdflt;
 		maxmtu = tcp_maxmtu(inc, NULL);
 		min_protoh = sizeof(struct tcpiphdr);
 	}
 #endif
 #if defined(INET6) || defined(INET)
 	thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
 #endif
 
 	if (maxmtu && thcmtu)
 		mss = min(maxmtu, thcmtu) - min_protoh;
 	else if (maxmtu || thcmtu)
 		mss = max(maxmtu, thcmtu) - min_protoh;
 
 	return (mss);
 }
 
 void
-tcp_do_prr_ack(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
+tcp_do_prr_ack(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, sackstatus_t sack_changed)
 {
 	int snd_cnt = 0, limit = 0, del_data = 0, pipe = 0;
 	int maxseg = tcp_maxseg(tp);
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	/*
 	 * Compute the amount of data that this ACK is indicating
 	 * (del_data) and an estimate of how many bytes are in the
 	 * network.
 	 */
 	if (tcp_is_sack_recovery(tp, to) ||
 	    (IN_CONGRECOVERY(tp->t_flags) &&
 	     !IN_FASTRECOVERY(tp->t_flags))) {
 		del_data = tp->sackhint.delivered_data;
 		if (V_tcp_do_newsack)
 			pipe = tcp_compute_pipe(tp);
 		else
 			pipe = (tp->snd_nxt - tp->snd_fack) +
 				tp->sackhint.sack_bytes_rexmit;
 	} else {
 		if (tp->sackhint.prr_delivered < (tcprexmtthresh * maxseg +
 					     tp->snd_recover - tp->snd_una))
 			del_data = maxseg;
 		pipe = imax(0, tp->snd_max - tp->snd_una -
 			    imin(INT_MAX / 65536, tp->t_dupacks) * maxseg);
 	}
 	tp->sackhint.prr_delivered += del_data;
 	/*
 	 * Proportional Rate Reduction
 	 */
 	if (pipe >= tp->snd_ssthresh) {
 		if (tp->sackhint.recover_fs == 0)
 			tp->sackhint.recover_fs =
 			    imax(1, tp->snd_nxt - tp->snd_una);
 		snd_cnt = howmany((long)tp->sackhint.prr_delivered *
 			    tp->snd_ssthresh, tp->sackhint.recover_fs) -
 			    tp->sackhint.prr_out;
 	} else {
-		if (V_tcp_do_prr_conservative || (del_data == 0))
+		/*
+		 * PRR 6937bis heuristic:
+		 * - A partial ack without SACK block beneath snd_recover
+		 * indicates further loss.
+		 * - An SACK scoreboard update adding a new hole indicates
+		 * further loss, so be conservative and send at most one
+		 * segment.
+		 * - Prevent ACK splitting attacks, by being conservative
+		 * when no new data is acked.
+		 */
+		if ((sack_changed == SACK_NEWLOSS) || (del_data == 0))
 			limit = tp->sackhint.prr_delivered -
 				tp->sackhint.prr_out;
 		else
 			limit = imax(tp->sackhint.prr_delivered -
 				    tp->sackhint.prr_out, del_data) +
 				    maxseg;
 		snd_cnt = imin((tp->snd_ssthresh - pipe), limit);
 	}
 	snd_cnt = imax(snd_cnt, 0) / maxseg;
 	/*
 	 * Send snd_cnt new data into the network in response to this ack.
 	 * If there is going to be a SACK retransmission, adjust snd_cwnd
 	 * accordingly.
 	 */
 	if (IN_FASTRECOVERY(tp->t_flags)) {
 		if (tcp_is_sack_recovery(tp, to)) {
 			tp->snd_cwnd = tp->snd_nxt - tp->snd_recover +
 					    tp->sackhint.sack_bytes_rexmit +
 					    (snd_cnt * maxseg);
 		} else {
 			tp->snd_cwnd = (tp->snd_max - tp->snd_una) +
 					    (snd_cnt * maxseg);
 		}
 	} else if (IN_CONGRECOVERY(tp->t_flags))
 		tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg);
 	tp->snd_cwnd = imax(maxseg, tp->snd_cwnd);
 }
 
 /*
  * On a partial ack arrives, force the retransmission of the
  * next unacknowledged segment.  Do not clear tp->t_dupacks.
  * By setting snd_nxt to ti_ack, this forces retransmission timer to
  * be started again.
  */
 void
 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
 {
 	tcp_seq onxt = tp->snd_nxt;
 	uint32_t ocwnd = tp->snd_cwnd;
 	u_int maxseg = tcp_maxseg(tp);
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	tcp_timer_activate(tp, TT_REXMT, 0);
 	tp->t_rtttime = 0;
 	tp->snd_nxt = th->th_ack;
 	/*
 	 * Set snd_cwnd to one segment beyond acknowledged offset.
 	 * (tp->snd_una has not yet been updated when this function is called.)
 	 */
 	tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th);
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 	tp->snd_cwnd = ocwnd;
 	if (SEQ_GT(onxt, tp->snd_nxt))
 		tp->snd_nxt = onxt;
 	/*
 	 * Partial window deflation.  Relies on fact that tp->snd_una
 	 * not updated yet.
 	 */
 	if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th))
 		tp->snd_cwnd -= BYTES_THIS_ACK(tp, th);
 	else
 		tp->snd_cwnd = 0;
 	tp->snd_cwnd += maxseg;
 }
 
 int
 tcp_compute_pipe(struct tcpcb *tp)
 {
 	if (tp->t_fb->tfb_compute_pipe == NULL) {
 		return (tp->snd_max - tp->snd_una +
 			tp->sackhint.sack_bytes_rexmit -
 			tp->sackhint.sacked_bytes -
 			tp->sackhint.lost_bytes);
 	} else {
 		return((*tp->t_fb->tfb_compute_pipe)(tp));
 	}
 }
 
 uint32_t
 tcp_compute_initwnd(uint32_t maxseg)
 {
 	/*
 	 * Calculate the Initial Window, also used as Restart Window
 	 *
 	 * RFC5681 Section 3.1 specifies the default conservative values.
 	 * RFC3390 specifies slightly more aggressive values.
 	 * RFC6928 increases it to ten segments.
 	 * Support for user specified value for initial flight size.
 	 */
 	if (V_tcp_initcwnd_segments)
 		return min(V_tcp_initcwnd_segments * maxseg,
 		    max(2 * maxseg, V_tcp_initcwnd_segments * 1460));
 	else if (V_tcp_do_rfc3390)
 		return min(4 * maxseg, max(2 * maxseg, 4380));
 	else {
 		/* Per RFC5681 Section 3.1 */
 		if (maxseg > 2190)
 			return (2 * maxseg);
 		else if (maxseg > 1095)
 			return (3 * maxseg);
 		else
 			return (4 * maxseg);
 	}
 }
diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c
index 589b0c424acb..891053c872dd 100644
--- a/sys/netinet/tcp_sack.c
+++ b/sys/netinet/tcp_sack.c
@@ -1,1135 +1,1143 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_sack.c	8.12 (Berkeley) 5/24/95
  */
 
 /*-
  *	@@(#)COPYRIGHT	1.1 (NRL) 17 January 1995
  *
  * NRL grants permission for redistribution and use in source and binary
  * forms, with or without modification, of the software and documentation
  * created at NRL provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgements:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  *	This product includes software developed at the Information
  *	Technology Division, US Naval Research Laboratory.
  * 4. Neither the name of the NRL nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * The views and conclusions contained in the software and documentation
  * are those of the authors and should not be interpreted as representing
  * official policies, either expressed or implied, of the US Naval
  * Research Laboratory (NRL).
  */
 
 #include <sys/cdefs.h>
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/nd6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
 
 #include <machine/in_cksum.h>
 
 VNET_DECLARE(struct uma_zone *, sack_hole_zone);
 #define	V_sack_hole_zone		VNET(sack_hole_zone)
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "TCP SACK");
 
 VNET_DEFINE(int, tcp_do_sack) = 1;
 SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_sack), 0,
     "Enable/Disable TCP SACK support");
 
 VNET_DEFINE(int, tcp_do_newsack) = 1;
 SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, revised, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_newsack), 0,
     "Use revised SACK loss recovery per RFC 6675");
 
 VNET_DEFINE(int, tcp_sack_maxholes) = 128;
 SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_sack_maxholes), 0,
     "Maximum number of TCP SACK holes allowed per connection");
 
 VNET_DEFINE(int, tcp_sack_globalmaxholes) = 65536;
 SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_sack_globalmaxholes), 0,
     "Global maximum number of TCP SACK holes");
 
 VNET_DEFINE(int, tcp_sack_globalholes) = 0;
 SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_VNET | CTLFLAG_RD,
     &VNET_NAME(tcp_sack_globalholes), 0,
     "Global number of TCP SACK holes currently allocated");
 
 int
 tcp_dsack_block_exists(struct tcpcb *tp)
 {
 	/* Return true if a DSACK block exists */
 	if (tp->rcv_numsacks == 0)
 		return (0);
 	if (SEQ_LEQ(tp->sackblks[0].end, tp->rcv_nxt))
 		return(1);
 	return (0);
 }
 
 /*
  * This function will find overlaps with the currently stored sackblocks
  * and add any overlap as a dsack block upfront
  */
 void
 tcp_update_dsack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
 {
 	struct sackblk head_blk,mid_blk,saved_blks[MAX_SACK_BLKS];
 	int i, j, n, identical;
 	tcp_seq start, end;
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end"));
 
 	if (SEQ_LT(rcv_end, tp->rcv_nxt) ||
 	    ((rcv_end == tp->rcv_nxt) &&
 	     (tp->rcv_numsacks > 0 ) &&
 	     (tp->sackblks[0].end == tp->rcv_nxt))) {
 		saved_blks[0].start = rcv_start;
 		saved_blks[0].end = rcv_end;
 	} else {
 		saved_blks[0].start = saved_blks[0].end = 0;
 	}
 
 	head_blk.start = head_blk.end = 0;
 	mid_blk.start = rcv_start;
 	mid_blk.end = rcv_end;
 	identical = 0;
 
 	for (i = 0; i < tp->rcv_numsacks; i++) {
 		start = tp->sackblks[i].start;
 		end = tp->sackblks[i].end;
 		if (SEQ_LT(rcv_end, start)) {
 			/* pkt left to sack blk */
 			continue;
 		}
 		if (SEQ_GT(rcv_start, end)) {
 			/* pkt right to sack blk */
 			continue;
 		}
 		if (SEQ_GT(tp->rcv_nxt, end)) {
 			if ((SEQ_MAX(rcv_start, start) != SEQ_MIN(rcv_end, end)) &&
 			    (SEQ_GT(head_blk.start, SEQ_MAX(rcv_start, start)) ||
 			    (head_blk.start == head_blk.end))) {
 				head_blk.start = SEQ_MAX(rcv_start, start);
 				head_blk.end = SEQ_MIN(rcv_end, end);
 			}
 			continue;
 		}
 		if (((head_blk.start == head_blk.end) ||
 		     SEQ_LT(start, head_blk.start)) &&
 		     (SEQ_GT(end, rcv_start) &&
 		      SEQ_LEQ(start, rcv_end))) {
 			head_blk.start = start;
 			head_blk.end = end;
 		}
 		mid_blk.start = SEQ_MIN(mid_blk.start, start);
 		mid_blk.end = SEQ_MAX(mid_blk.end, end);
 		if ((mid_blk.start == start) &&
 		    (mid_blk.end == end))
 			identical = 1;
 	}
 	if (SEQ_LT(head_blk.start, head_blk.end)) {
 		/* store overlapping range */
 		saved_blks[0].start = SEQ_MAX(rcv_start, head_blk.start);
 		saved_blks[0].end   = SEQ_MIN(rcv_end, head_blk.end);
 	}
 	n = 1;
 	/*
 	 * Second, if not ACKed, store the SACK block that
 	 * overlaps with the DSACK block unless it is identical
 	 */
 	if ((SEQ_LT(tp->rcv_nxt, mid_blk.end) &&
 	    !((mid_blk.start == saved_blks[0].start) &&
 	    (mid_blk.end == saved_blks[0].end))) ||
 	    identical == 1) {
 		saved_blks[n].start = mid_blk.start;
 		saved_blks[n++].end = mid_blk.end;
 	}
 	for (j = 0; (j < tp->rcv_numsacks) && (n < MAX_SACK_BLKS); j++) {
 		if (((SEQ_LT(tp->sackblks[j].end, mid_blk.start) ||
 		      SEQ_GT(tp->sackblks[j].start, mid_blk.end)) &&
 		    (SEQ_GT(tp->sackblks[j].start, tp->rcv_nxt))))
 		saved_blks[n++] = tp->sackblks[j];
 	}
 	j = 0;
 	for (i = 0; i < n; i++) {
 		/* we can end up with a stale initial entry */
 		if (SEQ_LT(saved_blks[i].start, saved_blks[i].end)) {
 			tp->sackblks[j++] = saved_blks[i];
 		}
 	}
 	tp->rcv_numsacks = j;
 }
 
 /*
  * This function is called upon receipt of new valid data (while not in
  * header prediction mode), and it updates the ordered list of sacks.
  */
 void
 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
 {
 	/*
 	 * First reported block MUST be the most recent one.  Subsequent
 	 * blocks SHOULD be in the order in which they arrived at the
 	 * receiver.  These two conditions make the implementation fully
 	 * compliant with RFC 2018.
 	 */
 	struct sackblk head_blk, saved_blks[MAX_SACK_BLKS];
 	int num_head, num_saved, i;
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	/* Check arguments. */
 	KASSERT(SEQ_LEQ(rcv_start, rcv_end), ("rcv_start <= rcv_end"));
 
 	if ((rcv_start == rcv_end) &&
 	    (tp->rcv_numsacks >= 1) &&
 	    (rcv_end == tp->sackblks[0].end)) {
 		/* retaining DSACK block below rcv_nxt (todrop) */
 		head_blk = tp->sackblks[0];
 	} else {
 		/* SACK block for the received segment. */
 		head_blk.start = rcv_start;
 		head_blk.end = rcv_end;
 	}
 
 	/*
 	 * Merge updated SACK blocks into head_blk, and save unchanged SACK
 	 * blocks into saved_blks[].  num_saved will have the number of the
 	 * saved SACK blocks.
 	 */
 	num_saved = 0;
 	for (i = 0; i < tp->rcv_numsacks; i++) {
 		tcp_seq start = tp->sackblks[i].start;
 		tcp_seq end = tp->sackblks[i].end;
 		if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) {
 			/*
 			 * Discard this SACK block.
 			 */
 		} else if (SEQ_LEQ(head_blk.start, end) &&
 			   SEQ_GEQ(head_blk.end, start)) {
 			/*
 			 * Merge this SACK block into head_blk.  This SACK
 			 * block itself will be discarded.
 			 */
 			/*
 			 * |-|
 			 *   |---|  merge
 			 *
 			 *     |-|
 			 * |---|    merge
 			 *
 			 * |-----|
 			 *   |-|    DSACK smaller
 			 *
 			 *   |-|
 			 * |-----|  DSACK smaller
 			 */
 			if (head_blk.start == end)
 				head_blk.start = start;
 			else if (head_blk.end == start)
 				head_blk.end = end;
 			else {
 				if (SEQ_LT(head_blk.start, start)) {
 					tcp_seq temp = start;
 					start = head_blk.start;
 					head_blk.start = temp;
 				}
 				if (SEQ_GT(head_blk.end, end)) {
 					tcp_seq temp = end;
 					end = head_blk.end;
 					head_blk.end = temp;
 				}
 				if ((head_blk.start != start) ||
 				    (head_blk.end != end)) {
 					if ((num_saved >= 1) &&
 					   SEQ_GEQ(saved_blks[num_saved-1].start, start) &&
 					   SEQ_LEQ(saved_blks[num_saved-1].end, end))
 						num_saved--;
 					saved_blks[num_saved].start = start;
 					saved_blks[num_saved].end = end;
 					num_saved++;
 				}
 			}
 		} else {
 			/*
 			 * This block supercedes the prior block
 			 */
 			if ((num_saved >= 1) &&
 			   SEQ_GEQ(saved_blks[num_saved-1].start, start) &&
 			   SEQ_LEQ(saved_blks[num_saved-1].end, end))
 				num_saved--;
 			/*
 			 * Save this SACK block.
 			 */
 			saved_blks[num_saved].start = start;
 			saved_blks[num_saved].end = end;
 			num_saved++;
 		}
 	}
 
 	/*
 	 * Update SACK list in tp->sackblks[].
 	 */
 	num_head = 0;
 	if (SEQ_LT(rcv_start, rcv_end)) {
 		/*
 		 * The received data segment is an out-of-order segment.  Put
 		 * head_blk at the top of SACK list.
 		 */
 		tp->sackblks[0] = head_blk;
 		num_head = 1;
 		/*
 		 * If the number of saved SACK blocks exceeds its limit,
 		 * discard the last SACK block.
 		 */
 		if (num_saved >= MAX_SACK_BLKS)
 			num_saved--;
 	}
 	if ((rcv_start == rcv_end) &&
 	    (rcv_start == tp->sackblks[0].end)) {
 		num_head = 1;
 	}
 	if (num_saved > 0) {
 		/*
 		 * Copy the saved SACK blocks back.
 		 */
 		bcopy(saved_blks, &tp->sackblks[num_head],
 		      sizeof(struct sackblk) * num_saved);
 	}
 
 	/* Save the number of SACK blocks. */
 	tp->rcv_numsacks = num_head + num_saved;
 }
 
 void
 tcp_clean_dsack_blocks(struct tcpcb *tp)
 {
 	struct sackblk saved_blks[MAX_SACK_BLKS];
 	int num_saved, i;
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 	/*
 	 * Clean up any DSACK blocks that
 	 * are in our queue of sack blocks.
 	 *
 	 */
 	num_saved = 0;
 	for (i = 0; i < tp->rcv_numsacks; i++) {
 		tcp_seq start = tp->sackblks[i].start;
 		tcp_seq end = tp->sackblks[i].end;
 		if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) {
 			/*
 			 * Discard this D-SACK block.
 			 */
 			continue;
 		}
 		/*
 		 * Save this SACK block.
 		 */
 		saved_blks[num_saved].start = start;
 		saved_blks[num_saved].end = end;
 		num_saved++;
 	}
 	if (num_saved > 0) {
 		/*
 		 * Copy the saved SACK blocks back.
 		 */
 		bcopy(saved_blks, &tp->sackblks[0],
 		      sizeof(struct sackblk) * num_saved);
 	}
 	tp->rcv_numsacks = num_saved;
 }
 
 /*
  * Delete all receiver-side SACK information.
  */
 void
 tcp_clean_sackreport(struct tcpcb *tp)
 {
 	int i;
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 	tp->rcv_numsacks = 0;
 	for (i = 0; i < MAX_SACK_BLKS; i++)
 		tp->sackblks[i].start = tp->sackblks[i].end=0;
 }
 
 /*
  * Allocate struct sackhole.
  */
 static struct sackhole *
 tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end)
 {
 	struct sackhole *hole;
 
 	if (tp->snd_numholes >= V_tcp_sack_maxholes ||
 	    V_tcp_sack_globalholes >= V_tcp_sack_globalmaxholes) {
 		TCPSTAT_INC(tcps_sack_sboverflow);
 		return NULL;
 	}
 
 	hole = (struct sackhole *)uma_zalloc(V_sack_hole_zone, M_NOWAIT);
 	if (hole == NULL)
 		return NULL;
 
 	hole->start = start;
 	hole->end = end;
 	hole->rxmit = start;
 
 	tp->snd_numholes++;
 	atomic_add_int(&V_tcp_sack_globalholes, 1);
 
 	return hole;
 }
 
 /*
  * Free struct sackhole.
  */
 static void
 tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole)
 {
 
 	uma_zfree(V_sack_hole_zone, hole);
 
 	tp->snd_numholes--;
 	atomic_subtract_int(&V_tcp_sack_globalholes, 1);
 
 	KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0"));
 	KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0"));
 }
 
 /*
  * Insert new SACK hole into scoreboard.
  */
 static struct sackhole *
 tcp_sackhole_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end,
     struct sackhole *after)
 {
 	struct sackhole *hole;
 
 	/* Allocate a new SACK hole. */
 	hole = tcp_sackhole_alloc(tp, start, end);
 	if (hole == NULL)
 		return NULL;
 
 	/* Insert the new SACK hole into scoreboard. */
 	if (after != NULL)
 		TAILQ_INSERT_AFTER(&tp->snd_holes, after, hole, scblink);
 	else
 		TAILQ_INSERT_TAIL(&tp->snd_holes, hole, scblink);
 
 	/* Update SACK hint. */
 	if (tp->sackhint.nexthole == NULL)
 		tp->sackhint.nexthole = hole;
 
 	return hole;
 }
 
 /*
  * Remove SACK hole from scoreboard.
  */
 static void
 tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole)
 {
 
 	/* Update SACK hint. */
 	if (tp->sackhint.nexthole == hole)
 		tp->sackhint.nexthole = TAILQ_NEXT(hole, scblink);
 
 	/* Remove this SACK hole. */
 	TAILQ_REMOVE(&tp->snd_holes, hole, scblink);
 
 	/* Free this SACK hole. */
 	tcp_sackhole_free(tp, hole);
 }
 
 /*
  * Process cumulative ACK and the TCP SACK option to update the scoreboard.
  * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of
  * the sequence space).
- * Returns 1 if incoming ACK has previously unknown SACK information,
- * 0 otherwise.
+ * Returns SACK_NEWLOSS if incoming ACK indicates ongoing loss (hole split, new hole),
+ * SACK_CHANGE if incoming ACK has previously unknown SACK information,
+ * SACK_NOCHANGE otherwise.
  */
-int
+sackstatus_t
 tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
 {
 	struct sackhole *cur, *temp;
 	struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp;
-	int i, j, num_sack_blks, sack_changed;
+	int i, j, num_sack_blks;
+	sackstatus_t sack_changed;
 	int delivered_data, left_edge_delta;
 
 	tcp_seq loss_hiack = 0;
 	int loss_thresh = 0;
 	int loss_sblks = 0;
 	int notlost_bytes = 0;
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	num_sack_blks = 0;
-	sack_changed = 0;
+	sack_changed = SACK_NOCHANGE;
 	delivered_data = 0;
 	left_edge_delta = 0;
 	/*
 	 * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist,
 	 * treat [SND.UNA, SEG.ACK) as if it is a SACK block.
 	 * Account changes to SND.UNA always in delivered data.
 	 */
 	if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) {
 		left_edge_delta = th_ack - tp->snd_una;
 		sack_blocks[num_sack_blks].start = tp->snd_una;
 		sack_blocks[num_sack_blks++].end = th_ack;
 		/*
 		 * Pulling snd_fack forward if we got here
 		 * due to DSACK blocks
 		 */
 		if (SEQ_LT(tp->snd_fack, th_ack)) {
 			delivered_data += th_ack - tp->snd_una;
 			tp->snd_fack = th_ack;
-			sack_changed = 1;
+			sack_changed = SACK_CHANGE;
 		}
 	}
 	/*
 	 * Append received valid SACK blocks to sack_blocks[], but only if we
 	 * received new blocks from the other side.
 	 */
 	if (to->to_flags & TOF_SACK) {
 		for (i = 0; i < to->to_nsacks; i++) {
 			bcopy((to->to_sacks + i * TCPOLEN_SACK),
 			    &sack, sizeof(sack));
 			sack.start = ntohl(sack.start);
 			sack.end = ntohl(sack.end);
 			if (SEQ_GT(sack.end, sack.start) &&
 			    SEQ_GT(sack.start, tp->snd_una) &&
 			    SEQ_GT(sack.start, th_ack) &&
 			    SEQ_LT(sack.start, tp->snd_max) &&
 			    SEQ_GT(sack.end, tp->snd_una) &&
 			    SEQ_LEQ(sack.end, tp->snd_max)) {
 				sack_blocks[num_sack_blks++] = sack;
 			} else if (SEQ_LEQ(sack.start, th_ack) &&
 			    SEQ_LEQ(sack.end, th_ack)) {
 				/*
 				 * Its a D-SACK block.
 				 */
 				tcp_record_dsack(tp, sack.start, sack.end, 0);
 			}
 		}
 	}
 	/*
 	 * Return if SND.UNA is not advanced and no valid SACK block is
 	 * received.
 	 */
 	if (num_sack_blks == 0)
 		return (sack_changed);
 
 	/*
 	 * Sort the SACK blocks so we can update the scoreboard with just one
 	 * pass. The overhead of sorting up to 4+1 elements is less than
 	 * making up to 4+1 passes over the scoreboard.
 	 */
 	for (i = 0; i < num_sack_blks; i++) {
 		for (j = i + 1; j < num_sack_blks; j++) {
 			if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
 				sack = sack_blocks[i];
 				sack_blocks[i] = sack_blocks[j];
 				sack_blocks[j] = sack;
 			}
 		}
 	}
 	if (TAILQ_EMPTY(&tp->snd_holes)) {
 		/*
 		 * Empty scoreboard. Need to initialize snd_fack (it may be
 		 * uninitialized or have a bogus value). Scoreboard holes
 		 * (from the sack blocks received) are created later below
 		 * (in the logic that adds holes to the tail of the
 		 * scoreboard).
 		 */
 		tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack);
 		tp->sackhint.sacked_bytes = 0;	/* reset */
 		tp->sackhint.hole_bytes = 0;
 	}
 	/*
 	 * In the while-loop below, incoming SACK blocks (sack_blocks[]) and
 	 * SACK holes (snd_holes) are traversed from their tails with just
 	 * one pass in order to reduce the number of compares especially when
 	 * the bandwidth-delay product is large.
 	 *
 	 * Note: Typically, in the first RTT of SACK recovery, the highest
 	 * three or four SACK blocks with the same ack number are received.
 	 * In the second RTT, if retransmitted data segments are not lost,
 	 * the highest three or four SACK blocks with ack number advancing
 	 * are received.
 	 */
 	sblkp = &sack_blocks[num_sack_blks - 1];	/* Last SACK block */
 	tp->sackhint.last_sack_ack = sblkp->end;
 	if (SEQ_LT(tp->snd_fack, sblkp->start)) {
 		/*
 		 * The highest SACK block is beyond fack.  First,
 		 * check if there was a successful Rescue Retransmission,
 		 * and move this hole left. With normal holes, snd_fack
 		 * is always to the right of the end.
 		 */
 		if (((temp = TAILQ_LAST(&tp->snd_holes, sackhole_head)) != NULL) &&
 		    SEQ_LEQ(tp->snd_fack,temp->end)) {
 			tp->sackhint.hole_bytes -= temp->end - temp->start;
 			temp->start = SEQ_MAX(tp->snd_fack, SEQ_MAX(tp->snd_una, th_ack));
 			temp->end = sblkp->start;
 			temp->rxmit = temp->start;
 			delivered_data += sblkp->end - sblkp->start;
 			tp->sackhint.hole_bytes += temp->end - temp->start;
 			KASSERT(tp->sackhint.hole_bytes >= 0,
 			    ("sackhint hole bytes >= 0"));
 			tp->snd_fack = sblkp->end;
-			sack_changed = 1;
 			sblkp--;
+			sack_changed = SACK_NEWLOSS;
 		} else {
 			/*
 			 * Append a new SACK hole at the tail.  If the
 			 * second or later highest SACK blocks are also
 			 * beyond the current fack, they will be inserted
 			 * by way of hole splitting in the while-loop below.
 			 */
 			temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL);
 			if (temp != NULL) {
 				delivered_data += sblkp->end - sblkp->start;
 				tp->sackhint.hole_bytes += temp->end - temp->start;
 				tp->snd_fack = sblkp->end;
-				sack_changed = 1;
 				/* Go to the previous sack block. */
 				sblkp--;
+				sack_changed = SACK_CHANGE;
 			} else {
 				/*
 				 * We failed to add a new hole based on the current
 				 * sack block.  Skip over all the sack blocks that
 				 * fall completely to the right of snd_fack and
 				 * proceed to trim the scoreboard based on the
 				 * remaining sack blocks.  This also trims the
 				 * scoreboard for th_ack (which is sack_blocks[0]).
 				 */
 				while (sblkp >= sack_blocks &&
 				       SEQ_LT(tp->snd_fack, sblkp->start))
 					sblkp--;
 				if (sblkp >= sack_blocks &&
 				    SEQ_LT(tp->snd_fack, sblkp->end)) {
 					delivered_data += sblkp->end - tp->snd_fack;
 					tp->snd_fack = sblkp->end;
-					sack_changed = 1;
+					/*
+					 * While the Scoreboard didn't change in
+					 * size, we only ended up here because
+					 * some SACK data had to be dismissed.
+					 */
+					sack_changed = SACK_NEWLOSS;
 				}
 			}
 		}
 	} else if (SEQ_LT(tp->snd_fack, sblkp->end)) {
 		/* fack is advanced. */
 		delivered_data += sblkp->end - tp->snd_fack;
 		tp->snd_fack = sblkp->end;
-		sack_changed = 1;
+		sack_changed = SACK_CHANGE;
 	}
 	cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */
 	loss_hiack = tp->snd_fack;
 
 	/*
 	 * Since the incoming sack blocks are sorted, we can process them
 	 * making one sweep of the scoreboard.
 	 */
 	while (cur != NULL) {
 		if (!(sblkp >= sack_blocks)) {
 			if (((loss_sblks >= tcprexmtthresh) ||
 			    (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg))) 
 				break;
 			loss_thresh += loss_hiack - cur->end;
 			loss_hiack = cur->start;
 			loss_sblks++;
 			if (!((loss_sblks >= tcprexmtthresh) ||
 			    (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg))) {
 				notlost_bytes += cur->end - cur->start;
 			} else {
 				break;
 			}
 			cur = TAILQ_PREV(cur, sackhole_head, scblink);
 			continue;
 		}
 		if (SEQ_GEQ(sblkp->start, cur->end)) {
 			/*
 			 * SACKs data beyond the current hole.  Go to the
 			 * previous sack block.
 			 */
 			sblkp--;
 			continue;
 		}
 		if (SEQ_LEQ(sblkp->end, cur->start)) {
 			/*
 			 * SACKs data before the current hole.  Go to the
 			 * previous hole.
 			 */
 			loss_thresh += loss_hiack - cur->end;
 			loss_hiack = cur->start;
 			loss_sblks++;
 			if (!((loss_sblks >= tcprexmtthresh) ||
 			    (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg)))
 				notlost_bytes += cur->end - cur->start;
 			cur = TAILQ_PREV(cur, sackhole_head, scblink);
 			continue;
 		}
 		tp->sackhint.sack_bytes_rexmit -=
 		    (SEQ_MIN(cur->rxmit, cur->end) - cur->start);
 		KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
 		    ("sackhint bytes rtx >= 0"));
-		sack_changed = 1;
+		sack_changed = SACK_CHANGE;
 		if (SEQ_LEQ(sblkp->start, cur->start)) {
 			/* Data acks at least the beginning of hole. */
 			if (SEQ_GEQ(sblkp->end, cur->end)) {
 				/* Acks entire hole, so delete hole. */
 				delivered_data += (cur->end - cur->start);
 				temp = cur;
 				cur = TAILQ_PREV(cur, sackhole_head, scblink);
 				tp->sackhint.hole_bytes -= temp->end - temp->start;
 				tcp_sackhole_remove(tp, temp);
 				/*
 				 * The sack block may ack all or part of the
 				 * next hole too, so continue onto the next
 				 * hole.
 				 */
 				continue;
 			} else {
 				/* Move start of hole forward. */
 				delivered_data += (sblkp->end - cur->start);
 				tp->sackhint.hole_bytes -= sblkp->end - cur->start;
 				cur->start = sblkp->end;
 				cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
 			}
 		} else {
 			/* Data acks at least the end of hole. */
 			if (SEQ_GEQ(sblkp->end, cur->end)) {
 				/* Move end of hole backward. */
 				delivered_data += (cur->end - sblkp->start);
 				tp->sackhint.hole_bytes -= cur->end - sblkp->start;
 				cur->end = sblkp->start;
 				cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
 				if ((tp->t_flags & TF_LRD) && SEQ_GEQ(cur->rxmit, cur->end))
 					cur->rxmit = tp->snd_recover;
 			} else {
 				/*
 				 * ACKs some data in middle of a hole; need
 				 * to split current hole
 				 */
 				temp = tcp_sackhole_insert(tp, sblkp->end,
 				    cur->end, cur);
+				sack_changed = SACK_NEWLOSS;
 				if (temp != NULL) {
 					if (SEQ_GT(cur->rxmit, temp->rxmit)) {
 						temp->rxmit = cur->rxmit;
 						tp->sackhint.sack_bytes_rexmit +=
 						    (SEQ_MIN(temp->rxmit,
 						    temp->end) - temp->start);
 					}
 					tp->sackhint.hole_bytes -= sblkp->end - sblkp->start;
 					loss_thresh += loss_hiack - temp->end;
 					loss_hiack = temp->start;
 					loss_sblks++;
 					if (!((loss_sblks >= tcprexmtthresh) ||
 					    (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg)))
 						notlost_bytes += temp->end - temp->start;
 					cur->end = sblkp->start;
 					cur->rxmit = SEQ_MIN(cur->rxmit,
 					    cur->end);
 					if ((tp->t_flags & TF_LRD) && SEQ_GEQ(cur->rxmit, cur->end))
 						cur->rxmit = tp->snd_recover;
 					delivered_data += (sblkp->end - sblkp->start);
 				}
 			}
 		}
 		tp->sackhint.sack_bytes_rexmit +=
 		    (SEQ_MIN(cur->rxmit, cur->end) - cur->start);
 		/*
 		 * Testing sblkp->start against cur->start tells us whether
 		 * we're done with the sack block or the sack hole.
 		 * Accordingly, we advance one or the other.
 		 */
 		if (SEQ_LEQ(sblkp->start, cur->start)) {
 			loss_thresh += loss_hiack - cur->end;
 			loss_hiack = cur->start;
 			loss_sblks++;
 			if (!((loss_sblks >= tcprexmtthresh) ||
 			    (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg)))
 				notlost_bytes += cur->end - cur->start;
 			cur = TAILQ_PREV(cur, sackhole_head, scblink);
 		} else {
 			sblkp--;
 		}
 	}
 
 	KASSERT(!(TAILQ_EMPTY(&tp->snd_holes) && (tp->sackhint.hole_bytes != 0)),
 	    ("SACK scoreboard empty, but accounting non-zero\n"));
 
 	KASSERT(notlost_bytes <= tp->sackhint.hole_bytes,
 	    ("SACK: more bytes marked notlost than in scoreboard holes"));
 
 	if (!(to->to_flags & TOF_SACK))
 		/*
 		 * If this ACK did not contain any
 		 * SACK blocks, any only moved the
 		 * left edge right, it is a pure
 		 * cumulative ACK. Do not count
 		 * DupAck for this. Also required
 		 * for RFC6675 rescue retransmission.
 		 */
-		sack_changed = 0;
+		sack_changed = SACK_NOCHANGE;
 	tp->sackhint.delivered_data = delivered_data;
 	tp->sackhint.sacked_bytes += delivered_data - left_edge_delta;
 	tp->sackhint.lost_bytes = tp->sackhint.hole_bytes - notlost_bytes;
 	KASSERT((delivered_data >= 0), ("delivered_data < 0"));
 	KASSERT((tp->sackhint.sacked_bytes >= 0), ("sacked_bytes < 0"));
 	return (sack_changed);
 }
 
 /*
  * Free all SACK holes to clear the scoreboard.
  */
 void
 tcp_free_sackholes(struct tcpcb *tp)
 {
 	struct sackhole *q;
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 	while ((q = TAILQ_FIRST(&tp->snd_holes)) != NULL)
 		tcp_sackhole_remove(tp, q);
 	tp->sackhint.sack_bytes_rexmit = 0;
 	tp->sackhint.delivered_data = 0;
 	tp->sackhint.sacked_bytes = 0;
 	tp->sackhint.hole_bytes = 0;
 	tp->sackhint.lost_bytes = 0;
 
 	KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes == 0"));
 	KASSERT(tp->sackhint.nexthole == NULL,
 		("tp->sackhint.nexthole == NULL"));
 }
 
 /*
  * Partial ack handling within a sack recovery episode.  Keeping this very
  * simple for now.  When a partial ack is received, force snd_cwnd to a value
  * that will allow the sender to transmit no more than 2 segments.  If
  * necessary, a better scheme can be adopted at a later point, but for now,
  * the goal is to prevent the sender from bursting a large amount of data in
  * the midst of sack recovery.
  */
 void
 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th)
 {
 	struct sackhole *temp;
 	int num_segs = 1;
 	u_int maxseg = tcp_maxseg(tp);
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 	tcp_timer_activate(tp, TT_REXMT, 0);
 	tp->t_rtttime = 0;
 	/* Send one or 2 segments based on how much new data was acked. */
 	if ((BYTES_THIS_ACK(tp, th) / maxseg) >= 2)
 		num_segs = 2;
 	tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit +
 	    (tp->snd_nxt - tp->snd_recover) + num_segs * maxseg);
 	if (tp->snd_cwnd > tp->snd_ssthresh)
 		tp->snd_cwnd = tp->snd_ssthresh;
 	tp->t_flags |= TF_ACKNOW;
 	/*
 	 * RFC6675 rescue retransmission
 	 * Add a hole between th_ack (snd_una is not yet set) and snd_max,
 	 * if this was a pure cumulative ACK and no data was send beyond
 	 * recovery point. Since the data in the socket has not been freed
 	 * at this point, we check if the scoreboard is empty, and the ACK
 	 * delivered some new data, indicating a full ACK. Also, if the
 	 * recovery point is still at snd_max, we are probably application
 	 * limited. However, this inference might not always be true. The
 	 * rescue retransmission may rarely be slightly premature
 	 * compared to RFC6675.
 	 * The corresponding ACK+SACK will cause any further outstanding
 	 * segments to be retransmitted. This addresses a corner case, when
 	 * the trailing packets of a window are lost and no further data
 	 * is available for sending.
 	 */
 	if ((V_tcp_do_newsack) &&
 	    SEQ_LT(th->th_ack, tp->snd_recover) &&
 	    TAILQ_EMPTY(&tp->snd_holes) &&
 	    (tp->sackhint.delivered_data > 0)) {
 		/*
 		 * Exclude FIN sequence space in
 		 * the hole for the rescue retransmission,
 		 * and also don't create a hole, if only
 		 * the ACK for a FIN is outstanding.
 		 */
 		tcp_seq highdata = tp->snd_max;
 		if (tp->t_flags & TF_SENTFIN)
 			highdata--;
 		highdata = SEQ_MIN(highdata, tp->snd_recover);
 		if (th->th_ack != highdata) {
 			tp->snd_fack = th->th_ack;
 			if ((temp = tcp_sackhole_insert(tp, SEQ_MAX(th->th_ack,
 			    highdata - maxseg), highdata, NULL)) != NULL)
 				tp->sackhint.hole_bytes += temp->end -
 								temp->start;
 		}
 	}
 	(void) tcp_output(tp);
 }
 
 #if 0
 /*
  * Debug version of tcp_sack_output() that walks the scoreboard.  Used for
  * now to sanity check the hint.
  */
 static struct sackhole *
 tcp_sack_output_debug(struct tcpcb *tp, int *sack_bytes_rexmt)
 {
 	struct sackhole *p;
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 	*sack_bytes_rexmt = 0;
 	TAILQ_FOREACH(p, &tp->snd_holes, scblink) {
 		if (SEQ_LT(p->rxmit, p->end)) {
 			if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
 				continue;
 			}
 			*sack_bytes_rexmt += (p->rxmit - p->start);
 			break;
 		}
 		*sack_bytes_rexmt += (SEQ_MIN(p->rxmit, p->end) - p->start);
 	}
 	return (p);
 }
 #endif
 
 /*
  * Returns the next hole to retransmit and the number of retransmitted bytes
  * from the scoreboard.  We store both the next hole and the number of
  * retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK
  * reception).  This avoids scoreboard traversals completely.
  *
  * The loop here will traverse *at most* one link.  Here's the argument.  For
  * the loop to traverse more than 1 link before finding the next hole to
  * retransmit, we would need to have at least 1 node following the current
  * hint with (rxmit == end).  But, for all holes following the current hint,
  * (start == rxmit), since we have not yet retransmitted from them.
  * Therefore, in order to traverse more 1 link in the loop below, we need to
  * have at least one node following the current hint with (start == rxmit ==
  * end).  But that can't happen, (start == end) means that all the data in
  * that hole has been sacked, in which case, the hole would have been removed
  * from the scoreboard.
  */
 struct sackhole *
 tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
 {
 	struct sackhole *hole = NULL;
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 	*sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit;
 	hole = tp->sackhint.nexthole;
 	if (hole == NULL)
 		return (hole);
 	if (SEQ_GEQ(hole->rxmit, hole->end)) {
 		for (;;) {
 			hole = TAILQ_NEXT(hole, scblink);
 			if (hole == NULL)
 				return (hole);
 			if (SEQ_LT(hole->rxmit, hole->end)) {
 				tp->sackhint.nexthole = hole;
 				break;
 			}
 		}
 	}
 	KASSERT(SEQ_LT(hole->start, hole->end), ("%s: hole.start >= hole.end", __func__));
 	if (!(V_tcp_do_newsack)) {
 		KASSERT(SEQ_LT(hole->start, tp->snd_fack), ("%s: hole.start >= snd.fack", __func__));
 		KASSERT(SEQ_LT(hole->end, tp->snd_fack), ("%s: hole.end >= snd.fack", __func__));
 		KASSERT(SEQ_LT(hole->rxmit, tp->snd_fack), ("%s: hole.rxmit >= snd.fack", __func__));
 		if (SEQ_GEQ(hole->start, hole->end) ||
 		    SEQ_GEQ(hole->start, tp->snd_fack) ||
 		    SEQ_GEQ(hole->end, tp->snd_fack) ||
 		    SEQ_GEQ(hole->rxmit, tp->snd_fack)) {
 			log(LOG_CRIT,"tcp: invalid SACK hole (%u-%u,%u) vs fwd ack %u, ignoring.\n",
 					hole->start, hole->end, hole->rxmit, tp->snd_fack);
 			return (NULL);
 		}
 	}
 	return (hole);
 }
 
 /*
  * After a timeout, the SACK list may be rebuilt.  This SACK information
  * should be used to avoid retransmitting SACKed data.  This function
  * traverses the SACK list to see if snd_nxt should be moved forward.
  */
 void
 tcp_sack_adjust(struct tcpcb *tp)
 {
 	struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes);
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 	if (cur == NULL)
 		return; /* No holes */
 	if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack))
 		return; /* We're already beyond any SACKed blocks */
 	/*-
 	 * Two cases for which we want to advance snd_nxt:
 	 * i) snd_nxt lies between end of one hole and beginning of another
 	 * ii) snd_nxt lies between end of last hole and snd_fack
 	 */
 	while ((p = TAILQ_NEXT(cur, scblink)) != NULL) {
 		if (SEQ_LT(tp->snd_nxt, cur->end))
 			return;
 		if (SEQ_GEQ(tp->snd_nxt, p->start))
 			cur = p;
 		else {
 			tp->snd_nxt = p->start;
 			return;
 		}
 	}
 	if (SEQ_LT(tp->snd_nxt, cur->end))
 		return;
 	tp->snd_nxt = tp->snd_fack;
 }
 
 /*
  * Lost Retransmission Detection
  * Check is FACK is beyond the rexmit of the leftmost hole.
  * If yes, we restart sending from still existing holes,
  * and adjust cwnd via the congestion control module.
  */
 void
 tcp_sack_lost_retransmission(struct tcpcb *tp, struct tcphdr *th)
 {
 	struct sackhole *temp;
 
 	if (IN_RECOVERY(tp->t_flags) &&
 	    SEQ_GT(tp->snd_fack, tp->snd_recover) &&
 	    ((temp = TAILQ_FIRST(&tp->snd_holes)) != NULL) &&
 	    SEQ_GEQ(temp->rxmit, temp->end) &&
 	    SEQ_GEQ(tp->snd_fack, temp->rxmit)) {
 		TCPSTAT_INC(tcps_sack_lostrexmt);
 		/*
 		 * Start retransmissions from the first hole, and
 		 * subsequently all other remaining holes, including
 		 * those, which had been sent completely before.
 		 */
 		tp->sackhint.nexthole = temp;
 		TAILQ_FOREACH(temp, &tp->snd_holes, scblink) {
 			if (SEQ_GEQ(tp->snd_fack, temp->rxmit) &&
 			    SEQ_GEQ(temp->rxmit, temp->end))
 				temp->rxmit = temp->start;
 		}
 		/*
 		 * Remember the old ssthresh, to deduct the beta factor used
 		 * by the CC module. Finally, set cwnd to ssthresh just
 		 * prior to invoking another cwnd reduction by the CC
 		 * module, to not shrink it excessively.
 		 */
 		tp->snd_cwnd = tp->snd_ssthresh;
 		/*
 		 * Formally exit recovery, and let the CC module adjust
 		 * ssthresh as intended.
 		 */
 		EXIT_RECOVERY(tp->t_flags);
 		cc_cong_signal(tp, th, CC_NDUPACK);
 		/*
 		 * For PRR, adjust recover_fs as if this new reduction
 		 * initialized this variable.
 		 * cwnd will be adjusted by SACK or PRR processing
 		 * subsequently, only set it to a safe value here.
 		 */
 		tp->snd_cwnd = tcp_maxseg(tp);
 		tp->sackhint.recover_fs = (tp->snd_max - tp->snd_una) -
 					    tp->sackhint.recover_fs;
 	}
 }
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index c6e24b187e0f..2236a1385b44 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -1,1613 +1,1620 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1993, 1994, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_var.h	8.4 (Berkeley) 5/24/95
  */
 
 #ifndef _NETINET_TCP_VAR_H_
 #define _NETINET_TCP_VAR_H_
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 
 #ifdef _KERNEL
 #include "opt_kern_tls.h"
 #include <net/vnet.h>
 #include <sys/mbuf.h>
 #include <sys/ktls.h>
 #endif
 
 #define TCP_END_BYTE_INFO 8	/* Bytes that makeup the "end information array" */
 /* Types of ending byte info */
 #define TCP_EI_EMPTY_SLOT	0
 #define TCP_EI_STATUS_CLIENT_FIN	0x1
 #define TCP_EI_STATUS_CLIENT_RST	0x2
 #define TCP_EI_STATUS_SERVER_FIN	0x3
 #define TCP_EI_STATUS_SERVER_RST	0x4
 #define TCP_EI_STATUS_RETRAN		0x5
 #define TCP_EI_STATUS_PROGRESS		0x6
 #define TCP_EI_STATUS_PERSIST_MAX	0x7
 #define TCP_EI_STATUS_KEEP_MAX		0x8
 #define TCP_EI_STATUS_DATA_A_CLOSE	0x9
 #define TCP_EI_STATUS_RST_IN_FRONT	0xa
 #define TCP_EI_STATUS_2MSL		0xb
 #define TCP_EI_STATUS_MAX_VALUE		0xb
 
 #define TCP_TRK_REQ_LOG_NEW		0x01
 #define TCP_TRK_REQ_LOG_COMPLETE	0x02
 #define TCP_TRK_REQ_LOG_FREED		0x03
 #define TCP_TRK_REQ_LOG_ALLOCFAIL	0x04
 #define TCP_TRK_REQ_LOG_MOREYET	0x05
 #define TCP_TRK_REQ_LOG_FORCEFREE	0x06
 #define TCP_TRK_REQ_LOG_STALE		0x07
 #define TCP_TRK_REQ_LOG_SEARCH		0x08
 
 /************************************************/
 /* Status bits we track to assure no duplicates,
  * the bits here are not used by the code but
  * for human representation. To check a bit we
  * take and shift over by 1 minus the value (1-8).
  */
 /************************************************/
 #define TCP_EI_BITS_CLIENT_FIN	0x001
 #define TCP_EI_BITS_CLIENT_RST	0x002
 #define TCP_EI_BITS_SERVER_FIN	0x004
 #define TCP_EI_BITS_SERVER_RST	0x008
 #define TCP_EI_BITS_RETRAN	0x010
 #define TCP_EI_BITS_PROGRESS	0x020
 #define TCP_EI_BITS_PRESIST_MAX	0x040
 #define TCP_EI_BITS_KEEP_MAX	0x080
 #define TCP_EI_BITS_DATA_A_CLO  0x100
 #define TCP_EI_BITS_RST_IN_FR	0x200	/* a front state reset */
 #define TCP_EI_BITS_2MS_TIMER	0x400	/* 2 MSL timer expired */
 
 #if defined(_KERNEL) || defined(_WANT_TCPCB)
 #include <netinet/cc/cc.h>
 
 /* TCP segment queue entry */
 struct tseg_qent {
 	TAILQ_ENTRY(tseg_qent) tqe_q;
 	struct	mbuf   *tqe_m;		/* mbuf contains packet */
 	struct  mbuf   *tqe_last;	/* last mbuf in chain */
 	tcp_seq tqe_start;		/* TCP Sequence number start */
 	int	tqe_len;		/* TCP segment data length */
 	uint32_t tqe_flags;		/* The flags from tcp_get_flags() */
 	uint32_t tqe_mbuf_cnt;		/* Count of mbuf overhead */
 };
 TAILQ_HEAD(tsegqe_head, tseg_qent);
 
 struct sackblk {
 	tcp_seq start;		/* start seq no. of sack block */
 	tcp_seq end;		/* end seq no. */
 };
 
 struct sackhole {
 	tcp_seq start;		/* start seq no. of hole */
 	tcp_seq end;		/* end seq no. */
 	tcp_seq rxmit;		/* next seq. no in hole to be retransmitted */
 	TAILQ_ENTRY(sackhole) scblink;	/* scoreboard linkage */
 };
 
 struct sackhint {
 	struct sackhole	*nexthole;
 	int32_t		sack_bytes_rexmit;
 	tcp_seq		last_sack_ack;	/* Most recent/largest sacked ack */
 
 	int32_t		delivered_data; /* Newly acked data from last SACK */
 
 	int32_t		sacked_bytes;	/* Total sacked bytes reported by the
 					 * receiver via sack option
 					 */
 	uint32_t	recover_fs;	/* Flight Size at the start of Loss recovery */
 	uint32_t	prr_delivered;	/* Total bytes delivered using PRR */
 	uint32_t	prr_out;	/* Bytes sent during IN_RECOVERY */
 	int32_t		hole_bytes;	/* current number of bytes in scoreboard holes */
 	int32_t		lost_bytes;	/* number of rfc6675 IsLost() bytes */
 };
 
 #define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq)
 
 STAILQ_HEAD(tcp_log_stailq, tcp_log_mem);
 
 #define TCP_TRK_TRACK_FLG_EMPTY 0x00	/* Available */
 #define TCP_TRK_TRACK_FLG_USED  0x01	/* In use */
 #define TCP_TRK_TRACK_FLG_OPEN  0x02	/* End is not valid (open range request) */
 #define TCP_TRK_TRACK_FLG_SEQV  0x04	/* We had a sendfile that touched it  */
 #define TCP_TRK_TRACK_FLG_COMP  0x08	/* Sendfile as placed the last bits (range req only) */
 #define TCP_TRK_TRACK_FLG_FSND	 0x10	/* First send has been done into the seq space */
 #define MAX_TCP_TRK_REQ 5		/* Max we will have at once */
 
 struct tcp_sendfile_track {
 	uint64_t timestamp;	/* User sent timestamp */
 	uint64_t start;		/* Start of sendfile offset */
 	uint64_t end;		/* End if not open-range req */
 	uint64_t localtime;	/* Time we actually got the req */
 	uint64_t deadline;	/* If in CU mode, deadline to delivery */
 	uint64_t first_send;	/* Time of first send in the range */
 	uint64_t cspr;		/* Client suggested pace rate */
 	uint64_t sent_at_fs;	/* What was t_sndbytes as we begun sending */
 	uint64_t rxt_at_fs;	/* What was t_snd_rxt_bytes as we begun sending */
 	tcp_seq start_seq;	/* First TCP Seq assigned */
 	tcp_seq end_seq;	/* If range req last seq */
 	uint32_t flags;		/* Type of request open etc */
 	uint32_t sbcc_at_s;	/* When we allocate what is the sb_cc */
 	uint32_t hint_maxseg;	/* Client hinted maxseg */
 	uint32_t hybrid_flags;	/* Hybrid flags on this request */
 };
 
 
 /*
  * Change Query responses for a stack switch we create a structure
  * that allows query response from the new stack to the old, if
  * supported.
  *
  * There are three queries currently defined.
  *  - sendmap
  *  - timers
  *  - rack_times
  *
  * For the sendmap query the caller fills in the
  * req and the req_param as the first seq (usually
  * snd_una). When the response comes back indicating
  * that there was data (return value 1), then the caller
  * can build a sendmap entry based on the range and the
  * times. The next query would then be done at the 
  * newly created sendmap_end. Repeated until sendmap_end == snd_max.
  *
  * Flags in sendmap_flags are defined below as well.
  *
  * For timers the standard PACE_TMR_XXXX flags are returned indicating
  * a pacing timer (possibly) and one other timer. If pacing timer then
  * the expiration timeout time in microseconds is in timer_pacing_to.
  * And the value used with whatever timer (if a flag is set) is in
  * timer_rxt. If no timers are running a 0 is returned and of
  * course no flags are set in timer_hpts_flags.
  *
  * The rack_times are a misc collection of information that
  * the old stack might possibly fill in. Of course its possible
  * that an old stack may not have a piece of information. If so
  * then setting that value to zero is advised. Setting any 
  * timestamp passed should only place a zero in it when it
  * is unfilled. This may mean that a time is off by a micro-second
  * but this is ok in the grand scheme of things.
  *
  * When switching stacks it is desireable to get as much information
  * from the old stack to the new stack as possible. Though not always
  * will the stack be compatible in the types of information. The
  * init() function needs to take care when it begins changing 
  * things such as inp_flags2 and the timer units to position these
  * changes at a point where it is unlikely they will fail after
  * making such changes. A stack optionally can have an "undo"
  * function  
  *
  * To transfer information to the old stack from the new in 
  * respect to LRO and the inp_flags2, the new stack should set
  * the inp_flags2 to what it supports. The old stack in its
  * fini() function should call the tcp_handle_orphaned_packets()
  * to clean up any packets. Note that a new stack should attempt
  */
 
 /* Query types */
 #define TCP_QUERY_SENDMAP	1
 #define TCP_QUERY_TIMERS_UP	2
 #define TCP_QUERY_RACK_TIMES	3
 
 /* Flags returned in sendmap_flags */
 #define SNDMAP_ACKED		0x000001/* The remote endpoint acked this */
 #define SNDMAP_OVERMAX		0x000008/* We have more retran's then we can fit */
 #define SNDMAP_SACK_PASSED	0x000010/* A sack was done above this block */
 #define SNDMAP_HAS_FIN		0x000040/* segment is sent with fin */
 #define SNDMAP_TLP		0x000080/* segment sent as tail-loss-probe */
 #define SNDMAP_HAS_SYN		0x000800/* SYN is on this guy */
 #define SNDMAP_HAD_PUSH		0x008000/* Push was sent on original send */
 #define SNDMAP_MASK  (SNDMAP_ACKED|SNDMAP_OVERMAX|SNDMAP_SACK_PASSED|SNDMAP_HAS_FIN\
 		      |SNDMAP_TLP|SNDMAP_HAS_SYN|SNDMAP_HAD_PUSH)
 #define SNDMAP_NRTX 3
 
 struct tcp_query_resp {
 	int req;
 	uint32_t req_param;
 	union {
 		struct {
 			tcp_seq sendmap_start;
 			tcp_seq sendmap_end;
 			int sendmap_send_cnt;
 			uint64_t sendmap_time[SNDMAP_NRTX];
 			uint64_t sendmap_ack_arrival;
 			int sendmap_flags;
 			uint32_t sendmap_r_rtr_bytes;
 			/* If FAS is available if not 0 */
 			uint32_t sendmap_fas;
 			uint8_t sendmap_dupacks;
 		};
 		struct {
 			uint32_t timer_hpts_flags;
 			uint32_t timer_pacing_to;
 			uint32_t timer_timer_exp;
 		};
 		struct {
 			/* Timestamps and rtt's */
 			uint32_t rack_reorder_ts;	/* Last uscts that reordering was seen */
 			uint32_t rack_num_dsacks;	/* Num of dsacks seen */
 			uint32_t rack_rxt_last_time; 	/* Last time a RXT/TLP or rack tmr  went off */
 			uint32_t rack_min_rtt;		/* never 0 smallest rtt seen */
 			uint32_t rack_rtt;		/* Last rtt used by rack */
 			uint32_t rack_tmit_time;	/* The time the rtt seg was tmited */
 			uint32_t rack_time_went_idle;	/* If in persist the time we went idle */
 			/* Prr data  */
 			uint32_t rack_sacked;
 			uint32_t rack_holes_rxt;
 			uint32_t rack_prr_delivered;
 			uint32_t rack_prr_recovery_fs;
 			uint32_t rack_prr_out;
 			uint32_t rack_prr_sndcnt;
 			/* TLP data */
 			uint16_t rack_tlp_cnt_out;	/* How many tlp's have been sent */
 			/* Various bits */
 			uint8_t  rack_tlp_out;		/* Is a TLP outstanding */
 			uint8_t  rack_srtt_measured;	/* The previous stack has measured srtt */
 			uint8_t  rack_in_persist;	/* Is the old stack in persists? */
 			uint8_t	 rack_wanted_output;	/* Did the prevous stack have a want output set */
 		};
 	};
 };
 
 #define TCP_TMR_GRANULARITY_TICKS	1	/* TCP timers are in ticks (msec if hz=1000)  */
 #define TCP_TMR_GRANULARITY_USEC	2	/* TCP timers are in microseconds */
 
 typedef enum {
 	TT_REXMT = 0,
 	TT_PERSIST,
 	TT_KEEP,
 	TT_2MSL,
 	TT_DELACK,
 	TT_N,
 } tt_which;
 
 typedef enum {
 	TT_PROCESSING = 0,
 	TT_PROCESSED,
 	TT_STARTING,
 	TT_STOPPING,
 } tt_what;
 
 /*
  * Tcp control block, one per tcp connection.
  */
 struct tcpcb {
 	struct inpcb t_inpcb;		/* embedded protocol independent cb */
 #define	t_start_zero	t_fb
 #define	t_zero_size	(sizeof(struct tcpcb) - \
 			    offsetof(struct tcpcb, t_start_zero))
 	struct tcp_function_block *t_fb;/* TCP function call block */
 	void	*t_fb_ptr;		/* Pointer to t_fb specific data */
 
 	struct callout t_callout;
 	sbintime_t t_timers[TT_N];
 	sbintime_t t_precisions[TT_N];
 
 	/* HPTS. Used by BBR and Rack stacks. See tcp_hpts.c for more info. */
 	TAILQ_ENTRY(tcpcb)	t_hpts;		/* linkage to HPTS ring */
 	STAILQ_HEAD(, mbuf)	t_inqueue;	/* HPTS input packets queue */
 	uint32_t t_hpts_request;	/* Current hpts request, zero if
 					 * fits in the pacing window. */
 	uint32_t t_hpts_slot;		/* HPTS wheel slot this tcb is. */
 	uint32_t t_hpts_drop_reas;	/* Reason we are dropping the pcb. */
 	uint32_t t_hpts_gencnt;
 	uint16_t t_hpts_cpu;		/* CPU chosen by hpts_cpuid(). */
 	uint16_t t_lro_cpu;		/* CPU derived from LRO. */
 #define	HPTS_CPU_NONE	((uint16_t)-1)
 	enum {
 		IHPTS_NONE = 0,
 		IHPTS_ONQUEUE,
 		IHPTS_MOVING,
 	} t_in_hpts;			/* Is it linked into HPTS? */
 
 	uint32_t t_maxseg:24,		/* maximum segment size */
 		_t_logstate:8;		/* State of "black box" logging */
 	uint32_t t_port:16,		/* Tunneling (over udp) port */
 		t_state:4,		/* state of this connection */
 		t_idle_reduce : 1,
 		t_delayed_ack: 7,	/* Delayed ack variable */
 		t_fin_is_rst: 1,	/* Are fin's treated as resets */
 		t_log_state_set: 1,
 		bits_spare : 2;
 	u_int	t_flags;
 	tcp_seq	snd_una;		/* sent but unacknowledged */
 	tcp_seq	snd_max;		/* highest sequence number sent;
 					 * used to recognize retransmits
 					 */
 	tcp_seq snd_nxt;		/* send next */
 	tcp_seq snd_up;			/* send urgent pointer */
 	uint32_t snd_wnd;		/* send window */
 	uint32_t snd_cwnd;		/* congestion-controlled window */
 	uint32_t ts_offset;		/* our timestamp offset */
 	uint32_t rfbuf_ts;		/* recv buffer autoscaling timestamp */
 	int	rcv_numsacks;		/* # distinct sack blks present */
 	u_int	t_tsomax;		/* TSO total burst length limit */
 	u_int	t_tsomaxsegcount;	/* TSO maximum segment count */
 	u_int	t_tsomaxsegsize;	/* TSO maximum segment size in bytes */
 	tcp_seq	rcv_nxt;		/* receive next */
 	tcp_seq	rcv_adv;		/* advertised window */
 	uint32_t rcv_wnd;		/* receive window */
 	u_int	t_flags2;		/* More tcpcb flags storage */
 	int	t_srtt;			/* smoothed round-trip time */
 	int	t_rttvar;		/* variance in round-trip time */
 	uint32_t ts_recent;		/* timestamp echo data */
 	u_char	snd_scale;		/* window scaling for send window */
 	u_char	rcv_scale;		/* window scaling for recv window */
 	u_char	snd_limited;		/* segments limited transmitted */
 	u_char	request_r_scale;	/* pending window scaling */
 	tcp_seq	last_ack_sent;
 	u_int	t_rcvtime;		/* inactivity time */
 	tcp_seq	rcv_up;			/* receive urgent pointer */
 	int	t_segqlen;		/* segment reassembly queue length */
 	uint32_t t_segqmbuflen;		/* total reassembly queue byte length */
 	struct	tsegqe_head t_segq;	/* segment reassembly queue */
 	uint32_t snd_ssthresh;		/* snd_cwnd size threshold for
 					 * for slow start exponential to
 					 * linear switch
 					 */
 	tcp_seq	snd_wl1;		/* window update seg seq number */
 	tcp_seq	snd_wl2;		/* window update seg ack number */
 
 	tcp_seq	irs;			/* initial receive sequence number */
 	tcp_seq	iss;			/* initial send sequence number */
 	u_int	t_acktime;		/* RACK and BBR incoming new data was acked */
 	u_int	t_sndtime;		/* time last data was sent */
 	u_int	ts_recent_age;		/* when last updated */
 	tcp_seq	snd_recover;		/* for use in NewReno Fast Recovery */
 	char	t_oobflags;		/* have some */
 	char	t_iobc;			/* input character */
 	uint8_t t_nic_ktls_xmit:1,	/* active nic ktls xmit sessions */
 		t_nic_ktls_xmit_dis:1,	/* disabled nic xmit ktls? */
 		t_nic_ktls_spare:6;	/* spare nic ktls */
 	int	t_rxtcur;		/* current retransmit value (ticks) */
 
 	int	t_rxtshift;		/* log(2) of rexmt exp. backoff */
 	u_int	t_rtttime;		/* RTT measurement start time */
 
 	tcp_seq	t_rtseq;		/* sequence number being timed */
 	u_int	t_starttime;		/* time connection was established */
 	u_int	t_fbyte_in;		/* ticks time first byte queued in */
 	u_int	t_fbyte_out;		/* ticks time first byte queued out */
 
 	u_int	t_pmtud_saved_maxseg;	/* pre-blackhole MSS */
 	int	t_blackhole_enter;	/* when to enter blackhole detection */
 	int	t_blackhole_exit;	/* when to exit blackhole detection */
 	u_int	t_rttmin;		/* minimum rtt allowed */
 
 	int	t_softerror;		/* possible error not yet reported */
 	uint32_t max_sndwnd;		/* largest window peer has offered */
 	uint32_t snd_cwnd_prev;		/* cwnd prior to retransmit */
 	uint32_t snd_ssthresh_prev;	/* ssthresh prior to retransmit */
 	tcp_seq	snd_recover_prev;	/* snd_recover prior to retransmit */
 	int	t_sndzerowin;		/* zero-window updates sent */
 	int	snd_numholes;		/* number of holes seen by sender */
 	u_int	t_badrxtwin;		/* window for retransmit recovery */
 	TAILQ_HEAD(sackhole_head, sackhole) snd_holes;
 					/* SACK scoreboard (sorted) */
 	tcp_seq	snd_fack;		/* last seq number(+1) sack'd by rcv'r*/
 	struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
 	struct sackhint	sackhint;	/* SACK scoreboard hint */
 	int	t_rttlow;		/* smallest observerved RTT */
 	int	rfbuf_cnt;		/* recv buffer autoscaling byte count */
 	struct toedev	*tod;		/* toedev handling this connection */
 	int	t_sndrexmitpack;	/* retransmit packets sent */
 	int	t_rcvoopack;		/* out-of-order packets received */
 	void	*t_toe;			/* TOE pcb pointer */
 	struct cc_algo	*t_cc;		/* congestion control algorithm */
 	struct cc_var	t_ccv;		/* congestion control specific vars */
 	int	t_bytes_acked;		/* # bytes acked during current RTT */
 	u_int	t_maxunacktime;
 	u_int	t_keepinit;		/* time to establish connection */
 	u_int	t_keepidle;		/* time before keepalive probes begin */
 	u_int	t_keepintvl;		/* interval between keepalives */
 	u_int	t_keepcnt;		/* number of keepalives before close */
 	int	t_dupacks;		/* consecutive dup acks recd */
 	int	t_lognum;		/* Number of log entries */
 	int	t_loglimit;		/* Maximum number of log entries */
 	uint32_t t_rcep;		/* Number of received CE marked pkts */
 	uint32_t t_scep;		/* Synced number of delivered CE pkts */
 	int64_t	t_pacing_rate;		/* bytes / sec, -1 => unlimited */
 	struct tcp_log_stailq t_logs;	/* Log buffer */
 	struct tcp_log_id_node *t_lin;
 	struct tcp_log_id_bucket *t_lib;
 	const char *t_output_caller;	/* Function that called tcp_output */
 	struct statsblob *t_stats;	/* Per-connection stats */
 	/* Should these be a pointer to the arrays or an array? */
 	uint32_t t_logsn;		/* Log "serial number" */
 	uint32_t gput_ts;		/* Time goodput measurement started */
 	tcp_seq gput_seq;		/* Outbound measurement seq */
 	tcp_seq gput_ack;		/* Inbound measurement ack */
 	int32_t t_stats_gput_prev;	/* XXXLAS: Prev gput measurement */
 	uint32_t t_maxpeakrate;		/* max peak rate set by user, bytes/s */
 	uint32_t t_sndtlppack;		/* tail loss probe packets sent */
 	uint64_t t_sndtlpbyte;		/* total tail loss probe bytes sent */
 	uint64_t t_sndbytes;		/* total bytes sent */
 	uint64_t t_snd_rxt_bytes;	/* total bytes retransmitted */
 	uint32_t t_dsack_bytes;		/* dsack bytes received */
 	uint32_t t_dsack_tlp_bytes;	/* dsack bytes received for TLPs sent */
 	uint32_t t_dsack_pack;		/* dsack packets we have eceived */
 	uint8_t t_tmr_granularity;	/* Granularity of all timers srtt etc */
 	uint8_t t_rttupdated;		/* number of times rtt sampled */
 	/* TCP Fast Open */
 	uint8_t t_tfo_client_cookie_len; /* TFO client cookie length */
 	uint32_t t_end_info_status;	/* Status flag of end info */
 	unsigned int *t_tfo_pending;	/* TFO server pending counter */
 	union {
 		uint8_t client[TCP_FASTOPEN_MAX_COOKIE_LEN];
 		uint64_t server;
 	} t_tfo_cookie;			/* TCP Fast Open cookie to send */
 	union {
 		uint8_t t_end_info_bytes[TCP_END_BYTE_INFO];
 		uint64_t t_end_info;
 	};
 	struct osd	t_osd;		/* storage for Khelp module data */
 	uint8_t _t_logpoint;	/* Used when a BB log points is enabled */
 	/*
 	 * Keep all #ifdef'ed components at the end of the structure!
 	 * This is important to minimize problems when compiling modules
 	 * using this structure from within the modules' directory.
 	 */
 #ifdef TCP_REQUEST_TRK
 	/* Response tracking addons. */
 	uint8_t t_tcpreq_req;	/* Request count */
 	uint8_t t_tcpreq_open;	/* Number of open range requests */
 	uint8_t t_tcpreq_closed;	/* Number of closed range requests */
 	uint32_t tcp_hybrid_start;	/* Num of times we started hybrid pacing */
 	uint32_t tcp_hybrid_stop;	/* Num of times we stopped hybrid pacing */
 	uint32_t tcp_hybrid_error;	/* Num of times we failed to start hybrid pacing */
 	struct tcp_sendfile_track t_tcpreq_info[MAX_TCP_TRK_REQ];
 #endif
 #ifdef TCP_ACCOUNTING
 	uint64_t tcp_cnt_counters[TCP_NUM_CNT_COUNTERS];
 	uint64_t tcp_proc_time[TCP_NUM_CNT_COUNTERS];
 #endif
 #ifdef TCPPCAP
 	struct mbufq t_inpkts;		/* List of saved input packets. */
 	struct mbufq t_outpkts;		/* List of saved output packets. */
 #endif
 };
 #endif	/* _KERNEL || _WANT_TCPCB */
 
 #ifdef _KERNEL
 struct tcptemp {
 	u_char	tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */
 	struct	tcphdr tt_t;
 };
 
+/* SACK scoreboard update status */
+typedef enum {
+	SACK_NOCHANGE = 0,
+	SACK_CHANGE,
+	SACK_NEWLOSS
+} sackstatus_t;
+
 /* Enable TCP/UDP tunneling port */
 #define TCP_TUNNELING_PORT_MIN		0
 #define TCP_TUNNELING_PORT_MAX		65535
 #define TCP_TUNNELING_PORT_DEFAULT	0
 
 /* Enable TCP/UDP tunneling port */
 #define TCP_TUNNELING_OVERHEAD_MIN	sizeof(struct udphdr)
 #define TCP_TUNNELING_OVERHEAD_MAX	1024
 #define TCP_TUNNELING_OVERHEAD_DEFAULT	TCP_TUNNELING_OVERHEAD_MIN
 
 /* Minimum map entries limit value, if set */
 #define TCP_MIN_MAP_ENTRIES_LIMIT	128
 
 /*
  * TODO: We yet need to brave plowing in
  * to tcp_input() and the pru_usrreq() block.
  * Right now these go to the old standards which
  * are somewhat ok, but in the long term may
  * need to be changed. If we do tackle tcp_input()
  * then we need to get rid of the tcp_do_segment()
  * function below.
  */
 /* Flags for tcp functions */
 #define	TCP_FUNC_BEING_REMOVED	0x01   	/* Can no longer be referenced */
 #define	TCP_FUNC_OUTPUT_CANDROP	0x02   	/* tfb_tcp_output may ask tcp_drop */
 
 /**
  * If defining the optional tcp_timers, in the
  * tfb_tcp_timer_stop call you must use the
  * callout_async_drain() function with the
  * tcp_timer_discard callback. You should check
  * the return of callout_async_drain() and if 0
  * increment tt_draincnt. Since the timer sub-system
  * does not know your callbacks you must provide a
  * stop_all function that loops through and calls
  * tcp_timer_stop() with each of your defined timers.
  *
  * Adding a tfb_tcp_handoff_ok function allows the socket
  * option to change stacks to query you even if the
  * connection is in a later stage. You return 0 to
  * say you can take over and run your stack, you return
  * non-zero (an error number) to say no you can't.
  * If the function is undefined you can only change
  * in the early states (before connect or listen).
  *
  * tfb_tcp_fb_init is used to allow the new stack to
  * setup its control block. Among the things it must
  * do is:
  * a) Make sure that the inp_flags2 is setup correctly
  *    for LRO. There are two flags that the previous
  *    stack may have set INP_MBUF_ACKCMP and 
  *    INP_SUPPORTS_MBUFQ. If the new stack does not
  *    support these it *should* clear the flags.
  * b) Make sure that the timers are in the proper
  *    granularity that the stack wants. The stack
  *    should check the t_tmr_granularity field. Currently
  *    there are two values that it may hold 
  *    TCP_TMR_GRANULARITY_TICKS and TCP_TMR_GRANULARITY_USEC.
  *    Use the functions tcp_timer_convert(tp, granularity);
  *    to move the timers to the correct format for your stack.
  *
  * The new stack may also optionally query the tfb_chg_query
  * function if the old stack has one. The new stack may ask
  * for one of three entries and can also state to the old
  * stack its support for the INP_MBUF_ACKCMP and 
  * INP_SUPPORTS_MBUFQ. This is important since if there are
  * queued ack's without that statement the old stack will
  * be forced to discard the queued acks. The requests that
  * can be made for information by the new stacks are:
  *
  * Note also that the tfb_tcp_fb_init() when called can
  * determine if a query is needed by looking at the 
  * value passed in the ptr. The ptr is designed to be
  * set in with any allocated memory, but the address
  * of the condtion (ptr == &tp->t_fb_ptr) will be
  * true if this is not a stack switch but the initial
  * setup of a tcb (which means no query would be needed).
  * If, however, the value is not t_fb_ptr, then the caller
  * is in the middle of a stack switch and is the new stack.
  * A query would be appropriate (if the new stack support 
  * the query mechanism).
  *
  * TCP_QUERY_SENDMAP - Query of outstanding data.
  * TCP_QUERY_TIMERS_UP	- Query about running timers.
  * TCP_SUPPORTED_LRO - Declaration in req_param of 
  *                     the inp_flags2 supported by 
  *                     the new stack.
  * TCP_QUERY_RACK_TIMES	- Enquire about various timestamps
  *                        and states the old stack may be in.
  * 
  * tfb_tcp_fb_fini is changed to add a flag to tell
  * the old stack if the tcb is being destroyed or
  * not. A one in the flag means the TCB is being
  * destroyed, a zero indicates its transitioning to
  * another stack (via socket option). The
  * tfb_tcp_fb_fini() function itself should not change timers
  * or inp_flags2 (the tfb_tcp_fb_init() must do that). However
  * if the old stack supports the LRO mbuf queuing, and the new
  * stack does not communicate via chg messages that it too does,
  * it must assume it does not and free any queued mbufs.
  *
  */
 struct tcp_function_block {
 	char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX];
 	int	(*tfb_tcp_output)(struct tcpcb *);
 	void	(*tfb_tcp_do_segment)(struct tcpcb *, struct mbuf *,
 		    struct tcphdr *, int, int, uint8_t);
 	int      (*tfb_do_segment_nounlock)(struct tcpcb *, struct mbuf *,
 		    struct tcphdr *, int, int, uint8_t, int, struct timeval *);
 	int     (*tfb_do_queued_segments)(struct tcpcb *, int);
 	int     (*tfb_tcp_ctloutput)(struct tcpcb *, struct sockopt *);
 	/* Optional memory allocation/free routine */
 	int	(*tfb_tcp_fb_init)(struct tcpcb *, void **);
 	void	(*tfb_tcp_fb_fini)(struct tcpcb *, int);
 	/* Optional timers, must define all if you define one */
 	int	(*tfb_tcp_timer_stop_all)(struct tcpcb *);
 	void	(*tfb_tcp_rexmit_tmr)(struct tcpcb *);
 	int	(*tfb_tcp_handoff_ok)(struct tcpcb *);
 	void	(*tfb_tcp_mtu_chg)(struct tcpcb *tp);
 	int	(*tfb_pru_options)(struct tcpcb *, int);
 	void	(*tfb_hwtls_change)(struct tcpcb *, int);
 	int	(*tfb_chg_query)(struct tcpcb *, struct tcp_query_resp *);
 	void	(*tfb_switch_failed)(struct tcpcb *);
 	bool	(*tfb_early_wake_check)(struct tcpcb *);
 	int     (*tfb_compute_pipe)(struct tcpcb *tp);
 	volatile uint32_t tfb_refcnt;
 	uint32_t  tfb_flags;
 	uint8_t	tfb_id;
 };
 
 struct tcp_function {
 	TAILQ_ENTRY(tcp_function)	tf_next;
 	char				tf_name[TCP_FUNCTION_NAME_LEN_MAX];
 	struct tcp_function_block	*tf_fb;
 };
 
 TAILQ_HEAD(tcp_funchead, tcp_function);
 
 struct tcpcb * tcp_drop(struct tcpcb *, int);
 
 #ifdef _NETINET_IN_PCB_H_
 #define	intotcpcb(inp)	__containerof((inp), struct tcpcb, t_inpcb)
 #define	sototcpcb(so)	intotcpcb(sotoinpcb(so))
 #define	tptoinpcb(tp)	(&(tp)->t_inpcb)
 #define	tptosocket(tp)	(tp)->t_inpcb.inp_socket
 
 /*
  * tcp_output()
  * Handles tcp_drop request from advanced stacks and reports that inpcb is
  * gone with negative return code.
  * Drop in replacement for the default stack.
  */
 static inline int
 tcp_output(struct tcpcb *tp)
 {
 	struct inpcb *inp = tptoinpcb(tp);
 	int rv;
 
 	INP_WLOCK_ASSERT(inp);
 
 	rv = tp->t_fb->tfb_tcp_output(tp);
 	if (rv < 0) {
 		KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP,
 		    ("TCP stack %s requested tcp_drop(%p)",
 		    tp->t_fb->tfb_tcp_block_name, tp));
 		tp = tcp_drop(tp, -rv);
 		if (tp)
 			INP_WUNLOCK(inp);
 	}
 
 	return (rv);
 }
 
 /*
  * tcp_output_unlock()
  * Always returns unlocked, handles drop request from advanced stacks.
  * Always returns positive error code.
  */
 static inline int
 tcp_output_unlock(struct tcpcb *tp)
 {
 	struct inpcb *inp = tptoinpcb(tp);
 	int rv;
 
 	INP_WLOCK_ASSERT(inp);
 
 	rv = tp->t_fb->tfb_tcp_output(tp);
 	if (rv < 0) {
 		KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP,
 		    ("TCP stack %s requested tcp_drop(%p)",
 		    tp->t_fb->tfb_tcp_block_name, tp));
 		rv = -rv;
 		tp = tcp_drop(tp, rv);
 		if (tp)
 			INP_WUNLOCK(inp);
 	} else
 		INP_WUNLOCK(inp);
 
 	return (rv);
 }
 
 /*
  * tcp_output_nodrop()
  * Always returns locked.  It is caller's responsibility to run tcp_drop()!
  * Useful in syscall implementations, when we want to perform some logging
  * and/or tracing with tcpcb before calling tcp_drop().  To be used with
  * tcp_unlock_or_drop() later.
  *
  * XXXGL: maybe don't allow stacks to return a drop request at certain
  * TCP states? Why would it do in connect(2)? In recv(2)?
  */
 static inline int
 tcp_output_nodrop(struct tcpcb *tp)
 {
 	int rv;
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	rv = tp->t_fb->tfb_tcp_output(tp);
 	KASSERT(rv >= 0 || tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP,
 	    ("TCP stack %s requested tcp_drop(%p)",
 	    tp->t_fb->tfb_tcp_block_name, tp));
 	return (rv);
 }
 
 /*
  * tcp_unlock_or_drop()
  * Handle return code from tfb_tcp_output() after we have logged/traced,
  * to be used with tcp_output_nodrop().
  */
 static inline int
 tcp_unlock_or_drop(struct tcpcb *tp, int tcp_output_retval)
 {
 	struct inpcb *inp = tptoinpcb(tp);
 
 	INP_WLOCK_ASSERT(inp);
 
         if (tcp_output_retval < 0) {
                 tcp_output_retval = -tcp_output_retval;
                 if (tcp_drop(tp, tcp_output_retval) != NULL)
                         INP_WUNLOCK(inp);
         } else
 		INP_WUNLOCK(inp);
 
 	return (tcp_output_retval);
 }
 #endif	/* _NETINET_IN_PCB_H_ */
 
 static int inline
 tcp_packets_this_ack(struct tcpcb *tp, tcp_seq ack)
 {
 	return ((ack - tp->snd_una) / tp->t_maxseg +
 		((((ack - tp->snd_una) % tp->t_maxseg) != 0) ? 1 : 0));
 }
 #endif	/* _KERNEL */
 
 /*
  * Flags and utility macros for the t_flags field.
  */
 #define	TF_ACKNOW	0x00000001	/* ack peer immediately */
 #define	TF_DELACK	0x00000002	/* ack, but try to delay it */
 #define	TF_NODELAY	0x00000004	/* don't delay packets to coalesce */
 #define	TF_NOOPT	0x00000008	/* don't use tcp options */
 #define	TF_SENTFIN	0x00000010	/* have sent FIN */
 #define	TF_REQ_SCALE	0x00000020	/* have/will request window scaling */
 #define	TF_RCVD_SCALE	0x00000040	/* other side has requested scaling */
 #define	TF_REQ_TSTMP	0x00000080	/* have/will request timestamps */
 #define	TF_RCVD_TSTMP	0x00000100	/* a timestamp was received in SYN */
 #define	TF_SACK_PERMIT	0x00000200	/* other side said I could SACK */
 #define	TF_NEEDSYN	0x00000400	/* send SYN (implicit state) */
 #define	TF_NEEDFIN	0x00000800	/* send FIN (implicit state) */
 #define	TF_NOPUSH	0x00001000	/* don't push */
 #define	TF_PREVVALID	0x00002000	/* saved values for bad rxmit valid
 					 * Note: accessing and restoring from
 					 * these may only be done in the 1st
 					 * RTO recovery round (t_rxtshift == 1)
 					 */
 #define	TF_WAKESOR	0x00004000	/* wake up receive socket */
 #define	TF_GPUTINPROG	0x00008000	/* Goodput measurement in progress */
 #define	TF_MORETOCOME	0x00010000	/* More data to be appended to sock */
 #define	TF_SONOTCONN	0x00020000	/* needs soisconnected() on ESTAB */
 #define	TF_LASTIDLE	0x00040000	/* connection was previously idle */
 #define	TF_RXWIN0SENT	0x00080000	/* sent a receiver win 0 in response */
 #define	TF_FASTRECOVERY	0x00100000	/* in NewReno Fast Recovery */
 #define	TF_WASFRECOVERY	0x00200000	/* was in NewReno Fast Recovery */
 #define	TF_SIGNATURE	0x00400000	/* require MD5 digests (RFC2385) */
 #define	TF_FORCEDATA	0x00800000	/* force out a byte */
 #define	TF_TSO		0x01000000	/* TSO enabled on this connection */
 #define	TF_TOE		0x02000000	/* this connection is offloaded */
 #define	TF_CLOSED	0x04000000	/* close(2) called on socket */
 #define	TF_UNUSED1	0x08000000	/* unused */
 #define	TF_LRD		0x10000000	/* Lost Retransmission Detection */
 #define	TF_CONGRECOVERY	0x20000000	/* congestion recovery mode */
 #define	TF_WASCRECOVERY	0x40000000	/* was in congestion recovery */
 #define	TF_FASTOPEN	0x80000000	/* TCP Fast Open indication */
 
 #define	IN_FASTRECOVERY(t_flags)	(t_flags & TF_FASTRECOVERY)
 #define	ENTER_FASTRECOVERY(t_flags)	t_flags |= TF_FASTRECOVERY
 #define	EXIT_FASTRECOVERY(t_flags)	t_flags &= ~TF_FASTRECOVERY
 
 #define	IN_CONGRECOVERY(t_flags)	(t_flags & TF_CONGRECOVERY)
 #define	ENTER_CONGRECOVERY(t_flags)	t_flags |= TF_CONGRECOVERY
 #define	EXIT_CONGRECOVERY(t_flags)	t_flags &= ~TF_CONGRECOVERY
 
 #define	IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY))
 #define	ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY)
 #define	EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY)
 
 #if defined(_KERNEL) && !defined(TCP_RFC7413)
 #define	IS_FASTOPEN(t_flags)		(false)
 #else
 #define	IS_FASTOPEN(t_flags)		(t_flags & TF_FASTOPEN)
 #endif
 
 #define	BYTES_THIS_ACK(tp, th)	(th->th_ack - tp->snd_una)
 
 /*
  * Flags for the t_oobflags field.
  */
 #define	TCPOOB_HAVEDATA	0x01
 #define	TCPOOB_HADDATA	0x02
 
 /*
  * Flags for the extended TCP flags field, t_flags2
  */
 #define	TF2_PLPMTU_BLACKHOLE	0x00000001 /* Possible PLPMTUD Black Hole. */
 #define	TF2_PLPMTU_PMTUD	0x00000002 /* Allowed to attempt PLPMTUD. */
 #define	TF2_PLPMTU_MAXSEGSNT	0x00000004 /* Last seg sent was full seg. */
 #define	TF2_LOG_AUTO		0x00000008 /* Session is auto-logging. */
 #define	TF2_DROP_AF_DATA	0x00000010 /* Drop after all data ack'd */
 #define	TF2_ECN_PERMIT		0x00000020 /* connection ECN-ready */
 #define	TF2_ECN_SND_CWR		0x00000040 /* ECN CWR in queue */
 #define	TF2_ECN_SND_ECE		0x00000080 /* ECN ECE in queue */
 #define	TF2_ACE_PERMIT		0x00000100 /* Accurate ECN mode */
 #define	TF2_HPTS_CPU_SET	0x00000200 /* t_hpts_cpu is not random */
 #define	TF2_FBYTES_COMPLETE	0x00000400 /* We have first bytes in and out */
 #define	TF2_ECN_USE_ECT1	0x00000800 /* Use ECT(1) marking on session */
 #define TF2_TCP_ACCOUNTING	0x00001000 /* Do TCP accounting */
 #define	TF2_HPTS_CALLS		0x00002000 /* tcp_output() called via HPTS */
 #define	TF2_MBUF_L_ACKS		0x00004000 /* large mbufs for ack compression */
 #define	TF2_MBUF_ACKCMP		0x00008000 /* mbuf ack compression ok */
 #define	TF2_SUPPORTS_MBUFQ	0x00010000 /* Supports the mbuf queue method */
 #define	TF2_MBUF_QUEUE_READY	0x00020000 /* Inputs can be queued */
 #define	TF2_DONT_SACK_QUEUE	0x00040000 /* Don't wake on sack */
 #define	TF2_CANNOT_DO_ECN	0x00080000 /* The stack does not do ECN */
 
 /*
  * Structure to hold TCP options that are only used during segment
  * processing (in tcp_input), but not held in the tcpcb.
  * It's basically used to reduce the number of parameters
  * to tcp_dooptions and tcp_addoptions.
  * The binary order of the to_flags is relevant for packing of the
  * options in tcp_addoptions.
  */
 struct tcpopt {
 	u_int32_t	to_flags;	/* which options are present */
 #define	TOF_MSS		0x0001		/* maximum segment size */
 #define	TOF_SCALE	0x0002		/* window scaling */
 #define	TOF_SACKPERM	0x0004		/* SACK permitted */
 #define	TOF_TS		0x0010		/* timestamp */
 #define	TOF_SIGNATURE	0x0040		/* TCP-MD5 signature option (RFC2385) */
 #define	TOF_SACK	0x0080		/* Peer sent SACK option */
 #define	TOF_FASTOPEN	0x0100		/* TCP Fast Open (TFO) cookie */
 #define	TOF_MAXOPT	0x0200
 	u_int32_t	to_tsval;	/* new timestamp */
 	u_int32_t	to_tsecr;	/* reflected timestamp */
 	u_char		*to_sacks;	/* pointer to the first SACK blocks */
 	u_char		*to_signature;	/* pointer to the TCP-MD5 signature */
 	u_int8_t	*to_tfo_cookie; /* pointer to the TFO cookie */
 	u_int16_t	to_mss;		/* maximum segment size */
 	u_int8_t	to_wscale;	/* window scaling */
 	u_int8_t	to_nsacks;	/* number of SACK blocks */
 	u_int8_t	to_tfo_len;	/* TFO cookie length */
 	u_int32_t	to_spare;	/* UTO */
 };
 
 /*
  * Flags for tcp_dooptions.
  */
 #define	TO_SYN		0x01		/* parse SYN-only options */
 
 struct hc_metrics_lite {	/* must stay in sync with hc_metrics */
 	uint32_t	rmx_mtu;	/* MTU for this path */
 	uint32_t	rmx_ssthresh;	/* outbound gateway buffer limit */
 	uint32_t	rmx_rtt;	/* estimated round trip time */
 	uint32_t	rmx_rttvar;	/* estimated rtt variance */
 	uint32_t	rmx_cwnd;	/* congestion window */
 	uint32_t	rmx_sendpipe;   /* outbound delay-bandwidth product */
 	uint32_t	rmx_recvpipe;   /* inbound delay-bandwidth product */
 };
 
 /*
  * Used by tcp_maxmtu() to communicate interface specific features
  * and limits at the time of connection setup.
  */
 struct tcp_ifcap {
 	int	ifcap;
 	u_int	tsomax;
 	u_int	tsomaxsegcount;
 	u_int	tsomaxsegsize;
 };
 
 #ifndef _NETINET_IN_PCB_H_
 struct in_conninfo;
 #endif /* _NETINET_IN_PCB_H_ */
 
 /*
  * The smoothed round-trip time and estimated variance
  * are stored as fixed point numbers scaled by the values below.
  * For convenience, these scales are also used in smoothing the average
  * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed).
  * With these scales, srtt has 3 bits to the right of the binary point,
  * and thus an "ALPHA" of 0.875.  rttvar has 2 bits to the right of the
  * binary point, and is smoothed with an ALPHA of 0.75.
  */
 #define	TCP_RTT_SCALE		32	/* multiplier for srtt; 3 bits frac. */
 #define	TCP_RTT_SHIFT		5	/* shift for srtt; 3 bits frac. */
 #define	TCP_RTTVAR_SCALE	16	/* multiplier for rttvar; 2 bits */
 #define	TCP_RTTVAR_SHIFT	4	/* shift for rttvar; 2 bits */
 #define	TCP_DELTA_SHIFT		2	/* see tcp_input.c */
 
 /*
  * The initial retransmission should happen at rtt + 4 * rttvar.
  * Because of the way we do the smoothing, srtt and rttvar
  * will each average +1/2 tick of bias.  When we compute
  * the retransmit timer, we want 1/2 tick of rounding and
  * 1 extra tick because of +-1/2 tick uncertainty in the
  * firing of the timer.  The bias will give us exactly the
  * 1.5 tick we need.  But, because the bias is
  * statistical, we have to test that we don't drop below
  * the minimum feasible timer (which is 2 ticks).
  * This version of the macro adapted from a paper by Lawrence
  * Brakmo and Larry Peterson which outlines a problem caused
  * by insufficient precision in the original implementation,
  * which results in inappropriately large RTO values for very
  * fast networks.
  */
 #define	TCP_REXMTVAL(tp) \
 	max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT))  \
 	  + (tp)->t_rttvar) >> TCP_DELTA_SHIFT)
 
 /*
  * TCP statistics.
  * Many of these should be kept per connection,
  * but that's inconvenient at the moment.
  */
 struct	tcpstat {
 	uint64_t tcps_connattempt;	/* connections initiated */
 	uint64_t tcps_accepts;		/* connections accepted */
 	uint64_t tcps_connects;		/* connections established */
 	uint64_t tcps_drops;		/* connections dropped */
 	uint64_t tcps_conndrops;	/* embryonic connections dropped */
 	uint64_t tcps_minmssdrops;	/* average minmss too low drops */
 	uint64_t tcps_closed;		/* conn. closed (includes drops) */
 	uint64_t tcps_segstimed;	/* segs where we tried to get rtt */
 	uint64_t tcps_rttupdated;	/* times we succeeded */
 	uint64_t tcps_delack;		/* delayed acks sent */
 	uint64_t tcps_timeoutdrop;	/* conn. dropped in rxmt timeout */
 	uint64_t tcps_rexmttimeo;	/* retransmit timeouts */
 	uint64_t tcps_persisttimeo;	/* persist timeouts */
 	uint64_t tcps_keeptimeo;	/* keepalive timeouts */
 	uint64_t tcps_keepprobe;	/* keepalive probes sent */
 	uint64_t tcps_keepdrops;	/* connections dropped in keepalive */
 	uint64_t tcps_progdrops;	/* drops due to no progress */
 
 	uint64_t tcps_sndtotal;		/* total packets sent */
 	uint64_t tcps_sndpack;		/* data packets sent */
 	uint64_t tcps_sndbyte;		/* data bytes sent */
 	uint64_t tcps_sndrexmitpack;	/* data packets retransmitted */
 	uint64_t tcps_sndrexmitbyte;	/* data bytes retransmitted */
 	uint64_t tcps_sndrexmitbad;	/* unnecessary packet retransmissions */
 	uint64_t tcps_sndacks;		/* ack-only packets sent */
 	uint64_t tcps_sndprobe;		/* window probes sent */
 	uint64_t tcps_sndurg;		/* packets sent with URG only */
 	uint64_t tcps_sndwinup;		/* window update-only packets sent */
 	uint64_t tcps_sndctrl;		/* control (SYN|FIN|RST) packets sent */
 
 	uint64_t tcps_rcvtotal;		/* total packets received */
 	uint64_t tcps_rcvpack;		/* packets received in sequence */
 	uint64_t tcps_rcvbyte;		/* bytes received in sequence */
 	uint64_t tcps_rcvbadsum;	/* packets received with ccksum errs */
 	uint64_t tcps_rcvbadoff;	/* packets received with bad offset */
 	uint64_t tcps_rcvreassfull;	/* packets dropped for no reass space */
 	uint64_t tcps_rcvshort;		/* packets received too short */
 	uint64_t tcps_rcvduppack;	/* duplicate-only packets received */
 	uint64_t tcps_rcvdupbyte;	/* duplicate-only bytes received */
 	uint64_t tcps_rcvpartduppack;	/* packets with some duplicate data */
 	uint64_t tcps_rcvpartdupbyte;	/* dup. bytes in part-dup. packets */
 	uint64_t tcps_rcvoopack;	/* out-of-order packets received */
 	uint64_t tcps_rcvoobyte;	/* out-of-order bytes received */
 	uint64_t tcps_rcvpackafterwin;	/* packets with data after window */
 	uint64_t tcps_rcvbyteafterwin;	/* bytes rcvd after window */
 	uint64_t tcps_rcvafterclose;	/* packets rcvd after "close" */
 	uint64_t tcps_rcvwinprobe;	/* rcvd window probe packets */
 	uint64_t tcps_rcvdupack;	/* rcvd duplicate acks */
 	uint64_t tcps_rcvacktoomuch;	/* rcvd acks for unsent data */
 	uint64_t tcps_rcvackpack;	/* rcvd ack packets */
 	uint64_t tcps_rcvackbyte;	/* bytes acked by rcvd acks */
 	uint64_t tcps_rcvwinupd;	/* rcvd window update packets */
 	uint64_t tcps_pawsdrop;		/* segments dropped due to PAWS */
 	uint64_t tcps_predack;		/* times hdr predict ok for acks */
 	uint64_t tcps_preddat;		/* times hdr predict ok for data pkts */
 	uint64_t tcps_pcbcachemiss;
 	uint64_t tcps_cachedrtt;	/* times cached RTT in route updated */
 	uint64_t tcps_cachedrttvar;	/* times cached rttvar updated */
 	uint64_t tcps_cachedssthresh;	/* times cached ssthresh updated */
 	uint64_t tcps_usedrtt;		/* times RTT initialized from route */
 	uint64_t tcps_usedrttvar;	/* times RTTVAR initialized from rt */
 	uint64_t tcps_usedssthresh;	/* times ssthresh initialized from rt*/
 	uint64_t tcps_persistdrop;	/* timeout in persist state */
 	uint64_t tcps_badsyn;		/* bogus SYN, e.g. premature ACK */
 	uint64_t tcps_mturesent;	/* resends due to MTU discovery */
 	uint64_t tcps_listendrop;	/* listen queue overflows */
 	uint64_t tcps_badrst;		/* ignored RSTs in the window */
 
 	uint64_t tcps_sc_added;		/* entry added to syncache */
 	uint64_t tcps_sc_retransmitted;	/* syncache entry was retransmitted */
 	uint64_t tcps_sc_dupsyn;	/* duplicate SYN packet */
 	uint64_t tcps_sc_dropped;	/* could not reply to packet */
 	uint64_t tcps_sc_completed;	/* successful extraction of entry */
 	uint64_t tcps_sc_bucketoverflow;/* syncache per-bucket limit hit */
 	uint64_t tcps_sc_cacheoverflow;	/* syncache cache limit hit */
 	uint64_t tcps_sc_reset;		/* RST removed entry from syncache */
 	uint64_t tcps_sc_stale;		/* timed out or listen socket gone */
 	uint64_t tcps_sc_aborted;	/* syncache entry aborted */
 	uint64_t tcps_sc_badack;	/* removed due to bad ACK */
 	uint64_t tcps_sc_unreach;	/* ICMP unreachable received */
 	uint64_t tcps_sc_zonefail;	/* zalloc() failed */
 	uint64_t tcps_sc_sendcookie;	/* SYN cookie sent */
 	uint64_t tcps_sc_recvcookie;	/* SYN cookie received */
 
 	uint64_t tcps_hc_added;		/* entry added to hostcache */
 	uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */
 
 	uint64_t tcps_finwait2_drops;    /* Drop FIN_WAIT_2 connection after time limit */
 
 	/* SACK related stats */
 	uint64_t tcps_sack_recovery_episode; /* SACK recovery episodes */
 	uint64_t tcps_sack_rexmits;	    /* SACK rexmit segments   */
 	uint64_t tcps_sack_rexmit_bytes;    /* SACK rexmit bytes      */
 	uint64_t tcps_sack_rcv_blocks;	    /* SACK blocks (options) received */
 	uint64_t tcps_sack_send_blocks;	    /* SACK blocks (options) sent     */
 	uint64_t tcps_sack_lostrexmt;	    /* SACK lost retransmission recovered */
 	uint64_t tcps_sack_sboverflow;	    /* times scoreboard overflowed */
 
 	/* ECN related stats */
 	uint64_t tcps_ecn_rcvce;		/* ECN Congestion Experienced */
 	uint64_t tcps_ecn_rcvect0;		/* ECN Capable Transport */
 	uint64_t tcps_ecn_rcvect1;		/* ECN Capable Transport */
 	uint64_t tcps_ecn_shs;		/* ECN successful handshakes */
 	uint64_t tcps_ecn_rcwnd;	/* # times ECN reduced the cwnd */
 
 	/* TCP_SIGNATURE related stats */
 	uint64_t tcps_sig_rcvgoodsig;	/* Total matching signature received */
 	uint64_t tcps_sig_rcvbadsig;	/* Total bad signature received */
 	uint64_t tcps_sig_err_buildsig;	/* Failed to make signature */
 	uint64_t tcps_sig_err_sigopt;	/* No signature expected by socket */
 	uint64_t tcps_sig_err_nosigopt;	/* No signature provided by segment */
 
 	/* Path MTU Discovery Black Hole Detection related stats */
 	uint64_t tcps_pmtud_blackhole_activated;	 /* Black Hole Count */
 	uint64_t tcps_pmtud_blackhole_activated_min_mss; /* BH at min MSS Count */
 	uint64_t tcps_pmtud_blackhole_failed;		 /* Black Hole Failure Count */
 
 	uint64_t tcps_tunneled_pkts;	/* Packets encap's in UDP received */
 	uint64_t tcps_tunneled_errs;	/* Packets that had errors that were UDP encaped */
 
 	/* Dsack related stats */
 	uint64_t tcps_dsack_count;	/* Number of ACKs arriving with DSACKs */
 	uint64_t tcps_dsack_bytes;	/* Number of bytes DSACK'ed no TLP */
 	uint64_t tcps_dsack_tlp_bytes;	/* Number of bytes DSACK'ed due to TLPs */
 
 	/* TCPS_TIME_WAIT usage stats */
 	uint64_t tcps_tw_recycles;	/* Times time-wait was recycled. */
 	uint64_t tcps_tw_resets;	/* Times time-wait sent a reset. */
 	uint64_t tcps_tw_responds;	/* Times time-wait sent a valid ack. */
 
 	/* Accurate ECN Handshake stats */
 	uint64_t tcps_ace_nect;		/* ACE SYN packet with Non-ECT */
 	uint64_t tcps_ace_ect1;		/* ACE SYN packet with ECT1 */
 	uint64_t tcps_ace_ect0;		/* ACE SYN packet with ECT0 */
 	uint64_t tcps_ace_ce;		/* ACE SYN packet with CE */
 
 	/* ECN related stats */
 	uint64_t tcps_ecn_sndect0;		/* ECN Capable Transport */
 	uint64_t tcps_ecn_sndect1;		/* ECN Capable Transport */
 
 	/*
 	 * BBR and Rack implement TLP's these values count TLP bytes in
 	 * two catagories, bytes that were retransmitted and bytes that
 	 * were newly transmited. Both types can serve as TLP's but they
 	 * are accounted differently.
 	 */
 	uint64_t tcps_tlpresends;	/* number of tlp resends */
 	uint64_t tcps_tlpresend_bytes;	/* number of bytes resent by tlp */
 
 
 	uint64_t _pad[4];		/* 4 TBD placeholder for STABLE */
 };
 
 #define	tcps_rcvmemdrop	tcps_rcvreassfull	/* compat */
 
 #ifdef _KERNEL
 #define	TI_UNLOCKED	1
 #define	TI_RLOCKED	2
 #include <sys/counter.h>
 
 VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat);	/* tcp statistics */
 /*
  * In-kernel consumers can use these accessor macros directly to update
  * stats.
  */
 #define	TCPSTAT_ADD(name, val)	\
     VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val))
 #define	TCPSTAT_INC(name)	TCPSTAT_ADD(name, 1)
 
 /*
  * Kernel module consumers must use this accessor macro.
  */
 void	kmod_tcpstat_add(int statnum, int val);
 #define	KMOD_TCPSTAT_ADD(name, val)					\
     kmod_tcpstat_add(offsetof(struct tcpstat, name) / sizeof(uint64_t), val)
 #define	KMOD_TCPSTAT_INC(name)	KMOD_TCPSTAT_ADD(name, 1)
 
 /*
  * Running TCP connection count by state.
  */
 VNET_DECLARE(counter_u64_t, tcps_states[TCP_NSTATES]);
 #define	V_tcps_states	VNET(tcps_states)
 #define	TCPSTATES_INC(state)	counter_u64_add(V_tcps_states[state], 1)
 #define	TCPSTATES_DEC(state)	counter_u64_add(V_tcps_states[state], -1)
 
 /*
  * TCP specific helper hook point identifiers.
  */
 #define	HHOOK_TCP_EST_IN		0
 #define	HHOOK_TCP_EST_OUT		1
 #define	HHOOK_TCP_LAST			HHOOK_TCP_EST_OUT
 
 struct tcp_hhook_data {
 	struct tcpcb	*tp;
 	struct tcphdr	*th;
 	struct tcpopt	*to;
 	uint32_t	len;
 	int		tso;
 	tcp_seq		curack;
 };
 #ifdef TCP_HHOOK
 void hhook_run_tcp_est_out(struct tcpcb *tp,
 	struct tcphdr *th, struct tcpopt *to,
 	uint32_t len, int tso);
 #endif
 #endif
 
 /*
  * TCB structure exported to user-land via sysctl(3).
  *
  * Fields prefixed with "xt_" are unique to the export structure, and fields
  * with "t_" or other prefixes match corresponding fields of 'struct tcpcb'.
  *
  * Legend:
  * (s) - used by userland utilities in src
  * (p) - used by utilities in ports
  * (3) - is known to be used by third party software not in ports
  * (n) - no known usage
  *
  * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been
  * included.  Not all of our clients do.
  */
 #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_)
 struct xtcpcb {
 	ksize_t	xt_len;		/* length of this structure */
 	struct xinpcb	xt_inp;
 	char		xt_stack[TCP_FUNCTION_NAME_LEN_MAX];	/* (s) */
 	char		xt_logid[TCP_LOG_ID_LEN];	/* (s) */
 	char		xt_cc[TCP_CA_NAME_MAX];	/* (s) */
 	int64_t		spare64[6];
 	int32_t		t_state;		/* (s,p) */
 	uint32_t	t_flags;		/* (s,p) */
 	int32_t		t_sndzerowin;		/* (s) */
 	int32_t		t_sndrexmitpack;	/* (s) */
 	int32_t		t_rcvoopack;		/* (s) */
 	int32_t		t_rcvtime;		/* (s) */
 	int32_t		tt_rexmt;		/* (s) */
 	int32_t		tt_persist;		/* (s) */
 	int32_t		tt_keep;		/* (s) */
 	int32_t		tt_2msl;		/* (s) */
 	int32_t		tt_delack;		/* (s) */
 	int32_t		t_logstate;		/* (3) */
 	uint32_t	t_snd_cwnd;		/* (s) */
 	uint32_t	t_snd_ssthresh;		/* (s) */
 	uint32_t	t_maxseg;		/* (s) */
 	uint32_t	t_rcv_wnd;		/* (s) */
 	uint32_t	t_snd_wnd;		/* (s) */
 	uint32_t	xt_ecn;			/* (s) */
 	uint32_t	t_dsack_bytes;		/* (n) */
 	uint32_t	t_dsack_tlp_bytes;	/* (n) */
 	uint32_t	t_dsack_pack;		/* (n) */
 	uint16_t	xt_encaps_port;		/* (s) */
 	int16_t		spare16;
 	int32_t		spare32[22];
 } __aligned(8);
 
 #ifdef _KERNEL
 void	tcp_inptoxtp(const struct inpcb *, struct xtcpcb *);
 #endif
 #endif
 
 /*
  * TCP function information (name-to-id mapping, aliases, and refcnt)
  * exported to user-land via sysctl(3).
  */
 struct tcp_function_info {
 	uint32_t	tfi_refcnt;
 	uint8_t		tfi_id;
 	char		tfi_name[TCP_FUNCTION_NAME_LEN_MAX];
 	char		tfi_alias[TCP_FUNCTION_NAME_LEN_MAX];
 };
 
 /*
  * Identifiers for TCP sysctl nodes
  */
 #define	TCPCTL_DO_RFC1323	1	/* use RFC-1323 extensions */
 #define	TCPCTL_MSSDFLT		3	/* MSS default */
 #define TCPCTL_STATS		4	/* statistics */
 #define	TCPCTL_RTTDFLT		5	/* default RTT estimate */
 #define	TCPCTL_KEEPIDLE		6	/* keepalive idle timer */
 #define	TCPCTL_KEEPINTVL	7	/* interval to send keepalives */
 #define	TCPCTL_SENDSPACE	8	/* send buffer space */
 #define	TCPCTL_RECVSPACE	9	/* receive buffer space */
 #define	TCPCTL_KEEPINIT		10	/* timeout for establishing syn */
 #define	TCPCTL_PCBLIST		11	/* list of all outstanding PCBs */
 #define	TCPCTL_DELACKTIME	12	/* time before sending delayed ACK */
 #define	TCPCTL_V6MSSDFLT	13	/* MSS default for IPv6 */
 #define	TCPCTL_SACK		14	/* Selective Acknowledgement,rfc 2018 */
 #define	TCPCTL_DROP		15	/* drop tcp connection */
 #define	TCPCTL_STATES		16	/* connection counts by TCP state */
 
 #ifdef _KERNEL
 #ifdef SYSCTL_DECL
 SYSCTL_DECL(_net_inet_tcp);
 SYSCTL_DECL(_net_inet_tcp_sack);
 MALLOC_DECLARE(M_TCPLOG);
 #endif
 
 VNET_DECLARE(int, tcp_log_in_vain);
 #define	V_tcp_log_in_vain		VNET(tcp_log_in_vain)
 
 /*
  * Global TCP tunables shared between different stacks.
  * Please keep the list sorted.
  */
 VNET_DECLARE(int, drop_synfin);
 VNET_DECLARE(int, path_mtu_discovery);
 VNET_DECLARE(int, tcp_abc_l_var);
 VNET_DECLARE(int, tcp_autorcvbuf_max);
 VNET_DECLARE(int, tcp_autosndbuf_inc);
 VNET_DECLARE(int, tcp_autosndbuf_max);
 VNET_DECLARE(int, tcp_delack_enabled);
 VNET_DECLARE(int, tcp_do_autorcvbuf);
 VNET_DECLARE(int, tcp_do_autosndbuf);
 VNET_DECLARE(int, tcp_do_ecn);
 VNET_DECLARE(int, tcp_do_lrd);
 VNET_DECLARE(int, tcp_do_prr);
 VNET_DECLARE(int, tcp_do_prr_conservative);
 VNET_DECLARE(int, tcp_do_newcwv);
 VNET_DECLARE(int, tcp_do_rfc1323);
 VNET_DECLARE(int, tcp_tolerate_missing_ts);
 VNET_DECLARE(int, tcp_do_rfc3042);
 VNET_DECLARE(int, tcp_do_rfc3390);
 VNET_DECLARE(int, tcp_do_rfc3465);
 VNET_DECLARE(int, tcp_do_newsack);
 VNET_DECLARE(int, tcp_do_sack);
 VNET_DECLARE(int, tcp_do_tso);
 VNET_DECLARE(int, tcp_ecn_maxretries);
 VNET_DECLARE(int, tcp_initcwnd_segments);
 VNET_DECLARE(int, tcp_insecure_rst);
 VNET_DECLARE(int, tcp_insecure_syn);
 VNET_DECLARE(uint32_t, tcp_map_entries_limit);
 VNET_DECLARE(uint32_t, tcp_map_split_limit);
 VNET_DECLARE(int, tcp_minmss);
 VNET_DECLARE(int, tcp_mssdflt);
 #ifdef STATS
 VNET_DECLARE(int, tcp_perconn_stats_dflt_tpl);
 VNET_DECLARE(int, tcp_perconn_stats_enable);
 #endif /* STATS */
 VNET_DECLARE(int, tcp_recvspace);
 VNET_DECLARE(int, tcp_retries);
 VNET_DECLARE(int, tcp_sack_globalholes);
 VNET_DECLARE(int, tcp_sack_globalmaxholes);
 VNET_DECLARE(int, tcp_sack_maxholes);
 VNET_DECLARE(int, tcp_sc_rst_sock_fail);
 VNET_DECLARE(int, tcp_sendspace);
 VNET_DECLARE(int, tcp_udp_tunneling_overhead);
 VNET_DECLARE(int, tcp_udp_tunneling_port);
 VNET_DECLARE(struct inpcbinfo, tcbinfo);
 
 #define	V_tcp_do_lrd			VNET(tcp_do_lrd)
 #define	V_tcp_do_prr			VNET(tcp_do_prr)
-#define	V_tcp_do_prr_conservative	VNET(tcp_do_prr_conservative)
 #define	V_tcp_do_newcwv			VNET(tcp_do_newcwv)
 #define	V_drop_synfin			VNET(drop_synfin)
 #define	V_path_mtu_discovery		VNET(path_mtu_discovery)
 #define	V_tcbinfo			VNET(tcbinfo)
 #define	V_tcp_abc_l_var			VNET(tcp_abc_l_var)
 #define	V_tcp_autorcvbuf_max		VNET(tcp_autorcvbuf_max)
 #define	V_tcp_autosndbuf_inc		VNET(tcp_autosndbuf_inc)
 #define	V_tcp_autosndbuf_max		VNET(tcp_autosndbuf_max)
 #define	V_tcp_delack_enabled		VNET(tcp_delack_enabled)
 #define	V_tcp_do_autorcvbuf		VNET(tcp_do_autorcvbuf)
 #define	V_tcp_do_autosndbuf		VNET(tcp_do_autosndbuf)
 #define	V_tcp_do_ecn			VNET(tcp_do_ecn)
 #define	V_tcp_do_rfc1323		VNET(tcp_do_rfc1323)
 #define	V_tcp_tolerate_missing_ts	VNET(tcp_tolerate_missing_ts)
 #define V_tcp_ts_offset_per_conn	VNET(tcp_ts_offset_per_conn)
 #define	V_tcp_do_rfc3042		VNET(tcp_do_rfc3042)
 #define	V_tcp_do_rfc3390		VNET(tcp_do_rfc3390)
 #define	V_tcp_do_rfc3465		VNET(tcp_do_rfc3465)
 #define	V_tcp_do_newsack		VNET(tcp_do_newsack)
 #define	V_tcp_do_sack			VNET(tcp_do_sack)
 #define	V_tcp_do_tso			VNET(tcp_do_tso)
 #define	V_tcp_ecn_maxretries		VNET(tcp_ecn_maxretries)
 #define	V_tcp_initcwnd_segments		VNET(tcp_initcwnd_segments)
 #define	V_tcp_insecure_rst		VNET(tcp_insecure_rst)
 #define	V_tcp_insecure_syn		VNET(tcp_insecure_syn)
 #define	V_tcp_map_entries_limit		VNET(tcp_map_entries_limit)
 #define	V_tcp_map_split_limit		VNET(tcp_map_split_limit)
 #define	V_tcp_minmss			VNET(tcp_minmss)
 #define	V_tcp_mssdflt			VNET(tcp_mssdflt)
 #ifdef STATS
 #define	V_tcp_perconn_stats_dflt_tpl	VNET(tcp_perconn_stats_dflt_tpl)
 #define	V_tcp_perconn_stats_enable	VNET(tcp_perconn_stats_enable)
 #endif /* STATS */
 #define	V_tcp_recvspace			VNET(tcp_recvspace)
 #define	V_tcp_retries			VNET(tcp_retries)
 #define	V_tcp_sack_globalholes		VNET(tcp_sack_globalholes)
 #define	V_tcp_sack_globalmaxholes	VNET(tcp_sack_globalmaxholes)
 #define	V_tcp_sack_maxholes		VNET(tcp_sack_maxholes)
 #define	V_tcp_sc_rst_sock_fail		VNET(tcp_sc_rst_sock_fail)
 #define	V_tcp_sendspace			VNET(tcp_sendspace)
 #define	V_tcp_udp_tunneling_overhead	VNET(tcp_udp_tunneling_overhead)
 #define	V_tcp_udp_tunneling_port	VNET(tcp_udp_tunneling_port)
 
 #ifdef TCP_HHOOK
 VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]);
 #define	V_tcp_hhh		VNET(tcp_hhh)
 #endif
 
 int	 tcp_addoptions(struct tcpopt *, u_char *);
 struct tcpcb *
 	 tcp_close(struct tcpcb *);
 void	 tcp_discardcb(struct tcpcb *);
 void	 tcp_twstart(struct tcpcb *);
 int	 tcp_ctloutput(struct socket *, struct sockopt *);
 void	 tcp_fini(void *);
 char	*tcp_log_addrs(struct in_conninfo *, struct tcphdr *, const void *,
 	    const void *);
 char	*tcp_log_vain(struct in_conninfo *, struct tcphdr *, const void *,
 	    const void *);
 int	 tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *,
 	    struct mbuf *);
 void	 tcp_reass_global_init(void);
 void	 tcp_reass_flush(struct tcpcb *);
 void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
 void	tcp_dropwithreset(struct mbuf *, struct tcphdr *,
 		     struct tcpcb *, int, int);
 void	tcp_pulloutofband(struct socket *,
 		     struct tcphdr *, struct mbuf *, int);
 void	tcp_xmit_timer(struct tcpcb *, int);
 void	tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
 void	cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
 			    uint16_t nsegs, uint16_t type);
 void 	cc_conn_init(struct tcpcb *tp);
 void 	cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 void    cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos);
 void	cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos);
 void	cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
 #ifdef TCP_HHOOK
 void	hhook_run_tcp_est_in(struct tcpcb *tp,
 			    struct tcphdr *th, struct tcpopt *to);
 #endif
 
 int	 tcp_input(struct mbuf **, int *, int);
 int	 tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *,
 	    struct tcpcb *, int);
 int	 tcp_input_with_port(struct mbuf **, int *, int, uint16_t);
 void	tcp_do_segment(struct tcpcb *, struct mbuf *, struct tcphdr *, int,
     int, uint8_t);
 
 int register_tcp_functions(struct tcp_function_block *blk, int wait);
 int register_tcp_functions_as_names(struct tcp_function_block *blk,
     int wait, const char *names[], int *num_names);
 int register_tcp_functions_as_name(struct tcp_function_block *blk,
     const char *name, int wait);
 int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
     bool force);
 struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs);
 int find_tcp_function_alias(struct tcp_function_block *blk, struct tcp_function_set *fs);
 uint32_t tcp_get_srtt(struct tcpcb *tp, int granularity);
 void tcp_switch_back_to_default(struct tcpcb *tp);
 struct tcp_function_block *
 find_and_ref_tcp_fb(struct tcp_function_block *fs);
 int tcp_default_ctloutput(struct tcpcb *tp, struct sockopt *sopt);
 int tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt);
 void tcp_log_socket_option(struct tcpcb *tp, uint32_t option_num,
     uint32_t option_val, int err);
 
 
 extern counter_u64_t tcp_inp_lro_direct_queue;
 extern counter_u64_t tcp_inp_lro_wokeup_queue;
 extern counter_u64_t tcp_inp_lro_compressed;
 extern counter_u64_t tcp_inp_lro_locks_taken;
 extern counter_u64_t tcp_extra_mbuf;
 extern counter_u64_t tcp_would_have_but;
 extern counter_u64_t tcp_comp_total;
 extern counter_u64_t tcp_uncomp_total;
 extern counter_u64_t tcp_bad_csums;
 
 #ifdef TCP_SAD_DETECTION
 /* Various SACK attack thresholds */
 extern int32_t tcp_force_detection;
 extern int32_t tcp_sad_limit;
 extern int32_t tcp_sack_to_ack_thresh;
 extern int32_t tcp_sack_to_move_thresh;
 extern int32_t tcp_restoral_thresh;
 extern int32_t tcp_sad_decay_val;
 extern int32_t tcp_sad_pacing_interval;
 extern int32_t tcp_sad_low_pps;
 extern int32_t tcp_map_minimum;
 extern int32_t tcp_attack_on_turns_on_logging;
 #endif
 extern uint32_t tcp_ack_war_time_window;
 extern uint32_t tcp_ack_war_cnt;
 
 uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
 uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *);
 void	 tcp6_use_min_mtu(struct tcpcb *);
 u_int	 tcp_maxseg(const struct tcpcb *);
 u_int	 tcp_fixed_maxseg(const struct tcpcb *);
 void	 tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *,
 	    struct tcp_ifcap *);
 void	 tcp_mss(struct tcpcb *, int);
 int	 tcp_mssopt(struct in_conninfo *);
 struct tcpcb *
 	 tcp_newtcpcb(struct inpcb *);
 int	 tcp_default_output(struct tcpcb *);
 void	 tcp_state_change(struct tcpcb *, int);
 void	 tcp_respond(struct tcpcb *, void *,
 	    struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, uint16_t);
 bool	 tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *,
 	    struct mbuf *, int);
 void	 tcp_setpersist(struct tcpcb *);
 void	 tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp);
 struct tcptemp *
 	 tcpip_maketemplate(struct inpcb *);
 void	 tcpip_fillheaders(struct inpcb *, uint16_t, void *, void *);
 void	 tcp_timer_activate(struct tcpcb *, tt_which, u_int);
 bool	 tcp_timer_active(struct tcpcb *, tt_which);
 void	 tcp_timer_stop(struct tcpcb *);
 int	 inp_to_cpuid(struct inpcb *inp);
 /*
  * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
  */
 void	 tcp_hc_init(void);
 #ifdef VIMAGE
 void	 tcp_hc_destroy(void);
 #endif
 void	 tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *);
 uint32_t tcp_hc_getmtu(struct in_conninfo *);
 void	 tcp_hc_updatemtu(struct in_conninfo *, uint32_t);
 void	 tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *);
 void 	 cc_after_idle(struct tcpcb *tp);
 
 extern	struct protosw tcp_protosw;		/* shared for TOE */
 extern	struct protosw tcp6_protosw;		/* shared for TOE */
 
 uint32_t tcp_new_ts_offset(struct in_conninfo *);
 tcp_seq	 tcp_new_isn(struct in_conninfo *);
 
-int	 tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq);
+sackstatus_t
+	 tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq);
 int	 tcp_dsack_block_exists(struct tcpcb *);
 void	 tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq);
 void	 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend);
 void	 tcp_clean_dsack_blocks(struct tcpcb *tp);
 void	 tcp_clean_sackreport(struct tcpcb *tp);
 void	 tcp_sack_adjust(struct tcpcb *tp);
 struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);
-void	 tcp_do_prr_ack(struct tcpcb *, struct tcphdr *, struct tcpopt *);
+void	 tcp_do_prr_ack(struct tcpcb *, struct tcphdr *, struct tcpopt *, sackstatus_t);
 void	 tcp_lost_retransmission(struct tcpcb *, struct tcphdr *);
 void	 tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
 void	 tcp_free_sackholes(struct tcpcb *tp);
 void	 tcp_sack_lost_retransmission(struct tcpcb *, struct tcphdr *);
 int	 tcp_newreno(struct tcpcb *, struct tcphdr *);
 int	 tcp_compute_pipe(struct tcpcb *);
 uint32_t tcp_compute_initwnd(uint32_t);
 void	 tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t);
 int	 tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes,
     size_t seed_len);
 int tcp_can_enable_pacing(void);
 void tcp_decrement_paced_conn(void);
 void tcp_change_time_units(struct tcpcb *, int);
 void tcp_handle_orphaned_packets(struct tcpcb *);
 
 struct mbuf *
 	 tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
 	   int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls);
 
 int	tcp_stats_init(void);
 void tcp_log_end_status(struct tcpcb *tp, uint8_t status);
 #ifdef TCP_REQUEST_TRK
 void tcp_req_free_a_slot(struct tcpcb *tp, struct tcp_sendfile_track *ent);
 struct tcp_sendfile_track *
 tcp_req_find_a_req_that_is_completed_by(struct tcpcb *tp, tcp_seq th_ack, int *ip);
 int tcp_req_check_for_comp(struct tcpcb *tp, tcp_seq ack_point);
 int
 tcp_req_is_entry_comp(struct tcpcb *tp, struct tcp_sendfile_track *ent, tcp_seq ack_point);
 struct tcp_sendfile_track *
 tcp_req_find_req_for_seq(struct tcpcb *tp, tcp_seq seq);
 void
 tcp_req_log_req_info(struct tcpcb *tp,
     struct tcp_sendfile_track *req, uint16_t slot,
     uint8_t val, uint64_t offset, uint64_t nbytes);
 
 uint32_t
 tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes);
 void
 tcp_req_alloc_req(struct tcpcb *tp, union tcp_log_userdata *user,
     uint64_t ts);
 
 struct tcp_sendfile_track *
 tcp_req_alloc_req_full(struct tcpcb *tp, struct tcp_snd_req *req, uint64_t ts, int rec_dups);
 
 
 #endif
 #ifdef TCP_ACCOUNTING
 int tcp_do_ack_accounting(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t tiwin, int mss);
 #endif
 
 static inline void
 tcp_lro_features_off(struct tcpcb *tp)
 {
 	tp->t_flags2 &= ~(TF2_SUPPORTS_MBUFQ|
 	    TF2_MBUF_QUEUE_READY|
 	    TF2_DONT_SACK_QUEUE|
 	    TF2_MBUF_ACKCMP|
 	    TF2_MBUF_L_ACKS);
 }
 
 static inline void
 tcp_fields_to_host(struct tcphdr *th)
 {
 
 	th->th_seq = ntohl(th->th_seq);
 	th->th_ack = ntohl(th->th_ack);
 	th->th_win = ntohs(th->th_win);
 	th->th_urp = ntohs(th->th_urp);
 }
 
 static inline void
 tcp_fields_to_net(struct tcphdr *th)
 {
 
 	th->th_seq = htonl(th->th_seq);
 	th->th_ack = htonl(th->th_ack);
 	th->th_win = htons(th->th_win);
 	th->th_urp = htons(th->th_urp);
 }
 
 static inline uint16_t
 tcp_get_flags(const struct tcphdr *th)
 {
         return (((uint16_t)th->th_x2 << 8) | th->th_flags);
 }
 
 static inline void
 tcp_set_flags(struct tcphdr *th, uint16_t flags)
 {
         th->th_x2    = (flags >> 8) & 0x0f;
         th->th_flags = flags & 0xff;
 }
 
 static inline void
 tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt,
     uint8_t is_tlp, bool hw_tls)
 {
 	if (is_tlp) {
 		tp->t_sndtlppack++;
 		tp->t_sndtlpbyte += len;
 	}
 	/* To get total bytes sent you must add t_snd_rxt_bytes to t_sndbytes */
 	if (is_rxt)
 		tp->t_snd_rxt_bytes += len;
 	else
 		tp->t_sndbytes += len;
 
 #ifdef KERN_TLS
 	if (hw_tls && is_rxt && len != 0) {
 		uint64_t rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / (10ULL * (tp->t_snd_rxt_bytes + tp->t_sndbytes));
 		if (rexmit_percent > ktls_ifnet_max_rexmit_pct)
 			ktls_disable_ifnet(tp);
 	}
 #endif
 
 }
 #endif /* _KERNEL */
 
 #endif /* _NETINET_TCP_VAR_H_ */