diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
index 16cf02184516..d01505e58427 100644
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@@ -1,799 +1,847 @@
 .\" Copyright (c) 1983, 1991, 1993
 .\"	The Regents of the University of California.
 .\" Copyright (c) 2010-2011 The FreeBSD Foundation
 .\" All rights reserved.
 .\"
 .\" Portions of this documentation were written at the Centre for Advanced
 .\" Internet Architectures, Swinburne University of Technology, Melbourne,
 .\" Australia by David Hayes under sponsorship from the FreeBSD Foundation.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\"     From: @(#)tcp.4	8.1 (Berkeley) 6/5/93
 .\" $FreeBSD$
 .\"
-.Dd February 13, 2021
+.Dd April 8, 2021
 .Dt TCP 4
 .Os
 .Sh NAME
 .Nm tcp
 .Nd Internet Transmission Control Protocol
 .Sh SYNOPSIS
 .In sys/types.h
 .In sys/socket.h
 .In netinet/in.h
 .In netinet/tcp.h
 .Ft int
 .Fn socket AF_INET SOCK_STREAM 0
 .Sh DESCRIPTION
 The
 .Tn TCP
 protocol provides reliable, flow-controlled, two-way
 transmission of data.
 It is a byte-stream protocol used to
 support the
 .Dv SOCK_STREAM
 abstraction.
 .Tn TCP
 uses the standard
 Internet address format and, in addition, provides a per-host
 collection of
 .Dq "port addresses" .
 Thus, each address is composed
 of an Internet address specifying the host and network,
 with a specific
 .Tn TCP
 port on the host identifying the peer entity.
 .Pp
 Sockets utilizing the
 .Tn TCP
 protocol are either
 .Dq active
 or
 .Dq passive .
 Active sockets initiate connections to passive
 sockets.
 By default,
 .Tn TCP
 sockets are created active; to create a
 passive socket, the
 .Xr listen 2
 system call must be used
 after binding the socket with the
 .Xr bind 2
 system call.
 Only passive sockets may use the
 .Xr accept 2
 call to accept incoming connections.
 Only active sockets may use the
 .Xr connect 2
 call to initiate connections.
 .Pp
 Passive sockets may
 .Dq underspecify
 their location to match
 incoming connection requests from multiple networks.
 This technique, termed
 .Dq "wildcard addressing" ,
 allows a single
 server to provide service to clients on multiple networks.
 To create a socket which listens on all networks, the Internet
 address
 .Dv INADDR_ANY
 must be bound.
 The
 .Tn TCP
 port may still be specified
 at this time; if the port is not specified, the system will assign one.
 Once a connection has been established, the socket's address is
 fixed by the peer entity's location.
 The address assigned to the
 socket is the address associated with the network interface
 through which packets are being transmitted and received.
 Normally, this address corresponds to the peer entity's network.
 .Pp
 .Tn TCP
 supports a number of socket options which can be set with
 .Xr setsockopt 2
 and tested with
 .Xr getsockopt 2 :
 .Bl -tag -width ".Dv TCP_FUNCTION_BLK"
 .It Dv TCP_INFO
 Information about a socket's underlying TCP session may be retrieved
 by passing the read-only option
 .Dv TCP_INFO
 to
 .Xr getsockopt 2 .
 It accepts a single argument: a pointer to an instance of
 .Vt "struct tcp_info" .
 .Pp
 This API is subject to change; consult the source to determine
 which fields are currently filled out by this option.
 .Fx
 specific additions include
 send window size,
 receive window size,
 and
 bandwidth-controlled window space.
 .It Dv TCP_CCALGOOPT
 Set or query congestion control algorithm specific parameters.
 See
 .Xr mod_cc 4
 for details.
 .It Dv TCP_CONGESTION
 Select or query the congestion control algorithm that TCP will use for the
 connection.
 See
 .Xr mod_cc 4
 for details.
 .It Dv TCP_FUNCTION_BLK
 Select or query the set of functions that TCP will use for this connection.
 This allows a user to select an alternate TCP stack.
 The alternate TCP stack must already be loaded in the kernel.
 To list the available TCP stacks, see
 .Va functions_available
 in the
 .Sx MIB Variables
 section further down.
 To list the default TCP stack, see
 .Va functions_default
 in the
 .Sx MIB Variables
 section.
 .It Dv TCP_KEEPINIT
 This
 .Xr setsockopt 2
 option accepts a per-socket timeout argument of
 .Vt "u_int"
 in seconds, for new, non-established
 .Tn TCP
 connections.
 For the global default in milliseconds see
 .Va keepinit
 in the
 .Sx MIB Variables
 section further down.
 .It Dv TCP_KEEPIDLE
 This
 .Xr setsockopt 2
 option accepts an argument of
 .Vt "u_int"
 for the amount of time, in seconds, that the connection must be idle
 before keepalive probes (if enabled) are sent for the connection of this
 socket.
 If set on a listening socket, the value is inherited by the newly created
 socket upon
 .Xr accept 2 .
 For the global default in milliseconds see
 .Va keepidle
 in the
 .Sx MIB Variables
 section further down.
 .It Dv TCP_KEEPINTVL
 This
 .Xr setsockopt 2
 option accepts an argument of
 .Vt "u_int"
 to set the per-socket interval, in seconds, between keepalive probes sent
 to a peer.
 If set on a listening socket, the value is inherited by the newly created
 socket upon
 .Xr accept 2 .
 For the global default in milliseconds see
 .Va keepintvl
 in the
 .Sx MIB Variables
 section further down.
 .It Dv TCP_KEEPCNT
 This
 .Xr setsockopt 2
 option accepts an argument of
 .Vt "u_int"
 and allows a per-socket tuning of the number of probes sent, with no response,
 before the connection will be dropped.
 If set on a listening socket, the value is inherited by the newly created
 socket upon
 .Xr accept 2 .
 For the global default see the
 .Va keepcnt
 in the
 .Sx MIB Variables
 section further down.
 .It Dv TCP_NODELAY
 Under most circumstances,
 .Tn TCP
 sends data when it is presented;
 when outstanding data has not yet been acknowledged, it gathers
 small amounts of output to be sent in a single packet once
 an acknowledgement is received.
 For a small number of clients, such as window systems
 that send a stream of mouse events which receive no replies,
 this packetization may cause significant delays.
 The boolean option
 .Dv TCP_NODELAY
 defeats this algorithm.
 .It Dv TCP_MAXSEG
 By default, a sender- and
 .No receiver- Ns Tn TCP
 will negotiate among themselves to determine the maximum segment size
 to be used for each connection.
 The
 .Dv TCP_MAXSEG
 option allows the user to determine the result of this negotiation,
 and to reduce it if desired.
 .It Dv TCP_NOOPT
 .Tn TCP
 usually sends a number of options in each packet, corresponding to
 various
 .Tn TCP
 extensions which are provided in this implementation.
 The boolean option
 .Dv TCP_NOOPT
 is provided to disable
 .Tn TCP
 option use on a per-connection basis.
 .It Dv TCP_NOPUSH
 By convention, the
 .No sender- Ns Tn TCP
 will set the
 .Dq push
 bit, and begin transmission immediately (if permitted) at the end of
 every user call to
 .Xr write 2
 or
 .Xr writev 2 .
 When this option is set to a non-zero value,
 .Tn TCP
 will delay sending any data at all until either the socket is closed,
 or the internal send buffer is filled.
 .It Dv TCP_MD5SIG
 This option enables the use of MD5 digests (also known as TCP-MD5)
 on writes to the specified socket.
 Outgoing traffic is digested;
 digests on incoming traffic are verified.
 When this option is enabled on a socket, all inbound and outgoing
 TCP segments must be signed with MD5 digests.
 .Pp
 One common use for this in a
 .Fx
 router deployment is to enable
 based routers to interwork with Cisco equipment at peering points.
 Support for this feature conforms to RFC 2385.
 .Pp
 In order for this option to function correctly, it is necessary for the
 administrator to add a tcp-md5 key entry to the system's security
 associations database (SADB) using the
 .Xr setkey 8
 utility.
 This entry can only be specified on a per-host basis at this time.
 .Pp
 If an SADB entry cannot be found for the destination,
 the system does not send any outgoing segments and drops any inbound segments.
 .It Dv TCP_STATS
 Manage collection of connection level statistics using the
 .Xr stats 3
 framework.
 .Pp
 Each dropped segment is taken into account in the TCP protocol statistics.
 .It Dv TCP_TXTLS_ENABLE
 Enable in-kernel Transport Layer Security (TLS) for data written to this
 socket.
 See
 .Xr ktls 4
 for more details.
 .It Dv TCP_TXTLS_MODE
 The integer argument can be used to get or set the current TLS transmit mode
 of a socket.
 See
 .Xr ktls 4
 for more details.
 .It Dv TCP_RXTLS_ENABLE
 Enable in-kernel TLS for data read from this socket.
 See
 .Xr ktls 4
 for more details.
 .It Dv TCP_REUSPORT_LB_NUMA
 Changes NUMA affinity filtering for an established TCP listen
 socket.
 This option takes a single integer argument which specifies
 the NUMA domain to filter on for this listen socket.
 The argument can also have the follwing special values:
 .Bl -tag -width "Dv TCP_REUSPORT_LB_NUMA"
 .It Dv TCP_REUSPORT_LB_NUMA_NODOM
 Remove NUMA filtering for this listen socket.
 .It Dv TCP_REUSPORT_LB_NUMA_CURDOM
 Filter traffic associated with the domain where the calling thread is
 currently executing.
 This is typically used after a process or thread inherits a listen
 socket from its parent, and sets its CPU affinity to a particular core.
 .El
 .El
 .Pp
 The option level for the
 .Xr setsockopt 2
 call is the protocol number for
 .Tn TCP ,
 available from
 .Xr getprotobyname 3 ,
 or
 .Dv IPPROTO_TCP .
 All options are declared in
 .In netinet/tcp.h .
 .Pp
 Options at the
 .Tn IP
 transport level may be used with
 .Tn TCP ;
 see
 .Xr ip 4 .
 Incoming connection requests that are source-routed are noted,
 and the reverse source route is used in responding.
 .Pp
 The default congestion control algorithm for
 .Tn TCP
 is
 .Xr cc_newreno 4 .
 Other congestion control algorithms can be made available using the
 .Xr mod_cc 4
 framework.
 .Ss MIB Variables
 The
 .Tn TCP
 protocol implements a number of variables in the
 .Va net.inet.tcp
 branch of the
 .Xr sysctl 3
 MIB.
 .Bl -tag -width ".Va TCPCTL_DO_RFC1323"
 .It Dv TCPCTL_DO_RFC1323
 .Pq Va rfc1323
 Implement the window scaling and timestamp options of RFC 1323/RFC 7323
 (default is true).
 .It Va tolerate_missing_ts
 Tolerate the missing of timestamps (RFC 1323/RFC 7323) for
 .Tn TCP
 segments belonging to
 .Tn TCP
 connections for which support of
 .Tn TCP
 timestamps has been negotiated.
 (default is 0, i.e., the missing of timestamps is not tolerated).
 .It Dv TCPCTL_MSSDFLT
 .Pq Va mssdflt
 The default value used for the maximum segment size
 .Pq Dq MSS
 when no advice to the contrary is received from MSS negotiation.
 .It Dv TCPCTL_SENDSPACE
 .Pq Va sendspace
 Maximum
 .Tn TCP
 send window.
 .It Dv TCPCTL_RECVSPACE
 .Pq Va recvspace
 Maximum
 .Tn TCP
 receive window.
 .It Va log_in_vain
 Log any connection attempts to ports where there is not a socket
 accepting connections.
 The value of 1 limits the logging to
 .Tn SYN
 (connection establishment) packets only.
 That of 2 results in any
 .Tn TCP
 packets to closed ports being logged.
 Any value unlisted above disables the logging
 (default is 0, i.e., the logging is disabled).
 .It Va msl
 The Maximum Segment Lifetime, in milliseconds, for a packet.
 .It Va keepinit
 Timeout, in milliseconds, for new, non-established
 .Tn TCP
 connections.
 The default is 75000 msec.
 .It Va keepidle
 Amount of time, in milliseconds, that the connection must be idle
 before keepalive probes (if enabled) are sent.
 The default is 7200000 msec (2 hours).
 .It Va keepintvl
 The interval, in milliseconds, between keepalive probes sent to remote
 machines, when no response is received on a
 .Va keepidle
 probe.
 The default is 75000 msec.
 .It Va keepcnt
 Number of probes sent, with no response, before a connection
 is dropped.
 The default is 8 packets.
 .It Va always_keepalive
 Assume that
 .Dv SO_KEEPALIVE
 is set on all
 .Tn TCP
 connections, the kernel will
 periodically send a packet to the remote host to verify the connection
 is still up.
 .It Va icmp_may_rst
 Certain
 .Tn ICMP
 unreachable messages may abort connections in
 .Tn SYN-SENT
 state.
 .It Va do_tcpdrain
 Flush packets in the
 .Tn TCP
 reassembly queue if the system is low on mbufs.
 .It Va blackhole
 If enabled, disable sending of RST when a connection is attempted
 to a port where there is not a socket accepting connections.
 See
 .Xr blackhole 4 .
 .It Va delayed_ack
 Delay ACK to try and piggyback it onto a data packet.
 .It Va delacktime
 Maximum amount of time, in milliseconds, before a delayed ACK is sent.
 .It Va path_mtu_discovery
 Enable Path MTU Discovery.
 .It Va tcbhashsize
 Size of the
 .Tn TCP
 control-block hash table
 (read-only).
 This may be tuned using the kernel option
 .Dv TCBHASHSIZE
 or by setting
 .Va net.inet.tcp.tcbhashsize
 in the
 .Xr loader 8 .
 .It Va pcbcount
 Number of active process control blocks
 (read-only).
 .It Va syncookies
 Determines whether or not
 .Tn SYN
 cookies should be generated for outbound
 .Tn SYN-ACK
 packets.
 .Tn SYN
 cookies are a great help during
 .Tn SYN
 flood attacks, and are enabled by default.
 (See
 .Xr syncookies 4 . )
 .It Va isn_reseed_interval
 The interval (in seconds) specifying how often the secret data used in
 RFC 1948 initial sequence number calculations should be reseeded.
 By default, this variable is set to zero, indicating that
 no reseeding will occur.
 Reseeding should not be necessary, and will break
 .Dv TIME_WAIT
 recycling for a few minutes.
 .It Va reass.cursegments
 The current total number of segments present in all reassembly queues.
 .It Va reass.maxsegments
 The maximum limit on the total number of segments across all reassembly
 queues.
 The limit can be adjusted as a tunable.
 .It Va reass.maxqueuelen
 The maximum number of segments allowed in each reassembly queue.
 By default, the system chooses a limit based on each TCP connection's
 receive buffer size and maximum segment size (MSS).
 The actual limit applied to a session's reassembly queue will be the lower of
 the system-calculated automatic limit and the user-specified
 .Va reass.maxqueuelen
 limit.
 .It Va rexmit_initial , rexmit_min , rexmit_slop
 Adjust the retransmit timer calculation for
 .Tn TCP .
 The slop is
 typically added to the raw calculation to take into account
 occasional variances that the
 .Tn SRTT
 (smoothed round-trip time)
 is unable to accommodate, while the minimum specifies an
 absolute minimum.
 While a number of
 .Tn TCP
 RFCs suggest a 1
 second minimum, these RFCs tend to focus on streaming behavior,
 and fail to deal with the fact that a 1 second minimum has severe
 detrimental effects over lossy interactive connections, such
 as a 802.11b wireless link, and over very fast but lossy
 connections for those cases not covered by the fast retransmit
 code.
 For this reason, we use 200ms of slop and a near-0
 minimum, which gives us an effective minimum of 200ms (similar to
 .Tn Linux ) .
 The initial value is used before an RTT measurement has been performed.
 .It Va initcwnd_segments
 Enable the ability to specify initial congestion window in number of segments.
 The default value is 10 as suggested by RFC 6928.
 Changing the value on fly would not affect connections using congestion window
 from the hostcache.
 Caution:
 This regulates the burst of packets allowed to be sent in the first RTT.
 The value should be relative to the link capacity.
 Start with small values for lower-capacity links.
 Large bursts can cause buffer overruns and packet drops if routers have small
 buffers or the link is experiencing congestion.
 .It Va newcwd
 Enable the New Congestion Window Validation mechanism as described in RFC 7661.
 This gently reduces the congestion window during periods, where TCP is
 application limited and the network bandwidth is not utilized completely.
 That prevents self-inflicted packet losses once the application starts to
 transmit data at a higher speed.
 .It Va do_prr
 Perform SACK loss recovery using the Proportional Rate Reduction (PRR) algorithm
 described in RFC6937.
 This improves the effectiveness of retransmissions particular in environments
 with ACK thinning or burst loss events, as chances to run out of the ACK clock
 are reduced, preventing lengthy and performance reducing RTO based loss recovery
 (default is true).
 .It Va do_prr_conservative
 While doing Proportional Rate Reduction, remain strictly in a packet conserving
 mode, sending only one new packet for each ACK received.
 Helpful when a misconfigured token bucket traffic policer causes persistent
 high losses leading to RTO, but reduces PRR effectiveness in more common settings
 (default is false).
 .It Va rfc6675_pipe
 Calculate the bytes in flight using the algorithm described in RFC 6675, and
 is also an improvement when Proportional Rate Reduction is enabled.
 Also enables two other mechanisms from RFC6675.
 Rescue Retransmission helps timely loss recovery, when the trailing segments
 of a transmission are lost, while no additional data is ready to be sent.
 In case a partial ACK without a SACK block is received during SACK loss
 recovery, the trailing segment is immediately resent, rather than waiting
 for a Retransmission timeout.
 SACK loss recovery is also engaged, once two segments plus one byte are
 SACKed - even if no traditional duplicate ACKs were seen.
 .It Va rfc3042
 Enable the Limited Transmit algorithm as described in RFC 3042.
 It helps avoid timeouts on lossy links and also when the congestion window
 is small, as happens on short transfers.
 .It Va rfc3390
 Enable support for RFC 3390, which allows for a variable-sized
 starting congestion window on new connections, depending on the
 maximum segment size.
 This helps throughput in general, but
 particularly affects short transfers and high-bandwidth large
 propagation-delay connections.
 .It Va sack.enable
 Enable support for RFC 2018, TCP Selective Acknowledgment option,
 which allows the receiver to inform the sender about all successfully
 arrived segments, allowing the sender to retransmit the missing segments
 only.
 .It Va sack.maxholes
 Maximum number of SACK holes per connection.
 Defaults to 128.
 .It Va sack.globalmaxholes
 Maximum number of SACK holes per system, across all connections.
 Defaults to 65536.
 .It Va maxtcptw
 When a TCP connection enters the
 .Dv TIME_WAIT
 state, its associated socket structure is freed, since it is of
 negligible size and use, and a new structure is allocated to contain a
 minimal amount of information necessary for sustaining a connection in
 this state, called the compressed TCP TIME_WAIT state.
 Since this structure is smaller than a socket structure, it can save
 a significant amount of system memory.
 The
 .Va net.inet.tcp.maxtcptw
 MIB variable controls the maximum number of these structures allocated.
 By default, it is initialized to
 .Va kern.ipc.maxsockets
 / 5.
 .It Va nolocaltimewait
 Suppress creating of compressed TCP TIME_WAIT states for connections in
 which both endpoints are local.
 .It Va fast_finwait2_recycle
 Recycle
 .Tn TCP
 .Dv FIN_WAIT_2
 connections faster when the socket is marked as
 .Dv SBS_CANTRCVMORE
 (no user process has the socket open, data received on
 the socket cannot be read).
 The timeout used here is
 .Va finwait2_timeout .
 .It Va finwait2_timeout
 Timeout to use for fast recycling of
 .Tn TCP
 .Dv FIN_WAIT_2
 connections.
 Defaults to 60 seconds.
 .It Va ecn.enable
 Enable support for TCP Explicit Congestion Notification (ECN).
 ECN allows a TCP sender to reduce the transmission rate in order to
 avoid packet drops.
-Settings:
 .Bl -tag -compact
 .It 0
 Disable ECN.
 .It 1
 Allow incoming connections to request ECN.
 Outgoing connections will request ECN.
 .It 2
 Allow incoming connections to request ECN.
 Outgoing connections will not request ECN.
+(default)
 .El
 .It Va ecn.maxretries
 Number of retries (SYN or SYN/ACK retransmits) before disabling ECN on a
 specific connection.
 This is needed to help with connection establishment
 when a broken firewall is in the network path.
 .It Va pmtud_blackhole_detection
 Enable automatic path MTU blackhole detection.
 In case of retransmits of MSS sized segments,
 the OS will lower the MSS to check if it's an MTU problem.
 If the current MSS is greater than the configured value to try
 .Po Va net.inet.tcp.pmtud_blackhole_mss
 and
 .Va net.inet.tcp.v6pmtud_blackhole_mss
 .Pc ,
 it will be set to this value, otherwise,
 the MSS will be set to the default values
 .Po Va net.inet.tcp.mssdflt
 and
 .Va net.inet.tcp.v6mssdflt
 .Pc .
 Settings:
 .Bl -tag -compact
 .It 0
 Disable path MTU blackhole detection.
 .It 1
 Enable path MTU blackhole detection for IPv4 and IPv6.
 .It 2
 Enable path MTU blackhole detection only for IPv4.
 .It 3
 Enable path MTU blackhole detection only for IPv6.
 .El
 .It Va pmtud_blackhole_mss
 MSS to try for IPv4 if PMTU blackhole detection is turned on.
 .It Va v6pmtud_blackhole_mss
 MSS to try for IPv6 if PMTU blackhole detection is turned on.
+.It Va hostcache.enable
+The TCP host cache is used to cache connection details and metrics to
+improve future performance of connections between the same hosts.
+At the completion of a TCP connection, a host will cache information
+for the connection for some defined period of time.
+.Bl -tag -compact
+.It 0
+Disable the host cache.
+.It 1
+Enable the host cache. (default)
+.It Va hostcache.purgenow
+Immediately purge all entries once set to any value.
+Setting this to 2 will also reseed the hash salt.
+.It Va hostcache.purge
+Expire all entires on next pruning of host cache entries.
+Any non-zero setting will be reset to zero, once the pruge
+is running.
+.Bl -tag -compact
+.It 0
+Do not purge all entries when pruning the host cache. (default)
+.It 1
+Purge all entries when doing the next pruning.
+.It 2
+Purge all entries, and also reseed the hash salt.
+.It Va hostcache.prune
+Time in seconds between pruning expired host cache entries.
+Defaults to 300 (5 minutes).
+.It Va hostcache.expire
+Time in seconds, how long a entry should be kept in the
+host cache since last accessed.
+Defaults to 3600 (1 hour).
+.It Va hostcache.count
+The current number of entries in the host cache.
+.It Va hostcache.bucketlimit
+The maximum number of entries for the same hash.
+Defaults to 30.
+.It Va hostcache.hashsize
+Size of TCP hostcache hashtable.
+This number has to be a power of two, or will be rejected.
+Defaults to 512.
+.It Va hostcache.cachelimit
+Overall entry limit for hostcache.
+Defaults to hashsize * bucketlimit.
+.It Va hostcache.histo
+Provide a Histogram of the hostcache hash utilization.
+.It Va hostcache.list
+Provide a complete list of all current entries in the host
+cache.
 .It Va functions_available
 List of available TCP function blocks (TCP stacks).
 .It Va functions_default
 The default TCP function block (TCP stack).
 .It Va functions_inherit_listen_socket_stack
 Determines whether to inherit listen socket's tcp stack or use the current
 system default tcp stack, as defined by
 .Va functions_default .
 Default is true.
 .It Va insecure_rst
 Use criteria defined in RFC793 instead of RFC5961 for accepting RST segments.
 Default is false.
 .It Va insecure_syn
 Use criteria defined in RFC793 instead of RFC5961 for accepting SYN segments.
 Default is false.
 .It Va ts_offset_per_conn
 When initializing the TCP timestamps, use a per connection offset instead of a
 per host pair offset.
 Default is to use per connection offsets as recommended in RFC 7323.
 .It Va perconn_stats_enable
 Controls the default collection of statistics for all connections using the
 .Xr stats 3
 framework.
 0 disables, 1 enables, 2 enables random sampling across log id connection
 groups with all connections in a group receiving the same setting.
 .It Va perconn_stats_sample_rates
 A CSV list of template_spec=percent key-value pairs which controls the per
 template sampling rates when
 .Xr stats 3
 sampling is enabled.
 .El
 .Sh ERRORS
 A socket operation may fail with one of the following errors returned:
 .Bl -tag -width Er
 .It Bq Er EISCONN
 when trying to establish a connection on a socket which
 already has one;
 .It Bo Er ENOBUFS Bc or Bo Er ENOMEM Bc
 when the system runs out of memory for
 an internal data structure;
 .It Bq Er ETIMEDOUT
 when a connection was dropped
 due to excessive retransmissions;
 .It Bq Er ECONNRESET
 when the remote peer
 forces the connection to be closed;
 .It Bq Er ECONNREFUSED
 when the remote
 peer actively refuses connection establishment (usually because
 no process is listening to the port);
 .It Bq Er EADDRINUSE
 when an attempt
 is made to create a socket with a port which has already been
 allocated;
 .It Bq Er EADDRNOTAVAIL
 when an attempt is made to create a
 socket with a network address for which no network interface
 exists;
 .It Bq Er EAFNOSUPPORT
 when an attempt is made to bind or connect a socket to a multicast
 address.
 .It Bq Er EINVAL
 when trying to change TCP function blocks at an invalid point in the session;
 .It Bq Er ENOENT
 when trying to use a TCP function block that is not available;
 .El
 .Sh SEE ALSO
 .Xr getsockopt 2 ,
 .Xr socket 2 ,
 .Xr stats 3 ,
 .Xr sysctl 3 ,
 .Xr blackhole 4 ,
 .Xr inet 4 ,
 .Xr intro 4 ,
 .Xr ip 4 ,
 .Xr ktls 4 ,
 .Xr mod_cc 4 ,
 .Xr siftr 4 ,
 .Xr syncache 4 ,
 .Xr tcp_bbr 4 ,
 .Xr setkey 8 ,
 .Xr tcp_functions 9
 .Rs
 .%A "V. Jacobson"
 .%A "B. Braden"
 .%A "D. Borman"
 .%T "TCP Extensions for High Performance"
 .%O "RFC 1323"
 .Re
 .Rs
 .%A "D. Borman"
 .%A "B. Braden"
 .%A "V. Jacobson"
 .%A "R. Scheffenegger"
 .%T "TCP Extensions for High Performance"
 .%O "RFC 7323"
 .Re
 .Rs
 .%A "A. Heffernan"
 .%T "Protection of BGP Sessions via the TCP MD5 Signature Option"
 .%O "RFC 2385"
 .Re
 .Rs
 .%A "K. Ramakrishnan"
 .%A "S. Floyd"
 .%A "D. Black"
 .%T "The Addition of Explicit Congestion Notification (ECN) to IP"
 .%O "RFC 3168"
 .Re
 .Sh HISTORY
 The
 .Tn TCP
 protocol appeared in
 .Bx 4.2 .
 The RFC 1323 extensions for window scaling and timestamps were added
 in
 .Bx 4.4 .
 The
 .Dv TCP_INFO
 option was introduced in
 .Tn Linux 2.6
 and is
 .Em subject to change .
diff --git a/sys/netinet/tcp_hostcache.c b/sys/netinet/tcp_hostcache.c
index dfd3cf6ee260..7bc79b781a30 100644
--- a/sys/netinet/tcp_hostcache.c
+++ b/sys/netinet/tcp_hostcache.c
@@ -1,869 +1,874 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * The tcp_hostcache moves the tcp-specific cached metrics from the routing
  * table to a dedicated structure indexed by the remote IP address.  It keeps
  * information on the measured TCP parameters of past TCP sessions to allow
  * better initial start values to be used with later connections to/from the
  * same source.  Depending on the network parameters (delay, max MTU,
  * congestion window) between local and remote sites, this can lead to
  * significant speed-ups for new TCP connections after the first one.
  *
  * Due to the tcp_hostcache, all TCP-specific metrics information in the
  * routing table have been removed.  The inpcb no longer keeps a pointer to
  * the routing entry, and protocol-initiated route cloning has been removed
  * as well.  With these changes, the routing table has gone back to being
  * more lightwight and only carries information related to packet forwarding.
  *
  * tcp_hostcache is designed for multiple concurrent access in SMP
  * environments and high contention.  All bucket rows have their own lock and
  * thus multiple lookups and modifies can be done at the same time as long as
  * they are in different bucket rows.  If a request for insertion of a new
  * record can't be satisfied, it simply returns an empty structure.  Nobody
  * and nothing outside of tcp_hostcache.c will ever point directly to any
  * entry in the tcp_hostcache.  All communication is done in an
  * object-oriented way and only functions of tcp_hostcache will manipulate
  * hostcache entries.  Otherwise, we are unable to achieve good behaviour in
  * concurrent access situations.  Since tcp_hostcache is only caching
  * information, there are no fatal consequences if we either can't satisfy
  * any particular request or have to drop/overwrite an existing entry because
  * of bucket limit memory constrains.
  */
 
 /*
  * Many thanks to jlemon for basic structure of tcp_syncache which is being
  * followed here.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/hash.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 
 #include <vm/uma.h>
 
 TAILQ_HEAD(hc_qhead, hc_metrics);
 
 struct hc_head {
 	struct hc_qhead	hch_bucket;
 	u_int		hch_length;
 	struct mtx	hch_mtx;
 };
 
 struct hc_metrics {
 	/* housekeeping */
 	TAILQ_ENTRY(hc_metrics) rmx_q;
 	struct		hc_head *rmx_head; /* head of bucket tail queue */
 	struct		in_addr ip4;	/* IP address */
 	struct		in6_addr ip6;	/* IP6 address */
 	uint32_t	ip6_zoneid;	/* IPv6 scope zone id */
 	/* endpoint specific values for tcp */
 	uint32_t	rmx_mtu;	/* MTU for this path */
 	uint32_t	rmx_ssthresh;	/* outbound gateway buffer limit */
 	uint32_t	rmx_rtt;	/* estimated round trip time */
 	uint32_t	rmx_rttvar;	/* estimated rtt variance */
 	uint32_t	rmx_cwnd;	/* congestion window */
 	uint32_t	rmx_sendpipe;	/* outbound delay-bandwidth product */
 	uint32_t	rmx_recvpipe;	/* inbound delay-bandwidth product */
 	/* TCP hostcache internal data */
 	int		rmx_expire;	/* lifetime for object */
 #ifdef	TCP_HC_COUNTERS
 	u_long		rmx_hits;	/* number of hits */
 	u_long		rmx_updates;	/* number of updates */
 #endif
 };
 
 struct tcp_hostcache {
 	struct hc_head	*hashbase;
 	uma_zone_t	zone;
 	u_int		hashsize;
 	u_int		hashmask;
+	u_int		hashsalt;
 	u_int		bucket_limit;
 	u_int		cache_count;
 	u_int		cache_limit;
 	int		expire;
 	int		prune;
 	int		purgeall;
 };
 
 /* Arbitrary values */
 #define TCP_HOSTCACHE_HASHSIZE		512
 #define TCP_HOSTCACHE_BUCKETLIMIT	30
 #define TCP_HOSTCACHE_EXPIRE		60*60	/* one hour */
 #define TCP_HOSTCACHE_PRUNE		5*60	/* every 5 minutes */
 
 VNET_DEFINE_STATIC(struct tcp_hostcache, tcp_hostcache);
 #define	V_tcp_hostcache		VNET(tcp_hostcache)
 
 VNET_DEFINE_STATIC(struct callout, tcp_hc_callout);
 #define	V_tcp_hc_callout	VNET(tcp_hc_callout)
 
 static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *, bool);
 static struct hc_metrics *tcp_hc_insert(struct in_conninfo *);
 static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS);
 static int sysctl_tcp_hc_histo(SYSCTL_HANDLER_ARGS);
 static int sysctl_tcp_hc_purgenow(SYSCTL_HANDLER_ARGS);
 static void tcp_hc_purge_internal(int);
 static void tcp_hc_purge(void *);
 
 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "TCP Host cache");
 
 VNET_DEFINE(int, tcp_use_hostcache) = 1;
 #define V_tcp_use_hostcache  VNET(tcp_use_hostcache)
 SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_use_hostcache), 0,
     "Enable the TCP hostcache");
 
 SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN,
     &VNET_NAME(tcp_hostcache.cache_limit), 0,
     "Overall entry limit for hostcache");
 
 SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN,
     &VNET_NAME(tcp_hostcache.hashsize), 0,
     "Size of TCP hostcache hashtable");
 
 SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit,
     CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.bucket_limit), 0,
     "Per-bucket hash limit for hostcache");
 
 SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_VNET | CTLFLAG_RD,
      &VNET_NAME(tcp_hostcache.cache_count), 0,
     "Current number of entries in hostcache");
 
 SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_hostcache.expire), 0,
     "Expire time of TCP hostcache entries");
 
 SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_hostcache.prune), 0,
     "Time between purge runs");
 
 SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_hostcache.purgeall), 0,
     "Expire all entires on next purge run");
 
 SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE,
     0, 0, sysctl_tcp_hc_list, "A",
     "List of all hostcache entries");
 
 SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, histo,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE,
     0, 0, sysctl_tcp_hc_histo, "A",
     "Print a histogram of hostcache hashbucket utilization");
 
 SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, purgenow,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_tcp_hc_purgenow, "I",
     "Immediately purge all entries");
 
 static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache");
 
+/* Use jenkins_hash32(), as in other parts of the tcp stack */
 #define HOSTCACHE_HASH(ip) \
-	(((ip)->s_addr ^ ((ip)->s_addr >> 7) ^ ((ip)->s_addr >> 17)) &	\
-	  V_tcp_hostcache.hashmask)
+	(jenkins_hash32((uint32_t *)(ip), 1, V_tcp_hostcache.hashsalt) & \
+	 V_tcp_hostcache.hashmask)
 
-/* XXX: What is the recommended hash to get good entropy for IPv6 addresses? */
 #define HOSTCACHE_HASH6(ip6)				\
-	(((ip6)->s6_addr32[0] ^				\
-	  (ip6)->s6_addr32[1] ^				\
-	  (ip6)->s6_addr32[2] ^				\
-	  (ip6)->s6_addr32[3]) &			\
+	(jenkins_hash32((uint32_t *)&((ip6)->s6_addr32[0]), 4, \
+	 V_tcp_hostcache.hashsalt) & \
 	 V_tcp_hostcache.hashmask)
 
 #define THC_LOCK(lp)		mtx_lock(lp)
 #define THC_UNLOCK(lp)		mtx_unlock(lp)
 
 void
 tcp_hc_init(void)
 {
 	u_int cache_limit;
 	int i;
 
 	/*
 	 * Initialize hostcache structures.
 	 */
 	atomic_store_int(&V_tcp_hostcache.cache_count, 0);
 	V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE;
 	V_tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT;
 	V_tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE;
 	V_tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE;
+	V_tcp_hostcache.hashsalt = arc4random();
 
 	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize",
 	    &V_tcp_hostcache.hashsize);
 	if (!powerof2(V_tcp_hostcache.hashsize)) {
 		printf("WARNING: hostcache hash size is not a power of 2.\n");
 		V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */
 	}
 	V_tcp_hostcache.hashmask = V_tcp_hostcache.hashsize - 1;
 
 	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit",
 	    &V_tcp_hostcache.bucket_limit);
 
 	cache_limit = V_tcp_hostcache.hashsize * V_tcp_hostcache.bucket_limit;
 	V_tcp_hostcache.cache_limit = cache_limit;
 	TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit",
 	    &V_tcp_hostcache.cache_limit);
 	if (V_tcp_hostcache.cache_limit > cache_limit)
 		V_tcp_hostcache.cache_limit = cache_limit;
 
 	/*
 	 * Allocate the hash table.
 	 */
 	V_tcp_hostcache.hashbase = (struct hc_head *)
 	    malloc(V_tcp_hostcache.hashsize * sizeof(struct hc_head),
 		   M_HOSTCACHE, M_WAITOK | M_ZERO);
 
 	/*
 	 * Initialize the hash buckets.
 	 */
 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
 		TAILQ_INIT(&V_tcp_hostcache.hashbase[i].hch_bucket);
 		V_tcp_hostcache.hashbase[i].hch_length = 0;
 		mtx_init(&V_tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry",
 			  NULL, MTX_DEF);
 	}
 
 	/*
 	 * Allocate the hostcache entries.
 	 */
 	V_tcp_hostcache.zone =
 	    uma_zcreate("hostcache", sizeof(struct hc_metrics),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_zone_set_max(V_tcp_hostcache.zone, V_tcp_hostcache.cache_limit);
 
 	/*
 	 * Set up periodic cache cleanup.
 	 */
 	callout_init(&V_tcp_hc_callout, 1);
 	callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
 	    tcp_hc_purge, curvnet);
 }
 
 #ifdef VIMAGE
 void
 tcp_hc_destroy(void)
 {
 	int i;
 
 	callout_drain(&V_tcp_hc_callout);
 
 	/* Purge all hc entries. */
 	tcp_hc_purge_internal(1);
 
 	/* Free the uma zone and the allocated hash table. */
 	uma_zdestroy(V_tcp_hostcache.zone);
 
 	for (i = 0; i < V_tcp_hostcache.hashsize; i++)
 		mtx_destroy(&V_tcp_hostcache.hashbase[i].hch_mtx);
 	free(V_tcp_hostcache.hashbase, M_HOSTCACHE);
 }
 #endif
 
 /*
  * Internal function: look up an entry in the hostcache or return NULL.
  *
  * If an entry has been returned, the caller becomes responsible for
  * unlocking the bucket row after he is done reading/modifying the entry.
  */
 static struct hc_metrics *
 tcp_hc_lookup(struct in_conninfo *inc, bool update)
 {
 	int hash;
 	struct hc_head *hc_head;
 	struct hc_metrics *hc_entry;
 
 	if (!V_tcp_use_hostcache)
 		return NULL;
 
 	KASSERT(inc != NULL, ("tcp_hc_lookup with NULL in_conninfo pointer"));
 
 	/*
 	 * Hash the foreign ip address.
 	 */
 	if (inc->inc_flags & INC_ISIPV6)
 		hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
 	else
 		hash = HOSTCACHE_HASH(&inc->inc_faddr);
 
 	hc_head = &V_tcp_hostcache.hashbase[hash];
 
 	/*
 	 * Acquire lock for this bucket row; we release the lock if we don't
 	 * find an entry, otherwise the caller has to unlock after he is
 	 * done.
 	 */
 	THC_LOCK(&hc_head->hch_mtx);
 
 	/*
 	 * Iterate through entries in bucket row looking for a match.
 	 */
 	TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) {
 		if (inc->inc_flags & INC_ISIPV6) {
 			/* XXX: check ip6_zoneid */
 			if (memcmp(&inc->inc6_faddr, &hc_entry->ip6,
 			    sizeof(inc->inc6_faddr)) == 0)
 				goto found;
 		} else {
 			if (memcmp(&inc->inc_faddr, &hc_entry->ip4,
 			    sizeof(inc->inc_faddr)) == 0)
 				goto found;
 		}
 	}
 
 	/*
 	 * We were unsuccessful and didn't find anything.
 	 */
 	THC_UNLOCK(&hc_head->hch_mtx);
 	return (NULL);
 
 found:
 #ifdef	TCP_HC_COUNTERS
 	if (update)
 		hc_entry->rmx_updates++;
 	else
 		hc_entry->rmx_hits++;
 #endif
 	hc_entry->rmx_expire = V_tcp_hostcache.expire;
 
 	return (hc_entry);
 }
 
 /*
  * Internal function: insert an entry into the hostcache or return NULL if
  * unable to allocate a new one.
  *
  * If an entry has been returned, the caller becomes responsible for
  * unlocking the bucket row after he is done reading/modifying the entry.
  */
 static struct hc_metrics *
 tcp_hc_insert(struct in_conninfo *inc)
 {
 	int hash;
 	struct hc_head *hc_head;
 	struct hc_metrics *hc_entry;
 
 	if (!V_tcp_use_hostcache)
 		return NULL;
 
 	KASSERT(inc != NULL, ("tcp_hc_insert with NULL in_conninfo pointer"));
 
 	/*
 	 * Hash the foreign ip address.
 	 */
 	if (inc->inc_flags & INC_ISIPV6)
 		hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
 	else
 		hash = HOSTCACHE_HASH(&inc->inc_faddr);
 
 	hc_head = &V_tcp_hostcache.hashbase[hash];
 
 	/*
 	 * Acquire lock for this bucket row; we release the lock if we don't
 	 * find an entry, otherwise the caller has to unlock after he is
 	 * done.
 	 */
 	THC_LOCK(&hc_head->hch_mtx);
 
 	/*
 	 * If the bucket limit is reached, reuse the least-used element.
 	 */
 	if (hc_head->hch_length >= V_tcp_hostcache.bucket_limit ||
 	    atomic_load_int(&V_tcp_hostcache.cache_count) >= V_tcp_hostcache.cache_limit) {
 		hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead);
 		/*
 		 * At first we were dropping the last element, just to
 		 * reacquire it in the next two lines again, which isn't very
 		 * efficient.  Instead just reuse the least used element.
 		 * We may drop something that is still "in-use" but we can be
 		 * "lossy".
 		 * Just give up if this bucket row is empty and we don't have
 		 * anything to replace.
 		 */
 		if (hc_entry == NULL) {
 			THC_UNLOCK(&hc_head->hch_mtx);
 			return NULL;
 		}
 		TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q);
 		KASSERT(V_tcp_hostcache.hashbase[hash].hch_length > 0 &&
 			V_tcp_hostcache.hashbase[hash].hch_length <=
 			V_tcp_hostcache.bucket_limit,
 			("tcp_hostcache: bucket length range violated at %u: %u",
 			hash, V_tcp_hostcache.hashbase[hash].hch_length));
 		V_tcp_hostcache.hashbase[hash].hch_length--;
 		atomic_subtract_int(&V_tcp_hostcache.cache_count, 1);
 		TCPSTAT_INC(tcps_hc_bucketoverflow);
 #if 0
 		uma_zfree(V_tcp_hostcache.zone, hc_entry);
 #endif
 	} else {
 		/*
 		 * Allocate a new entry, or balk if not possible.
 		 */
 		hc_entry = uma_zalloc(V_tcp_hostcache.zone, M_NOWAIT);
 		if (hc_entry == NULL) {
 			THC_UNLOCK(&hc_head->hch_mtx);
 			return NULL;
 		}
 	}
 
 	/*
 	 * Initialize basic information of hostcache entry.
 	 */
 	bzero(hc_entry, sizeof(*hc_entry));
 	if (inc->inc_flags & INC_ISIPV6) {
 		hc_entry->ip6 = inc->inc6_faddr;
 		hc_entry->ip6_zoneid = inc->inc6_zoneid;
 	} else
 		hc_entry->ip4 = inc->inc_faddr;
 	hc_entry->rmx_head = hc_head;
 	hc_entry->rmx_expire = V_tcp_hostcache.expire;
 
 	/*
 	 * Put it upfront.
 	 */
 	TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q);
 	V_tcp_hostcache.hashbase[hash].hch_length++;
 	KASSERT(V_tcp_hostcache.hashbase[hash].hch_length <
 		V_tcp_hostcache.bucket_limit,
 		("tcp_hostcache: bucket length too high at %u: %u",
 		hash, V_tcp_hostcache.hashbase[hash].hch_length));
 	atomic_add_int(&V_tcp_hostcache.cache_count, 1);
 	TCPSTAT_INC(tcps_hc_added);
 
 	return hc_entry;
 }
 
 /*
  * External function: look up an entry in the hostcache and fill out the
  * supplied TCP metrics structure.  Fills in NULL when no entry was found or
  * a value is not set.
  */
 void
 tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite)
 {
 	struct hc_metrics *hc_entry;
 
 	if (!V_tcp_use_hostcache) {
 		bzero(hc_metrics_lite, sizeof(*hc_metrics_lite));
 		return;
 	}
 
 	/*
 	 * Find the right bucket.
 	 */
 	hc_entry = tcp_hc_lookup(inc, false);
 
 	/*
 	 * If we don't have an existing object.
 	 */
 	if (hc_entry == NULL) {
 		bzero(hc_metrics_lite, sizeof(*hc_metrics_lite));
 		return;
 	}
 
 	hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu;
 	hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh;
 	hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt;
 	hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar;
 	hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd;
 	hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe;
 	hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe;
 
 	/*
 	 * Unlock bucket row.
 	 */
 	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
 }
 
 /*
  * External function: look up an entry in the hostcache and return the
  * discovered path MTU.  Returns 0 if no entry is found or value is not
  * set.
  */
 uint32_t
 tcp_hc_getmtu(struct in_conninfo *inc)
 {
 	struct hc_metrics *hc_entry;
 	uint32_t mtu;
 
 	if (!V_tcp_use_hostcache)
 		return 0;
 
 	hc_entry = tcp_hc_lookup(inc, false);
 	if (hc_entry == NULL) {
 		return 0;
 	}
 
 	mtu = hc_entry->rmx_mtu;
 	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
 	return mtu;
 }
 
 /*
  * External function: update the MTU value of an entry in the hostcache.
  * Creates a new entry if none was found.
  */
 void
 tcp_hc_updatemtu(struct in_conninfo *inc, uint32_t mtu)
 {
 	struct hc_metrics *hc_entry;
 
 	if (!V_tcp_use_hostcache)
 		return;
 
 	/*
 	 * Find the right bucket.
 	 */
 	hc_entry = tcp_hc_lookup(inc, true);
 
 	/*
 	 * If we don't have an existing object, try to insert a new one.
 	 */
 	if (hc_entry == NULL) {
 		hc_entry = tcp_hc_insert(inc);
 		if (hc_entry == NULL)
 			return;
 	}
 
 	hc_entry->rmx_mtu = mtu;
 
 	/*
 	 * Put it upfront so we find it faster next time.
 	 */
 	TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
 	TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
 
 	/*
 	 * Unlock bucket row.
 	 */
 	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
 }
 
 /*
  * External function: update the TCP metrics of an entry in the hostcache.
  * Creates a new entry if none was found.
  */
 void
 tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml)
 {
 	struct hc_metrics *hc_entry;
 
 	if (!V_tcp_use_hostcache)
 		return;
 
 	hc_entry = tcp_hc_lookup(inc, true);
 	if (hc_entry == NULL) {
 		hc_entry = tcp_hc_insert(inc);
 		if (hc_entry == NULL)
 			return;
 	}
 
 	if (hcml->rmx_rtt != 0) {
 		if (hc_entry->rmx_rtt == 0)
 			hc_entry->rmx_rtt = hcml->rmx_rtt;
 		else
 			hc_entry->rmx_rtt = ((uint64_t)hc_entry->rmx_rtt +
 			    (uint64_t)hcml->rmx_rtt) / 2;
 		TCPSTAT_INC(tcps_cachedrtt);
 	}
 	if (hcml->rmx_rttvar != 0) {
 	        if (hc_entry->rmx_rttvar == 0)
 			hc_entry->rmx_rttvar = hcml->rmx_rttvar;
 		else
 			hc_entry->rmx_rttvar = ((uint64_t)hc_entry->rmx_rttvar +
 			    (uint64_t)hcml->rmx_rttvar) / 2;
 		TCPSTAT_INC(tcps_cachedrttvar);
 	}
 	if (hcml->rmx_ssthresh != 0) {
 		if (hc_entry->rmx_ssthresh == 0)
 			hc_entry->rmx_ssthresh = hcml->rmx_ssthresh;
 		else
 			hc_entry->rmx_ssthresh =
 			    (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2;
 		TCPSTAT_INC(tcps_cachedssthresh);
 	}
 	if (hcml->rmx_cwnd != 0) {
 		if (hc_entry->rmx_cwnd == 0)
 			hc_entry->rmx_cwnd = hcml->rmx_cwnd;
 		else
 			hc_entry->rmx_cwnd = ((uint64_t)hc_entry->rmx_cwnd +
 			    (uint64_t)hcml->rmx_cwnd) / 2;
 		/* TCPSTAT_INC(tcps_cachedcwnd); */
 	}
 	if (hcml->rmx_sendpipe != 0) {
 		if (hc_entry->rmx_sendpipe == 0)
 			hc_entry->rmx_sendpipe = hcml->rmx_sendpipe;
 		else
 			hc_entry->rmx_sendpipe =
 			    ((uint64_t)hc_entry->rmx_sendpipe +
 			    (uint64_t)hcml->rmx_sendpipe) /2;
 		/* TCPSTAT_INC(tcps_cachedsendpipe); */
 	}
 	if (hcml->rmx_recvpipe != 0) {
 		if (hc_entry->rmx_recvpipe == 0)
 			hc_entry->rmx_recvpipe = hcml->rmx_recvpipe;
 		else
 			hc_entry->rmx_recvpipe =
 			    ((uint64_t)hc_entry->rmx_recvpipe +
 			    (uint64_t)hcml->rmx_recvpipe) /2;
 		/* TCPSTAT_INC(tcps_cachedrecvpipe); */
 	}
 
 	TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
 	TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
 	THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
 }
 
 /*
  * Sysctl function: prints the list and values of all hostcache entries in
  * unsorted order.
  */
 static int
 sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS)
 {
 	const int linesize = 128;
 	struct sbuf sb;
 	int i, error, len;
 	struct hc_metrics *hc_entry;
 	char ip4buf[INET_ADDRSTRLEN];
 #ifdef INET6
 	char ip6buf[INET6_ADDRSTRLEN];
 #endif
 
 	if (jailed_without_vnet(curthread->td_ucred) != 0)
 		return (EPERM);
 
 	/* Optimize Buffer length query by sbin/sysctl */
 	if (req->oldptr == NULL) {
 		len = (atomic_load_int(&V_tcp_hostcache.cache_count) + 1) *
 			linesize;
 		return (SYSCTL_OUT(req, NULL, len));
 	}
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0) {
 		return(error);
 	}
 
 	/* Use a buffer sized for one full bucket */
 	sbuf_new_for_sysctl(&sb, NULL, V_tcp_hostcache.bucket_limit *
 		linesize, req);
 
 	sbuf_printf(&sb,
 		"\nIP address        MTU  SSTRESH      RTT   RTTVAR "
 		"    CWND SENDPIPE RECVPIPE "
 #ifdef	TCP_HC_COUNTERS
 		"HITS  UPD  "
 #endif
 		"EXP\n");
 	sbuf_drain(&sb);
 
 #define msec(u) (((u) + 500) / 1000)
 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
 		THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
 		TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket,
 			      rmx_q) {
 			sbuf_printf(&sb,
 			    "%-15s %5u %8u %6lums %6lums %8u %8u %8u "
 #ifdef	TCP_HC_COUNTERS
 			    "%4lu %4lu "
 #endif
 			    "%4i\n",
 			    hc_entry->ip4.s_addr ?
 			        inet_ntoa_r(hc_entry->ip4, ip4buf) :
 #ifdef INET6
 				ip6_sprintf(ip6buf, &hc_entry->ip6),
 #else
 				"IPv6?",
 #endif
 			    hc_entry->rmx_mtu,
 			    hc_entry->rmx_ssthresh,
 			    msec((u_long)hc_entry->rmx_rtt *
 				(RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
 			    msec((u_long)hc_entry->rmx_rttvar *
 				(RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE))),
 			    hc_entry->rmx_cwnd,
 			    hc_entry->rmx_sendpipe,
 			    hc_entry->rmx_recvpipe,
 #ifdef	TCP_HC_COUNTERS
 			    hc_entry->rmx_hits,
 			    hc_entry->rmx_updates,
 #endif
 			    hc_entry->rmx_expire);
 		}
 		THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
 		sbuf_drain(&sb);
 	}
 #undef msec
 	error = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return(error);
 }
 
 /*
  * Sysctl function: prints a histogram of the hostcache hashbucket
  * utilization.
  */
 static int
 sysctl_tcp_hc_histo(SYSCTL_HANDLER_ARGS)
 {
 	const int linesize = 50;
 	struct sbuf sb;
 	int i, error;
 	int *histo;
 	u_int hch_length;
 
 	if (jailed_without_vnet(curthread->td_ucred) != 0)
 		return (EPERM);
 
 	histo = (int *)malloc(sizeof(int) * (V_tcp_hostcache.bucket_limit + 1),
 			M_TEMP, M_NOWAIT|M_ZERO);
 	if (histo == NULL)
 		return(ENOMEM);
 
 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
 		hch_length = V_tcp_hostcache.hashbase[i].hch_length;
 		KASSERT(hch_length <= V_tcp_hostcache.bucket_limit,
 			("tcp_hostcache: bucket limit exceeded at %u: %u",
 			i, hch_length));
 		histo[hch_length]++;
 	}
 
 	/* Use a buffer for 16 lines */
 	sbuf_new_for_sysctl(&sb, NULL, 16 * linesize, req);
 
 	sbuf_printf(&sb, "\nLength\tCount\n");
 	for (i = 0; i <= V_tcp_hostcache.bucket_limit; i++) {
 		sbuf_printf(&sb, "%u\t%u\n", i, histo[i]);
 	}
 	error = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	free(histo, M_TEMP);
 	return(error);
 }
 
 /*
  * Caller has to make sure the curvnet is set properly.
  */
 static void
 tcp_hc_purge_internal(int all)
 {
 	struct hc_metrics *hc_entry, *hc_next;
 	int i;
 
 	for (i = 0; i < V_tcp_hostcache.hashsize; i++) {
 		THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
 		TAILQ_FOREACH_SAFE(hc_entry,
 		    &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q, hc_next) {
 			KASSERT(V_tcp_hostcache.hashbase[i].hch_length > 0 &&
 				V_tcp_hostcache.hashbase[i].hch_length <=
 				V_tcp_hostcache.bucket_limit,
 				("tcp_hostcache: bucket length out of range at %u: %u",
 				i, V_tcp_hostcache.hashbase[i].hch_length));
 			if (all || hc_entry->rmx_expire <= 0) {
 				TAILQ_REMOVE(&V_tcp_hostcache.hashbase[i].hch_bucket,
 					      hc_entry, rmx_q);
 				uma_zfree(V_tcp_hostcache.zone, hc_entry);
 				V_tcp_hostcache.hashbase[i].hch_length--;
 				atomic_subtract_int(&V_tcp_hostcache.cache_count, 1);
 			} else
 				hc_entry->rmx_expire -= V_tcp_hostcache.prune;
 		}
 		THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx);
 	}
 }
 
 /*
  * Expire and purge (old|all) entries in the tcp_hostcache.  Runs
  * periodically from the callout.
  */
 static void
 tcp_hc_purge(void *arg)
 {
 	CURVNET_SET((struct vnet *) arg);
 	int all = 0;
 
 	if (V_tcp_hostcache.purgeall) {
+		if (V_tcp_hostcache.purgeall == 2)
+			V_tcp_hostcache.hashsalt = arc4random();
 		all = 1;
 		V_tcp_hostcache.purgeall = 0;
 	}
 
 	tcp_hc_purge_internal(all);
 
 	callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
 	    tcp_hc_purge, arg);
 	CURVNET_RESTORE();
 }
 
 /*
  * Expire and purge all entries in hostcache immediately.
  */
 static int
 sysctl_tcp_hc_purgenow(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = 0;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
+	if (val == 2)
+		V_tcp_hostcache.hashsalt = arc4random();
 	tcp_hc_purge_internal(1);
 
 	callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz,
 	    tcp_hc_purge, curvnet);
 
 	return (0);
 }