Index: stable/12/sys/modules/tcp/rack/Makefile
===================================================================
--- stable/12/sys/modules/tcp/rack/Makefile	(revision 362879)
+++ stable/12/sys/modules/tcp/rack/Makefile	(revision 362880)
@@ -1,23 +1,23 @@
 #
 # $FreeBSD$
 #
 
 .PATH: ${.CURDIR}/../../../netinet/tcp_stacks
 
 STACKNAME=	rack
 KMOD=	tcp_${STACKNAME}
-SRCS=	rack.c sack_filter.c
+SRCS=	rack.c sack_filter.c rack_bbr_common.c
 
 SRCS+=	opt_inet.h opt_inet6.h opt_ipsec.h
 SRCS+=	opt_tcpdebug.h
 SRCS+=	opt_kern_tls.h
 
 #
 # Enable full debugging
 #
 #CFLAGS += -g
 
 CFLAGS+=	-DMODNAME=${KMOD}
 CFLAGS+=	-DSTACKNAME=${STACKNAME}
 
 .include <bsd.kmod.mk>
Index: stable/12/sys/netinet/in_pcb.h
===================================================================
--- stable/12/sys/netinet/in_pcb.h	(revision 362879)
+++ stable/12/sys/netinet/in_pcb.h	(revision 362880)
@@ -1,897 +1,899 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1990, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef _NETINET_IN_PCB_H_
 #define _NETINET_IN_PCB_H_
 
 #include <sys/queue.h>
 #include <sys/epoch.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <sys/_rwlock.h>
 #include <net/route.h>
 
 #ifdef _KERNEL
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <net/vnet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <vm/uma.h>
 #endif
 #include <sys/ck.h>
 
 #define	in6pcb		inpcb	/* for KAME src sync over BSD*'s */
 #define	in6p_sp		inp_sp	/* for KAME src sync over BSD*'s */
 
 /*
  * struct inpcb is the common protocol control block structure used in most
  * IP transport protocols.
  *
  * Pointers to local and foreign host table entries, local and foreign socket
  * numbers, and pointers up (to a socket structure) and down (to a
  * protocol-specific control block) are stored here.
  */
 CK_LIST_HEAD(inpcbhead, inpcb);
 CK_LIST_HEAD(inpcbporthead, inpcbport);
 CK_LIST_HEAD(inpcblbgrouphead, inpcblbgroup);
 typedef	uint64_t	inp_gen_t;
 
 /*
  * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet.
  * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing
  * the following structure.
  */
 struct in_addr_4in6 {
 	u_int32_t	ia46_pad32[3];
 	struct	in_addr	ia46_addr4;
 };
 
 union in_dependaddr {
 	struct in_addr_4in6 id46_addr;
 	struct in6_addr	id6_addr;
 };
 
 /*
  * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553.  in_conninfo has
  * some extra padding to accomplish this.
  * NOTE 2: tcp_syncache.c uses first 5 32-bit words, which identify fport,
  * lport, faddr to generate hash, so these fields shouldn't be moved.
  */
 struct in_endpoints {
 	u_int16_t	ie_fport;		/* foreign port */
 	u_int16_t	ie_lport;		/* local port */
 	/* protocol dependent part, local and foreign addr */
 	union in_dependaddr ie_dependfaddr;	/* foreign host table entry */
 	union in_dependaddr ie_dependladdr;	/* local host table entry */
 #define	ie_faddr	ie_dependfaddr.id46_addr.ia46_addr4
 #define	ie_laddr	ie_dependladdr.id46_addr.ia46_addr4
 #define	ie6_faddr	ie_dependfaddr.id6_addr
 #define	ie6_laddr	ie_dependladdr.id6_addr
 	u_int32_t	ie6_zoneid;		/* scope zone id */
 };
 
 /*
  * XXX The defines for inc_* are hacks and should be changed to direct
  * references.
  */
 struct in_conninfo {
 	u_int8_t	inc_flags;
 	u_int8_t	inc_len;
 	u_int16_t	inc_fibnum;	/* XXX was pad, 16 bits is plenty */
 	/* protocol dependent part */
 	struct	in_endpoints inc_ie;
 };
 
 /*
  * Flags for inc_flags.
  */
 #define	INC_ISIPV6	0x01
 #define	INC_IPV6MINMTU	0x02
 
 #define	inc_fport	inc_ie.ie_fport
 #define	inc_lport	inc_ie.ie_lport
 #define	inc_faddr	inc_ie.ie_faddr
 #define	inc_laddr	inc_ie.ie_laddr
 #define	inc6_faddr	inc_ie.ie6_faddr
 #define	inc6_laddr	inc_ie.ie6_laddr
 #define	inc6_zoneid	inc_ie.ie6_zoneid
 
 #if defined(_KERNEL) || defined(_WANT_INPCB)
 /*
  * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and
  * IPv6 sockets.  In the case of TCP and UDP, further per-connection state is
  * hung off of inp_ppcb most of the time.  Almost all fields of struct inpcb
  * are static after creation or protected by a per-inpcb rwlock, inp_lock.  A
  * few fields are protected by multiple locks as indicated in the locking notes
  * below.  For these fields, all of the listed locks must be write-locked for
  * any modifications.  However, these fields can be safely read while any one of
  * the listed locks are read-locked.  This model can permit greater concurrency
  * for read operations.  For example, connections can be looked up while only
  * holding a read lock on the global pcblist lock.  This is important for
  * performance when attempting to find the connection for a packet given its IP
  * and port tuple.
  *
  * One noteworthy exception is that the global pcbinfo lock follows a different
  * set of rules in relation to the inp_list field.  Rather than being
  * write-locked for modifications and read-locked for list iterations, it must
  * be read-locked during modifications and write-locked during list iterations.
  * This ensures that the relatively rare global list iterations safely walk a
  * stable snapshot of connections while allowing more common list modifications
  * to safely grab the pcblist lock just while adding or removing a connection
  * from the global list.
  *
  * Key:
  * (b) - Protected by the hpts lock.
  * (c) - Constant after initialization
  * (e) - Protected by the net_epoch_prempt epoch
  * (g) - Protected by the pcbgroup lock
  * (i) - Protected by the inpcb lock
  * (p) - Protected by the pcbinfo lock for the inpcb
  * (l) - Protected by the pcblist lock for the inpcb
  * (h) - Protected by the pcbhash lock for the inpcb
  * (s) - Protected by another subsystem's locks
  * (x) - Undefined locking
  * 
  * Notes on the tcp_hpts:
  * 
  * First Hpts lock order is
  * 1) INP_WLOCK()
  * 2) HPTS_LOCK() i.e. hpts->pmtx 
  *
  * To insert a TCB on the hpts you *must* be holding the INP_WLOCK(). 
  * You may check the inp->inp_in_hpts flag without the hpts lock. 
  * The hpts is the only one that will clear this flag holding 
  * only the hpts lock. This means that in your tcp_output()
  * routine when you test for the inp_in_hpts flag to be 1 
  * it may be transitioning to 0 (by the hpts). 
  * That's ok since that will just mean an extra call to tcp_output 
  * that most likely will find the call you executed
  * (when the mis-match occured) will have put the TCB back 
  * on the hpts and it will return. If your
  * call did not add the inp back to the hpts then you will either
  * over-send or the cwnd will block you from sending more.
  *
  * Note you should also be holding the INP_WLOCK() when you
  * call the remove from the hpts as well. Though usually
  * you are either doing this from a timer, where you need and have
  * the INP_WLOCK() or from destroying your TCB where again
  * you should already have the INP_WLOCK().
  *
  * The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and 
  * inp_input_cpu_set fields are controlled completely by
  * the hpts. Do not ever set these. The inp_hpts_cpu_set
  * and inp_input_cpu_set fields indicate if the hpts has
  * setup the respective cpu field. It is advised if this
  * field is 0, to enqueue the packet with the appropriate
  * hpts_immediate() call. If the _set field is 1, then
  * you may compare the inp_*_cpu field to the curcpu and
  * may want to again insert onto the hpts if these fields
  * are not equal (i.e. you are not on the expected CPU).
  *
  * A note on inp_hpts_calls and inp_input_calls, these
  * flags are set when the hpts calls either the output
  * or do_segment routines respectively. If the routine
  * being called wants to use this, then it needs to
  * clear the flag before returning. The hpts will not
  * clear the flag. The flags can be used to tell if
  * the hpts is the function calling the respective
  * routine.
  *
  * A few other notes:
  *
  * When a read lock is held, stability of the field is guaranteed; to write
  * to a field, a write lock must generally be held.
  *
  * netinet/netinet6-layer code should not assume that the inp_socket pointer
  * is safe to dereference without inp_lock being held, even for protocols
  * other than TCP (where the inpcb persists during TIMEWAIT even after the
  * socket has been freed), or there may be close(2)-related races.
  *
  * The inp_vflag field is overloaded, and would otherwise ideally be (c).
  *
  * TODO:  Currently only the TCP stack is leveraging the global pcbinfo lock
  * read-lock usage during modification, this model can be applied to other
  * protocols (especially SCTP).
  */
 struct icmp6_filter;
 struct inpcbpolicy;
 struct m_snd_tag;
 struct inpcb {
 	/* Cache line #1 (amd64) */
 	CK_LIST_ENTRY(inpcb) inp_hash;	/* [w](h/i) [r](e/i)  hash list */
 	CK_LIST_ENTRY(inpcb) inp_pcbgrouphash;	/* (g/i) hash list */
 	struct rwlock	inp_lock;
 	/* Cache line #2 (amd64) */
 #define	inp_start_zero	inp_hpts
 #define	inp_zero_size	(sizeof(struct inpcb) - \
 			    offsetof(struct inpcb, inp_start_zero))
 	TAILQ_ENTRY(inpcb) inp_hpts;	/* pacing out queue next lock(b) */
 
 	uint32_t inp_hpts_request;	/* Current hpts request, zero if
 					 * fits in the pacing window (i&b). */
 	/*
 	 * Note the next fields are protected by a
 	 * different lock (hpts-lock). This means that 
 	 * they must correspond in size to the smallest
 	 * protectable bit field (uint8_t on x86, and
 	 * other platfomrs potentially uint32_t?). Also
 	 * since CPU switches can occur at different times the two
 	 * fields can *not* be collapsed into a signal bit field.
 	 */
 #if defined(__amd64__) || defined(__i386__)	
 	volatile uint8_t inp_in_hpts; /* on output hpts (lock b) */
 	volatile uint8_t inp_in_input; /* on input hpts (lock b) */
 #else
 	volatile uint32_t inp_in_hpts; /* on output hpts (lock b) */
 	volatile uint32_t inp_in_input; /* on input hpts (lock b) */
 #endif
 	volatile uint16_t  inp_hpts_cpu; /* Lock (i) */
 	u_int	inp_refcount;		/* (i) refcount */
 	int	inp_flags;		/* (i) generic IP/datagram flags */
 	int	inp_flags2;		/* (i) generic IP/datagram flags #2*/
 	volatile uint16_t  inp_input_cpu; /* Lock (i) */
 	volatile uint8_t inp_hpts_cpu_set :1,  /* on output hpts (i) */
 			 inp_input_cpu_set : 1,	/* on input hpts (i) */
 			 inp_hpts_calls :1,	/* (i) from output hpts */
 			 inp_input_calls :1,	/* (i) from input hpts */
 			 inp_spare_bits2 : 4;
 	uint8_t inp_spare_byte;		/* Compiler hole */
 	void	*inp_ppcb;		/* (i) pointer to per-protocol pcb */
 	struct	socket *inp_socket;	/* (i) back pointer to socket */
 	uint32_t 	 inp_hptsslot;	/* Hpts wheel slot this tcb is Lock(i&b) */
 	uint32_t         inp_hpts_drop_reas;	/* reason we are dropping the PCB (lock i&b) */
 	TAILQ_ENTRY(inpcb) inp_input;	/* pacing in  queue next lock(b) */
 	struct	inpcbinfo *inp_pcbinfo;	/* (c) PCB list info */
 	struct	inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */
 	CK_LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */
 	struct	ucred	*inp_cred;	/* (c) cache of socket cred */
 	u_int32_t inp_flow;		/* (i) IPv6 flow information */
 	u_char	inp_vflag;		/* (i) IP version flag (v4/v6) */
 	u_char	inp_ip_ttl;		/* (i) time to live proto */
 	u_char	inp_ip_p;		/* (c) protocol proto */
 	u_char	inp_ip_minttl;		/* (i) minimum TTL or drop */
 	uint32_t inp_flowid;		/* (x) flow id / queue id */
 	struct m_snd_tag *inp_snd_tag;	/* (i) send tag for outgoing mbufs */
 	uint32_t inp_flowtype;		/* (x) M_HASHTYPE value */
 	uint32_t inp_rss_listen_bucket;	/* (x) overridden RSS listen bucket */
 
 	/* Local and foreign ports, local and foreign addr. */
 	struct	in_conninfo inp_inc;	/* (i) list for PCB's local port */
 
 	/* MAC and IPSEC policy information. */
 	struct	label *inp_label;	/* (i) MAC label */
 	struct	inpcbpolicy *inp_sp;    /* (s) for IPSEC */
 
 	/* Protocol-dependent part; options. */
 	struct {
 		u_char	inp_ip_tos;		/* (i) type of service proto */
 		struct mbuf		*inp_options;	/* (i) IP options */
 		struct ip_moptions	*inp_moptions;	/* (i) mcast options */
 	};
 	struct {
 		/* (i) IP options */
 		struct mbuf		*in6p_options;
 		/* (i) IP6 options for outgoing packets */
 		struct ip6_pktopts	*in6p_outputopts;
 		/* (i) IP multicast options */
 		struct ip6_moptions	*in6p_moptions;
 		/* (i) ICMPv6 code type filter */
 		struct icmp6_filter	*in6p_icmp6filt;
 		/* (i) IPV6_CHECKSUM setsockopt */
 		int	in6p_cksum;
 		short	in6p_hops;
 	};
 	CK_LIST_ENTRY(inpcb) inp_portlist;	/* (i/h) */
 	struct	inpcbport *inp_phd;	/* (i/h) head of this list */
 	inp_gen_t	inp_gencnt;	/* (c) generation count */
 	void		*spare_ptr;	/* Spare pointer. */
 	rt_gen_t	inp_rt_cookie;	/* generation for route entry */
 	union {				/* cached L3 information */
 		struct route inp_route;
 		struct route_in6 inp_route6;
 	};
 	CK_LIST_ENTRY(inpcb) inp_list;	/* (p/l) list for all PCBs for proto */
 	                                /* (e[r]) for list iteration */
 	                                /* (p[w]/l) for addition/removal */
 	struct epoch_context inp_epoch_ctx;
 };
 #endif	/* _KERNEL */
 
 #define	inp_fport	inp_inc.inc_fport
 #define	inp_lport	inp_inc.inc_lport
 #define	inp_faddr	inp_inc.inc_faddr
 #define	inp_laddr	inp_inc.inc_laddr
 
 #define	in6p_faddr	inp_inc.inc6_faddr
 #define	in6p_laddr	inp_inc.inc6_laddr
 #define	in6p_zoneid	inp_inc.inc6_zoneid
 #define	in6p_flowinfo	inp_flow
 
 #define	inp_vnet	inp_pcbinfo->ipi_vnet
 
 /*
  * The range of the generation count, as used in this implementation, is 9e19.
  * We would have to create 300 billion connections per second for this number
  * to roll over in a year.  This seems sufficiently unlikely that we simply
  * don't concern ourselves with that possibility.
  */
 
 /*
  * Interface exported to userland by various protocols which use inpcbs.  Hack
  * alert -- only define if struct xsocket is in scope.
  * Fields prefixed with "xi_" are unique to this structure, and the rest
  * match fields in the struct inpcb, to ease coding and porting.
  *
  * Legend:
  * (s) - used by userland utilities in src
  * (p) - used by utilities in ports
  * (3) - is known to be used by third party software not in ports
  * (n) - no known usage
  */
 #ifdef _SYS_SOCKETVAR_H_
 struct xinpcb {
 	ksize_t		xi_len;			/* length of this structure */
 	struct xsocket	xi_socket;		/* (s,p) */
 	struct in_conninfo inp_inc;		/* (s,p) */
 	uint64_t	inp_gencnt;		/* (s,p) */
 	kvaddr_t	inp_ppcb;		/* (s) netstat(1) */
 	int64_t		inp_spare64[4];
 	uint32_t	inp_flow;		/* (s) */
 	uint32_t	inp_flowid;		/* (s) */
 	uint32_t	inp_flowtype;		/* (s) */
 	int32_t		inp_flags;		/* (s,p) */
 	int32_t		inp_flags2;		/* (s) */
 	int32_t		inp_rss_listen_bucket;	/* (n) */
 	int32_t		in6p_cksum;		/* (n) */
 	int32_t		inp_spare32[4];
 	uint16_t	in6p_hops;		/* (n) */
 	uint8_t		inp_ip_tos;		/* (n) */
 	int8_t		pad8;
 	uint8_t		inp_vflag;		/* (s,p) */
 	uint8_t		inp_ip_ttl;		/* (n) */
 	uint8_t		inp_ip_p;		/* (n) */
 	uint8_t		inp_ip_minttl;		/* (n) */
 	int8_t		inp_spare8[4];
 } __aligned(8);
 
 struct xinpgen {
 	ksize_t	xig_len;	/* length of this structure */
 	u_int		xig_count;	/* number of PCBs at this time */
 	uint32_t	_xig_spare32;
 	inp_gen_t	xig_gen;	/* generation count at this time */
 	so_gen_t	xig_sogen;	/* socket generation count this time */
 	uint64_t	_xig_spare64[4];
 } __aligned(8);
 #ifdef	_KERNEL
 void	in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *);
 #endif
 #endif /* _SYS_SOCKETVAR_H_ */
 
 struct inpcbport {
 	struct epoch_context phd_epoch_ctx;
 	CK_LIST_ENTRY(inpcbport) phd_hash;
 	struct inpcbhead phd_pcblist;
 	u_short phd_port;
 };
 
 struct in_pcblist {
 	int il_count;
 	struct epoch_context il_epoch_ctx;
 	struct inpcbinfo *il_pcbinfo;
 	struct inpcb *il_inp_list[0];
 };
 
 /*-
  * Global data structure for each high-level protocol (UDP, TCP, ...) in both
  * IPv4 and IPv6.  Holds inpcb lists and information for managing them.
  *
  * Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and
  * ipi_list_lock:
  *  - ipi_lock covering the global pcb list stability during loop iteration,
  *  - ipi_hash_lock covering the hashed lookup tables,
  *  - ipi_list_lock covering mutable global fields (such as the global
  *    pcb list)
  *
  * The lock order is:
  *
  *    ipi_lock (before)
  *        inpcb locks (before)
  *            ipi_list locks (before)
  *                {ipi_hash_lock, pcbgroup locks}
  *
  * Locking key:
  *
  * (c) Constant or nearly constant after initialisation
  * (e) - Protected by the net_epoch_prempt epoch
  * (g) Locked by ipi_lock
  * (l) Locked by ipi_list_lock
  * (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock
  * (p) Protected by one or more pcbgroup locks
  * (x) Synchronisation properties poorly defined
  */
 struct inpcbinfo {
 	/*
 	 * Global lock protecting inpcb list modification
 	 */
 	struct mtx		 ipi_lock;
 
 	/*
 	 * Global list of inpcbs on the protocol.
 	 */
 	struct inpcbhead	*ipi_listhead;		/* [r](e) [w](g/l) */
 	u_int			 ipi_count;		/* (l) */
 
 	/*
 	 * Generation count -- incremented each time a connection is allocated
 	 * or freed.
 	 */
 	u_quad_t		 ipi_gencnt;		/* (l) */
 
 	/*
 	 * Fields associated with port lookup and allocation.
 	 */
 	u_short			 ipi_lastport;		/* (x) */
 	u_short			 ipi_lastlow;		/* (x) */
 	u_short			 ipi_lasthi;		/* (x) */
 
 	/*
 	 * UMA zone from which inpcbs are allocated for this protocol.
 	 */
 	struct	uma_zone	*ipi_zone;		/* (c) */
 
 	/*
 	 * Connection groups associated with this protocol.  These fields are
 	 * constant, but pcbgroup structures themselves are protected by
 	 * per-pcbgroup locks.
 	 */
 	struct inpcbgroup	*ipi_pcbgroups;		/* (c) */
 	u_int			 ipi_npcbgroups;	/* (c) */
 	u_int			 ipi_hashfields;	/* (c) */
 
 	/*
 	 * Global lock protecting modification non-pcbgroup hash lookup tables.
 	 */
 	struct mtx		 ipi_hash_lock;
 
 	/*
 	 * Global hash of inpcbs, hashed by local and foreign addresses and
 	 * port numbers.
 	 */
 	struct inpcbhead	*ipi_hashbase;		/* (h) */
 	u_long			 ipi_hashmask;		/* (h) */
 
 	/*
 	 * Global hash of inpcbs, hashed by only local port number.
 	 */
 	struct inpcbporthead	*ipi_porthashbase;	/* (h) */
 	u_long			 ipi_porthashmask;	/* (h) */
 
 	/*
 	 * List of wildcard inpcbs for use with pcbgroups.  In the past, was
 	 * per-pcbgroup but is now global.  All pcbgroup locks must be held
 	 * to modify the list, so any is sufficient to read it.
 	 */
 	struct inpcbhead	*ipi_wildbase;		/* (p) */
 	u_long			 ipi_wildmask;		/* (p) */
 
 	/*
 	 * Load balance groups used for the SO_REUSEPORT_LB option,
 	 * hashed by local port.
 	 */
 	struct	inpcblbgrouphead *ipi_lbgrouphashbase;	/* (h) */
 	u_long			 ipi_lbgrouphashmask;	/* (h) */
 
 	/*
 	 * Pointer to network stack instance
 	 */
 	struct vnet		*ipi_vnet;		/* (c) */
 
 	/*
 	 * general use 2
 	 */
 	void 			*ipi_pspare[2];
 
 	/*
 	 * Global lock protecting global inpcb list, inpcb count, etc.
 	 */
 	struct rwlock		 ipi_list_lock;
 };
 
 #ifdef _KERNEL
 /*
  * Connection groups hold sets of connections that have similar CPU/thread
  * affinity.  Each connection belongs to exactly one connection group.
  */
 struct inpcbgroup {
 	/*
 	 * Per-connection group hash of inpcbs, hashed by local and foreign
 	 * addresses and port numbers.
 	 */
 	struct inpcbhead	*ipg_hashbase;		/* (c) */
 	u_long			 ipg_hashmask;		/* (c) */
 
 	/*
 	 * Notional affinity of this pcbgroup.
 	 */
 	u_int			 ipg_cpu;		/* (p) */
 
 	/*
 	 * Per-connection group lock, not to be confused with ipi_lock.
 	 * Protects the hash table hung off the group, but also the global
 	 * wildcard list in inpcbinfo.
 	 */
 	struct mtx		 ipg_lock;
 } __aligned(CACHE_LINE_SIZE);
 
 /*
  * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
  * (or unique address:port combination) can be re-used at most
  * INPCBLBGROUP_SIZMAX (256) times. The inpcbs are stored in il_inp which
  * is dynamically resized as processes bind/unbind to that specific group.
  */
 struct inpcblbgroup {
 	CK_LIST_ENTRY(inpcblbgroup) il_list;
 	struct epoch_context il_epoch_ctx;
 	uint16_t	il_lport;			/* (c) */
 	u_char		il_vflag;			/* (c) */
 	u_char		il_pad;
 	uint32_t	il_pad2;
 	union in_dependaddr il_dependladdr;		/* (c) */
 #define	il_laddr	il_dependladdr.id46_addr.ia46_addr4
 #define	il6_laddr	il_dependladdr.id6_addr
 	uint32_t	il_inpsiz; /* max count in il_inp[] (h) */
 	uint32_t	il_inpcnt; /* cur count in il_inp[] (h) */
 	struct inpcb	*il_inp[];			/* (h) */
 };
 
 #define INP_LOCK_INIT(inp, d, t) \
 	rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE |  RW_DUPOK)
 #define INP_LOCK_DESTROY(inp)	rw_destroy(&(inp)->inp_lock)
 #define INP_RLOCK(inp)		rw_rlock(&(inp)->inp_lock)
 #define INP_WLOCK(inp)		rw_wlock(&(inp)->inp_lock)
 #define INP_TRY_RLOCK(inp)	rw_try_rlock(&(inp)->inp_lock)
 #define INP_TRY_WLOCK(inp)	rw_try_wlock(&(inp)->inp_lock)
 #define INP_RUNLOCK(inp)	rw_runlock(&(inp)->inp_lock)
 #define INP_WUNLOCK(inp)	rw_wunlock(&(inp)->inp_lock)
 #define	INP_TRY_UPGRADE(inp)	rw_try_upgrade(&(inp)->inp_lock)
 #define	INP_DOWNGRADE(inp)	rw_downgrade(&(inp)->inp_lock)
 #define	INP_WLOCKED(inp)	rw_wowned(&(inp)->inp_lock)
 #define	INP_LOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_LOCKED)
 #define	INP_RLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_RLOCKED)
 #define	INP_WLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_WLOCKED)
 #define	INP_UNLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_UNLOCKED)
 
 /*
  * These locking functions are for inpcb consumers outside of sys/netinet,
  * more specifically, they were added for the benefit of TOE drivers. The
  * macros are reserved for use by the stack.
  */
 void inp_wlock(struct inpcb *);
 void inp_wunlock(struct inpcb *);
 void inp_rlock(struct inpcb *);
 void inp_runlock(struct inpcb *);
 
 #ifdef INVARIANT_SUPPORT
 void inp_lock_assert(struct inpcb *);
 void inp_unlock_assert(struct inpcb *);
 #else
 #define	inp_lock_assert(inp)	do {} while (0)
 #define	inp_unlock_assert(inp)	do {} while (0)
 #endif
 
 void	inp_apply_all(void (*func)(struct inpcb *, void *), void *arg);
 int 	inp_ip_tos_get(const struct inpcb *inp);
 void 	inp_ip_tos_set(struct inpcb *inp, int val);
 struct socket *
 	inp_inpcbtosocket(struct inpcb *inp);
 struct tcpcb *
 	inp_inpcbtotcpcb(struct inpcb *inp);
 void 	inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
 		uint32_t *faddr, uint16_t *fp);
 int	inp_so_options(const struct inpcb *inp);
 
 #endif /* _KERNEL */
 
 #define INP_INFO_LOCK_INIT(ipi, d) \
 	mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE)
 #define INP_INFO_LOCK_DESTROY(ipi)  mtx_destroy(&(ipi)->ipi_lock)
 #define INP_INFO_RLOCK_ET(ipi, et)	NET_EPOCH_ENTER_ET((et))
 #define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock)
 #define INP_INFO_TRY_WLOCK(ipi)	mtx_trylock(&(ipi)->ipi_lock)
 #define INP_INFO_WLOCKED(ipi)	mtx_owned(&(ipi)->ipi_lock)
 #define INP_INFO_RUNLOCK_ET(ipi, et)	NET_EPOCH_EXIT_ET((et))
 #define INP_INFO_RUNLOCK_TP(ipi, tp)	NET_EPOCH_EXIT_ET(*(tp)->t_inpcb->inp_et)
 #define INP_INFO_WUNLOCK(ipi)	mtx_unlock(&(ipi)->ipi_lock)
 #define	INP_INFO_LOCK_ASSERT(ipi)	MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_lock))
 #define INP_INFO_RLOCK_ASSERT(ipi)	MPASS(in_epoch(net_epoch_preempt))
 #define INP_INFO_WLOCK_ASSERT(ipi)	mtx_assert(&(ipi)->ipi_lock, MA_OWNED)
 #define INP_INFO_WUNLOCK_ASSERT(ipi)	\
 	mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED)
 #define INP_INFO_UNLOCK_ASSERT(ipi)	MPASS(!in_epoch(net_epoch_preempt) && !mtx_owned(&(ipi)->ipi_lock))
 
 #define INP_LIST_LOCK_INIT(ipi, d) \
         rw_init_flags(&(ipi)->ipi_list_lock, (d), 0)
 #define INP_LIST_LOCK_DESTROY(ipi)  rw_destroy(&(ipi)->ipi_list_lock)
 #define INP_LIST_RLOCK(ipi)     rw_rlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_WLOCK(ipi)     rw_wlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_TRY_UPGRADE(ipi)       rw_try_upgrade(&(ipi)->ipi_list_lock)
 #define INP_LIST_RUNLOCK(ipi)   rw_runlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_WUNLOCK(ipi)   rw_wunlock(&(ipi)->ipi_list_lock)
 #define INP_LIST_LOCK_ASSERT(ipi) \
 	rw_assert(&(ipi)->ipi_list_lock, RA_LOCKED)
 #define INP_LIST_RLOCK_ASSERT(ipi) \
 	rw_assert(&(ipi)->ipi_list_lock, RA_RLOCKED)
 #define INP_LIST_WLOCK_ASSERT(ipi) \
 	rw_assert(&(ipi)->ipi_list_lock, RA_WLOCKED)
 #define INP_LIST_UNLOCK_ASSERT(ipi) \
 	rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED)
 
 #define	INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF)
 #define	INP_HASH_LOCK_DESTROY(ipi)	mtx_destroy(&(ipi)->ipi_hash_lock)
 #define	INP_HASH_RLOCK(ipi)		struct epoch_tracker inp_hash_et; epoch_enter_preempt(net_epoch_preempt, &inp_hash_et)
 #define	INP_HASH_RLOCK_ET(ipi, et)		epoch_enter_preempt(net_epoch_preempt, &(et))
 #define	INP_HASH_WLOCK(ipi)		mtx_lock(&(ipi)->ipi_hash_lock)
 #define	INP_HASH_RUNLOCK(ipi)		NET_EPOCH_EXIT_ET(inp_hash_et)
 #define	INP_HASH_RUNLOCK_ET(ipi, et)		NET_EPOCH_EXIT_ET((et))
 #define	INP_HASH_WUNLOCK(ipi)		mtx_unlock(&(ipi)->ipi_hash_lock)
 #define	INP_HASH_LOCK_ASSERT(ipi)	MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_hash_lock))
 #define	INP_HASH_WLOCK_ASSERT(ipi)	mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED);
 
 #define	INP_GROUP_LOCK_INIT(ipg, d)	mtx_init(&(ipg)->ipg_lock, (d), NULL, \
 					    MTX_DEF | MTX_DUPOK)
 #define	INP_GROUP_LOCK_DESTROY(ipg)	mtx_destroy(&(ipg)->ipg_lock)
 
 #define	INP_GROUP_LOCK(ipg)		mtx_lock(&(ipg)->ipg_lock)
 #define	INP_GROUP_LOCK_ASSERT(ipg)	mtx_assert(&(ipg)->ipg_lock, MA_OWNED)
 #define	INP_GROUP_UNLOCK(ipg)		mtx_unlock(&(ipg)->ipg_lock)
 
 #define INP_PCBHASH(faddr, lport, fport, mask) \
 	(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
 #define INP_PCBPORTHASH(lport, mask) \
 	(ntohs((lport)) & (mask))
 #define	INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \
 	((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport)))
 #define	INP6_PCBHASHKEY(faddr)	((faddr)->s6_addr32[3])
 
 /*
  * Flags for inp_vflags -- historically version flags only
  */
 #define	INP_IPV4	0x1
 #define	INP_IPV6	0x2
 #define	INP_IPV6PROTO	0x4		/* opened under IPv6 protocol */
 
 /*
  * Flags for inp_flags.
  */
 #define	INP_RECVOPTS		0x00000001 /* receive incoming IP options */
 #define	INP_RECVRETOPTS		0x00000002 /* receive IP options for reply */
 #define	INP_RECVDSTADDR		0x00000004 /* receive IP dst address */
 #define	INP_HDRINCL		0x00000008 /* user supplies entire IP header */
 #define	INP_HIGHPORT		0x00000010 /* user wants "high" port binding */
 #define	INP_LOWPORT		0x00000020 /* user wants "low" port binding */
 #define	INP_ANONPORT		0x00000040 /* port chosen for user */
 #define	INP_RECVIF		0x00000080 /* receive incoming interface */
 #define	INP_MTUDISC		0x00000100 /* user can do MTU discovery */
 				   	   /* 0x000200 unused: was INP_FAITH */
 #define	INP_RECVTTL		0x00000400 /* receive incoming IP TTL */
 #define	INP_DONTFRAG		0x00000800 /* don't fragment packet */
 #define	INP_BINDANY		0x00001000 /* allow bind to any address */
 #define	INP_INHASHLIST		0x00002000 /* in_pcbinshash() has been called */
 #define	INP_RECVTOS		0x00004000 /* receive incoming IP TOS */
 #define	IN6P_IPV6_V6ONLY	0x00008000 /* restrict AF_INET6 socket for v6 */
 #define	IN6P_PKTINFO		0x00010000 /* receive IP6 dst and I/F */
 #define	IN6P_HOPLIMIT		0x00020000 /* receive hoplimit */
 #define	IN6P_HOPOPTS		0x00040000 /* receive hop-by-hop options */
 #define	IN6P_DSTOPTS		0x00080000 /* receive dst options after rthdr */
 #define	IN6P_RTHDR		0x00100000 /* receive routing header */
 #define	IN6P_RTHDRDSTOPTS	0x00200000 /* receive dstoptions before rthdr */
 #define	IN6P_TCLASS		0x00400000 /* receive traffic class value */
 #define	IN6P_AUTOFLOWLABEL	0x00800000 /* attach flowlabel automatically */
 #define	INP_TIMEWAIT		0x01000000 /* in TIMEWAIT, ppcb is tcptw */
 #define	INP_ONESBCAST		0x02000000 /* send all-ones broadcast */
 #define	INP_DROPPED		0x04000000 /* protocol drop flag */
 #define	INP_SOCKREF		0x08000000 /* strong socket reference */
 #define	INP_RESERVED_0          0x10000000 /* reserved field */
 #define	INP_RESERVED_1          0x20000000 /* reserved field */
 #define	IN6P_RFC2292		0x40000000 /* used RFC2292 API on the socket */
 #define	IN6P_MTU		0x80000000 /* receive path MTU */
 
 #define	INP_CONTROLOPTS		(INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\
 				 INP_RECVIF|INP_RECVTTL|INP_RECVTOS|\
 				 IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\
 				 IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\
 				 IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\
 				 IN6P_MTU)
 
 /*
  * Flags for inp_flags2.
  */
 #define	INP_2UNUSED1		0x00000001
 #define	INP_2UNUSED2		0x00000002
 #define	INP_PCBGROUPWILD	0x00000004 /* in pcbgroup wildcard list */
 #define	INP_REUSEPORT		0x00000008 /* SO_REUSEPORT option is set */
 #define	INP_FREED		0x00000010 /* inp itself is not valid */
 #define	INP_REUSEADDR		0x00000020 /* SO_REUSEADDR option is set */
 #define	INP_BINDMULTI		0x00000040 /* IP_BINDMULTI option is set */
 #define	INP_RSS_BUCKET_SET	0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
 #define	INP_RECVFLOWID		0x00000100 /* populate recv datagram with flow info */
 #define	INP_RECVRSSBUCKETID	0x00000200 /* populate recv datagram with bucket id */
 #define	INP_RATE_LIMIT_CHANGED	0x00000400 /* rate limit needs attention */
 #define	INP_ORIGDSTADDR		0x00000800 /* receive IP dst address/port */
 #define INP_CANNOT_DO_ECN	0x00001000 /* The stack does not do ECN */
 #define	INP_REUSEPORT_LB	0x00002000 /* SO_REUSEPORT_LB option is set */
-
+#define INP_SUPPORTS_MBUFQ	0x00004000 /* Supports the mbuf queue method of LRO */
+#define INP_MBUF_QUEUE_READY	0x00008000 /* The transport is pacing, inputs can be queued */
+#define INP_DONT_SACK_QUEUE	0x00010000 /* If a sack arrives do not wake me */
 /*
  * Flags passed to in_pcblookup*() functions.
  */
 #define	INPLOOKUP_WILDCARD	0x00000001	/* Allow wildcard sockets. */
 #define	INPLOOKUP_RLOCKPCB	0x00000002	/* Return inpcb read-locked. */
 #define	INPLOOKUP_WLOCKPCB	0x00000004	/* Return inpcb write-locked. */
 
 #define	INPLOOKUP_MASK	(INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \
 			    INPLOOKUP_WLOCKPCB)
 
 #define	sotoinpcb(so)	((struct inpcb *)(so)->so_pcb)
 #define	sotoin6pcb(so)	sotoinpcb(so) /* for KAME src sync over BSD*'s */
 
 #define	INP_SOCKAF(so) so->so_proto->pr_domain->dom_family
 
 #define	INP_CHECK_SOCKAF(so, af)	(INP_SOCKAF(so) == af)
 
 /*
  * Constants for pcbinfo.ipi_hashfields.
  */
 #define	IPI_HASHFIELDS_NONE	0
 #define	IPI_HASHFIELDS_2TUPLE	1
 #define	IPI_HASHFIELDS_4TUPLE	2
 
 #ifdef _KERNEL
 VNET_DECLARE(int, ipport_reservedhigh);
 VNET_DECLARE(int, ipport_reservedlow);
 VNET_DECLARE(int, ipport_lowfirstauto);
 VNET_DECLARE(int, ipport_lowlastauto);
 VNET_DECLARE(int, ipport_firstauto);
 VNET_DECLARE(int, ipport_lastauto);
 VNET_DECLARE(int, ipport_hifirstauto);
 VNET_DECLARE(int, ipport_hilastauto);
 VNET_DECLARE(int, ipport_randomized);
 VNET_DECLARE(int, ipport_randomcps);
 VNET_DECLARE(int, ipport_randomtime);
 VNET_DECLARE(int, ipport_stoprandom);
 VNET_DECLARE(int, ipport_tcpallocs);
 
 #define	V_ipport_reservedhigh	VNET(ipport_reservedhigh)
 #define	V_ipport_reservedlow	VNET(ipport_reservedlow)
 #define	V_ipport_lowfirstauto	VNET(ipport_lowfirstauto)
 #define	V_ipport_lowlastauto	VNET(ipport_lowlastauto)
 #define	V_ipport_firstauto	VNET(ipport_firstauto)
 #define	V_ipport_lastauto	VNET(ipport_lastauto)
 #define	V_ipport_hifirstauto	VNET(ipport_hifirstauto)
 #define	V_ipport_hilastauto	VNET(ipport_hilastauto)
 #define	V_ipport_randomized	VNET(ipport_randomized)
 #define	V_ipport_randomcps	VNET(ipport_randomcps)
 #define	V_ipport_randomtime	VNET(ipport_randomtime)
 #define	V_ipport_stoprandom	VNET(ipport_stoprandom)
 #define	V_ipport_tcpallocs	VNET(ipport_tcpallocs)
 
 void	in_pcbinfo_destroy(struct inpcbinfo *);
 void	in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *,
 	    int, int, char *, uma_init, u_int);
 
 int	in_pcbbind_check_bindmulti(const struct inpcb *ni,
 	    const struct inpcb *oi);
 
 struct inpcbgroup *
 	in_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t);
 struct inpcbgroup *
 	in_pcbgroup_byinpcb(struct inpcb *);
 struct inpcbgroup *
 	in_pcbgroup_bytuple(struct inpcbinfo *, struct in_addr, u_short,
 	    struct in_addr, u_short);
 void	in_pcbgroup_destroy(struct inpcbinfo *);
 int	in_pcbgroup_enabled(struct inpcbinfo *);
 void	in_pcbgroup_init(struct inpcbinfo *, u_int, int);
 void	in_pcbgroup_remove(struct inpcb *);
 void	in_pcbgroup_update(struct inpcb *);
 void	in_pcbgroup_update_mbuf(struct inpcb *, struct mbuf *);
 
 void	in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *);
 int	in_pcballoc(struct socket *, struct inpcbinfo *);
 int	in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *);
 int	in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa,
 	    u_short *lportp, struct sockaddr *fsa, u_short fport,
 	    struct ucred *cred, int lookupflags);
 int	in_pcb_lport(struct inpcb *, struct in_addr *, u_short *,
 	    struct ucred *, int);
 int	in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
 	    u_short *, struct ucred *);
 int	in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *);
 int	in_pcbconnect_mbuf(struct inpcb *, struct sockaddr *, struct ucred *,
 	    struct mbuf *, bool);
 int	in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
 	    u_short *, in_addr_t *, u_short *, struct inpcb **,
 	    struct ucred *);
 void	in_pcbdetach(struct inpcb *);
 void	in_pcbdisconnect(struct inpcb *);
 void	in_pcbdrop(struct inpcb *);
 void	in_pcbfree(struct inpcb *);
 int	in_pcbinshash(struct inpcb *);
 int	in_pcbinshash_mbuf(struct inpcb *, struct mbuf *);
 int	in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *,
 	    struct ucred *);
 struct inpcb *
 	in_pcblookup_local(struct inpcbinfo *,
 	    struct in_addr, u_short, int, struct ucred *);
 struct inpcb *
 	in_pcblookup(struct inpcbinfo *, struct in_addr, u_int,
 	    struct in_addr, u_int, int, struct ifnet *);
 struct inpcb *
 	in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int,
 	    struct in_addr, u_int, int, struct ifnet *, struct mbuf *);
 void	in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
 	    int, struct inpcb *(*)(struct inpcb *, int));
 void	in_pcbref(struct inpcb *);
 void	in_pcbrehash(struct inpcb *);
 void	in_pcbrehash_mbuf(struct inpcb *, struct mbuf *);
 int	in_pcbrele(struct inpcb *);
 int	in_pcbrele_rlocked(struct inpcb *);
 int	in_pcbrele_wlocked(struct inpcb *);
 void	in_pcblist_rele_rlocked(epoch_context_t ctx);
 void	in_losing(struct inpcb *);
 void	in_pcbsetsolabel(struct socket *so);
 int	in_getpeeraddr(struct socket *so, struct sockaddr **nam);
 int	in_getsockaddr(struct socket *so, struct sockaddr **nam);
 struct sockaddr *
 	in_sockaddr(in_port_t port, struct in_addr *addr);
 void	in_pcbsosetlabel(struct socket *so);
 #ifdef RATELIMIT
 int	in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t);
 void	in_pcbdetach_txrtlmt(struct inpcb *);
 int	in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
 int	in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
 int	in_pcbquery_txrlevel(struct inpcb *, uint32_t *);
 void	in_pcboutput_txrtlmt(struct inpcb *, struct ifnet *, struct mbuf *);
 void	in_pcboutput_eagain(struct inpcb *);
 #endif
 #endif /* _KERNEL */
 
 #endif /* !_NETINET_IN_PCB_H_ */
Index: stable/12/sys/netinet/tcp.h
===================================================================
--- stable/12/sys/netinet/tcp.h	(revision 362879)
+++ stable/12/sys/netinet/tcp.h	(revision 362880)
@@ -1,341 +1,354 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef _NETINET_TCP_H_
 #define _NETINET_TCP_H_
 
 #include <sys/cdefs.h>
 #include <sys/types.h>
 
 #if __BSD_VISIBLE
 
 typedef	u_int32_t tcp_seq;
 
 #define tcp6_seq	tcp_seq	/* for KAME src sync over BSD*'s */
 #define tcp6hdr		tcphdr	/* for KAME src sync over BSD*'s */
 
 /*
  * TCP header.
  * Per RFC 793, September, 1981.
  */
 struct tcphdr {
 	u_short	th_sport;		/* source port */
 	u_short	th_dport;		/* destination port */
 	tcp_seq	th_seq;			/* sequence number */
 	tcp_seq	th_ack;			/* acknowledgement number */
 #if BYTE_ORDER == LITTLE_ENDIAN
 	u_char	th_x2:4,		/* (unused) */
 		th_off:4;		/* data offset */
 #endif
 #if BYTE_ORDER == BIG_ENDIAN
 	u_char	th_off:4,		/* data offset */
 		th_x2:4;		/* (unused) */
 #endif
 	u_char	th_flags;
 #define	TH_FIN	0x01
 #define	TH_SYN	0x02
 #define	TH_RST	0x04
 #define	TH_PUSH	0x08
 #define	TH_ACK	0x10
 #define	TH_URG	0x20
 #define	TH_ECE	0x40
 #define	TH_CWR	0x80
 #define	TH_AE	0x100			/* maps into th_x2 */
 #define	TH_FLAGS	(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|TH_ECE|TH_CWR)
 #define	PRINT_TH_FLAGS	"\20\1FIN\2SYN\3RST\4PUSH\5ACK\6URG\7ECE\10CWR\11AE"
 
 	u_short	th_win;			/* window */
 	u_short	th_sum;			/* checksum */
 	u_short	th_urp;			/* urgent pointer */
 };
 
 #define	TCPOPT_EOL		0
 #define	   TCPOLEN_EOL			1
 #define	TCPOPT_PAD		0		/* padding after EOL */
 #define	   TCPOLEN_PAD			1
 #define	TCPOPT_NOP		1
 #define	   TCPOLEN_NOP			1
 #define	TCPOPT_MAXSEG		2
 #define    TCPOLEN_MAXSEG		4
 #define TCPOPT_WINDOW		3
 #define    TCPOLEN_WINDOW		3
 #define TCPOPT_SACK_PERMITTED	4
 #define    TCPOLEN_SACK_PERMITTED	2
 #define TCPOPT_SACK		5
 #define	   TCPOLEN_SACKHDR		2
 #define    TCPOLEN_SACK			8	/* 2*sizeof(tcp_seq) */
 #define TCPOPT_TIMESTAMP	8
 #define    TCPOLEN_TIMESTAMP		10
 #define    TCPOLEN_TSTAMP_APPA		(TCPOLEN_TIMESTAMP+2) /* appendix A */
 #define	TCPOPT_SIGNATURE	19		/* Keyed MD5: RFC 2385 */
 #define	   TCPOLEN_SIGNATURE		18
 #define	TCPOPT_FAST_OPEN	34
 #define	   TCPOLEN_FAST_OPEN_EMPTY	2
 
 /* Miscellaneous constants */
 #define	MAX_SACK_BLKS	6	/* Max # SACK blocks stored at receiver side */
 #define	TCP_MAX_SACK	4	/* MAX # SACKs sent in any segment */
 
 
 /*
  * The default maximum segment size (MSS) to be used for new TCP connections
  * when path MTU discovery is not enabled.
  *
  * RFC879 derives the default MSS from the largest datagram size hosts are
  * minimally required to handle directly or through IP reassembly minus the
  * size of the IP and TCP header.  With IPv6 the minimum MTU is specified
  * in RFC2460.
  *
  * For IPv4 the MSS is 576 - sizeof(struct tcpiphdr)
  * For IPv6 the MSS is IPV6_MMTU - sizeof(struct ip6_hdr) - sizeof(struct tcphdr)
  *
  * We use explicit numerical definition here to avoid header pollution.
  */
 #define	TCP_MSS		536
 #define	TCP6_MSS	1220
 
 /*
  * Limit the lowest MSS we accept for path MTU discovery and the TCP SYN MSS
  * option.  Allowing low values of MSS can consume significant resources and
  * be used to mount a resource exhaustion attack.
  * Connections requesting lower MSS values will be rounded up to this value
  * and the IP_DF flag will be cleared to allow fragmentation along the path.
  *
  * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments.  Setting
  * it to "0" disables the minmss check.
  *
  * The default value is fine for TCP across the Internet's smallest official
  * link MTU (256 bytes for AX.25 packet radio).  However, a connection is very
  * unlikely to come across such low MTU interfaces these days (anno domini 2003).
  */
 #define	TCP_MINMSS 216
 
 #define	TCP_MAXWIN	65535	/* largest value for (unscaled) window */
 #define	TTCP_CLIENT_SND_WND	4096	/* dflt send window for T/TCP client */
 
 #define TCP_MAX_WINSHIFT	14	/* maximum window shift */
 
 #define TCP_MAXBURST		4	/* maximum segments in a burst */
 
 #define TCP_MAXHLEN	(0xf<<2)	/* max length of header in bytes */
 #define TCP_MAXOLEN	(TCP_MAXHLEN - sizeof(struct tcphdr))
 					/* max space left for options */
 
 #define TCP_FASTOPEN_MIN_COOKIE_LEN	4	/* Per RFC7413 */
 #define TCP_FASTOPEN_MAX_COOKIE_LEN	16	/* Per RFC7413 */
 #define TCP_FASTOPEN_PSK_LEN		16	/* Same as TCP_FASTOPEN_KEY_LEN */
 #endif /* __BSD_VISIBLE */
 
 /*
  * User-settable options (used with setsockopt).  These are discrete
  * values and are not masked together.  Some values appear to be
  * bitmasks for historical reasons.
  */
 #define	TCP_NODELAY	1	/* don't delay send to coalesce packets */
 #if __BSD_VISIBLE
 #define	TCP_MAXSEG	2	/* set maximum segment size */
 #define TCP_NOPUSH	4	/* don't push last block of write */
 #define TCP_NOOPT	8	/* don't use TCP options */
 #define TCP_MD5SIG	16	/* use MD5 digests (RFC2385) */
 #define	TCP_INFO	32	/* retrieve tcp_info structure */
 #define	TCP_LOG		34	/* configure event logging for connection */
 #define	TCP_LOGBUF	35	/* retrieve event log for connection */
 #define	TCP_LOGID	36	/* configure log ID to correlate connections */
 #define	TCP_LOGDUMP	37	/* dump connection log events to device */
 #define	TCP_LOGDUMPID	38	/* dump events from connections with same ID to
 				   device */
 #define	TCP_CONGESTION	64	/* get/set congestion control algorithm */
 #define	TCP_CCALGOOPT	65	/* get/set cc algorithm specific options */
 #define TCP_DELACK  	72	/* socket option for delayed ack */
 #define	TCP_KEEPINIT	128	/* N, time to establish connection */
 #define	TCP_KEEPIDLE	256	/* L,N,X start keeplives after this period */
 #define	TCP_KEEPINTVL	512	/* L,N interval between keepalives */
 #define	TCP_KEEPCNT	1024	/* L,N number of keepalives before close */
 #define	TCP_FASTOPEN	1025	/* enable TFO / was created via TFO */
 #define	TCP_PCAP_OUT	2048	/* number of output packets to keep */
 #define	TCP_PCAP_IN	4096	/* number of input packets to keep */
 #define TCP_FUNCTION_BLK 8192	/* Set the tcp function pointers to the specified stack */
 /* Options for Rack and BBR */
 #define TCP_RACK_PROP	      1051 /* RACK proportional rate reduction (bool) */
 #define TCP_RACK_TLP_REDUCE   1052 /* RACK TLP cwnd reduction (bool) */
 #define TCP_RACK_PACE_REDUCE  1053 /* RACK Pacing reduction factor (divisor) */
 #define TCP_RACK_PACE_MAX_SEG 1054 /* Max segments in a pace */
 #define TCP_RACK_PACE_ALWAYS  1055 /* Use the always pace method */
 #define TCP_RACK_PROP_RATE    1056 /* The proportional reduction rate */
 #define TCP_RACK_PRR_SENDALOT 1057 /* Allow PRR to send more than one seg */
 #define TCP_RACK_MIN_TO       1058 /* Minimum time between rack t-o's in ms */
 #define TCP_RACK_EARLY_RECOV  1059 /* Should recovery happen early (bool) */
 #define TCP_RACK_EARLY_SEG    1060 /* If early recovery max segments */
 #define TCP_RACK_REORD_THRESH 1061 /* RACK reorder threshold (shift amount) */
 #define TCP_RACK_REORD_FADE   1062 /* Does reordering fade after ms time */
 #define TCP_RACK_TLP_THRESH   1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */
 #define TCP_RACK_PKT_DELAY    1064 /* RACK added ms i.e. rack-rtt + reord + N */
 #define TCP_RACK_TLP_INC_VAR  1065 /* Does TLP include rtt variance in t-o */
-#define TCP_RACK_SESS_CWV     1066 /* Enable RFC7611 cwnd validation on sess */
 #define TCP_BBR_IWINTSO	      1067 /* Initial TSO window for BBRs first sends */
-#define TCP_BBR_RECFORCE      1068 /* Enter recovery force out a segment disregard pacer */
+#define TCP_BBR_RECFORCE      1068 /* Enter recovery force out a segment disregard pacer no longer valid */
 #define TCP_BBR_STARTUP_PG    1069 /* Startup pacing gain */
 #define TCP_BBR_DRAIN_PG      1070 /* Drain pacing gain */
 #define TCP_BBR_RWND_IS_APP   1071 /* Rwnd limited is considered app limited */
 #define TCP_BBR_PROBE_RTT_INT 1072 /* How long in useconds between probe-rtt */
 #define TCP_BBR_ONE_RETRAN    1073 /* Is only one segment allowed out during retran */
 #define TCP_BBR_STARTUP_LOSS_EXIT 1074	/* Do we exit a loss during startup if not 20% incr */
 #define TCP_BBR_USE_LOWGAIN   1075 /* lower the gain in PROBE_BW enable */
-#define TCP_BBR_LOWGAIN_THRESH 1076 /* How many cycles do we stay in lowgain */
-#define TCP_BBR_LOWGAIN_HALF  1077 /* Do we halfstep lowgain down */
-#define TCP_BBR_LOWGAIN_FD    1078 /* Do we force a drain when lowgain in place */
+#define TCP_BBR_LOWGAIN_THRESH 1076 /* Unused after 2.3 morphs to TSLIMITS >= 2.3 */
+#define TCP_BBR_TSLIMITS 1076	   /* Do we use experimental Timestamp limiting for our algo */
+#define TCP_BBR_LOWGAIN_HALF  1077 /* Unused after 2.3 */
+#define TCP_BBR_PACE_OH        1077 /* Reused in 4.2 for pacing overhead setting */
+#define TCP_BBR_LOWGAIN_FD    1078 /* Unused after 2.3 */
+#define TCP_BBR_HOLD_TARGET 1078	/* For 4.3 on */
 #define TCP_BBR_USEDEL_RATE   1079 /* Enable use of delivery rate for loss recovery */
 #define TCP_BBR_MIN_RTO       1080 /* Min RTO in milliseconds */
 #define TCP_BBR_MAX_RTO	      1081 /* Max RTO in milliseconds */
 #define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */
-#define TCP_BBR_UNLIMITED     1083 /* Does BBR, in non-recovery not use cwnd */
+#define TCP_BBR_UNLIMITED     1083 /* Not used before 2.3 and morphs to algorithm >= 2.3 */
+#define TCP_BBR_ALGORITHM     1083 /* What measurement algo does BBR use netflix=0, google=1 */
 #define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */
 #define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */
 #define TCP_BBR_PACE_PER_SEC   1086
 #define TCP_BBR_PACE_DEL_TAR   1087
 #define TCP_BBR_PACE_SEG_MAX   1088
 #define TCP_BBR_PACE_SEG_MIN   1089
 #define TCP_BBR_PACE_CROSS     1090
 #define TCP_RACK_IDLE_REDUCE_HIGH 1092  /* Reduce the highest cwnd seen to IW on idle */
-#define TCP_RACK_IDLE_REDUCE_HIGH 1092  /* Reduce the highest cwnd seen to IW on idle */
 #define TCP_RACK_MIN_PACE      1093 	/* Do we enforce rack min pace time */
 #define TCP_RACK_MIN_PACE_SEG  1094	/* If so what is the seg threshould */
+#define TCP_RACK_GP_INCREASE   1094	/* After 4.1 its the GP increase */
 #define TCP_RACK_TLP_USE       1095
 #define TCP_BBR_ACK_COMP_ALG   1096 	/* Not used */
+#define TCP_BBR_TMR_PACE_OH    1096	/* Recycled in 4.2 */
 #define TCP_BBR_EXTRA_GAIN     1097
 #define TCP_BBR_RACK_RTT_USE   1098	/* what RTT should we use 0, 1, or 2? */
 #define TCP_BBR_RETRAN_WTSO    1099
 #define TCP_DATA_AFTER_CLOSE   1100
 #define TCP_BBR_PROBE_RTT_GAIN 1101
 #define TCP_BBR_PROBE_RTT_LEN  1102
+#define TCP_BBR_SEND_IWND_IN_TSO 1103	/* Do we burst out whole iwin size chunks at start? */
+#define TCP_BBR_USE_RACK_CHEAT 1104	/* Do we use the rack cheat for pacing rxt's */
+#define TCP_BBR_HDWR_PACE      1105	/* Enable/disable hardware pacing */
+#define TCP_BBR_UTTER_MAX_TSO  1106	/* Do we enforce an utter max TSO size */
+#define TCP_BBR_EXTRA_STATE    1107	/* Special exit-persist catch up */
+#define TCP_BBR_FLOOR_MIN_TSO  1108     /* The min tso size */
+#define TCP_BBR_MIN_TOPACEOUT  1109	/* Do we suspend pacing until */
+#define TCP_BBR_TSTMP_RAISES   1110	/* Can a timestamp measurement raise the b/w */
+#define TCP_BBR_POLICER_DETECT 1111	/* Turn on/off google mode policer detection */
 
 
 /* Start of reserved space for third-party user-settable options. */
 #define	TCP_VENDOR	SO_VENDOR
 
 #define	TCP_CA_NAME_MAX	16	/* max congestion control name length */
 
 #define	TCPI_OPT_TIMESTAMPS	0x01
 #define	TCPI_OPT_SACK		0x02
 #define	TCPI_OPT_WSCALE		0x04
 #define	TCPI_OPT_ECN		0x08
 #define	TCPI_OPT_TOE		0x10
 
 /* Maximum length of log ID. */
 #define TCP_LOG_ID_LEN	64
 
 /*
  * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits
  * the caller to query certain information about the state of a TCP
  * connection.  We provide an overlapping set of fields with the Linux
  * implementation, but since this is a fixed size structure, room has been
  * left for growth.  In order to maximize potential future compatibility with
  * the Linux API, the same variable names and order have been adopted, and
  * padding left to make room for omitted fields in case they are added later.
  *
  * XXX: This is currently an unstable ABI/API, in that it is expected to
  * change.
  */
 struct tcp_info {
 	u_int8_t	tcpi_state;		/* TCP FSM state. */
 	u_int8_t	__tcpi_ca_state;
 	u_int8_t	__tcpi_retransmits;
 	u_int8_t	__tcpi_probes;
 	u_int8_t	__tcpi_backoff;
 	u_int8_t	tcpi_options;		/* Options enabled on conn. */
 	u_int8_t	tcpi_snd_wscale:4,	/* RFC1323 send shift value. */
 			tcpi_rcv_wscale:4;	/* RFC1323 recv shift value. */
 
 	u_int32_t	tcpi_rto;		/* Retransmission timeout (usec). */
 	u_int32_t	__tcpi_ato;
 	u_int32_t	tcpi_snd_mss;		/* Max segment size for send. */
 	u_int32_t	tcpi_rcv_mss;		/* Max segment size for receive. */
 
 	u_int32_t	__tcpi_unacked;
 	u_int32_t	__tcpi_sacked;
 	u_int32_t	__tcpi_lost;
 	u_int32_t	__tcpi_retrans;
 	u_int32_t	__tcpi_fackets;
 
 	/* Times; measurements in usecs. */
 	u_int32_t	__tcpi_last_data_sent;
 	u_int32_t	__tcpi_last_ack_sent;	/* Also unimpl. on Linux? */
 	u_int32_t	tcpi_last_data_recv;	/* Time since last recv data. */
 	u_int32_t	__tcpi_last_ack_recv;
 
 	/* Metrics; variable units. */
 	u_int32_t	__tcpi_pmtu;
 	u_int32_t	__tcpi_rcv_ssthresh;
 	u_int32_t	tcpi_rtt;		/* Smoothed RTT in usecs. */
 	u_int32_t	tcpi_rttvar;		/* RTT variance in usecs. */
 	u_int32_t	tcpi_snd_ssthresh;	/* Slow start threshold. */
 	u_int32_t	tcpi_snd_cwnd;		/* Send congestion window. */
 	u_int32_t	__tcpi_advmss;
 	u_int32_t	__tcpi_reordering;
 
 	u_int32_t	__tcpi_rcv_rtt;
 	u_int32_t	tcpi_rcv_space;		/* Advertised recv window. */
 
 	/* FreeBSD extensions to tcp_info. */
 	u_int32_t	tcpi_snd_wnd;		/* Advertised send window. */
 	u_int32_t	tcpi_snd_bwnd;		/* No longer used. */
 	u_int32_t	tcpi_snd_nxt;		/* Next egress seqno */
 	u_int32_t	tcpi_rcv_nxt;		/* Next ingress seqno */
 	u_int32_t	tcpi_toe_tid;		/* HWTID for TOE endpoints */
 	u_int32_t	tcpi_snd_rexmitpack;	/* Retransmitted packets */
 	u_int32_t	tcpi_rcv_ooopack;	/* Out-of-order packets */
 	u_int32_t	tcpi_snd_zerowin;	/* Zero-sized windows sent */
 	
 	/* Padding to grow without breaking ABI. */
 	u_int32_t	__tcpi_pad[26];		/* Padding. */
 };
 
 /*
  * If this structure is provided when setting the TCP_FASTOPEN socket
  * option, and the enable member is non-zero, a subsequent connect will use
  * pre-shared key (PSK) mode using the provided key.
  */
 struct tcp_fastopen {
 	int enable;
 	uint8_t psk[TCP_FASTOPEN_PSK_LEN];
 };
 #endif
 #define TCP_FUNCTION_NAME_LEN_MAX 32
 
 struct tcp_function_set {
 	char function_set_name[TCP_FUNCTION_NAME_LEN_MAX];
 	uint32_t pcbcnt;
 };
 
 #endif /* !_NETINET_TCP_H_ */
Index: stable/12/sys/netinet/tcp_hpts.c
===================================================================
--- stable/12/sys/netinet/tcp_hpts.c	(revision 362879)
+++ stable/12/sys/netinet/tcp_hpts.c	(revision 362880)
@@ -1,1902 +1,1975 @@
 /*-
  * Copyright (c) 2016-2018 Netflix, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_tcpdebug.h"
 /**
  * Some notes about usage.
  *
  * The tcp_hpts system is designed to provide a high precision timer
  * system for tcp. Its main purpose is to provide a mechanism for 
  * pacing packets out onto the wire. It can be used in two ways
  * by a given TCP stack (and those two methods can be used simultaneously).
  *
- * First, and probably the main thing its used by Rack and BBR for, it can
+ * First, and probably the main thing its used by Rack and BBR, it can
  * be used to call tcp_output() of a transport stack at some time in the future.
  * The normal way this is done is that tcp_output() of the stack schedules
  * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
  * slot is the time from now that the stack wants to be called but it
  * must be converted to tcp_hpts's notion of slot. This is done with
  * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical
  * call from the tcp_output() routine might look like:
  *
  * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550));
  *
  * The above would schedule tcp_ouput() to be called in 550 useconds.
  * Note that if using this mechanism the stack will want to add near
  * its top a check to prevent unwanted calls (from user land or the
  * arrival of incoming ack's). So it would add something like:
  *
  * if (inp->inp_in_hpts)
  *    return;
  *
  * to prevent output processing until the time alotted has gone by.
  * Of course this is a bare bones example and the stack will probably
  * have more consideration then just the above.
- *
- * Now the tcp_hpts system will call tcp_output in one of two forms, 
- * it will first check to see if the stack as defined a 
- * tfb_tcp_output_wtime() function, if so that is the routine it
- * will call, if that function is not defined then it will call the
- * tfb_tcp_output() function. The only difference between these
- * two calls is that the former passes the time in to the function
- * so the function does not have to access the time (which tcp_hpts
- * already has). What these functions do is of course totally up
- * to the individual tcp stack.
- *
+ * 
  * Now the second function (actually two functions I guess :D)
  * the tcp_hpts system provides is the  ability to either abort 
- * a connection (later) or process  input on a connection. 
- * Why would you want to do this? To keep processor locality.
+ * a connection (later) or process input on a connection. 
+ * Why would you want to do this? To keep processor locality
+ * and or not have to worry about untangling any recursive
+ * locks. The input function now is hooked to the new LRO
+ * system as well. 
  *
- * So in order to use the input redirection function the
- * stack changes its tcp_do_segment() routine to instead
- * of process the data call the function:
+ * In order to use the input redirection function the
+ * tcp stack must define an input function for 
+ * tfb_do_queued_segments(). This function understands
+ * how to dequeue a array of packets that were input and
+ * knows how to call the correct processing routine. 
  *
- * tcp_queue_pkt_to_input()
- *
- * You will note that the arguments to this function look
- * a lot like tcp_do_segments's arguments. This function
- * will assure that the tcp_hpts system will
- * call the functions tfb_tcp_hpts_do_segment() from the
- * correct CPU. Note that multiple calls can get pushed
- * into the tcp_hpts system this will be indicated by
- * the next to last argument to tfb_tcp_hpts_do_segment()
- * (nxt_pkt). If nxt_pkt is a 1 then another packet is
- * coming. If nxt_pkt is a 0 then this is the last call
- * that the tcp_hpts system has available for the tcp stack.
+ * Locking in this is important as well so most likely the 
+ * stack will need to define the tfb_do_segment_nounlock()
+ * splitting tfb_do_segment() into two parts. The main processing
+ * part that does not unlock the INP and returns a value of 1 or 0.
+ * It returns 0 if all is well and the lock was not released. It
+ * returns 1 if we had to destroy the TCB (a reset received etc).
+ * The remains of tfb_do_segment() then become just a simple call
+ * to the tfb_do_segment_nounlock() function and check the return
+ * code and possibly unlock.
  * 
- * The other point of the input system is to be able to safely
- * drop a tcp connection without worrying about the recursive 
- * locking that may be occuring on the INP_WLOCK. So if
+ * The stack must also set the flag on the INP that it supports this
+ * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes
+ * this flag as well and will queue packets when it is set.
+ * There are other flags as well INP_MBUF_QUEUE_READY and
+ * INP_DONT_SACK_QUEUE. The first flag tells the LRO code
+ * that we are in the pacer for output so there is no
+ * need to wake up the hpts system to get immediate
+ * input. The second tells the LRO code that its okay
+ * if a SACK arrives you can still defer input and let
+ * the current hpts timer run (this is usually set when
+ * a rack timer is up so we know SACK's are happening
+ * on the connection already and don't want to wakeup yet).
+ *
+ * There is a common functions within the rack_bbr_common code
+ * version i.e. ctf_do_queued_segments(). This function
+ * knows how to take the input queue of packets from 
+ * tp->t_in_pkts and process them digging out 
+ * all the arguments, calling any bpf tap and 
+ * calling into tfb_do_segment_nounlock(). The common
+ * function (ctf_do_queued_segments())  requires that 
+ * you have defined the tfb_do_segment_nounlock() as
+ * described above.
+ *
+ * The second feature of the input side of hpts is the
+ * dropping of a connection. This is due to the way that
+ * locking may have occured on the INP_WLOCK. So if
  * a stack wants to drop a connection it calls:
  *
  *     tcp_set_inp_to_drop(tp, ETIMEDOUT)
  * 
  * To schedule the tcp_hpts system to call 
  * 
  *    tcp_drop(tp, drop_reason)
  *
  * at a future point. This is quite handy to prevent locking
  * issues when dropping connections.
  *
  */
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/hhook.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/refcount.h>
 #include <sys/sched.h>
 #include <sys/queue.h>
 #include <sys/smp.h>
 #include <sys/counter.h>
 #include <sys/time.h>
 #include <sys/kthread.h>
 #include <sys/kern_prefetch.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/vnet.h>
 
 #define TCPSTATES		/* for logging */
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* required for icmp_var.h */
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcp_hpts.h>
+#include <netinet/tcp_log_buf.h>
 
 #ifdef tcpdebug
 #include <netinet/tcp_debug.h>
 #endif				/* tcpdebug */
 #ifdef tcp_offload
 #include <netinet/tcp_offload.h>
 #endif
 
 #include "opt_rss.h"
 
 MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
 #ifdef RSS
-#include <net/netisr.h>
-#include <net/rss_config.h>
 static int tcp_bind_threads = 1;
 #else
 static int tcp_bind_threads = 0;
 #endif
 TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
 
-static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG;
-
-TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size);
-
 static struct tcp_hptsi tcp_pace;
+static int hpts_does_tp_logging = 0;
 
 static void tcp_wakehpts(struct tcp_hpts_entry *p);
 static void tcp_wakeinput(struct tcp_hpts_entry *p);
 static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
-static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick);
+static void tcp_hptsi(struct tcp_hpts_entry *hpts);
 static void tcp_hpts_thread(void *ctx);
 static void tcp_init_hptsi(void *st);
 
 int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
 static int32_t tcp_hpts_callout_skip_swi = 0;
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW, 0, "TCP Hpts controls");
 
 #define	timersub(tvp, uvp, vvp)						\
 	do {								\
 		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
 		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\
 		if ((vvp)->tv_usec < 0) {				\
 			(vvp)->tv_sec--;				\
 			(vvp)->tv_usec += 1000000;			\
 		}							\
 	} while (0)
 
-static int32_t logging_on = 0;
-static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2);
 static int32_t tcp_hpts_precision = 120;
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
     &tcp_hpts_precision, 120,
     "Value for PRE() precision of callout");
 
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
-    &logging_on, 0,
-    "Turn on logging if compiled in");
+counter_u64_t hpts_hopelessly_behind;
 
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, hopeless, CTLFLAG_RD,
+    &hpts_hopelessly_behind,
+    "Number of times hpts could not catch up and was behind hopelessly");
+
 counter_u64_t hpts_loops;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD,
     &hpts_loops, "Number of times hpts had to loop to catch up");
 
+
 counter_u64_t back_tosleep;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
     &back_tosleep, "Number of times hpts found no tcbs");
 
-static int32_t in_newts_every_tcb = 0;
+counter_u64_t combined_wheel_wrap;
 
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW,
-    &in_newts_every_tcb, 0,
-    "Do we have a new cts every tcb we process for input");
-static int32_t in_ts_percision = 0;
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
+    &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
 
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW,
-    &in_ts_percision, 0,
-    "Do we use percise timestamp for clients on input");
-static int32_t out_newts_every_tcb = 0;
+counter_u64_t wheel_wrap;
 
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW,
-    &out_newts_every_tcb, 0,
-    "Do we have a new cts every tcb we process for output");
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, wheel_wrap, CTLFLAG_RD,
+    &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
+
 static int32_t out_ts_percision = 0;
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW,
     &out_ts_percision, 0,
     "Do we use a percise timestamp for every output cts");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
+    &hpts_does_tp_logging, 0,
+    "Do we add to any tp that has logging on pacer logs");
 
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW,
+static int32_t max_pacer_loops = 10;
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW,
+    &max_pacer_loops, 10,
+    "What is the maximum number of times the pacer will loop trying to catch up");
+
+#define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2)
+
+static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED;
+
+
+static int
+sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	uint32_t new;
+
+	new = hpts_sleep_max;
+	error = sysctl_handle_int(oidp, &new, 0, req);
+	if (error == 0 && req->newptr) {
+		if ((new < (NUM_OF_HPTSI_SLOTS / 4)) ||
+		    (new > HPTS_MAX_SLEEP_ALLOWED)) 
+			error = EINVAL;
+		else
+			hpts_sleep_max = new;
+	}
+	return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep,
+    CTLTYPE_UINT | CTLFLAG_RW,
     &hpts_sleep_max, 0,
-    "The maximum time the hpts will sleep <1 - 254>");
+    &sysctl_net_inet_tcp_hpts_max_sleep, "IU",
+    "Maximum time hpts will sleep");
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW,
     &tcp_min_hptsi_time, 0,
     "The minimum time the hpts must sleep before processing more slots");
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW,
     &tcp_hpts_callout_skip_swi, 0,
     "Do we have the callout call directly to the hpts?");
 
 static void
-__tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot,
-    uint32_t ticknow, int32_t line)
+tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
+	     int ticks_to_run, int idx)
 {
-	struct hpts_log *pl;
-
-	HPTS_MTX_ASSERT(hpts);
-	if (hpts->p_log == NULL)
-		return;
-	pl = &hpts->p_log[hpts->p_log_at];
-	hpts->p_log_at++;
-	if (hpts->p_log_at >= hpts->p_logsize) {
-		hpts->p_log_at = 0;
-		hpts->p_log_wrapped = 1;
-	}
-	pl->inp = inp;
-	if (inp) {
-		pl->t_paceslot = inp->inp_hptsslot;
-		pl->t_hptsreq = inp->inp_hpts_request;
-		pl->p_onhpts = inp->inp_in_hpts;
-		pl->p_oninput = inp->inp_in_input;
-	} else {
-		pl->t_paceslot = 0;
-		pl->t_hptsreq = 0;
-		pl->p_onhpts = 0;
-		pl->p_oninput = 0;
-	}
-	pl->is_notempty = 1;
-	pl->event = event;
-	pl->line = line;
-	pl->cts = tcp_get_usecs(NULL);
-	pl->p_curtick = hpts->p_curtick;
-	pl->p_prevtick = hpts->p_prevtick;
-	pl->p_on_queue_cnt = hpts->p_on_queue_cnt;
-	pl->ticknow = ticknow;
-	pl->slot_req = slot;
-	pl->p_nxt_slot = hpts->p_nxt_slot;
-	pl->p_cur_slot = hpts->p_cur_slot;
-	pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time;
-	pl->p_flags = (hpts->p_cpu & 0x7f);
-	pl->p_flags <<= 7;
-	pl->p_flags |= (hpts->p_num & 0x7f);
-	pl->p_flags <<= 2;
-	if (hpts->p_hpts_active) {
-		pl->p_flags |= HPTS_HPTS_ACTIVE;
-	}
+	union tcp_log_stackspecific log;
+	
+	memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+	log.u_bbr.flex1 = hpts->p_nxt_slot;
+	log.u_bbr.flex2 = hpts->p_cur_slot;
+	log.u_bbr.flex3 = hpts->p_prev_slot;
+	log.u_bbr.flex4 = idx;
+	log.u_bbr.flex5 = hpts->p_curtick;
+	log.u_bbr.flex6 = hpts->p_on_queue_cnt;
+	log.u_bbr.use_lt_bw = 1;
+	log.u_bbr.inflight = ticks_to_run;
+	log.u_bbr.applimited = hpts->overidden_sleep;
+	log.u_bbr.delivered = hpts->saved_curtick;
+	log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
+	log.u_bbr.epoch = hpts->saved_curslot;
+	log.u_bbr.lt_epoch = hpts->saved_prev_slot;
+	log.u_bbr.pkts_out = hpts->p_delayed_by;
+	log.u_bbr.lost = hpts->p_hpts_sleep_time;
+	log.u_bbr.cur_del_rate = hpts->p_runningtick;
+	TCP_LOG_EVENTP(tp, NULL,
+		       &tp->t_inpcb->inp_socket->so_rcv,
+		       &tp->t_inpcb->inp_socket->so_snd,
+		       BBR_LOG_HPTSDIAG, 0,
+		       0, &log, false, tv);
 }
 
-#define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__)
-
 static void
 hpts_timeout_swi(void *arg)
 {
 	struct tcp_hpts_entry *hpts;
 
 	hpts = (struct tcp_hpts_entry *)arg;
 	swi_sched(hpts->ie_cookie, 0);
 }
 
 static void
 hpts_timeout_dir(void *arg)
 {
 	tcp_hpts_thread(arg);
 }
 
 static inline void
 hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear)
 {
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx) == 0) {
 		/* We don't own the mutex? */
 		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
 	}
 	if (hpts->p_cpu != inp->inp_hpts_cpu) {
 		/* It is not the right cpu/mutex? */
 		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
 	}
 	if (inp->inp_in_hpts == 0) {
 		/* We are not on the hpts? */
 		panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp);
 	}
-	if (TAILQ_EMPTY(head) &&
-	    (hpts->p_on_queue_cnt != 0)) {
-		/* We should not be empty with a queue count */
-		panic("%s hpts:%p hpts bucket empty but cnt:%d",
-		    __FUNCTION__, hpts, hpts->p_on_queue_cnt);
-	}
 #endif
 	TAILQ_REMOVE(head, inp, inp_hpts);
 	hpts->p_on_queue_cnt--;
 	if (hpts->p_on_queue_cnt < 0) {
 		/* Count should not go negative .. */
 #ifdef INVARIANTS
 		panic("Hpts goes negative inp:%p hpts:%p",
 		    inp, hpts);
 #endif
 		hpts->p_on_queue_cnt = 0;
 	}
 	if (clear) {
 		inp->inp_hpts_request = 0;
 		inp->inp_in_hpts = 0;
 	}
 }
 
 static inline void
 hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref)
 {
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx) == 0) {
 		/* We don't own the mutex? */
 		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
 	}
 	if (hpts->p_cpu != inp->inp_hpts_cpu) {
 		/* It is not the right cpu/mutex? */
 		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
 	}
 	if ((noref == 0) && (inp->inp_in_hpts == 1)) {
 		/* We are already on the hpts? */
 		panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp);
 	}
 #endif
 	TAILQ_INSERT_TAIL(head, inp, inp_hpts);
 	inp->inp_in_hpts = 1;
 	hpts->p_on_queue_cnt++;
 	if (noref == 0) {
 		in_pcbref(inp);
 	}
 }
 
 static inline void
 hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear)
 {
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx) == 0) {
 		/* We don't own the mutex? */
 		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
 	}
 	if (hpts->p_cpu != inp->inp_input_cpu) {
 		/* It is not the right cpu/mutex? */
 		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
 	}
 	if (inp->inp_in_input == 0) {
 		/* We are not on the input hpts? */
 		panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp);
 	}
 #endif
 	TAILQ_REMOVE(&hpts->p_input, inp, inp_input);
 	hpts->p_on_inqueue_cnt--;
 	if (hpts->p_on_inqueue_cnt < 0) {
 #ifdef INVARIANTS
 		panic("Hpts in goes negative inp:%p hpts:%p",
 		    inp, hpts);
 #endif
 		hpts->p_on_inqueue_cnt = 0;
 	}
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&hpts->p_input) &&
 	    (hpts->p_on_inqueue_cnt != 0)) {
 		/* We should not be empty with a queue count */
 		panic("%s hpts:%p in_hpts input empty but cnt:%d",
 		    __FUNCTION__, hpts, hpts->p_on_inqueue_cnt);
 	}
 #endif
 	if (clear)
 		inp->inp_in_input = 0;
 }
 
 static inline void
 hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line)
 {
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx) == 0) {
 		/* We don't own the mutex? */
 		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
 	}
 	if (hpts->p_cpu != inp->inp_input_cpu) {
 		/* It is not the right cpu/mutex? */
 		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
 	}
 	if (inp->inp_in_input == 1) {
 		/* We are already on the input hpts? */
 		panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp);
 	}
 #endif
 	TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input);
 	inp->inp_in_input = 1;
 	hpts->p_on_inqueue_cnt++;
 	in_pcbref(inp);
 }
 
-static int
-sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS)
-{
-	struct tcp_hpts_entry *hpts;
-	size_t sz;
-	int32_t logging_was, i;
-	int32_t error = 0;
-
-	/*
-	 * HACK: Turn off logging so no locks are required this really needs
-	 * a memory barrier :)
-	 */
-	logging_was = logging_on;
-	logging_on = 0;
-	if (!req->oldptr) {
-		/* How much? */
-		sz = 0;
-		for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
-			hpts = tcp_pace.rp_ent[i];
-			if (hpts->p_log == NULL)
-				continue;
-			sz += (sizeof(struct hpts_log) * hpts->p_logsize);
-		}
-		error = SYSCTL_OUT(req, 0, sz);
-	} else {
-		for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
-			hpts = tcp_pace.rp_ent[i];
-			if (hpts->p_log == NULL)
-				continue;
-			if (hpts->p_log_wrapped)
-				sz = (sizeof(struct hpts_log) * hpts->p_logsize);
-			else
-				sz = (sizeof(struct hpts_log) * hpts->p_log_at);
-			error = SYSCTL_OUT(req, hpts->p_log, sz);
-		}
-	}
-	logging_on = logging_was;
-	return error;
-}
-
-SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
-    0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log");
-
-
 static void
 tcp_wakehpts(struct tcp_hpts_entry *hpts)
 {
 	HPTS_MTX_ASSERT(hpts);
-	swi_sched(hpts->ie_cookie, 0);
-	if (hpts->p_hpts_active == 2) {
-		/* Rare sleeping on a ENOBUF */
-		wakeup_one(hpts);
+	if (hpts->p_hpts_wake_scheduled == 0) {
+		hpts->p_hpts_wake_scheduled = 1;
+		swi_sched(hpts->ie_cookie, 0);
 	}
 }
 
 static void
 tcp_wakeinput(struct tcp_hpts_entry *hpts)
 {
 	HPTS_MTX_ASSERT(hpts);
-	swi_sched(hpts->ie_cookie, 0);
-	if (hpts->p_hpts_active == 2) {
-		/* Rare sleeping on a ENOBUF */
-		wakeup_one(hpts);
+	if (hpts->p_hpts_wake_scheduled == 0) {
+		hpts->p_hpts_wake_scheduled = 1;
+		swi_sched(hpts->ie_cookie, 0);
 	}
 }
 
 struct tcp_hpts_entry *
 tcp_cur_hpts(struct inpcb *inp)
 {
 	int32_t hpts_num;
 	struct tcp_hpts_entry *hpts;
 
 	hpts_num = inp->inp_hpts_cpu;
 	hpts = tcp_pace.rp_ent[hpts_num];
 	return (hpts);
 }
 
 struct tcp_hpts_entry *
 tcp_hpts_lock(struct inpcb *inp)
 {
 	struct tcp_hpts_entry *hpts;
 	int32_t hpts_num;
 
 again:
 	hpts_num = inp->inp_hpts_cpu;
 	hpts = tcp_pace.rp_ent[hpts_num];
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx)) {
 		panic("Hpts:%p owns mtx prior-to lock line:%d",
 		    hpts, __LINE__);
 	}
 #endif
 	mtx_lock(&hpts->p_mtx);
 	if (hpts_num != inp->inp_hpts_cpu) {
 		mtx_unlock(&hpts->p_mtx);
 		goto again;
 	}
 	return (hpts);
 }
 
 struct tcp_hpts_entry *
 tcp_input_lock(struct inpcb *inp)
 {
 	struct tcp_hpts_entry *hpts;
 	int32_t hpts_num;
 
 again:
 	hpts_num = inp->inp_input_cpu;
 	hpts = tcp_pace.rp_ent[hpts_num];
 #ifdef INVARIANTS
 	if (mtx_owned(&hpts->p_mtx)) {
 		panic("Hpts:%p owns mtx prior-to lock line:%d",
 		    hpts, __LINE__);
 	}
 #endif
 	mtx_lock(&hpts->p_mtx);
 	if (hpts_num != inp->inp_input_cpu) {
 		mtx_unlock(&hpts->p_mtx);
 		goto again;
 	}
 	return (hpts);
 }
 
 static void
 tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line)
 {
 	int32_t add_freed;
 
 	if (inp->inp_flags2 & INP_FREED) {
 		/*
 		 * Need to play a special trick so that in_pcbrele_wlocked
 		 * does not return 1 when it really should have returned 0.
 		 */
 		add_freed = 1;
 		inp->inp_flags2 &= ~INP_FREED;
 	} else {
 		add_freed = 0;
 	}
 #ifndef INP_REF_DEBUG
 	if (in_pcbrele_wlocked(inp)) {
 		/*
 		 * This should not happen. We have the inpcb referred to by
 		 * the main socket (why we are called) and the hpts. It
 		 * should always return 0.
 		 */
 		panic("inpcb:%p release ret 1",
 		    inp);
 	}
 #else
 	if (__in_pcbrele_wlocked(inp, line)) {
 		/*
 		 * This should not happen. We have the inpcb referred to by
 		 * the main socket (why we are called) and the hpts. It
 		 * should always return 0.
 		 */
 		panic("inpcb:%p release ret 1",
 		    inp);
 	}
 #endif
 	if (add_freed) {
 		inp->inp_flags2 |= INP_FREED;
 	}
 }
 
 static void
 tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
 {
 	if (inp->inp_in_hpts) {
 		hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1);
 		tcp_remove_hpts_ref(inp, hpts, line);
 	}
 }
 
 static void
 tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
 {
 	HPTS_MTX_ASSERT(hpts);
 	if (inp->inp_in_input) {
 		hpts_sane_input_remove(hpts, inp, 1);
 		tcp_remove_hpts_ref(inp, hpts, line);
 	}
 }
 
 /*
  * Called normally with the INP_LOCKED but it
  * does not matter, the hpts lock is the key
  * but the lock order allows us to hold the
  * INP lock and then get the hpts lock.
  *
  * Valid values in the flags are
  * HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
  * HPTS_REMOVE_INPUT - remove from the input of the hpts.
- * Note that you can or both values together and get two
- * actions.
+ * Note that you can use one or both values together 
+ * and get two actions.
  */
 void
 __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
 {
 	struct tcp_hpts_entry *hpts;
 
 	INP_WLOCK_ASSERT(inp);
 	if (flags & HPTS_REMOVE_OUTPUT) {
 		hpts = tcp_hpts_lock(inp);
 		tcp_hpts_remove_locked_output(hpts, inp, flags, line);
 		mtx_unlock(&hpts->p_mtx);
 	}
 	if (flags & HPTS_REMOVE_INPUT) {
 		hpts = tcp_input_lock(inp);
 		tcp_hpts_remove_locked_input(hpts, inp, flags, line);
 		mtx_unlock(&hpts->p_mtx);
 	}
 }
 
 static inline int
-hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus)
+hpts_tick(uint32_t wheel_tick, uint32_t plus)
 {
-	return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS);
+	/*
+	 * Given a slot on the wheel, what slot
+	 * is that plus ticks out?
+	 */
+	KASSERT(wheel_tick < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_tick));
+	return ((wheel_tick + plus) % NUM_OF_HPTSI_SLOTS);
 }
 
+static inline int
+tick_to_wheel(uint32_t cts_in_wticks)
+{
+	/* 
+	 * Given a timestamp in wheel ticks (10usec inc's)
+	 * map it to our limited space wheel.
+	 */
+	return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
+}
+
+static inline int
+hpts_ticks_diff(int prev_tick, int tick_now)
+{
+	/*
+	 * Given two ticks that are someplace
+	 * on our wheel. How far are they apart?
+	 */
+	if (tick_now > prev_tick)
+		return (tick_now - prev_tick);
+	else if (tick_now == prev_tick)
+		/* 
+		 * Special case, same means we can go all of our 
+		 * wheel less one slot.
+		 */
+		return (NUM_OF_HPTSI_SLOTS - 1);
+	else
+		return ((NUM_OF_HPTSI_SLOTS - prev_tick) + tick_now);
+}
+
+/*
+ * Given a tick on the wheel that is the current time
+ * mapped to the wheel (wheel_tick), what is the maximum
+ * distance forward that can be obtained without
+ * wrapping past either prev_tick or running_tick
+ * depending on the htps state? Also if passed
+ * a uint32_t *, fill it with the tick location.
+ *
+ * Note if you do not give this function the current
+ * time (that you think it is) mapped to the wheel 
+ * then the results will not be what you expect and
+ * could lead to invalid inserts.
+ */
+static inline int32_t
+max_ticks_available(struct tcp_hpts_entry *hpts, uint32_t wheel_tick, uint32_t *target_tick)
+{
+	uint32_t dis_to_travel, end_tick, pacer_to_now, avail_on_wheel;
+
+	if ((hpts->p_hpts_active == 1) &&
+	    (hpts->p_wheel_complete == 0)) {
+		end_tick = hpts->p_runningtick;
+		/* Back up one tick */
+		if (end_tick == 0)
+			end_tick = NUM_OF_HPTSI_SLOTS - 1;
+		else
+			end_tick--;
+		if (target_tick)
+			*target_tick = end_tick;
+	} else {
+		/*
+		 * For the case where we are
+		 * not active, or we have
+		 * completed the pass over
+		 * the wheel, we can use the
+		 * prev tick and subtract one from it. This puts us
+		 * as far out as possible on the wheel.
+		 */
+		end_tick = hpts->p_prev_slot;
+		if (end_tick == 0)
+			end_tick = NUM_OF_HPTSI_SLOTS - 1;
+		else
+			end_tick--;
+		if (target_tick)
+			*target_tick = end_tick;
+		/* 
+		 * Now we have close to the full wheel left minus the 
+		 * time it has been since the pacer went to sleep. Note
+		 * that wheel_tick, passed in, should be the current time
+		 * from the perspective of the caller, mapped to the wheel.
+		 */
+		if (hpts->p_prev_slot != wheel_tick)
+			dis_to_travel = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
+		else
+			dis_to_travel = 1;
+		/* 
+		 * dis_to_travel in this case is the space from when the 
+		 * pacer stopped (p_prev_slot) and where our wheel_tick 
+		 * is now. To know how many slots we can put it in we 
+		 * subtract from the wheel size. We would not want
+		 * to place something after p_prev_slot or it will
+		 * get ran too soon.
+		 */
+		return (NUM_OF_HPTSI_SLOTS - dis_to_travel);
+	}
+	/* 
+	 * So how many slots are open between p_runningtick -> p_cur_slot 
+	 * that is what is currently un-available for insertion. Special
+	 * case when we are at the last slot, this gets 1, so that
+	 * the answer to how many slots are available is all but 1.
+	 */
+	if (hpts->p_runningtick == hpts->p_cur_slot)
+		dis_to_travel = 1;
+	else
+		dis_to_travel = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
+	/* 
+	 * How long has the pacer been running?
+	 */
+	if (hpts->p_cur_slot != wheel_tick) {
+		/* The pacer is a bit late */
+		pacer_to_now = hpts_ticks_diff(hpts->p_cur_slot, wheel_tick);
+	} else {
+		/* The pacer is right on time, now == pacers start time */
+		pacer_to_now = 0;
+	}
+	/* 
+	 * To get the number left we can insert into we simply
+	 * subract the distance the pacer has to run from how
+	 * many slots there are.
+	 */
+	avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel;
+	/* 
+	 * Now how many of those we will eat due to the pacer's 
+	 * time (p_cur_slot) of start being behind the 
+	 * real time (wheel_tick)?
+	 */
+	if (avail_on_wheel <= pacer_to_now) {
+		/* 
+		 * Wheel wrap, we can't fit on the wheel, that
+		 * is unusual the system must be way overloaded!
+		 * Insert into the assured tick, and return special
+		 * "0".
+		 */
+		counter_u64_add(combined_wheel_wrap, 1);
+		*target_tick = hpts->p_nxt_slot;
+		return (0);
+	} else {
+		/* 
+		 * We know how many slots are open
+		 * on the wheel (the reverse of what
+		 * is left to run. Take away the time
+		 * the pacer started to now (wheel_tick)
+		 * and that tells you how many slots are
+		 * open that can be inserted into that won't
+		 * be touched by the pacer until later.
+		 */
+		return (avail_on_wheel - pacer_to_now);
+	}
+}
+
 static int
 tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref)
 {
-	int32_t need_wake = 0;
-	uint32_t ticknow = 0;
-
+	uint32_t need_wake = 0;
+	
 	HPTS_MTX_ASSERT(hpts);
 	if (inp->inp_in_hpts == 0) {
 		/* Ok we need to set it on the hpts in the current slot */
-		if (hpts->p_hpts_active == 0) {
-			/* A sleeping hpts we want in next slot to run */
-			if (logging_on) {
-				tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0,
-				    hpts_tick(hpts, 1));
-			}
-			inp->inp_hptsslot = hpts_tick(hpts, 1);
-			inp->inp_hpts_request = 0;
-			if (logging_on) {
-				tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow);
-			}
-			need_wake = 1;
+		inp->inp_hpts_request = 0;
+		if ((hpts->p_hpts_active == 0) ||
+		    (hpts->p_wheel_complete)) {
+			/*
+			 * A sleeping hpts we want in next slot to run 
+			 * note that in this state p_prev_slot == p_cur_slot
+			 */
+			inp->inp_hptsslot = hpts_tick(hpts->p_prev_slot, 1);
+			if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0))
+				need_wake = 1;
 		} else if ((void *)inp == hpts->p_inp) {
 			/*
+			 * The hpts system is running and the caller
+			 * was awoken by the hpts system. 
 			 * We can't allow you to go into the same slot we
-			 * are in. We must put you out.
+			 * are in (we don't want a loop :-D).
 			 */
 			inp->inp_hptsslot = hpts->p_nxt_slot;
 		} else
-			inp->inp_hptsslot = hpts->p_cur_slot;
+			inp->inp_hptsslot = hpts->p_runningtick;
 		hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
-		inp->inp_hpts_request = 0;
-		if (logging_on) {
-			tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0);
-		}
 		if (need_wake) {
 			/*
 			 * Activate the hpts if it is sleeping and its
 			 * timeout is not 1.
 			 */
-			if (logging_on) {
-				tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow);
-			}
 			hpts->p_direct_wake = 1;
 			tcp_wakehpts(hpts);
 		}
 	}
 	return (need_wake);
 }
 
 int
 __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line)
 {
 	int32_t ret;
 	struct tcp_hpts_entry *hpts;
 
 	INP_WLOCK_ASSERT(inp);
 	hpts = tcp_hpts_lock(inp);
 	ret = tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
 	mtx_unlock(&hpts->p_mtx);
 	return (ret);
 }
 
+#ifdef INVARIANTS
 static void
-tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line,
-    struct hpts_diag *diag, int32_t noref)
+check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line)
 {
-	int32_t need_new_to = 0;
-	int32_t need_wakeup = 0;
-	uint32_t largest_slot;
-	uint32_t ticknow = 0;
-	uint32_t slot_calc;
+	/*
+	 * Sanity checks for the pacer with invariants 
+	 * on insert.
+	 */
+	if (inp_hptsslot >= NUM_OF_HPTSI_SLOTS)
+		panic("hpts:%p inp:%p slot:%d > max",
+		      hpts, inp, inp_hptsslot);
+	if ((hpts->p_hpts_active) &&
+	    (hpts->p_wheel_complete == 0)) {
+		/* 
+		 * If the pacer is processing a arc
+		 * of the wheel, we need to make
+		 * sure we are not inserting within
+		 * that arc.
+		 */
+		int distance, yet_to_run;
 
+		distance = hpts_ticks_diff(hpts->p_runningtick, inp_hptsslot);
+		if (hpts->p_runningtick != hpts->p_cur_slot)
+			yet_to_run = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
+		else
+			yet_to_run = 0;	/* processing last slot */
+		if (yet_to_run > distance) {
+			panic("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
+			      hpts, inp, inp_hptsslot,
+			      distance, yet_to_run,
+			      hpts->p_runningtick, hpts->p_cur_slot);
+		}
+	}
+}
+#endif
+
+static void
+tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, int32_t line,
+		       struct hpts_diag *diag, struct timeval *tv)
+{
+	uint32_t need_new_to = 0;
+	uint32_t wheel_cts, last_tick;
+	int32_t wheel_tick, maxticks;
+	int8_t need_wakeup = 0;
+
 	HPTS_MTX_ASSERT(hpts);
 	if (diag) {
 		memset(diag, 0, sizeof(struct hpts_diag));
 		diag->p_hpts_active = hpts->p_hpts_active;
+		diag->p_prev_slot = hpts->p_prev_slot;
+		diag->p_runningtick = hpts->p_runningtick;
 		diag->p_nxt_slot = hpts->p_nxt_slot;
 		diag->p_cur_slot = hpts->p_cur_slot;
+		diag->p_curtick = hpts->p_curtick;
+		diag->p_lasttick = hpts->p_lasttick;
 		diag->slot_req = slot;
+		diag->p_on_min_sleep = hpts->p_on_min_sleep;
+		diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
 	}
-	if ((inp->inp_in_hpts == 0) || noref) {
-		inp->inp_hpts_request = slot;
+	if (inp->inp_in_hpts == 0) {
 		if (slot == 0) {
 			/* Immediate */
-			tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref);
+			tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
 			return;
 		}
-		if (hpts->p_hpts_active) {
-			/*
-			 * Its slot - 1 since nxt_slot is the next tick that
-			 * will go off since the hpts is awake
-			 */
-			if (logging_on) {
-				tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0);
-			}
-			/*
-			 * We want to make sure that we don't place a inp in
-			 * the range of p_cur_slot <-> p_nxt_slot. If we
-			 * take from p_nxt_slot to the end, plus p_cur_slot
-			 * and then take away 2, we will know how many is
-			 * the max slots we can use.
-			 */
-			if (hpts->p_nxt_slot > hpts->p_cur_slot) {
-				/*
-				 * Non-wrap case nxt_slot <-> cur_slot we
-				 * don't want to land in. So the diff gives
-				 * us what is taken away from the number of
-				 * slots.
+		/* Get the current time relative to the wheel */
+		wheel_cts = tcp_tv_to_hptstick(tv);
+		/* Map it onto the wheel */
+		wheel_tick = tick_to_wheel(wheel_cts);
+		/* Now what's the max we can place it at? */
+		maxticks = max_ticks_available(hpts, wheel_tick, &last_tick);
+		if (diag) {
+			diag->wheel_tick = wheel_tick;
+			diag->maxticks = maxticks;
+			diag->wheel_cts = wheel_cts;
+		}
+		if (maxticks == 0) {
+			/* The pacer is in a wheel wrap behind, yikes! */
+			if (slot > 1) {
+				/* 
+				 * Reduce by 1 to prevent a forever loop in
+				 * case something else is wrong. Note this
+				 * probably does not hurt because the pacer
+				 * if its true is so far behind we will be
+				 * > 1second late calling anyway.
 				 */
-				largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot);
-			} else if (hpts->p_nxt_slot == hpts->p_cur_slot) {
-				largest_slot = NUM_OF_HPTSI_SLOTS - 2;
-			} else {
-				/*
-				 * Wrap case so the diff gives us the number
-				 * of slots that we can land in.
-				 */
-				largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot;
+				slot--;
 			}
-			/*
-			 * We take away two so we never have a problem (20
-			 * usec's) out of 1024000 usecs
-			 */
-			largest_slot -= 2;
-			if (inp->inp_hpts_request > largest_slot) {
-				/*
-				 * Restrict max jump of slots and remember
-				 * leftover
-				 */
-				slot = largest_slot;
-				inp->inp_hpts_request -= largest_slot;
-			} else {
-				/* This one will run when we hit it */
-				inp->inp_hpts_request = 0;
-			}
-			if (hpts->p_nxt_slot == hpts->p_cur_slot)
-				slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS;
-			else
-				slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS;
-			if (slot_calc == hpts->p_cur_slot) {
+			inp->inp_hptsslot = last_tick;
+			inp->inp_hpts_request = slot;
+		} else 	if (maxticks >= slot) {
+			/* It all fits on the wheel */
+			inp->inp_hpts_request = 0;
+			inp->inp_hptsslot = hpts_tick(wheel_tick, slot);
+		} else {
+			/* It does not fit */
+			inp->inp_hpts_request = slot - maxticks;
+			inp->inp_hptsslot = last_tick;
+		}
+		if (diag) {
+			diag->slot_remaining = inp->inp_hpts_request;
+			diag->inp_hptsslot = inp->inp_hptsslot;
+		}
 #ifdef INVARIANTS
-				/* TSNH */
-				panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n",
-				    hpts, slot_calc, slot, largest_slot);
+		check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
 #endif
-				if (slot_calc)
-					slot_calc--;
-				else
-					slot_calc = NUM_OF_HPTSI_SLOTS - 1;
-			}
-			inp->inp_hptsslot = slot_calc;
-			if (diag) {
-				diag->inp_hptsslot = inp->inp_hptsslot;
-			}
-		} else {
+		hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0);
+		if ((hpts->p_hpts_active == 0) &&
+		    (inp->inp_hpts_request == 0) &&
+		    (hpts->p_on_min_sleep == 0)) {
 			/*
-			 * The hpts is sleeping, we need to figure out where
+			 * The hpts is sleeping and not on a minimum
+			 * sleep time, we need to figure out where
 			 * it will wake up at and if we need to reschedule
 			 * its time-out.
 			 */
 			uint32_t have_slept, yet_to_sleep;
-			uint32_t slot_now;
-			struct timeval tv;
 
-			ticknow = tcp_gethptstick(&tv);
-			slot_now = ticknow % NUM_OF_HPTSI_SLOTS;
-			/*
-			 * The user wants to be inserted at (slot_now +
-			 * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up.
-			 */
-			largest_slot = NUM_OF_HPTSI_SLOTS - 2;
-			if (inp->inp_hpts_request > largest_slot) {
-				/* Adjust the residual in inp_hpts_request */
-				slot = largest_slot;
-				inp->inp_hpts_request -= largest_slot;
-			} else {
-				/* No residual it all fits */
-				inp->inp_hpts_request = 0;
-			}
-			inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS;
-			if (diag) {
-				diag->slot_now = slot_now;
-				diag->inp_hptsslot = inp->inp_hptsslot;
-				diag->p_on_min_sleep = hpts->p_on_min_sleep;
-			}
-			if (logging_on) {
-				tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow);
-			}
 			/* Now do we need to restart the hpts's timer? */
-			if (TSTMP_GT(ticknow, hpts->p_curtick))
-				have_slept = ticknow - hpts->p_curtick;
-			else
-				have_slept = 0;
-			if (have_slept < hpts->p_hpts_sleep_time) {
-				/* This should be what happens */
+			have_slept = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
+			if (have_slept < hpts->p_hpts_sleep_time)
 				yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
-			} else {
+			else {
 				/* We are over-due */
 				yet_to_sleep = 0;
 				need_wakeup = 1;
 			}
 			if (diag) {
 				diag->have_slept = have_slept;
 				diag->yet_to_sleep = yet_to_sleep;
-				diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
 			}
-			if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) {
+			if (yet_to_sleep &&
+			    (yet_to_sleep > slot)) {
 				/*
-				 * We need to reschedule the hptss time-out.
+				 * We need to reschedule the hpts's time-out.
 				 */
 				hpts->p_hpts_sleep_time = slot;
 				need_new_to = slot * HPTS_TICKS_PER_USEC;
 			}
 		}
-		hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
-		if (logging_on) {
-			tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow);
-		}
 		/*
 		 * Now how far is the hpts sleeping to? if active is 1, its
 		 * up and ticking we do nothing, otherwise we may need to
 		 * reschedule its callout if need_new_to is set from above.
 		 */
 		if (need_wakeup) {
-			if (logging_on) {
-				tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0);
-			}
 			hpts->p_direct_wake = 1;
 			tcp_wakehpts(hpts);
 			if (diag) {
 				diag->need_new_to = 0;
 				diag->co_ret = 0xffff0000;
 			}
 		} else if (need_new_to) {
 			int32_t co_ret;
 			struct timeval tv;
 			sbintime_t sb;
 
 			tv.tv_sec = 0;
 			tv.tv_usec = 0;
 			while (need_new_to > HPTS_USEC_IN_SEC) {
 				tv.tv_sec++;
 				need_new_to -= HPTS_USEC_IN_SEC;
 			}
 			tv.tv_usec = need_new_to;
 			sb = tvtosbt(tv);
 			if (tcp_hpts_callout_skip_swi == 0) {
 				co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
 				    hpts_timeout_swi, hpts, hpts->p_cpu,
 				    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 			} else {
 				co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
 				    hpts_timeout_dir, hpts,
 				    hpts->p_cpu,
 				    C_PREL(tcp_hpts_precision));
 			}
 			if (diag) {
 				diag->need_new_to = need_new_to;
 				diag->co_ret = co_ret;
 			}
 		}
 	} else {
 #ifdef INVARIANTS
 		panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp);
 #endif
 	}
 }
 
 uint32_t
-tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){
+tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag)
+{
 	struct tcp_hpts_entry *hpts;
-	uint32_t slot_on, cts;
+	uint32_t slot_on;
 	struct timeval tv;
 
 	/*
 	 * We now return the next-slot the hpts will be on, beyond its
 	 * current run (if up) or where it was when it stopped if it is
 	 * sleeping.
 	 */
 	INP_WLOCK_ASSERT(inp);
 	hpts = tcp_hpts_lock(inp);
-	if (in_ts_percision)
-		microuptime(&tv);
-	else
-		getmicrouptime(&tv);
-	cts = tcp_tv_to_usectick(&tv);
-	tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0);
+	microuptime(&tv);
+	tcp_hpts_insert_locked(hpts, inp, slot, line, diag, &tv);
 	slot_on = hpts->p_nxt_slot;
 	mtx_unlock(&hpts->p_mtx);
 	return (slot_on);
 }
 
 uint32_t
 __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){
 	return (tcp_hpts_insert_diag(inp, slot, line, NULL));
 }
-
 int
 __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line)
 {
 	int32_t retval = 0;
 
 	HPTS_MTX_ASSERT(hpts);
 	if (inp->inp_in_input == 0) {
 		/* Ok we need to set it on the hpts in the current slot */
 		hpts_sane_input_insert(hpts, inp, line);
 		retval = 1;
 		if (hpts->p_hpts_active == 0) {
 			/*
 			 * Activate the hpts if it is sleeping.
 			 */
-			if (logging_on) {
-				tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0);
-			}
 			retval = 2;
 			hpts->p_direct_wake = 1;
 			tcp_wakeinput(hpts);
 		}
 	} else if (hpts->p_hpts_active == 0) {
 		retval = 4;
 		hpts->p_direct_wake = 1;
 		tcp_wakeinput(hpts);
 	}
 	return (retval);
 }
 
-void
-tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
-    int32_t tlen, int32_t drop_hdrlen, uint8_t iptos)
-{
-	/* Setup packet for input first */
-	INP_WLOCK_ASSERT(tp->t_inpcb);
-	m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t));
-	m->m_pkthdr.pace_tlen = (uint16_t) tlen;
-	m->m_pkthdr.pace_drphdrlen = drop_hdrlen;
-	m->m_pkthdr.pace_tos = iptos;
-	m->m_pkthdr.pace_lock = (curthread->td_epochnest != 0);
-	if (tp->t_in_pkt == NULL) {
-		tp->t_in_pkt = m;
-		tp->t_tail_pkt = m;
-	} else {
-		tp->t_tail_pkt->m_nextpkt = m;
-		tp->t_tail_pkt = m;
-	}
-}
-
-
 int32_t
-__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
-    int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line){
+__tcp_queue_to_input(struct inpcb *inp, int line)
+{
 	struct tcp_hpts_entry *hpts;
 	int32_t ret;
 
-	tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos);
-	hpts = tcp_input_lock(tp->t_inpcb);
-	ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line);
+	hpts = tcp_input_lock(inp);
+	ret = __tcp_queue_to_input_locked(inp, hpts, line);
 	mtx_unlock(&hpts->p_mtx);
 	return (ret);
 }
 
 void
 __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line)
 {
 	struct tcp_hpts_entry *hpts;
 	struct tcpcb *tp;
 
 	tp = intotcpcb(inp);
 	hpts = tcp_input_lock(tp->t_inpcb);
 	if (inp->inp_in_input == 0) {
 		/* Ok we need to set it on the hpts in the current slot */
 		hpts_sane_input_insert(hpts, inp, line);
 		if (hpts->p_hpts_active == 0) {
 			/*
 			 * Activate the hpts if it is sleeping.
 			 */
 			hpts->p_direct_wake = 1;
 			tcp_wakeinput(hpts);
 		}
 	} else if (hpts->p_hpts_active == 0) {
 		hpts->p_direct_wake = 1;
 		tcp_wakeinput(hpts);
 	}
 	inp->inp_hpts_drop_reas = reason;
 	mtx_unlock(&hpts->p_mtx);
 }
 
 static uint16_t
 hpts_random_cpu(struct inpcb *inp){
 	/*
 	 * No flow type set distribute the load randomly.
 	 */
 	uint16_t cpuid;
 	uint32_t ran;
 
 	/*
 	 * If one has been set use it i.e. we want both in and out on the
 	 * same hpts.
 	 */
 	if (inp->inp_input_cpu_set) {
 		return (inp->inp_input_cpu);
 	} else if (inp->inp_hpts_cpu_set) {
 		return (inp->inp_hpts_cpu);
 	}
 	/* Nothing set use a random number */
 	ran = arc4random();
 	cpuid = (ran & 0xffff) % mp_ncpus;
 	return (cpuid);
 }
 
 static uint16_t
 hpts_cpuid(struct inpcb *inp){
 	u_int cpuid;
 
 
 	/*
 	 * If one has been set use it i.e. we want both in and out on the
 	 * same hpts.
 	 */
 	if (inp->inp_input_cpu_set) {
 		return (inp->inp_input_cpu);
 	} else if (inp->inp_hpts_cpu_set) {
 		return (inp->inp_hpts_cpu);
 	}
 	/* If one is set the other must be the same */
 #ifdef	RSS
 	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 	if (cpuid == NETISR_CPUID_NONE)
 		return (hpts_random_cpu(inp));
 	else
 		return (cpuid);
 #else
 	/*
 	 * We don't have a flowid -> cpuid mapping, so cheat and just map
 	 * unknown cpuids to curcpu.  Not the best, but apparently better
 	 * than defaulting to swi 0.
 	 */
 	if (inp->inp_flowtype != M_HASHTYPE_NONE) {
 		cpuid = inp->inp_flowid % mp_ncpus;
 		return (cpuid);
 	}
 	cpuid = hpts_random_cpu(inp);
 	return (cpuid);
 #endif
 }
 
+static void
+tcp_drop_in_pkts(struct tcpcb *tp)
+{
+	struct mbuf *m, *n;
+	
+	m = tp->t_in_pkt;
+	if (m)
+		n = m->m_nextpkt;
+	else
+		n = NULL;
+	tp->t_in_pkt = NULL;
+	while (m) {
+		m_freem(m);
+		m = n;
+		if (m)
+			n = m->m_nextpkt;
+	}
+}
+
 /*
  * Do NOT try to optimize the processing of inp's
  * by first pulling off all the inp's into a temporary
  * list (e.g. TAILQ_CONCAT). If you do that the subtle
  * interactions of switching CPU's will kill because of
  * problems in the linked list manipulation. Basically
  * you would switch cpu's with the hpts mutex locked
  * but then while you were processing one of the inp's
  * some other one that you switch will get a new
  * packet on the different CPU. It will insert it
- * on the new hptss input list. Creating a temporary
+ * on the new hpts's input list. Creating a temporary
  * link in the inp will not fix it either, since
  * the other hpts will be doing the same thing and
  * you will both end up using the temporary link.
  *
  * You will die in an ASSERT for tailq corruption if you
  * run INVARIANTS or you will die horribly without
  * INVARIANTS in some unknown way with a corrupt linked
  * list.
  */
 static void
 tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv)
 {
-	struct mbuf *m, *n;
 	struct tcpcb *tp;
 	struct inpcb *inp;
 	uint16_t drop_reason;
 	int16_t set_cpu;
 	uint32_t did_prefetch = 0;
-	int32_t ti_locked = TI_UNLOCKED;
+	int dropped;
 	struct epoch_tracker et;
 
 	HPTS_MTX_ASSERT(hpts);
+#ifndef VIMAGE
+	INP_INFO_RLOCK_ET(&V_tcbinfo, et);
+#endif
 	while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) {
 		HPTS_MTX_ASSERT(hpts);
 		hpts_sane_input_remove(hpts, inp, 0);
 		if (inp->inp_input_cpu_set == 0) {
 			set_cpu = 1;
 		} else {
 			set_cpu = 0;
 		}
 		hpts->p_inp = inp;
 		drop_reason = inp->inp_hpts_drop_reas;
 		inp->inp_in_input = 0;
 		mtx_unlock(&hpts->p_mtx);
-		CURVNET_SET(inp->inp_vnet);
-		if (drop_reason) {
-			INP_INFO_RLOCK_ET(&V_tcbinfo, et);
-			ti_locked = TI_RLOCKED;
-		} else {
-			ti_locked = TI_UNLOCKED;
-		}
 		INP_WLOCK(inp);
+#ifdef VIMAGE
+		CURVNET_SET(inp->inp_vnet);
+		INP_INFO_RLOCK_ET(&V_tcbinfo, et);
+#endif
 		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
 		    (inp->inp_flags2 & INP_FREED)) {
 out:
 			hpts->p_inp = NULL;
-			if (ti_locked == TI_RLOCKED) {
-				INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
-			}
 			if (in_pcbrele_wlocked(inp) == 0) {
 				INP_WUNLOCK(inp);
 			}
-			ti_locked = TI_UNLOCKED;
+#ifdef VIMAGE
+			INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 			CURVNET_RESTORE();
+#endif
 			mtx_lock(&hpts->p_mtx);
 			continue;
 		}
 		tp = intotcpcb(inp);
 		if ((tp == NULL) || (tp->t_inpcb == NULL)) {
 			goto out;
 		}
 		if (drop_reason) {
 			/* This tcb is being destroyed for drop_reason */
-			m = tp->t_in_pkt;
-			if (m)
-				n = m->m_nextpkt;
-			else
-				n = NULL;
-			tp->t_in_pkt = NULL;
-			while (m) {
-				m_freem(m);
-				m = n;
-				if (m)
-					n = m->m_nextpkt;
-			}
+			tcp_drop_in_pkts(tp);
 			tp = tcp_drop(tp, drop_reason);
-			INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 			if (tp == NULL) {
 				INP_WLOCK(inp);
 			}
 			if (in_pcbrele_wlocked(inp) == 0)
 				INP_WUNLOCK(inp);
+#ifdef VIMAGE
+			INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 			CURVNET_RESTORE();
+#endif
 			mtx_lock(&hpts->p_mtx);
 			continue;
 		}
 		if (set_cpu) {
 			/*
 			 * Setup so the next time we will move to the right
 			 * CPU. This should be a rare event. It will
 			 * sometimes happens when we are the client side
 			 * (usually not the server). Somehow tcp_output()
 			 * gets called before the tcp_do_segment() sets the
 			 * intial state. This means the r_cpu and r_hpts_cpu
 			 * is 0. We get on the hpts, and then tcp_input()
 			 * gets called setting up the r_cpu to the correct
 			 * value. The hpts goes off and sees the mis-match.
 			 * We simply correct it here and the CPU will switch
 			 * to the new hpts nextime the tcb gets added to the
 			 * the hpts (not this time) :-)
 			 */
 			tcp_set_hpts(inp);
 		}
-		m = tp->t_in_pkt;
-		n = NULL;
-		if (m != NULL &&
-		    (m->m_pkthdr.pace_lock == TI_RLOCKED ||
-		    tp->t_state != TCPS_ESTABLISHED)) {
-			ti_locked = TI_RLOCKED;
-			INP_INFO_RLOCK_ET(&V_tcbinfo, et);
-			m = tp->t_in_pkt;
-		}
-		if (in_newts_every_tcb) {
-			if (in_ts_percision)
-				microuptime(tv);
-			else
-				getmicrouptime(tv);
-		}
 		if (tp->t_fb_ptr != NULL) {
 			kern_prefetch(tp->t_fb_ptr, &did_prefetch);
 			did_prefetch = 1;
 		}
-		/* Any input work to do, if so do it first */
-		if ((m != NULL) && (m == tp->t_in_pkt)) {
-			struct tcphdr *th;
-			int32_t tlen, drop_hdrlen, nxt_pkt;
-			uint8_t iptos;
-
-			n = m->m_nextpkt;
-			tp->t_in_pkt = tp->t_tail_pkt = NULL;
-			while (m) {
-				th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff);
-				tlen = m->m_pkthdr.pace_tlen;
-				drop_hdrlen = m->m_pkthdr.pace_drphdrlen;
-				iptos = m->m_pkthdr.pace_tos;
-				m->m_nextpkt = NULL;
-				if (n)
-					nxt_pkt = 1;
-				else
-					nxt_pkt = 0;
-				inp->inp_input_calls = 1;
-				if (tp->t_fb->tfb_tcp_hpts_do_segment) {
-					/* Use the hpts specific do_segment */
-					(*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket,
-					    tp, drop_hdrlen,
-					    tlen, iptos, nxt_pkt, tv);
-				} else {
-					/* Use the default do_segment */
-					(*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket,
-					    tp, drop_hdrlen,
-						tlen, iptos);
-				}
-				if (ti_locked == TI_RLOCKED)
-					INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
-				/*
-				 * Do segment returns unlocked we need the
-				 * lock again but we also need some kasserts
-				 * here.
-				 */
-				INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo);
-				INP_UNLOCK_ASSERT(inp);
-				m = n;
-				if (m)
-					n = m->m_nextpkt;
-				if (m != NULL &&
-				    m->m_pkthdr.pace_lock == TI_RLOCKED) {
-					INP_INFO_RLOCK_ET(&V_tcbinfo, et);
-					ti_locked = TI_RLOCKED;
-				} else
-					ti_locked = TI_UNLOCKED;
+		if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
+			if (inp->inp_in_input)
+				tcp_hpts_remove(inp, HPTS_REMOVE_INPUT);
+			dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
+			if (dropped) {
+				/* Re-acquire the wlock so we can release the reference */
 				INP_WLOCK(inp);
-				/*
-				 * Since we have an opening here we must
-				 * re-check if the tcb went away while we
-				 * were getting the lock(s).
-				 */
-				if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
-				    (inp->inp_flags2 & INP_FREED)) {
-					while (m) {
-						m_freem(m);
-						m = n;
-						if (m)
-							n = m->m_nextpkt;
-					}
-					goto out;
-				}
-				/*
-				 * Now that we hold the INP lock, check if
-				 * we need to upgrade our lock.
-				 */
-				if (ti_locked == TI_UNLOCKED &&
-				    (tp->t_state != TCPS_ESTABLISHED)) {
-					ti_locked = TI_RLOCKED;
-					INP_INFO_RLOCK_ET(&V_tcbinfo, et);
-				}
-			}	/** end while(m) */
-		}		/** end if ((m != NULL)  && (m == tp->t_in_pkt)) */
+			}
+		} else if (tp->t_in_pkt) {
+			/* 
+			 * We reach here only if we had a 
+			 * stack that supported INP_SUPPORTS_MBUFQ
+			 * and then somehow switched to a stack that
+			 * does not. The packets are basically stranded
+			 * and would hang with the connection until
+			 * cleanup without this code. Its not the
+			 * best way but I know of no other way to
+			 * handle it since the stack needs functions
+			 * it does not have to handle queued packets.
+			 */
+			tcp_drop_in_pkts(tp);
+		}
 		if (in_pcbrele_wlocked(inp) == 0)
 			INP_WUNLOCK(inp);
-		if (ti_locked == TI_RLOCKED)
-			INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
-		INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo);
 		INP_UNLOCK_ASSERT(inp);
-		ti_locked = TI_UNLOCKED;
+#ifdef VIMAGE
+		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+		CURVNET_RESTORE();
+#endif
 		mtx_lock(&hpts->p_mtx);
 		hpts->p_inp = NULL;
-		CURVNET_RESTORE();
 	}
+#ifndef VIMAGE
+	INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+#endif
 }
 
-static int
-tcp_hpts_est_run(struct tcp_hpts_entry *hpts)
-{
-	int32_t ticks_to_run;
-
-	if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) {
-		ticks_to_run = hpts->p_curtick - hpts->p_prevtick;
-		if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) {
-			ticks_to_run = NUM_OF_HPTSI_SLOTS - 2;
-		}
-	} else {
-		if (hpts->p_prevtick == hpts->p_curtick) {
-			/* This happens when we get woken up right away */
-			return (-1);
-		}
-		ticks_to_run = 1;
-	}
-	/* Set in where we will be when we catch up */
-	hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS;
-	if (hpts->p_nxt_slot == hpts->p_cur_slot) {
-		panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d",
-		    hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run);
-	}
-	return (ticks_to_run);
-}
-
 static void
-tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick)
+tcp_hptsi(struct tcp_hpts_entry *hpts)
 {
+	struct epoch_tracker et;
 	struct tcpcb *tp;
 	struct inpcb *inp = NULL, *ninp;
 	struct timeval tv;
-	int32_t ticks_to_run, i, error, tick_now, interum_tick;
+	int32_t ticks_to_run, i, error;
 	int32_t paced_cnt = 0;
+	int32_t loop_cnt = 0;
 	int32_t did_prefetch = 0;
 	int32_t prefetch_ninp = 0;
 	int32_t prefetch_tp = 0;
-	uint32_t cts;
+	int32_t wrap_loop_cnt = 0;
 	int16_t set_cpu;
 
 	HPTS_MTX_ASSERT(hpts);
-	hpts->p_curtick = tcp_tv_to_hptstick(ctick);
-	cts = tcp_tv_to_usectick(ctick);
-	memcpy(&tv, ctick, sizeof(struct timeval));
-	hpts->p_cur_slot = hpts_tick(hpts, 1);
+	/* record previous info for any logging */
+	hpts->saved_lasttick = hpts->p_lasttick;
+	hpts->saved_curtick = hpts->p_curtick;
+	hpts->saved_curslot = hpts->p_cur_slot;
+	hpts->saved_prev_slot = hpts->p_prev_slot;
 
-	/* Figure out if we had missed ticks */
+	hpts->p_lasttick = hpts->p_curtick;
+	hpts->p_curtick = tcp_gethptstick(&tv);
+	hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+	if ((hpts->p_on_queue_cnt == 0) ||
+	    (hpts->p_lasttick == hpts->p_curtick)) {
+		/* 
+		 * No time has yet passed, 
+		 * or nothing to do.
+		 */
+		hpts->p_prev_slot = hpts->p_cur_slot;
+		hpts->p_lasttick = hpts->p_curtick;
+		goto no_run;
+	}
 again:
+	hpts->p_wheel_complete = 0;
 	HPTS_MTX_ASSERT(hpts);
-	ticks_to_run = tcp_hpts_est_run(hpts);
-	if (!TAILQ_EMPTY(&hpts->p_input)) {
-		tcp_input_data(hpts, &tv);
+	ticks_to_run = hpts_ticks_diff(hpts->p_prev_slot, hpts->p_cur_slot);
+	if (((hpts->p_curtick - hpts->p_lasttick) > ticks_to_run) &&
+	    (hpts->p_on_queue_cnt != 0)) {
+		/* 
+		 * Wheel wrap is occuring, basically we
+		 * are behind and the distance between
+		 * run's has spread so much it has exceeded
+		 * the time on the wheel (1.024 seconds). This
+		 * is ugly and should NOT be happening. We
+		 * need to run the entire wheel. We last processed
+		 * p_prev_slot, so that needs to be the last slot
+		 * we run. The next slot after that should be our
+		 * reserved first slot for new, and then starts
+		 * the running postion. Now the problem is the
+		 * reserved "not to yet" place does not exist
+		 * and there may be inp's in there that need
+		 * running. We can merge those into the
+		 * first slot at the head.
+		 */
+		wrap_loop_cnt++;
+		hpts->p_nxt_slot = hpts_tick(hpts->p_prev_slot, 1);
+		hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 2);
+		/* 
+		 * Adjust p_cur_slot to be where we are starting from
+		 * hopefully we will catch up (fat chance if something
+		 * is broken this bad :( )
+		 */
+		hpts->p_cur_slot = hpts->p_prev_slot;
+		/*
+		 * The next slot has guys to run too, and that would
+		 * be where we would normally start, lets move them into
+		 * the next slot (p_prev_slot + 2) so that we will
+		 * run them, the extra 10usecs of late (by being
+		 * put behind) does not really matter in this situation.
+		 */
+#ifdef INVARIANTS
+		/* 
+		 * To prevent a panic we need to update the inpslot to the
+		 * new location. This is safe since it takes both the
+		 * INP lock and the pacer mutex to change the inp_hptsslot.
+		 */
+		TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts) {
+			inp->inp_hptsslot = hpts->p_runningtick;
+		}
+#endif
+		TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningtick],
+			     &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts);
+		ticks_to_run = NUM_OF_HPTSI_SLOTS - 1;
+		counter_u64_add(wheel_wrap, 1);
+	} else {
+		/* 
+		 * Nxt slot is always one after p_runningtick though
+		 * its not used usually unless we are doing wheel wrap.
+		 */
+		hpts->p_nxt_slot = hpts->p_prev_slot;
+		hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 1);
 	}
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&hpts->p_input) &&
 	    (hpts->p_on_inqueue_cnt != 0)) {
 		panic("tp:%p in_hpts input empty but cnt:%d",
-		    hpts, hpts->p_on_inqueue_cnt);
+		      hpts, hpts->p_on_inqueue_cnt);
 	}
 #endif
 	HPTS_MTX_ASSERT(hpts);
-	/* Reset the ticks to run and time if we need too */
-	interum_tick = tcp_gethptstick(&tv);
-	if (interum_tick != hpts->p_curtick) {
-		/* Save off the new time we execute to */
-		*ctick = tv;
-		hpts->p_curtick = interum_tick;
-		cts = tcp_tv_to_usectick(&tv);
-		hpts->p_cur_slot = hpts_tick(hpts, 1);
-		ticks_to_run = tcp_hpts_est_run(hpts);
-	}
-	if (ticks_to_run == -1) {
-		goto no_run;
-	}
-	if (logging_on) {
-		tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0);
-	}
 	if (hpts->p_on_queue_cnt == 0) {
 		goto no_one;
 	}
 	HPTS_MTX_ASSERT(hpts);
+#ifndef VIMAGE
+	INP_INFO_RLOCK_ET(&V_tcbinfo, et);
+#endif
 	for (i = 0; i < ticks_to_run; i++) {
 		/*
 		 * Calculate our delay, if there are no extra ticks there
-		 * was not any
+		 * was not any (i.e. if ticks_to_run == 1, no delay).
 		 */
 		hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC;
 		HPTS_MTX_ASSERT(hpts);
-		while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
+		while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
 			/* For debugging */
-			if (logging_on) {
-				tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i);
-			}
 			hpts->p_inp = inp;
 			paced_cnt++;
-			if (hpts->p_cur_slot != inp->inp_hptsslot) {
+#ifdef INVARIANTS
+			if (hpts->p_runningtick != inp->inp_hptsslot) {
 				panic("Hpts:%p inp:%p slot mis-aligned %u vs %u",
-				    hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot);
+				      hpts, inp, hpts->p_runningtick, inp->inp_hptsslot);
 			}
+#endif
 			/* Now pull it */
 			if (inp->inp_hpts_cpu_set == 0) {
 				set_cpu = 1;
 			} else {
 				set_cpu = 0;
 			}
-			hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0);
-			if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
+			hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningtick], 0);
+			if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
 				/* We prefetch the next inp if possible */
 				kern_prefetch(ninp, &prefetch_ninp);
 				prefetch_ninp = 1;
 			}
 			if (inp->inp_hpts_request) {
 				/*
 				 * This guy is deferred out further in time
-				 * then our wheel had on it. Push him back
-				 * on the wheel.
+				 * then our wheel had available on it. 
+				 * Push him back on the wheel or run it
+				 * depending.
 				 */
-				int32_t remaining_slots;
-
+				uint32_t maxticks, last_tick, remaining_slots;
+				
 				remaining_slots = ticks_to_run - (i + 1);
 				if (inp->inp_hpts_request > remaining_slots) {
 					/*
-					 * Keep INVARIANTS happy by clearing
-					 * the flag
+					 * How far out can we go?
 					 */
-					tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1);
+					maxticks = max_ticks_available(hpts, hpts->p_cur_slot, &last_tick);
+					if (maxticks >= inp->inp_hpts_request) {
+						/* we can place it finally to be processed  */
+						inp->inp_hptsslot = hpts_tick(hpts->p_runningtick, inp->inp_hpts_request);
+						inp->inp_hpts_request = 0;
+					} else {
+						/* Work off some more time */
+						inp->inp_hptsslot = last_tick;
+						inp->inp_hpts_request-= maxticks;
+					}
+					hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], __LINE__, 1);
 					hpts->p_inp = NULL;
 					continue;
 				}
 				inp->inp_hpts_request = 0;
+				/* Fall through we will so do it now */
 			}
 			/*
-			 * We clear the hpts flag here after dealing with
+			 * We clear the hpts flag here after dealing with	
 			 * remaining slots. This way anyone looking with the
 			 * TCB lock will see its on the hpts until just
 			 * before we unlock.
 			 */
 			inp->inp_in_hpts = 0;
 			mtx_unlock(&hpts->p_mtx);
 			INP_WLOCK(inp);
 			if (in_pcbrele_wlocked(inp)) {
 				mtx_lock(&hpts->p_mtx);
-				if (logging_on)
-					tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1);
 				hpts->p_inp = NULL;
 				continue;
 			}
-			if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
-out_now:
+			if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
+			    (inp->inp_flags2 & INP_FREED)) {
+			out_now:
 #ifdef INVARIANTS
 				if (mtx_owned(&hpts->p_mtx)) {
 					panic("Hpts:%p owns mtx prior-to lock line:%d",
-					    hpts, __LINE__);
+					      hpts, __LINE__);
 				}
 #endif
 				INP_WUNLOCK(inp);
 				mtx_lock(&hpts->p_mtx);
-				if (logging_on)
-					tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3);
 				hpts->p_inp = NULL;
 				continue;
 			}
 			tp = intotcpcb(inp);
 			if ((tp == NULL) || (tp->t_inpcb == NULL)) {
 				goto out_now;
 			}
 			if (set_cpu) {
 				/*
 				 * Setup so the next time we will move to
 				 * the right CPU. This should be a rare
 				 * event. It will sometimes happens when we
 				 * are the client side (usually not the
 				 * server). Somehow tcp_output() gets called
 				 * before the tcp_do_segment() sets the
 				 * intial state. This means the r_cpu and
 				 * r_hpts_cpu is 0. We get on the hpts, and
 				 * then tcp_input() gets called setting up
 				 * the r_cpu to the correct value. The hpts
 				 * goes off and sees the mis-match. We
 				 * simply correct it here and the CPU will
 				 * switch to the new hpts nextime the tcb
 				 * gets added to the the hpts (not this one)
 				 * :-)
 				 */
 				tcp_set_hpts(inp);
 			}
-			if (out_newts_every_tcb) {
-				struct timeval sv;
-
-				if (out_ts_percision)
-					microuptime(&sv);
-				else
-					getmicrouptime(&sv);
-				cts = tcp_tv_to_usectick(&sv);
-			}
+#ifdef VIMAGE
 			CURVNET_SET(inp->inp_vnet);
+			INP_INFO_RLOCK_ET(&V_tcbinfo, et);
+#endif
+			/* Lets do any logging that we might want to */
+			if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
+				tcp_hpts_log(hpts, tp, &tv, ticks_to_run, i);
+			}
 			/*
 			 * There is a hole here, we get the refcnt on the
 			 * inp so it will still be preserved but to make
 			 * sure we can get the INP we need to hold the p_mtx
 			 * above while we pull out the tp/inp,  as long as
 			 * fini gets the lock first we are assured of having
 			 * a sane INP we can lock and test.
 			 */
 #ifdef INVARIANTS
 			if (mtx_owned(&hpts->p_mtx)) {
 				panic("Hpts:%p owns mtx before tcp-output:%d",
-				    hpts, __LINE__);
+				      hpts, __LINE__);
 			}
 #endif
 			if (tp->t_fb_ptr != NULL) {
 				kern_prefetch(tp->t_fb_ptr, &did_prefetch);
 				did_prefetch = 1;
 			}
-			inp->inp_hpts_calls = 1;
-			if (tp->t_fb->tfb_tcp_output_wtime != NULL) {
-				error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv);
-			} else {
-				error = tp->t_fb->tfb_tcp_output(tp);
+			if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
+				error = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
+				if (error) {
+					/* The input killed the connection */
+					goto skip_pacing;
+				}
 			}
+			inp->inp_hpts_calls = 1;
+			error = tp->t_fb->tfb_tcp_output(tp);
+			inp->inp_hpts_calls = 0;
 			if (ninp && ninp->inp_ppcb) {
 				/*
 				 * If we have a nxt inp, see if we can
 				 * prefetch its ppcb. Note this may seem
 				 * "risky" since we have no locks (other
 				 * than the previous inp) and there no
 				 * assurance that ninp was not pulled while
 				 * we were processing inp and freed. If this
 				 * occured it could mean that either:
 				 *
 				 * a) Its NULL (which is fine we won't go
 				 * here) <or> b) Its valid (which is cool we
 				 * will prefetch it) <or> c) The inp got
 				 * freed back to the slab which was
 				 * reallocated. Then the piece of memory was
 				 * re-used and something else (not an
 				 * address) is in inp_ppcb. If that occurs
 				 * we don't crash, but take a TLB shootdown
 				 * performance hit (same as if it was NULL
 				 * and we tried to pre-fetch it).
 				 *
 				 * Considering that the likelyhood of <c> is
 				 * quite rare we will take a risk on doing
 				 * this. If performance drops after testing
 				 * we can always take this out. NB: the
 				 * kern_prefetch on amd64 actually has
 				 * protection against a bad address now via
 				 * the DMAP_() tests. This will prevent the
 				 * TLB hit, and instead if <c> occurs just
 				 * cause us to load cache with a useless
 				 * address (to us).
 				 */
 				kern_prefetch(ninp->inp_ppcb, &prefetch_tp);
 				prefetch_tp = 1;
 			}
 			INP_WUNLOCK(inp);
-			INP_UNLOCK_ASSERT(inp);
+		skip_pacing:
+#ifdef VIMAGE
+			INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 			CURVNET_RESTORE();
+#endif
+			INP_UNLOCK_ASSERT(inp);
 #ifdef INVARIANTS
 			if (mtx_owned(&hpts->p_mtx)) {
 				panic("Hpts:%p owns mtx prior-to lock line:%d",
-				    hpts, __LINE__);
+				      hpts, __LINE__);
 			}
 #endif
 			mtx_lock(&hpts->p_mtx);
-			if (logging_on)
-				tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4);
 			hpts->p_inp = NULL;
 		}
 		HPTS_MTX_ASSERT(hpts);
 		hpts->p_inp = NULL;
-		hpts->p_cur_slot++;
-		if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) {
-			hpts->p_cur_slot = 0;
+		hpts->p_runningtick++;
+		if (hpts->p_runningtick >= NUM_OF_HPTSI_SLOTS) {
+			hpts->p_runningtick = 0;
 		}
 	}
+#ifndef VIMAGE
+	INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+#endif
 no_one:
 	HPTS_MTX_ASSERT(hpts);
-	hpts->p_prevtick = hpts->p_curtick;
 	hpts->p_delayed_by = 0;
 	/*
 	 * Check to see if we took an excess amount of time and need to run
 	 * more ticks (if we did not hit eno-bufs).
 	 */
-	/* Re-run any input that may be there */
-	(void)tcp_gethptstick(&tv);
-	if (!TAILQ_EMPTY(&hpts->p_input)) {
-		tcp_input_data(hpts, &tv);
-	}
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&hpts->p_input) &&
 	    (hpts->p_on_inqueue_cnt != 0)) {
 		panic("tp:%p in_hpts input empty but cnt:%d",
-		    hpts, hpts->p_on_inqueue_cnt);
+		      hpts, hpts->p_on_inqueue_cnt);
 	}
 #endif
-	tick_now = tcp_gethptstick(&tv);
-	if (SEQ_GT(tick_now, hpts->p_prevtick)) {
-		struct timeval res;
-
-		/* Did we really spend a full tick or more in here? */
-		timersub(&tv, ctick, &res);
-		if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) {
+	hpts->p_prev_slot = hpts->p_cur_slot;
+	hpts->p_lasttick = hpts->p_curtick;
+	if (loop_cnt > max_pacer_loops) {	    
+		/*
+		 * Something is serious slow we have
+		 * looped through processing the wheel
+		 * and by the time we cleared the
+		 * needs to run max_pacer_loops time
+		 * we still needed to run. That means
+		 * the system is hopelessly behind and
+		 * can never catch up :(
+		 *
+		 * We will just lie to this thread
+		 * and let it thing p_curtick is 
+		 * correct. When it next awakens
+		 * it will find itself further behind.
+		 */
+		counter_u64_add(hpts_hopelessly_behind, 1);
+		goto no_run;
+	}
+	hpts->p_curtick = tcp_gethptstick(&tv);
+	hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+	if ((wrap_loop_cnt < 2) &&
+	    (hpts->p_lasttick != hpts->p_curtick)) {
+		counter_u64_add(hpts_loops, 1);
+		loop_cnt++;
+		goto again;
+	}
+no_run:
+	/*
+	 * Set flag to tell that we are done for
+	 * any slot input that happens during
+	 * input.
+	 */
+	hpts->p_wheel_complete = 1;
+	/* 
+	 * Run any input that may be there not covered
+	 * in running data.
+	 */
+	if (!TAILQ_EMPTY(&hpts->p_input)) {
+		tcp_input_data(hpts, &tv);
+		/*
+		 * Now did we spend too long running
+		 * input and need to run more ticks?
+		 */
+		KASSERT(hpts->p_prev_slot == hpts->p_cur_slot,
+			("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
+			 hpts->p_prev_slot, hpts->p_cur_slot));
+		KASSERT(hpts->p_lasttick == hpts->p_curtick,
+			("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
+			 hpts->p_lasttick, hpts->p_curtick));
+		hpts->p_curtick = tcp_gethptstick(&tv);
+		if (hpts->p_lasttick != hpts->p_curtick) {
 			counter_u64_add(hpts_loops, 1);
-			if (logging_on) {
-				tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now);
-			}
-			*ctick = res;
-			hpts->p_curtick = tick_now;
+			hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
 			goto again;
 		}
 	}
-no_run:
 	{
 		uint32_t t = 0, i, fnd = 0;
 
-		if (hpts->p_on_queue_cnt) {
-
-
+		if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) {
 			/*
 			 * Find next slot that is occupied and use that to
 			 * be the sleep time.
 			 */
-			for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) {
+			for (i = 0, t = hpts_tick(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
 				if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
 					fnd = 1;
 					break;
 				}
 				t = (t + 1) % NUM_OF_HPTSI_SLOTS;
 			}
 			if (fnd) {
-				hpts->p_hpts_sleep_time = i;
+				hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
 			} else {
-				counter_u64_add(back_tosleep, 1);
 #ifdef INVARIANTS
-				panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt);
+				panic("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt);
 #endif
+				counter_u64_add(back_tosleep, 1);
 				hpts->p_on_queue_cnt = 0;
 				goto non_found;
 			}
-			t++;
+		} else if (wrap_loop_cnt >= 2) {
+			/* Special case handling */
+			hpts->p_hpts_sleep_time = tcp_min_hptsi_time;
 		} else {
-			/* No one on the wheel sleep for all but 2 slots  */
-non_found:
-			if (hpts_sleep_max == 0)
-				hpts_sleep_max = 1;
-			hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max);
-			t = 0;
+			/* No one on the wheel sleep for all but 400 slots or sleep max  */
+		non_found:
+			hpts->p_hpts_sleep_time = hpts_sleep_max;
 		}
-		if (logging_on) {
-			tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC));
-		}
 	}
 }
 
 void
 __tcp_set_hpts(struct inpcb *inp, int32_t line)
 {
 	struct tcp_hpts_entry *hpts;
 
 	INP_WLOCK_ASSERT(inp);
 	hpts = tcp_hpts_lock(inp);
 	if ((inp->inp_in_hpts == 0) &&
 	    (inp->inp_hpts_cpu_set == 0)) {
 		inp->inp_hpts_cpu = hpts_cpuid(inp);
 		inp->inp_hpts_cpu_set = 1;
 	}
 	mtx_unlock(&hpts->p_mtx);
 	hpts = tcp_input_lock(inp);
 	if ((inp->inp_input_cpu_set == 0) &&
 	    (inp->inp_in_input == 0)) {
 		inp->inp_input_cpu = hpts_cpuid(inp);
 		inp->inp_input_cpu_set = 1;
 	}
 	mtx_unlock(&hpts->p_mtx);
 }
 
 uint16_t
 tcp_hpts_delayedby(struct inpcb *inp){
 	return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by);
 }
 
 static void
 tcp_hpts_thread(void *ctx)
 {
 	struct tcp_hpts_entry *hpts;
 	struct timeval tv;
 	sbintime_t sb;
 
 	hpts = (struct tcp_hpts_entry *)ctx;
 	mtx_lock(&hpts->p_mtx);
 	if (hpts->p_direct_wake) {
 		/* Signaled by input */
-		if (logging_on)
-			tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1);
 		callout_stop(&hpts->co);
 	} else {
 		/* Timed out */
 		if (callout_pending(&hpts->co) ||
 		    !callout_active(&hpts->co)) {
-			if (logging_on)
-				tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2);
 			mtx_unlock(&hpts->p_mtx);
 			return;
 		}
 		callout_deactivate(&hpts->co);
-		if (logging_on)
-			tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3);
 	}
+	hpts->p_hpts_wake_scheduled = 0;
 	hpts->p_hpts_active = 1;
-	(void)tcp_gethptstick(&tv);
-	tcp_hptsi(hpts, &tv);
+	tcp_hptsi(hpts);
 	HPTS_MTX_ASSERT(hpts);
 	tv.tv_sec = 0;
 	tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
 	if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) {
+		hpts->overidden_sleep = tv.tv_usec;
 		tv.tv_usec = tcp_min_hptsi_time;
 		hpts->p_on_min_sleep = 1;
 	} else {
 		/* Clear the min sleep flag */
+		hpts->overidden_sleep = 0;
 		hpts->p_on_min_sleep = 0;
 	}
 	hpts->p_hpts_active = 0;
 	sb = tvtosbt(tv);
 	if (tcp_hpts_callout_skip_swi == 0) {
 		callout_reset_sbt_on(&hpts->co, sb, 0,
 		    hpts_timeout_swi, hpts, hpts->p_cpu,
 		    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 	} else {
 		callout_reset_sbt_on(&hpts->co, sb, 0,
 		    hpts_timeout_dir, hpts,
 		    hpts->p_cpu,
 		    C_PREL(tcp_hpts_precision));
 	}
 	hpts->p_direct_wake = 0;
 	mtx_unlock(&hpts->p_mtx);
 }
 
 #undef	timersub
 
 static void
 tcp_init_hptsi(void *st)
 {
 	int32_t i, j, error, bound = 0, created = 0;
 	size_t sz, asz;
 	struct timeval tv;
 	sbintime_t sb;
 	struct tcp_hpts_entry *hpts;
 	char unit[16];
 	uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
 
 	tcp_pace.rp_proc = NULL;
 	tcp_pace.rp_num_hptss = ncpus;
+	hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
 	hpts_loops = counter_u64_alloc(M_WAITOK);
 	back_tosleep = counter_u64_alloc(M_WAITOK);
-
+	combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
+	wheel_wrap = counter_u64_alloc(M_WAITOK);
 	sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
 	tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
 	asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
 	for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
 		tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
 		    M_TCPHPTS, M_WAITOK | M_ZERO);
 		tcp_pace.rp_ent[i]->p_hptss = malloc(asz,
 		    M_TCPHPTS, M_WAITOK);
 		hpts = tcp_pace.rp_ent[i];
 		/*
 		 * Init all the hpts structures that are not specifically
 		 * zero'd by the allocations. Also lets attach them to the
 		 * appropriate sysctl block as well.
 		 */
 		mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
 		    "hpts", MTX_DEF | MTX_DUPOK);
 		TAILQ_INIT(&hpts->p_input);
 		for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
 			TAILQ_INIT(&hpts->p_hptss[j]);
 		}
 		sysctl_ctx_init(&hpts->hpts_ctx);
 		sprintf(unit, "%d", i);
 		hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
 		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
 		    OID_AUTO,
 		    unit,
 		    CTLFLAG_RW, 0,
 		    "");
 		SYSCTL_ADD_INT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "in_qcnt", CTLFLAG_RD,
 		    &hpts->p_on_inqueue_cnt, 0,
 		    "Count TCB's awaiting input processing");
 		SYSCTL_ADD_INT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "out_qcnt", CTLFLAG_RD,
 		    &hpts->p_on_queue_cnt, 0,
 		    "Count TCB's awaiting output processing");
-		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+		SYSCTL_ADD_U16(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "active", CTLFLAG_RD,
 		    &hpts->p_hpts_active, 0,
 		    "Is the hpts active");
 		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "curslot", CTLFLAG_RD,
 		    &hpts->p_cur_slot, 0,
-		    "What the current slot is if active");
+		    "What the current running pacers goal");
 		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
-		    OID_AUTO, "curtick", CTLFLAG_RD,
-		    &hpts->p_curtick, 0,
-		    "What the current tick on if active");
+		    OID_AUTO, "runtick", CTLFLAG_RD,
+		    &hpts->p_runningtick, 0,
+		    "What the running pacers current slot is");
 		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
-		    OID_AUTO, "logsize", CTLFLAG_RD,
-		    &hpts->p_logsize, 0,
-		    "Hpts logging buffer size");
-		hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2;
+		    OID_AUTO, "curtick", CTLFLAG_RD,
+		    &hpts->p_curtick, 0,
+		    "What the running pacers last tick mapped to the wheel was");
+		hpts->p_hpts_sleep_time = hpts_sleep_max;
 		hpts->p_num = i;
-		hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv);
-		hpts->p_prevtick -= 1;
-		hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS;
+		hpts->p_curtick = tcp_gethptstick(&tv);
+		hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
 		hpts->p_cpu = 0xffff;
-		hpts->p_nxt_slot = 1;
-		hpts->p_logsize = tcp_hpts_logging_size;
-		if (hpts->p_logsize) {
-			sz = (sizeof(struct hpts_log) * hpts->p_logsize);
-			hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
-		}
+		hpts->p_nxt_slot = hpts_tick(hpts->p_cur_slot, 1);
 		callout_init(&hpts->co, 1);
 	}
 	/*
 	 * Now lets start ithreads to handle the hptss.
 	 */
 	CPU_FOREACH(i) {
 		hpts = tcp_pace.rp_ent[i];
 		hpts->p_cpu = i;
 		error = swi_add(&hpts->ie, "hpts",
 		    tcp_hpts_thread, (void *)hpts,
 		    SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
 		if (error) {
 			panic("Can't add hpts:%p i:%d err:%d",
 			    hpts, i, error);
 		}
 		created++;
 		if (tcp_bind_threads) {
 			if (intr_event_bind(hpts->ie, i) == 0)
 				bound++;
 		}
 		tv.tv_sec = 0;
 		tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
 		sb = tvtosbt(tv);
 		if (tcp_hpts_callout_skip_swi == 0) {
 			callout_reset_sbt_on(&hpts->co, sb, 0,
 			    hpts_timeout_swi, hpts, hpts->p_cpu,
 			    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 		} else {
 			callout_reset_sbt_on(&hpts->co, sb, 0,
 			    hpts_timeout_dir, hpts,
 			    hpts->p_cpu,
 			    C_PREL(tcp_hpts_precision));
 		}
 	}
 	printf("TCP Hpts created %d swi interrupt thread and bound %d\n",
 	    created, bound);
 	return;
 }
 
 SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL);
 MODULE_VERSION(tcphpts, 1);
Index: stable/12/sys/netinet/tcp_hpts.h
===================================================================
--- stable/12/sys/netinet/tcp_hpts.h	(revision 362879)
+++ stable/12/sys/netinet/tcp_hpts.h	(revision 362880)
@@ -1,304 +1,268 @@
 /*-
  * Copyright (c) 2016-2018 Netflix, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef __tcp_hpts_h__
 #define __tcp_hpts_h__
 
 /*
  * The hpts uses a 102400 wheel. The wheel
  * defines the time in 10 usec increments (102400 x 10).
  * This gives a range of 10usec - 1024ms to place
  * an entry within. If the user requests more than
  * 1.024 second, a remaineder is attached and the hpts
  * when seeing the remainder will re-insert the
  * inpcb forward in time from where it is until
  * the remainder is zero.
  */
 
 #define NUM_OF_HPTSI_SLOTS 102400
 
 TAILQ_HEAD(hptsh, inpcb);
 
 /* Number of useconds in a hpts tick */
 #define HPTS_TICKS_PER_USEC 10
-#define HPTS_MS_TO_SLOTS(x) (x * 100)
+#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1)
 #define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
 #define HPTS_USEC_IN_SEC 1000000
 #define HPTS_MSEC_IN_SEC 1000
 #define HPTS_USEC_IN_MSEC 1000
 
-#define DEFAULT_HPTS_LOG 3072
 
-/*
- * Log flags consist of
- *  7f      7f         1            1 bits
- * p_cpu | p_num | INPUT_ACTIVE | HPTS_ACTIVE
- *
- * So for example cpu 10, number 10 would with
- * input active would show up as:
- * p_flags = 0001010 0001010 1 0
- *  <or>
- * p_flags = 0x142a
- */
-#define HPTS_HPTS_ACTIVE 0x01
-#define HPTS_INPUT_ACTIVE 0x02
-
-#define HPTSLOG_IMMEDIATE	1
-#define HPTSLOG_INSERT_NORMAL	2
-#define HPTSLOG_INSERT_SLEEPER	3
-#define HPTSLOG_SLEEP_AFTER	4
-#define HPTSLOG_SLEEP_BEFORE	5
-#define HPTSLOG_INSERTED	6
-#define HPTSLOG_WAKEUP_HPTS	7
-#define HPTSLOG_SETTORUN	8
-#define HPTSLOG_HPTSI		9
-#define HPTSLOG_TOLONG		10
-#define HPTSLOG_AWAKENS	11
-#define HPTSLOG_TIMESOUT	12
-#define HPTSLOG_SLEEPSET	13
-#define HPTSLOG_WAKEUP_INPUT	14
-#define HPTSLOG_RESCHEDULE     15
-#define HPTSLOG_AWAKE		16
-#define HPTSLOG_INP_DONE	17
-
-struct hpts_log {
-	struct inpcb *inp;
-	int32_t event;
-	uint32_t cts;
-	int32_t line;
-	uint32_t ticknow;
-	uint32_t t_paceslot;
-	uint32_t t_hptsreq;
-	uint32_t p_curtick;
-	uint32_t p_prevtick;
-	uint32_t slot_req;
-	uint32_t p_on_queue_cnt;
-	uint32_t p_nxt_slot;
-	uint32_t p_cur_slot;
-	uint32_t p_hpts_sleep_time;
-	uint16_t p_flags;
-	uint8_t p_onhpts;
-	uint8_t p_oninput;
-	uint8_t is_notempty;
-};
-
 struct hpts_diag {
-	uint32_t p_hpts_active;
-	uint32_t p_nxt_slot;
-	uint32_t p_cur_slot;
-	uint32_t slot_req;
-	uint32_t inp_hptsslot;
-	uint32_t slot_now;
-	uint32_t have_slept;
-	uint32_t hpts_sleep_time;
-	uint32_t yet_to_sleep;
-	uint32_t need_new_to;
-	int32_t co_ret;
-	uint8_t p_on_min_sleep;
+	uint32_t p_hpts_active; 	/* bbr->flex7 x */
+	uint32_t p_nxt_slot;		/* bbr->flex1 x */
+	uint32_t p_cur_slot;		/* bbr->flex2 x */
+	uint32_t p_prev_slot;		/* bbr->delivered */
+	uint32_t p_runningtick;		/* bbr->inflight */
+	uint32_t slot_req;		/* bbr->flex3 x */
+	uint32_t inp_hptsslot;		/* bbr->flex4 x */
+	uint32_t slot_remaining;	/* bbr->flex5 x */
+	uint32_t have_slept;		/* bbr->epoch x */
+	uint32_t hpts_sleep_time;	/* bbr->applimited x */
+	uint32_t yet_to_sleep;		/* bbr->lt_epoch x */
+	uint32_t need_new_to;		/* bbr->flex6 x  */
+	uint32_t wheel_tick;		/* bbr->bw_inuse x */
+	uint32_t maxticks;		/* bbr->delRate x */
+	uint32_t wheel_cts;		/* bbr->rttProp x */
+	int32_t co_ret; 		/* bbr->pkts_out x */
+	uint32_t p_curtick;		/* upper bbr->cur_del_rate */
+	uint32_t p_lasttick;		/* lower bbr->cur_del_rate */
+	uint8_t p_on_min_sleep; 	/* bbr->flex8 x */
 };
 
+/* Magic flags to tell whats cooking on the pacing wheel */
+#define PACE_TMR_DELACK 0x01	/* Delayed ack timer running */
+#define PACE_TMR_RACK   0x02	/* RACK timer running */
+#define PACE_TMR_TLP    0x04	/* TLP timer running */
+#define PACE_TMR_RXT    0x08	/* Retransmit timer running */
+#define PACE_TMR_PERSIT 0x10	/* Persists timer running */
+#define PACE_TMR_KEEP   0x20	/* Keep alive timer running */
+#define PACE_PKT_OUTPUT 0x40	/* Output Packets being paced */
+#define PACE_TMR_MASK   (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
+
 #ifdef _KERNEL
 /* Each hpts has its own p_mtx which is used for locking */
 struct tcp_hpts_entry {
 	/* Cache line 0x00 */
 	struct mtx p_mtx;	/* Mutex for hpts */
-	uint32_t p_hpts_active; /* Flag that says hpts is awake  */
-	uint32_t p_curtick;	/* Current tick in 10 us the hpts is at */
-	uint32_t p_prevtick;	/* Previous tick in 10 us the hpts ran */
+	uint16_t p_hpts_active; /* Flag that says hpts is awake  */
+	uint8_t p_hpts_wake_scheduled;	/* Have we scheduled a wakeup? */
+	uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
+	uint32_t p_curtick;	/* Tick in 10 us the hpts is going to */
+	uint32_t p_runningtick; /* Current tick we are at if we are running */
+	uint32_t p_prev_slot;	/* Previous slot we were on */
 	uint32_t p_cur_slot;	/* Current slot in wheel hpts is draining */
 	uint32_t p_nxt_slot;	/* The next slot outside the current range of
 				 * slots that the hpts is running on. */
 	int32_t p_on_queue_cnt;	/* Count on queue in this hpts */
-	uint32_t enobuf_cnt;
-	uint16_t p_log_at;
+	uint32_t p_lasttick;	/* Last tick before the current one */
 	uint8_t p_direct_wake :1, /* boolean */
-		p_log_wrapped :1, /* boolean */
-		p_on_min_sleep:1; /* boolean */
-	uint8_t p_fill;
+		p_on_min_sleep:1, /* boolean */
+		p_avail:6; 
+	uint8_t p_fill[3];	  /* Fill to 32 bits */
 	/* Cache line 0x40 */
 	void *p_inp;
 	struct hptsh p_input;	/* For the tcp-input runner */
 	/* Hptsi wheel */
 	struct hptsh *p_hptss;
-	struct hpts_log *p_log;
-	uint32_t p_logsize;
 	int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
 	uint32_t hit_no_enobuf;
 	uint32_t p_dyn_adjust;
 	uint32_t p_hpts_sleep_time;	/* Current sleep interval having a max
 					 * of 255ms */
+	uint32_t overidden_sleep;	/* what was overrided by min-sleep for logging */
+	uint32_t saved_lasttick;	/* for logging */
+	uint32_t saved_curtick;		/* for logging */
+	uint32_t saved_curslot;		/* for logging */
+	uint32_t saved_prev_slot;       /* for logging */
 	uint32_t p_delayed_by;	/* How much were we delayed by */
 	/* Cache line 0x80 */
 	struct sysctl_ctx_list hpts_ctx;
 	struct sysctl_oid *hpts_root;
 	struct intr_event *ie;
 	void *ie_cookie;
 	uint16_t p_num;		/* The hpts number one per cpu */
 	uint16_t p_cpu;		/* The hpts CPU */
 	/* There is extra space in here */
 	/* Cache line 0x100 */
 	struct callout co __aligned(CACHE_LINE_SIZE);
 }               __aligned(CACHE_LINE_SIZE);
 
 struct tcp_hptsi {
 	struct proc *rp_proc;	/* Process structure for hpts */
 	struct tcp_hpts_entry **rp_ent;	/* Array of hptss */
 	uint32_t rp_num_hptss;	/* Number of hpts threads */
 };
 
 #endif
 
 #define HPTS_REMOVE_INPUT  0x01
 #define HPTS_REMOVE_OUTPUT 0x02
 #define HPTS_REMOVE_ALL    (HPTS_REMOVE_INPUT | HPTS_REMOVE_OUTPUT)
 
 /*
  * When using the hpts, a TCP stack must make sure
  * that once a INP_DROPPED flag is applied to a INP
  * that it does not expect tcp_output() to ever be
  * called by the hpts. The hpts will *not* call
  * any output (or input) functions on a TCB that
  * is in the DROPPED state.
  *
  * This implies final ACK's and RST's that might
  * be sent when a TCB is still around must be
  * sent from a routine like tcp_respond().
  */
 #define DEFAULT_MIN_SLEEP 250	/* How many usec's is default for hpts sleep
 				 * this determines min granularity of the
 				 * hpts. If 0, granularity is 10useconds at
 				 * the cost of more CPU (context switching). */
 #ifdef _KERNEL
 #define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
 struct tcp_hpts_entry *tcp_hpts_lock(struct inpcb *inp);
 struct tcp_hpts_entry *tcp_input_lock(struct inpcb *inp);
 int __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line);
 #define tcp_queue_to_hpts_immediate(a)__tcp_queue_to_hpts_immediate(a, __LINE__)
 
 struct tcp_hpts_entry *tcp_cur_hpts(struct inpcb *inp);
 #define tcp_hpts_remove(a, b) __tcp_hpts_remove(a, b, __LINE__)
 void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line);
 
 /*
  * To insert a TCB on the hpts you *must* be holding the
  * INP_WLOCK(). The hpts insert code will then acqurire
  * the hpts's lock and insert the TCB on the requested
  * slot possibly waking up the hpts if you are requesting
  * a time earlier than what the hpts is sleeping to (if
  * the hpts is sleeping). You may check the inp->inp_in_hpts
  * flag without the hpts lock. The hpts is the only one
  * that will clear this flag holding only the hpts lock. This
  * means that in your tcp_output() routine when you test for
  * it to be 1 (so you wont call output) it may be transitioning
  * to 0 (by the hpts). That will be fine since that will just
  * mean an extra call to tcp_output that most likely will find
  * the call you executed (when the mis-match occured) will have
  * put the TCB back on the hpts and it will return. If your
  * call did not add it back to the hpts then you will either
  * over-send or the cwnd will block you from sending more.
  *
  * Note you should also be holding the INP_WLOCK() when you
  * call the remove from the hpts as well. Thoug usually
  * you are either doing this from a timer, where you need
  * that INP_WLOCK() or from destroying your TCB where again
  * you should already have the INP_WLOCK().
  */
 uint32_t __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line);
 #define tcp_hpts_insert(a, b) __tcp_hpts_insert(a, b, __LINE__)
 
 uint32_t
 tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag);
 
 int
     __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line);
 #define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__);
-void
-tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
-    int32_t tlen, int32_t drop_hdrlen, uint8_t iptos);
 int
-__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
-    int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line);
-#define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__)
+__tcp_queue_to_input(struct inpcb *inp, int32_t line);
+#define tcp_queue_to_input(a) __tcp_queue_to_input(a, __LINE__)
 
 uint16_t tcp_hpts_delayedby(struct inpcb *inp);
 
 void __tcp_set_hpts(struct inpcb *inp, int32_t line);
 #define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__)
 
 void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line);
 #define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__)
 
 extern int32_t tcp_min_hptsi_time;
 
 static __inline uint32_t
 tcp_tv_to_hptstick(struct timeval *sv)
 {
 	return ((sv->tv_sec * 100000) + (sv->tv_usec / 10));
 }
 
 static __inline uint32_t
 tcp_gethptstick(struct timeval *sv)
 {
 	struct timeval tv;
 
 	if (sv == NULL)
 		sv = &tv;
 	microuptime(sv);
 	return (tcp_tv_to_hptstick(sv));
 }
 
 static __inline uint32_t
 tcp_tv_to_usectick(struct timeval *sv)
 {
 	return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
 }
 
 static __inline uint32_t
 tcp_tv_to_mssectick(struct timeval *sv)
 {
 	return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
 }
 
 static __inline void
 tcp_hpts_unlock(struct tcp_hpts_entry *hpts)
 {
 	mtx_unlock(&hpts->p_mtx);
 }
 
 static __inline uint32_t
 tcp_get_usecs(struct timeval *tv)
 {
 	struct timeval tvd;
 
 	if (tv == NULL)
 		tv = &tvd;
 	microuptime(tv);
 	return (tcp_tv_to_usectick(tv));
 }
 
 #endif /* _KERNEL */
 #endif /* __tcp_hpts_h__ */
Index: stable/12/sys/netinet/tcp_log_buf.h
===================================================================
--- stable/12/sys/netinet/tcp_log_buf.h	(revision 362879)
+++ stable/12/sys/netinet/tcp_log_buf.h	(revision 362880)
@@ -1,369 +1,376 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2016-2018 Netflix, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef __tcp_log_buf_h__
 #define __tcp_log_buf_h__
 
 #define	TCP_LOG_REASON_LEN	32
 #define	TCP_LOG_BUF_VER		(6)
 
 /*
  * Because the (struct tcp_log_buffer) includes 8-byte uint64_t's, it requires
  * 8-byte alignment to work properly on all platforms. Therefore, we will
  * enforce 8-byte alignment for all the structures that may appear by
  * themselves (instead of being embedded in another structure) in a data
  * stream.
  */
 #define	ALIGN_TCP_LOG		__aligned(8)
 
 /* Information about the socketbuffer state. */
 struct tcp_log_sockbuf
 {
 	uint32_t	tls_sb_acc;	/* available chars (sb->sb_acc) */
 	uint32_t	tls_sb_ccc;	/* claimed chars (sb->sb_ccc) */
 	uint32_t	tls_sb_spare;	/* spare */
 };
 
 /* Optional, verbose information that may be appended to an event log. */
 struct tcp_log_verbose
 {
 #define	TCP_FUNC_LEN	32
 	char		tlv_snd_frm[TCP_FUNC_LEN]; /* tcp_output() caller */
 	char		tlv_trace_func[TCP_FUNC_LEN]; /* Function that
 							 generated trace */
 	uint32_t	tlv_trace_line;	/* Line number that generated trace */
 	uint8_t		_pad[4];
 } ALIGN_TCP_LOG;
 
 /* Internal RACK state variables. */
 struct tcp_log_rack
 {
 	uint32_t	tlr_rack_rtt;		/* rc_rack_rtt */
 	uint8_t		tlr_state;		/* Internal RACK state */
 	uint8_t		_pad[3];		/* Padding */
 };
 
 struct tcp_log_bbr {
 	uint64_t cur_del_rate;
 	uint64_t delRate;
 	uint64_t rttProp;
 	uint64_t bw_inuse;
 	uint32_t inflight;
 	uint32_t applimited;
 	uint32_t delivered;
 	uint32_t timeStamp;
 	uint32_t epoch;
 	uint32_t lt_epoch;
 	uint32_t pkts_out;
 	uint32_t flex1;
 	uint32_t flex2;
 	uint32_t flex3;
 	uint32_t flex4;
 	uint32_t flex5;
 	uint32_t flex6;
 	uint32_t lost;
 	uint16_t pacing_gain;
 	uint16_t cwnd_gain;
 	uint16_t flex7;
 	uint8_t bbr_state;
 	uint8_t bbr_substate;
 	uint8_t inhpts;
 	uint8_t ininput;
 	uint8_t use_lt_bw;
 	uint8_t flex8;
 	uint32_t pkt_epoch;
 };
 
 /* Per-stack stack-specific info. */
 union tcp_log_stackspecific
 {
 	struct tcp_log_rack u_rack;
 	struct tcp_log_bbr u_bbr;
 };
 
 struct tcp_log_buffer
 {
 	/* Event basics */
 	struct timeval	tlb_tv;		/* Timestamp of trace */
 	uint32_t	tlb_ticks;	/* Timestamp of trace */
 	uint32_t	tlb_sn;		/* Serial number */
 	uint8_t		tlb_stackid;	/* Stack ID */
 	uint8_t		tlb_eventid;	/* Event ID */
 	uint16_t	tlb_eventflags;	/* Flags for the record */
 #define	TLB_FLAG_RXBUF		0x0001	/* Includes receive buffer info */
 #define	TLB_FLAG_TXBUF		0x0002	/* Includes send buffer info */
 #define	TLB_FLAG_HDR		0x0004	/* Includes a TCP header */
 #define	TLB_FLAG_VERBOSE	0x0008	/* Includes function/line numbers */
 #define	TLB_FLAG_STACKINFO	0x0010	/* Includes stack-specific info */
 	int		tlb_errno;	/* Event error (if any) */
 
 	/* Internal session state */
 	struct tcp_log_sockbuf tlb_rxbuf; /* Receive buffer */
 	struct tcp_log_sockbuf tlb_txbuf; /* Send buffer */
 
 	int		tlb_state;	/* TCPCB t_state */
 	uint32_t	tlb_starttime;	/* TCPCB t_starttime */
 	uint32_t	tlb_iss;		/* TCPCB iss */
 	uint32_t	tlb_flags;	/* TCPCB flags */
 	uint32_t	tlb_snd_una;	/* TCPCB snd_una */
 	uint32_t	tlb_snd_max;	/* TCPCB snd_max */
 	uint32_t	tlb_snd_cwnd;	/* TCPCB snd_cwnd */
 	uint32_t	tlb_snd_nxt;	/* TCPCB snd_nxt */
 	uint32_t	tlb_snd_recover;/* TCPCB snd_recover */
 	uint32_t	tlb_snd_wnd;	/* TCPCB snd_wnd */
 	uint32_t	tlb_snd_ssthresh; /* TCPCB snd_ssthresh */
 	uint32_t	tlb_srtt;	/* TCPCB t_srtt */
 	uint32_t	tlb_rttvar;	/* TCPCB t_rttvar */
 	uint32_t	tlb_rcv_up;	/* TCPCB rcv_up */
 	uint32_t	tlb_rcv_adv;	/* TCPCB rcv_adv */
 	uint32_t	tlb_rcv_nxt;	/* TCPCB rcv_nxt */
 	tcp_seq		tlb_sack_newdata; /* TCPCB sack_newdata */
 	uint32_t       	tlb_rcv_wnd;	/* TCPCB rcv_wnd */
 	uint32_t	tlb_dupacks;	/* TCPCB t_dupacks */
 	int		tlb_segqlen;	/* TCPCB segqlen */
 	int		tlb_snd_numholes; /* TCPCB snd_numholes */
 	uint32_t 	tlb_flex1; /* Event specific information */
 	uint32_t 	tlb_flex2; /* Event specific information */
 	uint8_t		tlb_snd_scale:4, /* TCPCB snd_scale */
 			tlb_rcv_scale:4; /* TCPCB rcv_scale */
 	uint8_t		_pad[3];	/* Padding */
 
 	/* Per-stack info */
 	union tcp_log_stackspecific tlb_stackinfo;
 #define	tlb_rack	tlb_stackinfo.u_rack
 
 	/* The packet */
 	uint32_t	tlb_len;	/* The packet's data length */
 	struct tcphdr	tlb_th;		/* The TCP header */
 	uint8_t		tlb_opts[TCP_MAXOLEN]; /* The TCP options */
 
 	/* Verbose information (optional) */
 	struct tcp_log_verbose tlb_verbose[0];
 } ALIGN_TCP_LOG;
 
 enum tcp_log_events {
 	TCP_LOG_IN = 1,	/* Incoming packet                 1 */
 	TCP_LOG_OUT,	/* Transmit (without other event)  2 */
 	TCP_LOG_RTO,	/* Retransmit timeout              3 */
 	TCP_LOG_TF_ACK,	/* Transmit due to TF_ACK          4 */
 	TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */
 	TCP_LOG_PRR,	/* Doing PRR                       6 */
 	TCP_LOG_REORDER,/* Detected reorder                7 */
-	TCP_LOG_PACER,	/* Pacer sending a packet          8 */
+	TCP_LOG_HPTS,	/* Hpts sending a packet          8 */
 	BBR_LOG_BBRUPD,		/* We updated BBR info     9 */
 	BBR_LOG_BBRSND,		/* We did a slot calculation and sending is done 10 */
 	BBR_LOG_ACKCLEAR,	/* A ack clears all outstanding     11 */
 	BBR_LOG_INQUEUE,	/* The tcb had a packet input to it 12 */
 	BBR_LOG_TIMERSTAR,	/* Start a timer                    13 */
 	BBR_LOG_TIMERCANC,	/* Cancel a timer                   14 */
 	BBR_LOG_ENTREC,		/* Entered recovery                 15 */
 	BBR_LOG_EXITREC,	/* Exited recovery                  16 */
 	BBR_LOG_CWND,		/* Cwnd change                      17 */
 	BBR_LOG_BWSAMP,		/* LT B/W sample has been made      18 */
 	BBR_LOG_MSGSIZE,	/* We received a EMSGSIZE error     19 */
 	BBR_LOG_BBRRTT,		/* BBR RTT is updated               20 */
 	BBR_LOG_JUSTRET,	/* We just returned out of output   21 */
 	BBR_LOG_STATE,		/* A BBR state change occured       22 */
 	BBR_LOG_PKT_EPOCH,      /* A BBR packet epoch occured       23 */
 	BBR_LOG_PERSIST,        /* BBR changed to/from a persists   24 */
 	TCP_LOG_FLOWEND,        /* End of a flow                    25 */
 	BBR_LOG_RTO,            /* BBR's timeout includes BBR info  26 */
-	BBR_LOG_DOSEG_DONE,     /* pacer do_segment completes       27 */
-	BBR_LOG_EXIT_GAIN,      /* pacer do_segment completes       28 */
+	BBR_LOG_DOSEG_DONE,     /* hpts do_segment completes       27 */
+	BBR_LOG_EXIT_GAIN,      /* hpts do_segment completes       28 */
 	BBR_LOG_THRESH_CALC,    /* Doing threshold calculation      29 */
 	BBR_LOG_EXTRACWNDGAIN,	/* Removed                          30 */
 	TCP_LOG_USERSEND, 	/* User level sends data            31 */
-	UNUSED_32,	 	/* Unused                           32 */
-	UNUSED_33, 		/* Unused                           33 */
+	BBR_RSM_CLEARED,	/* RSM cleared of ACK flags         32 */
+	BBR_LOG_STATE_TARGET, 	/* Log of target at state           33 */
 	BBR_LOG_TIME_EPOCH, 	/* A timed based Epoch occured      34 */
 	BBR_LOG_TO_PROCESS,	/* A to was processed               35 */
 	BBR_LOG_BBRTSO, 	/* TSO update	                    36 */
-	BBR_LOG_PACERDIAG,	/* Pacer diag insert                37 */
+	BBR_LOG_HPTSDIAG,	/* Hpts diag insert                37 */
 	BBR_LOG_LOWGAIN,	/* Low gain accounting              38 */
 	BBR_LOG_PROGRESS,	/* Progress timer event             39 */
 	TCP_LOG_SOCKET_OPT,	/* A socket option is set	    40 */
 	BBR_LOG_TIMERPREP,	/* A BBR var to debug out TLP issues  41 */
 	BBR_LOG_ENOBUF_JMP,	/* We had a enobuf jump 42 */
-	BBR_LOG_PACING_CALC,	/* calc the pacing time 43 */
+	BBR_LOG_HPTSI_CALC,	/* calc the hptsi time 43 */
 	BBR_LOG_RTT_SHRINKS,	/* We had a log reduction of rttProp 44 */
 	BBR_LOG_BW_RED_EV,	/* B/W reduction events 45 */
 	BBR_LOG_REDUCE,		/* old bbr log reduce for 4.1 and earlier 46*/
 	TCP_LOG_RTT,		/* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */
 	BBR_LOG_SETTINGS_CHG,   /* Settings changed for loss response 48 */
-	BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining 49 */
+	BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining -- now not used 49 */
 	TCP_LOG_REASS,		/* Reassembly buffer logging 50 */
-	TCP_LOG_END		/* End (keep at end)	            51 */
+	TCP_HDWR_TLS,		/* TCP Hardware TLS logs 51 */
+	BBR_LOG_HDWR_PACE,	/* TCP Hardware pacing log 52 */
+	BBR_LOG_TSTMP_VAL,	/* Temp debug timestamp validation 53 */
+	TCP_LOG_CONNEND,	/* End of connection 54 */
+	TCP_LOG_LRO,		/* LRO entry 55 */
+	TCP_SACK_FILTER_RES,	/* Results of SACK Filter 56 */
+	TCP_SAD_DETECTION,	/* Sack Attack Detection 57 */
+	TCP_LOG_END		/* End (keep at end)	   58 */
 };
 
 enum tcp_log_states {
 	TCP_LOG_STATE_CLEAR = -1,	/* Deactivate and clear tracing */
 	TCP_LOG_STATE_OFF = 0,		/* Pause */
 	TCP_LOG_STATE_TAIL=1,		/* Keep the trailing events */
 	TCP_LOG_STATE_HEAD=2,		/* Keep the leading events */
 	TCP_LOG_STATE_HEAD_AUTO=3,	/* Keep the leading events, and
 					   automatically dump them to the
 					   device  */
 	TCP_LOG_STATE_CONTINUAL=4,	/* Continually dump the data when full */
 	TCP_LOG_STATE_TAIL_AUTO=5,	/* Keep the trailing events, and
 					   automatically dump them when the
 					   session ends */
 };
 
 /* Use this if we don't know whether the operation succeeded. */
 #define	ERRNO_UNK	(-1)
 
 /*
  * If the user included dev/tcp_log/tcp_log_dev.h, then include our private
  * headers. Otherwise, there is no reason to pollute all the files with an
  * additional include.
  *
  * This structure is aligned to an 8-byte boundary to match the alignment
  * requirements of (struct tcp_log_buffer).
  */
 #ifdef __tcp_log_dev_h__
 struct tcp_log_header {
 	struct tcp_log_common_header tlh_common;
 #define	tlh_version	tlh_common.tlch_version
 #define	tlh_type	tlh_common.tlch_type
 #define	tlh_length	tlh_common.tlch_length
 	struct in_endpoints	tlh_ie;
 	struct timeval		tlh_offset;	/* Uptime -> UTC offset */
 	char			tlh_id[TCP_LOG_ID_LEN];
 	char			tlh_reason[TCP_LOG_REASON_LEN];
 	uint8_t		tlh_af;
 	uint8_t		_pad[7];
 } ALIGN_TCP_LOG;
 
 #ifdef _KERNEL
 struct tcp_log_dev_log_queue {
 	struct tcp_log_dev_queue tldl_common;
 	char			tldl_id[TCP_LOG_ID_LEN];
 	char			tldl_reason[TCP_LOG_REASON_LEN];
 	struct in_endpoints	tldl_ie;
 	struct tcp_log_stailq	tldl_entries;
 	int			tldl_count;
 	uint8_t			tldl_af;
 };
 #endif /* _KERNEL */
 #endif /* __tcp_log_dev_h__ */
 
 #ifdef _KERNEL
 
-#define	TCP_LOG_BUF_DEFAULT_SESSION_LIMIT	10000
-#define	TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT	1000000
+#define	TCP_LOG_BUF_DEFAULT_SESSION_LIMIT	5000
+#define	TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT	5000000
 
 /*
  * TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always
  * tries to record verbose information.
  */
 #define	TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \
 	do {								\
 		if (tp->t_logstate != TCP_LOG_STATE_OFF)		\
 			tcp_log_event_(tp, th, rxbuf, txbuf, eventid,	\
 	 	        errornum, len, stackinfo, th_hostorder,		\
 		        tp->t_output_caller, __func__, __LINE__, tv);	\
 	} while (0)
 
 /*
  * TCP_LOG_EVENT: This is a macro so we can capture function/line
  * information when needed.
  *
  * Prototype:
  * TCP_LOG_EVENT(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, 
  *     struct sockbuf *txbuf, uint8_t eventid, int errornum,
  *     union tcp_log_stackspecific *stackinfo)
  *
  * tp is mandatory and must be write locked.
  * th is optional; if present, it will appear in the record.
  * rxbuf and txbuf are optional; if present, they will appear in the record.
  * eventid is mandatory.
  * errornum is mandatory (it indicates the success or failure of the
  *     operation associated with the event).
  * len indicates the length of the packet. If no packet, use 0.
  * stackinfo is optional; if present, it will appear in the record.
  */
 #ifdef TCP_LOG_FORCEVERBOSE
 #define	TCP_LOG_EVENT	TCP_LOG_EVENT_VERBOSE
 #else
 #define	TCP_LOG_EVENT(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder) \
 	do {								\
 		if (tcp_log_verbose)					\
 			TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf,	\
 			    eventid, errornum, len, stackinfo,		\
 			    th_hostorder, NULL);				\
 		else if (tp->t_logstate != TCP_LOG_STATE_OFF)		\
 			tcp_log_event_(tp, th, rxbuf, txbuf, eventid,	\
 			    errornum, len, stackinfo, th_hostorder,	\
 			    NULL, NULL, 0, NULL);				\
 	} while (0)
 #endif /* TCP_LOG_FORCEVERBOSE */
 #define	TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \
 	do {								\
 		if (tp->t_logstate != TCP_LOG_STATE_OFF)		\
 			tcp_log_event_(tp, th, rxbuf, txbuf, eventid,	\
 			    errornum, len, stackinfo, th_hostorder,	\
 			    NULL, NULL, 0, tv);				\
 	} while (0)
 
 
 #ifdef TCP_BLACKBOX
 extern bool tcp_log_verbose;
 void tcp_log_drain(struct tcpcb *tp);
 int tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force);
 void tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason);
 struct tcp_log_buffer *tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf,
     struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len,
     union tcp_log_stackspecific *stackinfo, int th_hostorder,
     const char *output_caller, const char *func, int line, const struct timeval *tv);
 size_t tcp_log_get_id(struct tcpcb *tp, char *buf);
 u_int tcp_log_get_id_cnt(struct tcpcb *tp);
 int tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp);
 void tcp_log_init(void);
 int tcp_log_set_id(struct tcpcb *tp, char *id);
 int tcp_log_state_change(struct tcpcb *tp, int state);
 void tcp_log_tcpcbinit(struct tcpcb *tp);
 void tcp_log_tcpcbfini(struct tcpcb *tp);
 void tcp_log_flowend(struct tcpcb *tp);
 #else /* !TCP_BLACKBOX */
 #define tcp_log_verbose	(false)
 
 static inline struct tcp_log_buffer *
 tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf,
     struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len,
     union tcp_log_stackspecific *stackinfo, int th_hostorder,
     const char *output_caller, const char *func, int line,
     const struct timeval *tv)
 {
 
 	return (NULL);
 }
 #endif /* TCP_BLACKBOX */
 
 #endif	/* _KERNEL */
 #endif	/* __tcp_log_buf_h__ */
Index: stable/12/sys/netinet/tcp_stacks/rack.c
===================================================================
--- stable/12/sys/netinet/tcp_stacks/rack.c	(revision 362879)
+++ stable/12/sys/netinet/tcp_stacks/rack.c	(revision 362880)
@@ -1,9311 +1,9288 @@
 /*-
  * Copyright (c) 2016-2018 Netflix, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #ifdef TCP_HHOOK
 #include <sys/hhook.h>
 #endif
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
+#ifdef NETFLIX_STATS
+#include <sys/qmath.h>
+#endif
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
+#include <sys/tree.h>
 #ifdef NETFLIX_STATS
-#include <sys/stats.h>
+#include <sys/stats.h> /* Must come after qmath.h and tree.h */
 #endif
 #include <sys/refcount.h>
 #include <sys/queue.h>
 #include <sys/smp.h>
 #include <sys/kthread.h>
 #include <sys/kern_prefetch.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/vnet.h>
 
 #define TCPSTATES		/* for logging */
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* required for icmp_var.h */
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
-#include <netinet/tcp.h>
 #define	TCPOUTFLAGS
+#include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_hpts.h>
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
-#ifdef NETFLIX_CWV
-#include <netinet/tcp_newcwv.h>
-#endif
 #include <netinet/tcp_fastopen.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif				/* TCPDEBUG */
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 
 #include <netipsec/ipsec_support.h>
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif				/* IPSEC */
 
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <machine/in_cksum.h>
 
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
 #include "sack_filter.h"
 #include "tcp_rack.h"
 #include "rack_bbr_common.h"
 
 uma_zone_t rack_zone;
 uma_zone_t rack_pcb_zone;
 
 #ifndef TICKS2SBT
 #define	TICKS2SBT(__t)	(tick_sbt * ((sbintime_t)(__t)))
 #endif
 
 struct sysctl_ctx_list rack_sysctl_ctx;
 struct sysctl_oid *rack_sysctl_root;
 
+#ifndef TCPHPTS
+fatal error missing option TCPHSTS in the build;
+#endif
+
 #define CUM_ACKED 1
 #define SACKED 2
 
 /*
  * The RACK module incorporates a number of
  * TCP ideas that have been put out into the IETF
  * over the last few years:
  * - Matt Mathis's Rate Halving which slowly drops
  *    the congestion window so that the ack clock can
  *    be maintained during a recovery.
  * - Yuchung Cheng's RACK TCP (for which its named) that
  *    will stop us using the number of dup acks and instead
  *    use time as the gage of when we retransmit.
  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
  *    of Dukkipati et.al.
  * RACK depends on SACK, so if an endpoint arrives that
  * cannot do SACK the state machine below will shuttle the
  * connection back to using the "default" TCP stack that is
  * in FreeBSD.
  *
  * To implement RACK the original TCP stack was first decomposed
  * into a functional state machine with individual states
  * for each of the possible TCP connection states. The do_segement
  * functions role in life is to mandate the connection supports SACK
  * initially and then assure that the RACK state matches the conenction
  * state before calling the states do_segment function. Each
  * state is simplified due to the fact that the original do_segment
  * has been decomposed and we *know* what state we are in (no
  * switches on the state) and all tests for SACK are gone. This
  * greatly simplifies what each state does.
  *
  * TCP output is also over-written with a new version since it
  * must maintain the new rack scoreboard.
  *
  */
 static int32_t rack_precache = 1;
 static int32_t rack_tlp_thresh = 1;
 static int32_t rack_reorder_thresh = 2;
 static int32_t rack_reorder_fade = 60000;	/* 0 - never fade, def 60,000
 						 * - 60 seconds */
 static int32_t rack_pkt_delay = 1;
 static int32_t rack_inc_var = 0;/* For TLP */
 static int32_t rack_reduce_largest_on_idle = 0;
 static int32_t rack_min_pace_time = 0;
 static int32_t rack_min_pace_time_seg_req=6;
 static int32_t rack_early_recovery = 1;
 static int32_t rack_early_recovery_max_seg = 6;
 static int32_t rack_send_a_lot_in_prr = 1;
 static int32_t rack_min_to = 1;	/* Number of ms minimum timeout */
 static int32_t rack_tlp_in_recovery = 1;	/* Can we do TLP in recovery? */
 static int32_t rack_verbose_logging = 0;
 static int32_t rack_ignore_data_after_close = 1;
+static int32_t rack_map_entries_limit = 1024;
+static int32_t rack_map_split_limit = 256;
+
 /*
  * Currently regular tcp has a rto_min of 30ms
  * the backoff goes 12 times so that ends up
  * being a total of 122.850 seconds before a
  * connection is killed.
  */
 static int32_t rack_tlp_min = 10;
 static int32_t rack_rto_min = 30;	/* 30ms same as main freebsd */
 static int32_t rack_rto_max = 30000;	/* 30 seconds */
 static const int32_t rack_free_cache = 2;
 static int32_t rack_hptsi_segments = 40;
 static int32_t rack_rate_sample_method = USE_RTT_LOW;
 static int32_t rack_pace_every_seg = 1;
 static int32_t rack_delayed_ack_time = 200;	/* 200ms */
 static int32_t rack_slot_reduction = 4;
 static int32_t rack_lower_cwnd_at_tlp = 0;
 static int32_t rack_use_proportional_reduce = 0;
 static int32_t rack_proportional_rate = 10;
 static int32_t rack_tlp_max_resend = 2;
 static int32_t rack_limited_retran = 0;
 static int32_t rack_always_send_oldest = 0;
 static int32_t rack_sack_block_limit = 128;
 static int32_t rack_use_sack_filter = 1;
 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
-static uint32_t rack_map_split_limit = 0;	/* unlimited by default */
 
 /* Rack specific counters */
 counter_u64_t rack_badfr;
 counter_u64_t rack_badfr_bytes;
 counter_u64_t rack_rtm_prr_retran;
 counter_u64_t rack_rtm_prr_newdata;
 counter_u64_t rack_timestamp_mismatch;
 counter_u64_t rack_reorder_seen;
 counter_u64_t rack_paced_segments;
 counter_u64_t rack_unpaced_segments;
 counter_u64_t rack_saw_enobuf;
 counter_u64_t rack_saw_enetunreach;
 
 /* Tail loss probe counters */
 counter_u64_t rack_tlp_tot;
 counter_u64_t rack_tlp_newdata;
 counter_u64_t rack_tlp_retran;
 counter_u64_t rack_tlp_retran_bytes;
 counter_u64_t rack_tlp_retran_fail;
 counter_u64_t rack_to_tot;
 counter_u64_t rack_to_arm_rack;
 counter_u64_t rack_to_arm_tlp;
 counter_u64_t rack_to_alloc;
 counter_u64_t rack_to_alloc_hard;
 counter_u64_t rack_to_alloc_emerg;
+counter_u64_t rack_to_alloc_limited;
 counter_u64_t rack_alloc_limited_conns;
 counter_u64_t rack_split_limited;
 
 counter_u64_t rack_sack_proc_all;
 counter_u64_t rack_sack_proc_short;
 counter_u64_t rack_sack_proc_restart;
 counter_u64_t rack_runt_sacks;
 counter_u64_t rack_used_tlpmethod;
 counter_u64_t rack_used_tlpmethod2;
 counter_u64_t rack_enter_tlp_calc;
 counter_u64_t rack_input_idle_reduces;
 counter_u64_t rack_tlp_does_nada;
 
 /* Temp CPU counters */
 counter_u64_t rack_find_high;
 
 counter_u64_t rack_progress_drops;
 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
 
+/*
+ * This was originally defined in tcp_timer.c, but is now reproduced here given
+ * the unification of the SYN and non-SYN retransmit timer exponents combined
+ * with wanting to retain previous behaviour for previously deployed stack
+ * versions.
+ */
+int	tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
+    { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
+
 static void
 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);
 
 static int
 rack_process_ack(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to,
     uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
 static int
 rack_process_data(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static void
 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
     struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery);
 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
     uint8_t limit_type);
 static struct rack_sendmap *
 rack_check_recovery_mode(struct tcpcb *tp,
     uint32_t tsused);
 static void
 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th,
     uint32_t type);
 static void rack_counter_destroy(void);
 static int
 rack_ctloutput(struct socket *so, struct sockopt *sopt,
     struct inpcb *inp, struct tcpcb *tp);
 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
 static void
 rack_do_segment(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
     uint8_t iptos);
 static void rack_dtor(void *mem, int32_t size, void *arg);
 static void
 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
     uint32_t t, uint32_t cts);
 static struct rack_sendmap *
 rack_find_high_nonack(struct tcp_rack *rack,
     struct rack_sendmap *rsm);
 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
 static int
 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 static int32_t rack_handoff_ok(struct tcpcb *tp);
 static int32_t rack_init(struct tcpcb *tp);
 static void rack_init_sysctls(void);
 static void
 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
     struct tcphdr *th);
 static void
 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
     uint8_t pass, struct rack_sendmap *hintrsm);
 static void
 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm);
 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num);
 static int32_t rack_output(struct tcpcb *tp);
 static void
 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
     uint8_t iptos, int32_t nxt_pkt, struct timeval *tv);
 
 static uint32_t
 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
     struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
     uint32_t cts);
 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 static void rack_remxt_tmr(struct tcpcb *tp);
 static int
 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
 static int32_t rack_stopall(struct tcpcb *tp);
 static void
 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
     uint32_t delta);
 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
 static uint32_t
 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp);
 static void
 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, uint32_t ts);
 static int
 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type);
 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
 static void
 rack_challenge_ack(struct mbuf *m, struct tcphdr *th,
     struct tcpcb *tp, int32_t * ret_val);
 static int
 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static int
 rack_do_closing(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
+static void rack_do_drop(struct mbuf *m, struct tcpcb *tp);
 static void
-rack_do_drop(struct mbuf *m, struct tcpcb *tp);
-static void
 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
     struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val);
 static void
 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
-	struct tcphdr *th, int32_t rstreason, int32_t tlen);
+    struct tcphdr *th, int32_t rstreason, int32_t tlen);
 static int
 rack_do_established(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static int
 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t nxt_pkt);
 static int
 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static int
 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static int
 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static int
 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static int
 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static int
 rack_drop_checks(struct tcpopt *to, struct mbuf *m,
     struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf,
     int32_t * drop_hdrlen, int32_t * ret_val);
 static int
 rack_process_rst(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp);
 struct rack_sendmap *
 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
     uint32_t tsused);
 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt);
 static void
      tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th);
 
 static int
 rack_ts_check(struct mbuf *m, struct tcphdr *th,
     struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val);
 
 int32_t rack_clear_counter=0;
 
 
 static int
 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t stat;
 	int32_t error;
 
 	error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
 	if (error || req->newptr == NULL)
 		return error;
 
 	error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
 	if (error)
 		return (error);
 	if (stat == 1) {
 #ifdef INVARIANTS
 		printf("Clearing RACK counters\n");
 #endif
 		counter_u64_zero(rack_badfr);
 		counter_u64_zero(rack_badfr_bytes);
 		counter_u64_zero(rack_rtm_prr_retran);
 		counter_u64_zero(rack_rtm_prr_newdata);
 		counter_u64_zero(rack_timestamp_mismatch);
 		counter_u64_zero(rack_reorder_seen);
 		counter_u64_zero(rack_tlp_tot);
 		counter_u64_zero(rack_tlp_newdata);
 		counter_u64_zero(rack_tlp_retran);
 		counter_u64_zero(rack_tlp_retran_bytes);
 		counter_u64_zero(rack_tlp_retran_fail);
 		counter_u64_zero(rack_to_tot);
 		counter_u64_zero(rack_to_arm_rack);
 		counter_u64_zero(rack_to_arm_tlp);
 		counter_u64_zero(rack_paced_segments);
 		counter_u64_zero(rack_unpaced_segments);
 		counter_u64_zero(rack_saw_enobuf);
 		counter_u64_zero(rack_saw_enetunreach);
 		counter_u64_zero(rack_to_alloc_hard);
 		counter_u64_zero(rack_to_alloc_emerg);
 		counter_u64_zero(rack_sack_proc_all);
 		counter_u64_zero(rack_sack_proc_short);
 		counter_u64_zero(rack_sack_proc_restart);
 		counter_u64_zero(rack_to_alloc);
+		counter_u64_zero(rack_to_alloc_limited);
 		counter_u64_zero(rack_alloc_limited_conns);
 		counter_u64_zero(rack_split_limited);
 		counter_u64_zero(rack_find_high);
 		counter_u64_zero(rack_runt_sacks);
 		counter_u64_zero(rack_used_tlpmethod);
 		counter_u64_zero(rack_used_tlpmethod2);
 		counter_u64_zero(rack_enter_tlp_calc);
 		counter_u64_zero(rack_progress_drops);
 		counter_u64_zero(rack_tlp_does_nada);
 	}
 	rack_clear_counter = 0;
 	return (0);
 }
 
 
 
 static void
 rack_init_sysctls()
 {
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
+	    OID_AUTO, "map_limit", CTLFLAG_RW,
+	    &rack_map_entries_limit , 1024,
+	    "Is there a limit on how big the sendmap can grow? ");
+
+	SYSCTL_ADD_S32(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_sysctl_root),
+	    OID_AUTO, "map_splitlimit", CTLFLAG_RW,
+	    &rack_map_split_limit , 256,
+	    "Is there a limit on how much splitting a peer can do?");
+
+	SYSCTL_ADD_S32(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "rate_sample_method", CTLFLAG_RW,
 	    &rack_rate_sample_method , USE_RTT_LOW,
 	    "What method should we use for rate sampling 0=high, 1=low ");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "data_after_close", CTLFLAG_RW,
 	    &rack_ignore_data_after_close, 0,
 	    "Do we hold off sending a RST until all pending data is ack'd");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlpmethod", CTLFLAG_RW,
 	    &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
 	    "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "min_pace_time", CTLFLAG_RW,
 	    &rack_min_pace_time, 0,
 	    "Should we enforce a minimum pace time of 1ms");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "min_pace_segs", CTLFLAG_RW,
 	    &rack_min_pace_time_seg_req, 6,
 	    "How many segments have to be in the len to enforce min-pace-time");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "idle_reduce_high", CTLFLAG_RW,
 	    &rack_reduce_largest_on_idle, 0,
 	    "Should we reduce the largest cwnd seen to IW on idle reduction");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "bb_verbose", CTLFLAG_RW,
 	    &rack_verbose_logging, 0,
 	    "Should RACK black box logging be verbose");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "sackfiltering", CTLFLAG_RW,
 	    &rack_use_sack_filter, 1,
 	    "Do we use sack filtering?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "delayed_ack", CTLFLAG_RW,
 	    &rack_delayed_ack_time, 200,
 	    "Delayed ack time (200ms)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlpminto", CTLFLAG_RW,
 	    &rack_tlp_min, 10,
 	    "TLP minimum timeout per the specification (10ms)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "precache", CTLFLAG_RW,
 	    &rack_precache, 0,
 	    "Where should we precache the mcopy (0 is not at all)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "sblklimit", CTLFLAG_RW,
 	    &rack_sack_block_limit, 128,
 	    "When do we start paying attention to small sack blocks");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "send_oldest", CTLFLAG_RW,
 	    &rack_always_send_oldest, 1,
 	    "Should we always send the oldest TLP and RACK-TLP");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW,
 	    &rack_tlp_in_recovery, 1,
 	    "Can we do a TLP during recovery?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "rack_tlimit", CTLFLAG_RW,
 	    &rack_limited_retran, 0,
 	    "How many times can a rack timeout drive out sends");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "minrto", CTLFLAG_RW,
 	    &rack_rto_min, 0,
 	    "Minimum RTO in ms -- set with caution below 1000 due to TLP");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "maxrto", CTLFLAG_RW,
 	    &rack_rto_max, 0,
 	    "Maxiumum RTO in ms -- should be at least as large as min_rto");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_retry", CTLFLAG_RW,
 	    &rack_tlp_max_resend, 2,
 	    "How many times does TLP retry a single segment or multiple with no ACK");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "recovery_loss_prop", CTLFLAG_RW,
 	    &rack_use_proportional_reduce, 0,
 	    "Should we proportionaly reduce cwnd based on the number of losses ");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "recovery_prop", CTLFLAG_RW,
 	    &rack_proportional_rate, 10,
 	    "What percent reduction per loss");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
 	    &rack_lower_cwnd_at_tlp, 0,
 	    "When a TLP completes a retran should we enter recovery?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "hptsi_reduces", CTLFLAG_RW,
 	    &rack_slot_reduction, 4,
 	    "When setting a slot should we reduce by divisor");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "hptsi_every_seg", CTLFLAG_RW,
 	    &rack_pace_every_seg, 1,
 	    "Should we pace out every segment hptsi");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "hptsi_seg_max", CTLFLAG_RW,
 	    &rack_hptsi_segments, 6,
 	    "Should we pace out only a limited size of segments");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "prr_sendalot", CTLFLAG_RW,
 	    &rack_send_a_lot_in_prr, 1,
 	    "Send a lot in prr");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "minto", CTLFLAG_RW,
 	    &rack_min_to, 1,
 	    "Minimum rack timeout in milliseconds");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW,
 	    &rack_early_recovery_max_seg, 6,
 	    "Max segments in early recovery");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "earlyrecovery", CTLFLAG_RW,
 	    &rack_early_recovery, 1,
 	    "Do we do early recovery with rack");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "reorder_thresh", CTLFLAG_RW,
 	    &rack_reorder_thresh, 2,
 	    "What factor for rack will be added when seeing reordering (shift right)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
 	    &rack_tlp_thresh, 1,
 	    "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "reorder_fade", CTLFLAG_RW,
 	    &rack_reorder_fade, 0,
 	    "Does reorder detection fade, if so how many ms (0 means never)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "pktdelay", CTLFLAG_RW,
 	    &rack_pkt_delay, 1,
 	    "Extra RACK time (in ms) besides reordering thresh");
-	SYSCTL_ADD_U32(&rack_sysctl_ctx,
-	    SYSCTL_CHILDREN(rack_sysctl_root),
-	    OID_AUTO, "split_limit", CTLFLAG_RW,
-	    &rack_map_split_limit, 0,
-	    "Is there a limit on the number of map split entries (0=unlimited)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "inc_var", CTLFLAG_RW,
 	    &rack_inc_var, 0,
 	    "Should rack add to the TLP timer the variance in rtt calculation");
 	rack_badfr = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "badfr", CTLFLAG_RD,
 	    &rack_badfr, "Total number of bad FRs");
 	rack_badfr_bytes = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "badfr_bytes", CTLFLAG_RD,
 	    &rack_badfr_bytes, "Total number of bad FRs");
 	rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "prrsndret", CTLFLAG_RD,
 	    &rack_rtm_prr_retran,
 	    "Total number of prr based retransmits");
 	rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "prrsndnew", CTLFLAG_RD,
 	    &rack_rtm_prr_newdata,
 	    "Total number of prr based new transmits");
 	rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tsnf", CTLFLAG_RD,
 	    &rack_timestamp_mismatch,
 	    "Total number of timestamps that we could not find the reported ts");
 	rack_find_high = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "findhigh", CTLFLAG_RD,
 	    &rack_find_high,
 	    "Total number of FIN causing find-high");
 	rack_reorder_seen = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "reordering", CTLFLAG_RD,
 	    &rack_reorder_seen,
 	    "Total number of times we added delay due to reordering");
 	rack_tlp_tot = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_to_total", CTLFLAG_RD,
 	    &rack_tlp_tot,
 	    "Total number of tail loss probe expirations");
 	rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_new", CTLFLAG_RD,
 	    &rack_tlp_newdata,
 	    "Total number of tail loss probe sending new data");
 
 	rack_tlp_retran = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_retran", CTLFLAG_RD,
 	    &rack_tlp_retran,
 	    "Total number of tail loss probe sending retransmitted data");
 	rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
 	    &rack_tlp_retran_bytes,
 	    "Total bytes of tail loss probe sending retransmitted data");
 	rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_retran_fail", CTLFLAG_RD,
 	    &rack_tlp_retran_fail,
 	    "Total number of tail loss probe sending retransmitted data that failed (wait for t3)");
 	rack_to_tot = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "rack_to_tot", CTLFLAG_RD,
 	    &rack_to_tot,
 	    "Total number of times the rack to expired?");
 	rack_to_arm_rack = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "arm_rack", CTLFLAG_RD,
 	    &rack_to_arm_rack,
 	    "Total number of times the rack timer armed?");
 	rack_to_arm_tlp = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "arm_tlp", CTLFLAG_RD,
 	    &rack_to_arm_tlp,
 	    "Total number of times the tlp timer armed?");
 	rack_paced_segments = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "paced", CTLFLAG_RD,
 	    &rack_paced_segments,
 	    "Total number of times a segment send caused hptsi");
 	rack_unpaced_segments = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "unpaced", CTLFLAG_RD,
 	    &rack_unpaced_segments,
 	    "Total number of times a segment did not cause hptsi");
 	rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "saw_enobufs", CTLFLAG_RD,
 	    &rack_saw_enobuf,
 	    "Total number of times a segment did not cause hptsi");
 	rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
 	    &rack_saw_enetunreach,
 	    "Total number of times a segment did not cause hptsi");
 	rack_to_alloc = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "allocs", CTLFLAG_RD,
 	    &rack_to_alloc,
 	    "Total allocations of tracking structures");
 	rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "allochard", CTLFLAG_RD,
 	    &rack_to_alloc_hard,
 	    "Total allocations done with sleeping the hard way");
 	rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "allocemerg", CTLFLAG_RD,
 	    &rack_to_alloc_emerg,
 	    "Total allocations done from emergency cache");
+	rack_to_alloc_limited = counter_u64_alloc(M_WAITOK);
+	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_sysctl_root),
+	    OID_AUTO, "alloc_limited", CTLFLAG_RD,
+	    &rack_to_alloc_limited,
+	    "Total allocations dropped due to limit");
 	rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "alloc_limited_conns", CTLFLAG_RD,
 	    &rack_alloc_limited_conns,
 	    "Connections with allocations dropped due to limit");
 	rack_split_limited = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "split_limited", CTLFLAG_RD,
 	    &rack_split_limited,
 	    "Split allocations dropped due to limit");
 	rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "sack_long", CTLFLAG_RD,
 	    &rack_sack_proc_all,
 	    "Total times we had to walk whole list for sack processing");
 
 	rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "sack_restart", CTLFLAG_RD,
 	    &rack_sack_proc_restart,
 	    "Total times we had to walk whole list due to a restart");
 	rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "sack_short", CTLFLAG_RD,
 	    &rack_sack_proc_short,
 	    "Total times we took shortcut for sack processing");
 	rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_calc_entered", CTLFLAG_RD,
 	    &rack_enter_tlp_calc,
 	    "Total times we called calc-tlp");
 	rack_used_tlpmethod = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "hit_tlp_method", CTLFLAG_RD,
 	    &rack_used_tlpmethod,
 	    "Total number of runt sacks");
 	rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "hit_tlp_method2", CTLFLAG_RD,
 	    &rack_used_tlpmethod2,
 	    "Total number of runt sacks 2");
 	rack_runt_sacks = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "runtsacks", CTLFLAG_RD,
 	    &rack_runt_sacks,
 	    "Total number of runt sacks");
 	rack_progress_drops = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "prog_drops", CTLFLAG_RD,
 	    &rack_progress_drops,
 	    "Total number of progress drops");
 	rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
 	    &rack_input_idle_reduces,
 	    "Total number of idle reductions on input");
 	rack_tlp_does_nada = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "tlp_nada", CTLFLAG_RD,
 	    &rack_tlp_does_nada,
 	    "Total number of nada tlp calls");
 	COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "outsize", CTLFLAG_RD,
 	    rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
 	COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "opts", CTLFLAG_RD,
 	    rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
 	SYSCTL_ADD_PROC(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 	    &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
 }
 
 static inline int32_t
 rack_progress_timeout_check(struct tcpcb *tp)
 {
+#ifdef NETFLIX_PROGRESS
 	if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) {
 		if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) {
 			/*
 			 * There is an assumption that the caller
 			 * will drop the connection so we will
 			 * increment the counters here.
 			 */
 			struct tcp_rack *rack;
 			rack = (struct tcp_rack *)tp->t_fb_ptr;
 			counter_u64_add(rack_progress_drops, 1);
-#ifdef NETFLIX_STATS
 			TCPSTAT_INC(tcps_progdrops);
-#endif
 			rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__);
 			return (1);
 		}
 	}
+#endif
 	return (0);
 }
 
 
 static void
 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT);
 		log.u_bbr.flex2 = to;
 		log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex4 = slot;
 		log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
 		log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
 		log.u_bbr.flex8 = which;
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 		TCP_LOG_EVENT(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_TIMERSTAR, 0,
 		    0, &log, false);
 	}
 }
 
 static void
 rack_log_to_event(struct tcp_rack *rack, int32_t to_num)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 		log.u_bbr.flex8 = to_num;
 		log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
 		log.u_bbr.flex2 = rack->rc_rack_rtt;
 		TCP_LOG_EVENT(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_RTO, 0,
 		    0, &log, false);
 	}
 }
 
 static void
 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t,
     uint32_t o_srtt, uint32_t o_var)
 {
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 		log.u_bbr.flex1 = t;
 		log.u_bbr.flex2 = o_srtt;
 		log.u_bbr.flex3 = o_var;
 		log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
-		log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;		
+		log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
 		log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt;
 		log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot;
 		log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
 		TCP_LOG_EVENT(tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_BBRRTT, 0,
 		    0, &log, false);
 	}
 }
 
 static void
 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
 {
-	/* 
+	/*
 	 * Log the rtt sample we are
 	 * applying to the srtt algorithm in
 	 * useconds.
 	 */
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
-		
+
+		memset(&log, 0, sizeof(log));
 		/* Convert our ms to a microsecond */
 		log.u_bbr.flex1 = rtt * 1000;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    TCP_LOG_RTT, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 
 static inline void
 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line)
 {
 	if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 		log.u_bbr.flex1 = line;
 		log.u_bbr.flex2 = tick;
 		log.u_bbr.flex3 = tp->t_maxunacktime;
 		log.u_bbr.flex4 = tp->t_acktime;
 		log.u_bbr.flex8 = event;
 		TCP_LOG_EVENT(tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_PROGRESS, 0,
 		    0, &log, false);
 	}
 }
 
 static void
 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 		log.u_bbr.flex1 = slot;
 		log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
 		log.u_bbr.flex8 = rack->rc_in_persist;
 		TCP_LOG_EVENT(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_BBRSND, 0,
 		    0, &log, false);
 	}
 }
 
 static void
 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
+
+		memset(&log, 0, sizeof(log));
 		log.u_bbr.flex1 = did_out;
 		log.u_bbr.flex2 = nxt_pkt;
 		log.u_bbr.flex3 = way_out;
 		log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex7 = rack->r_wanted_output;
 		log.u_bbr.flex8 = rack->rc_in_persist;
 		TCP_LOG_EVENT(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_DOSEG_DONE, 0,
 		    0, &log, false);
 	}
 }
 
 
 static void
 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 		log.u_bbr.flex1 = slot;
 		log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex7 = hpts_calling;
 		log.u_bbr.flex8 = rack->rc_in_persist;
 		TCP_LOG_EVENT(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_JUSTRET, 0,
 		    tlen, &log, false);
 	}
 }
 
 static void
 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 		log.u_bbr.flex1 = line;
 		log.u_bbr.flex2 = 0;
 		log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex4 = 0;
 		log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
 		log.u_bbr.flex8 = hpts_removed;
 		TCP_LOG_EVENT(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_TIMERCANC, 0,
 		    0, &log, false);
 	}
 }
 
 static void
 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.flex1 = timers;
 		log.u_bbr.flex2 = ret;
 		log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
 		log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex5 = cts;
 		TCP_LOG_EVENT(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_TO_PROCESS, 0,
 		    0, &log, false);
 	}
 }
 
 static void
 rack_counter_destroy()
 {
 	counter_u64_free(rack_badfr);
 	counter_u64_free(rack_badfr_bytes);
 	counter_u64_free(rack_rtm_prr_retran);
 	counter_u64_free(rack_rtm_prr_newdata);
 	counter_u64_free(rack_timestamp_mismatch);
 	counter_u64_free(rack_reorder_seen);
 	counter_u64_free(rack_tlp_tot);
 	counter_u64_free(rack_tlp_newdata);
 	counter_u64_free(rack_tlp_retran);
 	counter_u64_free(rack_tlp_retran_bytes);
 	counter_u64_free(rack_tlp_retran_fail);
 	counter_u64_free(rack_to_tot);
 	counter_u64_free(rack_to_arm_rack);
 	counter_u64_free(rack_to_arm_tlp);
 	counter_u64_free(rack_paced_segments);
 	counter_u64_free(rack_unpaced_segments);
 	counter_u64_free(rack_saw_enobuf);
 	counter_u64_free(rack_saw_enetunreach);
 	counter_u64_free(rack_to_alloc_hard);
 	counter_u64_free(rack_to_alloc_emerg);
 	counter_u64_free(rack_sack_proc_all);
 	counter_u64_free(rack_sack_proc_short);
 	counter_u64_free(rack_sack_proc_restart);
 	counter_u64_free(rack_to_alloc);
+	counter_u64_free(rack_to_alloc_limited);
+	counter_u64_free(rack_split_limited);
 	counter_u64_free(rack_find_high);
 	counter_u64_free(rack_runt_sacks);
 	counter_u64_free(rack_enter_tlp_calc);
 	counter_u64_free(rack_used_tlpmethod);
 	counter_u64_free(rack_used_tlpmethod2);
 	counter_u64_free(rack_progress_drops);
 	counter_u64_free(rack_input_idle_reduces);
 	counter_u64_free(rack_tlp_does_nada);
 	COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
 	COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
 }
 
 static struct rack_sendmap *
 rack_alloc(struct tcp_rack *rack)
 {
 	struct rack_sendmap *rsm;
 
 	rsm = uma_zalloc(rack_zone, M_NOWAIT);
 	if (rsm) {
-alloc_done:
-		counter_u64_add(rack_to_alloc, 1);
 		rack->r_ctl.rc_num_maps_alloced++;
+		counter_u64_add(rack_to_alloc, 1);
 		return (rsm);
 	}
 	if (rack->rc_free_cnt) {
 		counter_u64_add(rack_to_alloc_emerg, 1);
 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
 		TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
 		rack->rc_free_cnt--;
-		goto alloc_done;
+		return (rsm);
 	}
 	return (NULL);
 }
 
+static struct rack_sendmap *
+rack_alloc_full_limit(struct tcp_rack *rack)
+{
+	if ((rack_map_entries_limit > 0) &&
+	    (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) {
+		counter_u64_add(rack_to_alloc_limited, 1);
+		if (!rack->alloc_limit_reported) {
+			rack->alloc_limit_reported = 1;
+			counter_u64_add(rack_alloc_limited_conns, 1);
+		}
+		return (NULL);
+	}
+	return (rack_alloc(rack));
+}
+
 /* wrapper to allocate a sendmap entry, subject to a specific limit */
 static struct rack_sendmap *
 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
 {
 	struct rack_sendmap *rsm;
 
 	if (limit_type) {
 		/* currently there is only one limit type */
 		if (rack_map_split_limit > 0 &&
 		    rack->r_ctl.rc_num_split_allocs >= rack_map_split_limit) {
 			counter_u64_add(rack_split_limited, 1);
 			if (!rack->alloc_limit_reported) {
 				rack->alloc_limit_reported = 1;
 				counter_u64_add(rack_alloc_limited_conns, 1);
 			}
 			return (NULL);
 		}
 	}
 
 	/* allocate and mark in the limit type, if set */
 	rsm = rack_alloc(rack);
 	if (rsm != NULL && limit_type) {
 		rsm->r_limit_type = limit_type;
 		rack->r_ctl.rc_num_split_allocs++;
 	}
 	return (rsm);
 }
 
 static void
 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
 {
 	if (rsm->r_limit_type) {
 		/* currently there is only one limit type */
 		rack->r_ctl.rc_num_split_allocs--;
 	}
-	rack->r_ctl.rc_num_maps_alloced--;
 	if (rack->r_ctl.rc_tlpsend == rsm)
 		rack->r_ctl.rc_tlpsend = NULL;
 	if (rack->r_ctl.rc_next == rsm)
 		rack->r_ctl.rc_next = NULL;
 	if (rack->r_ctl.rc_sacklast == rsm)
 		rack->r_ctl.rc_sacklast = NULL;
 	if (rack->rc_free_cnt < rack_free_cache) {
 		memset(rsm, 0, sizeof(struct rack_sendmap));
 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
+		rsm->r_limit_type = 0;
 		rack->rc_free_cnt++;
 		return;
 	}
+	rack->r_ctl.rc_num_maps_alloced--;
 	uma_zfree(rack_zone, rsm);
 }
 
 /*
  * CC wrapper hook functions
  */
 static void
 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs,
     uint16_t type, int32_t recovery)
 {
 #ifdef NETFLIX_STATS
 	int32_t gput;
 #endif
-#ifdef NETFLIX_CWV
-	u_long old_cwnd = tp->snd_cwnd;
-#endif
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
+
 	tp->ccv->nsegs = nsegs;
 	tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
 	if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
 		uint32_t max;
 
 		max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg;
 		if (tp->ccv->bytes_this_ack > max) {
 			tp->ccv->bytes_this_ack = max;
 		}
 	}
 	if (tp->snd_cwnd <= tp->snd_wnd)
 		tp->ccv->flags |= CCF_CWND_LIMITED;
 	else
 		tp->ccv->flags &= ~CCF_CWND_LIMITED;
 
 	if (type == CC_ACK) {
 #ifdef NETFLIX_STATS
 		stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
 		    ((int32_t) tp->snd_cwnd) - tp->snd_wnd);
 		if ((tp->t_flags & TF_GPUTINPROG) &&
 		    SEQ_GEQ(th->th_ack, tp->gput_ack)) {
 			gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) /
 			    max(1, tcp_ts_getticks() - tp->gput_ts);
 			stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
 			    gput);
 			/*
 			 * XXXLAS: This is a temporary hack, and should be
 			 * chained off VOI_TCP_GPUT when stats(9) grows an
 			 * API to deal with chained VOIs.
 			 */
 			if (tp->t_stats_gput_prev > 0)
 				stats_voi_update_abs_s32(tp->t_stats,
 				    VOI_TCP_GPUT_ND,
 				    ((gput - tp->t_stats_gput_prev) * 100) /
 				    tp->t_stats_gput_prev);
 			tp->t_flags &= ~TF_GPUTINPROG;
 			tp->t_stats_gput_prev = gput;
-#ifdef NETFLIX_CWV
 			if (tp->t_maxpeakrate) {
 				/*
 				 * We update t_peakrate_thr. This gives us roughly
 				 * one update per round trip time.
 				 */
 				tcp_update_peakrate_thr(tp);
 			}
-#endif
 		}
 #endif
 		if (tp->snd_cwnd > tp->snd_ssthresh) {
 			tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
 			    nsegs * V_tcp_abc_l_var * tp->t_maxseg);
 			if (tp->t_bytes_acked >= tp->snd_cwnd) {
 				tp->t_bytes_acked -= tp->snd_cwnd;
 				tp->ccv->flags |= CCF_ABC_SENTAWND;
 			}
 		} else {
 			tp->ccv->flags &= ~CCF_ABC_SENTAWND;
 			tp->t_bytes_acked = 0;
 		}
 	}
 	if (CC_ALGO(tp)->ack_received != NULL) {
 		/* XXXLAS: Find a way to live without this */
 		tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->ack_received(tp->ccv, type);
 	}
 #ifdef NETFLIX_STATS
 	stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd);
 #endif
 	if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) {
 		rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd;
 	}
-#ifdef NETFLIX_CWV
-	if (tp->cwv_enabled) {
-		/*
-		 * Per RFC 7661: The behaviour in the non-validated phase is
-		 * specified as: o  A sender determines whether to increase
-		 * the cwnd based upon whether it is cwnd-limited (see
-		 * Section 4.5.3): * A sender that is cwnd-limited MAY use
-		 * the standard TCP method to increase cwnd (i.e., the
-		 * standard method permits a TCP sender that fully utilises
-		 * the cwnd to increase the cwnd each time it receives an
-		 * ACK). * A sender that is not cwnd-limited MUST NOT
-		 * increase the cwnd when ACK packets are received in this
-		 * phase (i.e., needs to avoid growing the cwnd when it has
-		 * not recently sent using the current size of cwnd).
-		 */
-		if ((tp->snd_cwnd > old_cwnd) &&
-		    (tp->cwv_cwnd_valid == 0) &&
-		    (!(tp->ccv->flags & CCF_CWND_LIMITED))) {
-			tp->snd_cwnd = old_cwnd;
-		}
-		/* Try to update pipeAck and NCWV state */
-		if (TCPS_HAVEESTABLISHED(tp->t_state) &&
-		    !IN_RECOVERY(tp->t_flags)) {
-			uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd));
-
-			tcp_newcwv_update_pipeack(tp, data);
-		}
-	}
 	/* we enforce max peak rate if it is set. */
 	if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) {
 		tp->snd_cwnd = tp->t_peakrate_thr;
 	}
-#endif
 }
 
 static void
 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th)
 {
 	struct tcp_rack *rack;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (rack->r_ctl.rc_prr_sndcnt > 0)
 		rack->r_wanted_output++;
 }
 
 static void
 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th)
 {
 	struct tcp_rack *rack;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (CC_ALGO(tp)->post_recovery != NULL) {
 		tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->post_recovery(tp->ccv);
 	}
 	/*
 	 * Here we can in theory adjust cwnd to be based on the number of
 	 * losses in the window (rack->r_ctl.rc_loss_count). This is done
 	 * based on the rack_use_proportional flag.
 	 */
 	if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) {
 		int32_t reduce;
 
 		reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate);
 		if (reduce > 50) {
 			reduce = 50;
 		}
 		tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100);
 	} else {
 		if (tp->snd_cwnd > tp->snd_ssthresh) {
 			/* Drop us down to the ssthresh (1/2 cwnd at loss) */
 			tp->snd_cwnd = tp->snd_ssthresh;
 		}
 	}
 	if (rack->r_ctl.rc_prr_sndcnt > 0) {
 		/* Suck the next prr cnt back into cwnd */
 		tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt;
 		rack->r_ctl.rc_prr_sndcnt = 0;
 	}
+	tp->snd_recover = tp->snd_una;
 	EXIT_RECOVERY(tp->t_flags);
-
-
-#ifdef NETFLIX_CWV
-	if (tp->cwv_enabled) {
-		if ((tp->cwv_cwnd_valid == 0) &&
-		    (tp->snd_cwv.in_recovery))
-			tcp_newcwv_end_recovery(tp);
-	}
-#endif
 }
 
 static void
 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
 {
 	struct tcp_rack *rack;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	switch (type) {
 	case CC_NDUPACK:
 /*		rack->r_ctl.rc_ssthresh_set = 1;*/
 		if (!IN_FASTRECOVERY(tp->t_flags)) {
 			rack->r_ctl.rc_tlp_rtx_out = 0;
 			rack->r_ctl.rc_prr_delivered = 0;
 			rack->r_ctl.rc_prr_out = 0;
 			rack->r_ctl.rc_loss_count = 0;
 			rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
 			rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
 			tp->snd_recover = tp->snd_max;
 			if (tp->t_flags & TF_ECN_PERMIT)
 				tp->t_flags |= TF_ECN_SND_CWR;
 		}
 		break;
 	case CC_ECN:
 		if (!IN_CONGRECOVERY(tp->t_flags) ||
 		    /*
 		     * Allow ECN reaction on ACK to CWR, if
 		     * that data segment was also CE marked.
 		     */
 		    SEQ_GEQ(th->th_ack, tp->snd_recover)) {
 			EXIT_CONGRECOVERY(tp->t_flags);
 			TCPSTAT_INC(tcps_ecn_rcwnd);
 			tp->snd_recover = tp->snd_max + 1;
 			if (tp->t_flags & TF_ECN_PERMIT)
 				tp->t_flags |= TF_ECN_SND_CWR;
 		}
 		break;
 	case CC_RTO:
 		tp->t_dupacks = 0;
 		tp->t_bytes_acked = 0;
 		EXIT_RECOVERY(tp->t_flags);
 		tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
 		    tp->t_maxseg) * tp->t_maxseg;
 		tp->snd_cwnd = tp->t_maxseg;
 		break;
 	case CC_RTO_ERR:
 		TCPSTAT_INC(tcps_sndrexmitbad);
 		/* RTO was unnecessary, so reset everything. */
 		tp->snd_cwnd = tp->snd_cwnd_prev;
 		tp->snd_ssthresh = tp->snd_ssthresh_prev;
 		tp->snd_recover = tp->snd_recover_prev;
 		if (tp->t_flags & TF_WASFRECOVERY)
 			ENTER_FASTRECOVERY(tp->t_flags);
 		if (tp->t_flags & TF_WASCRECOVERY)
 			ENTER_CONGRECOVERY(tp->t_flags);
 		tp->snd_nxt = tp->snd_max;
 		tp->t_badrxtwin = 0;
 		break;
 	}
 
 	if (CC_ALGO(tp)->cong_signal != NULL) {
 		if (th != NULL)
 			tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->cong_signal(tp->ccv, type);
 	}
-#ifdef NETFLIX_CWV
-	if (tp->cwv_enabled) {
-		if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) {
-			tcp_newcwv_enter_recovery(tp);
-		}
-		if (type == CC_RTO) {
-			tcp_newcwv_reset(tp);
-		}
-	}
-#endif
 }
 
 
 
 static inline void
 rack_cc_after_idle(struct tcpcb *tp, int reduce_largest)
 {
 	uint32_t i_cwnd;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 #ifdef NETFLIX_STATS
 	TCPSTAT_INC(tcps_idle_restarts);
 	if (tp->t_state == TCPS_ESTABLISHED)
 		TCPSTAT_INC(tcps_idle_estrestarts);
 #endif
 	if (CC_ALGO(tp)->after_idle != NULL)
 		CC_ALGO(tp)->after_idle(tp->ccv);
 
 	if (tp->snd_cwnd == 1)
 		i_cwnd = tp->t_maxseg;		/* SYN(-ACK) lost */
 	else if (V_tcp_initcwnd_segments)
 		i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg),
 		    max(2 * tp->t_maxseg, V_tcp_initcwnd_segments * 1460));
 	else if (V_tcp_do_rfc3390)
 		i_cwnd = min(4 * tp->t_maxseg,
 		    max(2 * tp->t_maxseg, 4380));
 	else {
 		/* Per RFC5681 Section 3.1 */
 		if (tp->t_maxseg > 2190)
 			i_cwnd = 2 * tp->t_maxseg;
 		else if (tp->t_maxseg > 1095)
 			i_cwnd = 3 * tp->t_maxseg;
 		else
 			i_cwnd = 4 * tp->t_maxseg;
 	}
 	if (reduce_largest) {
 		/*
-		 * Do we reduce the largest cwnd to make 
+		 * Do we reduce the largest cwnd to make
 		 * rack play nice on restart hptsi wise?
 		 */
 		if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd  > i_cwnd)
 			((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd;
 	}
 	/*
 	 * Being idle is no differnt than the initial window. If the cc
 	 * clamps it down below the initial window raise it to the initial
 	 * window.
 	 */
 	if (tp->snd_cwnd < i_cwnd) {
 		tp->snd_cwnd = i_cwnd;
 	}
 }
 
 
 /*
  * Indicate whether this ack should be delayed.  We can delay the ack if
  * following conditions are met:
  *	- There is no delayed ack timer in progress.
  *	- Our last ack wasn't a 0-sized window. We never want to delay
  *	  the ack that opens up a 0-sized window.
  *	- LRO wasn't used for this segment. We make sure by checking that the
  *	  segment size is not larger than the MSS.
  *	- Delayed acks are enabled or this is a half-synchronized T/TCP
  *	  connection.
  */
 #define DELAY_ACK(tp, tlen)			 \
 	(((tp->t_flags & TF_RXWIN0SENT) == 0) && \
 	((tp->t_flags & TF_DELACK) == 0) && 	 \
 	(tlen <= tp->t_maxseg) &&		 \
 	(tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
 
 static inline void
 rack_calc_rwin(struct socket *so, struct tcpcb *tp)
 {
 	int32_t win;
 
 	/*
 	 * Calculate amount of space in receive window, and then do TCP
 	 * input processing. Receive window is amount of space in rcv queue,
 	 * but not less than advertised window.
 	 */
 	win = sbspace(&so->so_rcv);
 	if (win < 0)
 		win = 0;
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 }
 
 static void
 rack_do_drop(struct mbuf *m, struct tcpcb *tp)
 {
 	/*
 	 * Drop space held by incoming segment and return.
 	 */
 	if (tp != NULL)
 		INP_WUNLOCK(tp->t_inpcb);
 	if (m)
 		m_freem(m);
 }
 
 static void
-rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
-    int32_t rstreason, int32_t tlen)
+rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen)
 {
 	if (tp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
 		INP_WUNLOCK(tp->t_inpcb);
 	} else
 		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 }
 
 /*
  * The value in ret_val informs the caller
  * if we dropped the tcb (and lock) or not.
  * 1 = we dropped it, 0 = the TCB is still locked
  * and valid.
  */
 static void
 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val)
 {
 	/*
 	 * Generate an ACK dropping incoming segment if it occupies sequence
 	 * space, where the ACK reflects our state.
 	 *
 	 * We can now skip the test for the RST flag since all paths to this
 	 * code happen after packets containing RST have been dropped.
 	 *
 	 * In the SYN-RECEIVED state, don't send an ACK unless the segment
 	 * we received passes the SYN-RECEIVED ACK test. If it fails send a
 	 * RST.  This breaks the loop in the "LAND" DoS attack, and also
 	 * prevents an ACK storm between two listening ports that have been
 	 * sent forged SYN segments, each with the source address of the
 	 * other.
 	 */
 	struct tcp_rack *rack;
 
 	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
 	    (SEQ_GT(tp->snd_una, th->th_ack) ||
 	    SEQ_GT(th->th_ack, tp->snd_max))) {
 		*ret_val = 1;
 		rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 		return;
 	} else
 		*ret_val = 0;
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	rack->r_wanted_output++;
 	tp->t_flags |= TF_ACKNOW;
 	if (m)
 		m_freem(m);
 }
 
 
 static int
 rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp)
 {
 	/*
 	 * RFC5961 Section 3.2
 	 *
 	 * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
 	 * window, we send challenge ACK.
 	 *
 	 * Note: to take into account delayed ACKs, we should test against
 	 * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
 	 * of closed window, not covered by the RFC.
 	 */
 	int dropped = 0;
 
 	if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
 	    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
 	    (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
 
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 		KASSERT(tp->t_state != TCPS_SYN_SENT,
 		    ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
 		    __func__, th, tp));
 
 		if (V_tcp_insecure_rst ||
 		    (tp->last_ack_sent == th->th_seq) ||
 		    (tp->rcv_nxt == th->th_seq) ||
 		    ((tp->last_ack_sent - 1) == th->th_seq)) {
 			TCPSTAT_INC(tcps_drops);
 			/* Drop the connection. */
 			switch (tp->t_state) {
 			case TCPS_SYN_RECEIVED:
 				so->so_error = ECONNREFUSED;
 				goto close;
 			case TCPS_ESTABLISHED:
 			case TCPS_FIN_WAIT_1:
 			case TCPS_FIN_WAIT_2:
 			case TCPS_CLOSE_WAIT:
 			case TCPS_CLOSING:
 			case TCPS_LAST_ACK:
 				so->so_error = ECONNRESET;
 		close:
 				tcp_state_change(tp, TCPS_CLOSED);
 				/* FALLTHROUGH */
 			default:
 				tp = tcp_close(tp);
 			}
 			dropped = 1;
 			rack_do_drop(m, tp);
 		} else {
 			TCPSTAT_INC(tcps_badrst);
 			/* Send challenge ACK. */
 			tcp_respond(tp, mtod(m, void *), th, m,
 			    tp->rcv_nxt, tp->snd_nxt, TH_ACK);
 			tp->last_ack_sent = tp->rcv_nxt;
 		}
 	} else {
 		m_freem(m);
 	}
 	return (dropped);
 }
 
 /*
  * The value in ret_val informs the caller
  * if we dropped the tcb (and lock) or not.
  * 1 = we dropped it, 0 = the TCB is still locked
  * and valid.
  */
 static void
 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val)
 {
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 
 	TCPSTAT_INC(tcps_badsyn);
 	if (V_tcp_insecure_syn &&
 	    SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
 		tp = tcp_drop(tp, ECONNRESET);
 		*ret_val = 1;
 		rack_do_drop(m, tp);
 	} else {
 		/* Send challenge ACK. */
 		tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
 		    tp->snd_nxt, TH_ACK);
 		tp->last_ack_sent = tp->rcv_nxt;
 		m = NULL;
 		*ret_val = 0;
 		rack_do_drop(m, NULL);
 	}
 }
 
 /*
  * rack_ts_check returns 1 for you should not proceed. It places
  * in ret_val what should be returned 1/0 by the caller. The 1 indicates
  * that the TCB is unlocked and probably dropped. The 0 indicates the
  * TCB is still valid and locked.
  */
 static int
 rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val)
 {
 
 	/* Check to see if ts_recent is over 24 days old.  */
 	if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
 		/*
 		 * Invalidate ts_recent.  If this segment updates ts_recent,
 		 * the age will be reset later and ts_recent will get a
 		 * valid value.  If it does not, setting ts_recent to zero
 		 * will at least satisfy the requirement that zero be placed
 		 * in the timestamp echo reply when ts_recent isn't valid.
 		 * The age isn't reset until we get a valid ts_recent
 		 * because we don't want out-of-order segments to be dropped
 		 * when ts_recent is old.
 		 */
 		tp->ts_recent = 0;
 	} else {
 		TCPSTAT_INC(tcps_rcvduppack);
 		TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
 		TCPSTAT_INC(tcps_pawsdrop);
 		*ret_val = 0;
 		if (tlen) {
 			rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
 		} else {
 			rack_do_drop(m, NULL);
 		}
 		return (1);
 	}
 	return (0);
 }
 
 /*
  * rack_drop_checks returns 1 for you should not proceed. It places
  * in ret_val what should be returned 1/0 by the caller. The 1 indicates
  * that the TCB is unlocked and probably dropped. The 0 indicates the
  * TCB is still valid and locked.
  */
 static int
-rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp,  int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
+rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
 {
 	int32_t todrop;
 	int32_t thflags;
 	int32_t tlen;
 
 	thflags = *thf;
 	tlen = *tlenp;
 	todrop = tp->rcv_nxt - th->th_seq;
 	if (todrop > 0) {
 		if (thflags & TH_SYN) {
 			thflags &= ~TH_SYN;
 			th->th_seq++;
 			if (th->th_urp > 1)
 				th->th_urp--;
 			else
 				thflags &= ~TH_URG;
 			todrop--;
 		}
 		/*
 		 * Following if statement from Stevens, vol. 2, p. 960.
 		 */
 		if (todrop > tlen
 		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
 			/*
 			 * Any valid FIN must be to the left of the window.
 			 * At this point the FIN must be a duplicate or out
 			 * of sequence; drop it.
 			 */
 			thflags &= ~TH_FIN;
 			/*
 			 * Send an ACK to resynchronize and drop any data.
 			 * But keep on processing for RST or ACK.
 			 */
 			tp->t_flags |= TF_ACKNOW;
 			todrop = tlen;
 			TCPSTAT_INC(tcps_rcvduppack);
 			TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
 		} else {
 			TCPSTAT_INC(tcps_rcvpartduppack);
 			TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
 		}
 		/*
 		 * DSACK - add SACK block for dropped range
 		 */
 		if (tp->t_flags & TF_SACK_PERMIT) {
 			tcp_update_sack_list(tp, th->th_seq,
 			    th->th_seq + todrop);
 			/*
 			 * ACK now, as the next in-sequence segment
 			 * will clear the DSACK block again
 			 */
 			tp->t_flags |= TF_ACKNOW;
 		}
 		*drop_hdrlen += todrop;	/* drop from the top afterwards */
 		th->th_seq += todrop;
 		tlen -= todrop;
 		if (th->th_urp > todrop)
 			th->th_urp -= todrop;
 		else {
 			thflags &= ~TH_URG;
 			th->th_urp = 0;
 		}
 	}
 	/*
 	 * If segment ends after window, drop trailing data (and PUSH and
 	 * FIN); if nothing left, just ACK.
 	 */
 	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
 	if (todrop > 0) {
 		TCPSTAT_INC(tcps_rcvpackafterwin);
 		if (todrop >= tlen) {
 			TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
 			/*
 			 * If window is closed can only take segments at
 			 * window edge, and have to drop data and PUSH from
 			 * incoming segments.  Continue processing, but
 			 * remember to ack.  Otherwise, drop segment and
 			 * ack.
 			 */
 			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
 				tp->t_flags |= TF_ACKNOW;
 				TCPSTAT_INC(tcps_rcvwinprobe);
 			} else {
 				rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
 				return (1);
 			}
 		} else
 			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 		m_adj(m, -todrop);
 		tlen -= todrop;
 		thflags &= ~(TH_PUSH | TH_FIN);
 	}
 	*thf = thflags;
 	*tlenp = tlen;
 	return (0);
 }
 
 static struct rack_sendmap *
 rack_find_lowest_rsm(struct tcp_rack *rack)
 {
 	struct rack_sendmap *rsm;
 
 	/*
 	 * Walk the time-order transmitted list looking for an rsm that is
 	 * not acked. This will be the one that was sent the longest time
 	 * ago that is still outstanding.
 	 */
 	TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
 		if (rsm->r_flags & RACK_ACKED) {
 			continue;
 		}
 		goto finish;
 	}
 finish:
 	return (rsm);
 }
 
 static struct rack_sendmap *
 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
 {
 	struct rack_sendmap *prsm;
 
 	/*
 	 * Walk the sequence order list backward until we hit and arrive at
 	 * the highest seq not acked. In theory when this is called it
 	 * should be the last segment (which it was not).
 	 */
 	counter_u64_add(rack_find_high, 1);
 	prsm = rsm;
 	TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) {
 		if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
 			continue;
 		}
 		return (prsm);
 	}
 	return (NULL);
 }
 
 
 static uint32_t
 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
 {
 	int32_t lro;
 	uint32_t thresh;
 
 	/*
 	 * lro is the flag we use to determine if we have seen reordering.
 	 * If it gets set we have seen reordering. The reorder logic either
 	 * works in one of two ways:
 	 *
 	 * If reorder-fade is configured, then we track the last time we saw
 	 * re-ordering occur. If we reach the point where enough time as
 	 * passed we no longer consider reordering has occuring.
 	 *
 	 * Or if reorder-face is 0, then once we see reordering we consider
 	 * the connection to alway be subject to reordering and just set lro
 	 * to 1.
 	 *
 	 * In the end if lro is non-zero we add the extra time for
 	 * reordering in.
 	 */
 	if (srtt == 0)
 		srtt = 1;
 	if (rack->r_ctl.rc_reorder_ts) {
 		if (rack->r_ctl.rc_reorder_fade) {
 			if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
 				lro = cts - rack->r_ctl.rc_reorder_ts;
 				if (lro == 0) {
 					/*
 					 * No time as passed since the last
 					 * reorder, mark it as reordering.
 					 */
 					lro = 1;
 				}
 			} else {
 				/* Negative time? */
 				lro = 0;
 			}
 			if (lro > rack->r_ctl.rc_reorder_fade) {
 				/* Turn off reordering seen too */
 				rack->r_ctl.rc_reorder_ts = 0;
 				lro = 0;
 			}
 		} else {
 			/* Reodering does not fade */
 			lro = 1;
 		}
 	} else {
 		lro = 0;
 	}
 	thresh = srtt + rack->r_ctl.rc_pkt_delay;
 	if (lro) {
 		/* It must be set, if not you get 1/4 rtt */
 		if (rack->r_ctl.rc_reorder_shift)
 			thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
 		else
 			thresh += (srtt >> 2);
 	} else {
 		thresh += 1;
 	}
 	/* We don't let the rack timeout be above a RTO */
-	
+
 	if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) {
 		thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur);
 	}
 	/* And we don't want it above the RTO max either */
 	if (thresh > rack_rto_max) {
 		thresh = rack_rto_max;
 	}
 	return (thresh);
 }
 
 static uint32_t
 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
 		     struct rack_sendmap *rsm, uint32_t srtt)
 {
 	struct rack_sendmap *prsm;
 	uint32_t thresh, len;
 	int maxseg;
-	
+
 	if (srtt == 0)
 		srtt = 1;
 	if (rack->r_ctl.rc_tlp_threshold)
 		thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
 	else
 		thresh = (srtt * 2);
-	
+
 	/* Get the previous sent packet, if any  */
 	maxseg = tcp_maxseg(tp);
 	counter_u64_add(rack_enter_tlp_calc, 1);
 	len = rsm->r_end - rsm->r_start;
 	if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
 		/* Exactly like the ID */
 		if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) {
 			uint32_t alt_thresh;
 			/*
 			 * Compensate for delayed-ack with the d-ack time.
 			 */
 			counter_u64_add(rack_used_tlpmethod, 1);
 			alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
 			if (alt_thresh > thresh)
 				thresh = alt_thresh;
 		}
 	} else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
 		/* 2.1 behavior */
 		prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
 		if (prsm && (len <= maxseg)) {
 			/*
 			 * Two packets outstanding, thresh should be (2*srtt) +
 			 * possible inter-packet delay (if any).
 			 */
 			uint32_t inter_gap = 0;
 			int idx, nidx;
-			
+
 			counter_u64_add(rack_used_tlpmethod, 1);
 			idx = rsm->r_rtr_cnt - 1;
 			nidx = prsm->r_rtr_cnt - 1;
 			if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) {
 				/* Yes it was sent later (or at the same time) */
 				inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
 			}
 			thresh += inter_gap;
 		} else 	if (len <= maxseg) {
 			/*
 			 * Possibly compensate for delayed-ack.
 			 */
 			uint32_t alt_thresh;
-			
+
 			counter_u64_add(rack_used_tlpmethod2, 1);
 			alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
 			if (alt_thresh > thresh)
 				thresh = alt_thresh;
 		}
 	} else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
 		/* 2.2 behavior */
 		if (len <= maxseg) {
 			uint32_t alt_thresh;
 			/*
 			 * Compensate for delayed-ack with the d-ack time.
 			 */
 			counter_u64_add(rack_used_tlpmethod, 1);
 			alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
 			if (alt_thresh > thresh)
 				thresh = alt_thresh;
 		}
 	}
  	/* Not above an RTO */
 	if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) {
 		thresh = TICKS_2_MSEC(tp->t_rxtcur);
 	}
 	/* Not above a RTO max */
 	if (thresh > rack_rto_max) {
 		thresh = rack_rto_max;
 	}
 	/* Apply user supplied min TLP */
 	if (thresh < rack_tlp_min) {
 		thresh = rack_tlp_min;
 	}
 	return (thresh);
 }
 
 static struct rack_sendmap *
 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
 {
 	/*
 	 * Check to see that we don't need to fall into recovery. We will
 	 * need to do so if our oldest transmit is past the time we should
 	 * have had an ack.
 	 */
 	struct tcp_rack *rack;
 	struct rack_sendmap *rsm;
 	int32_t idx;
 	uint32_t srtt_cur, srtt, thresh;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
 		return (NULL);
 	}
 	srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
 	srtt = TICKS_2_MSEC(srtt_cur);
 	if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
 		srtt = rack->rc_rack_rtt;
 
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	if (rsm == NULL)
 		return (NULL);
 
 	if (rsm->r_flags & RACK_ACKED) {
 		rsm = rack_find_lowest_rsm(rack);
 		if (rsm == NULL)
 			return (NULL);
 	}
 	idx = rsm->r_rtr_cnt - 1;
 	thresh = rack_calc_thresh_rack(rack, srtt, tsused);
 	if (tsused < rsm->r_tim_lastsent[idx]) {
 		return (NULL);
 	}
 	if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) {
 		return (NULL);
 	}
 	/* Ok if we reach here we are over-due */
 	rack->r_ctl.rc_rsm_start = rsm->r_start;
 	rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
 	rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
 	rack_cong_signal(tp, NULL, CC_NDUPACK);
 	return (rsm);
 }
 
 static uint32_t
 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
 {
 	int32_t t;
 	int32_t tt;
 	uint32_t ret_val;
 
 	t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT));
 	TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
 	    tcp_persmin, tcp_persmax);
 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
 		tp->t_rxtshift++;
 	rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
 	ret_val = (uint32_t)tt;
 	return (ret_val);
 }
 
 static uint32_t
 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	/*
 	 * Start the FR timer, we do this based on getting the first one in
 	 * the rc_tmap. Note that if its NULL we must stop the timer. in all
 	 * events we need to stop the running timer (if its running) before
 	 * starting the new one.
 	 */
 	uint32_t thresh, exp, to, srtt, time_since_sent;
 	uint32_t srtt_cur;
 	int32_t idx;
 	int32_t is_tlp_timer = 0;
 	struct rack_sendmap *rsm;
-	
+
 	if (rack->t_timers_stopped) {
 		/* All timers have been stopped none are to run */
 		return (0);
 	}
 	if (rack->rc_in_persist) {
 		/* We can't start any timer in persists */
 		return (rack_get_persists_timer_val(tp, rack));
 	}
-	if (tp->t_state < TCPS_ESTABLISHED)
-		goto activate_rxt;
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	if (rsm == NULL) {
 		/* Nothing on the send map */
 activate_rxt:
 		if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
 			rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
 			to = TICKS_2_MSEC(tp->t_rxtcur);
 			if (to == 0)
 				to = 1;
 			return (to);
 		}
 		return (0);
 	}
 	if (rsm->r_flags & RACK_ACKED) {
 		rsm = rack_find_lowest_rsm(rack);
 		if (rsm == NULL) {
 			/* No lowest? */
 			goto activate_rxt;
 		}
 	}
 	/* Convert from ms to usecs */
 	if (rsm->r_flags & RACK_SACK_PASSED) {
 		if ((tp->t_flags & TF_SENTFIN) &&
 		    ((tp->snd_max - tp->snd_una) == 1) &&
 		    (rsm->r_flags & RACK_HAS_FIN)) {
 			/*
 			 * We don't start a rack timer if all we have is a
 			 * FIN outstanding.
 			 */
 			goto activate_rxt;
 		}
 		if (tp->t_srtt) {
 			srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
 			srtt = TICKS_2_MSEC(srtt_cur);
 		} else
 			srtt = RACK_INITIAL_RTO;
 
 		thresh = rack_calc_thresh_rack(rack, srtt, cts);
 		idx = rsm->r_rtr_cnt - 1;
 		exp = rsm->r_tim_lastsent[idx] + thresh;
 		if (SEQ_GEQ(exp, cts)) {
 			to = exp - cts;
 			if (to < rack->r_ctl.rc_min_to) {
 				to = rack->r_ctl.rc_min_to;
 			}
 		} else {
 			to = rack->r_ctl.rc_min_to;
 		}
 	} else {
 		/* Ok we need to do a TLP not RACK */
 		if ((rack->rc_tlp_in_progress != 0) ||
 		    (rack->r_ctl.rc_tlp_rtx_out != 0)) {
 			/*
 			 * The previous send was a TLP or a tlp_rtx is in
 			 * process.
 			 */
 			goto activate_rxt;
 		}
+		if ((tp->snd_max - tp->snd_una) > tp->snd_wnd) {
+			/*
+			 * Peer collapsed rwnd, don't do TLP.
+			 */
+			goto activate_rxt;
+		}
 		rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
 		if (rsm == NULL) {
 			/* We found no rsm to TLP with. */
 			goto activate_rxt;
 		}
 		if (rsm->r_flags & RACK_HAS_FIN) {
 			/* If its a FIN we dont do TLP */
 			rsm = NULL;
 			goto activate_rxt;
 		}
 		idx = rsm->r_rtr_cnt - 1;
-		if (TSTMP_GT(cts,  rsm->r_tim_lastsent[idx])) 
+		if (TSTMP_GT(cts,  rsm->r_tim_lastsent[idx]))
 			time_since_sent = cts - rsm->r_tim_lastsent[idx];
 		else
 			time_since_sent = 0;
 		is_tlp_timer = 1;
 		if (tp->t_srtt) {
 			srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
 			srtt = TICKS_2_MSEC(srtt_cur);
 		} else
 			srtt = RACK_INITIAL_RTO;
 		thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
 		if (thresh > time_since_sent)
 			to = thresh - time_since_sent;
 		else
 			to = rack->r_ctl.rc_min_to;
 		if (to > TCPTV_REXMTMAX) {
 			/*
 			 * If the TLP time works out to larger than the max
 			 * RTO lets not do TLP.. just RTO.
 			 */
 			goto activate_rxt;
 		}
 		if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) {
 			/*
 			 * The tail is no longer the last one I did a probe
 			 * on
 			 */
 			rack->r_ctl.rc_tlp_seg_send_cnt = 0;
 			rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
 		}
 	}
 	if (is_tlp_timer == 0) {
 		rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
 	} else {
 		if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) ||
 		    (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
 			/*
 			 * We have exceeded how many times we can retran the
 			 * current TLP timer, switch to the RTO timer.
 			 */
 			goto activate_rxt;
 		} else {
 			rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
 		}
 	}
 	if (to == 0)
 		to = 1;
 	return (to);
 }
 
 static void
 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	if (rack->rc_in_persist == 0) {
 		if (((tp->t_flags & TF_SENTFIN) == 0) &&
 		    (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd))
 			/* Must need to send more data to enter persist */
 			return;
 		rack->r_ctl.rc_went_idle_time = cts;
 		rack_timer_cancel(tp, rack, cts, __LINE__);
 		tp->t_rxtshift = 0;
 		rack->rc_in_persist = 1;
 	}
 }
 
 static void
 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack)
 {
 	if (rack->rc_inp->inp_in_hpts)  {
 		tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
 		rack->r_ctl.rc_hpts_flags  = 0;
 	}
 	rack->rc_in_persist = 0;
 	rack->r_ctl.rc_went_idle_time = 0;
 	tp->t_flags &= ~TF_FORCEDATA;
 	tp->t_rxtshift = 0;
 }
 
 static void
 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line,
     int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail)
 {
 	struct inpcb *inp;
 	uint32_t delayed_ack = 0;
 	uint32_t hpts_timeout;
 	uint8_t stopped;
 	uint32_t left = 0;
 
 	inp = tp->t_inpcb;
 	if (inp->inp_in_hpts) {
 		/* A previous call is already set up */
 		return;
 	}
-	if (tp->t_state == TCPS_CLOSED) {
+
+	if ((tp->t_state == TCPS_CLOSED) ||
+	    (tp->t_state == TCPS_LISTEN)) {
 		return;
 	}
 	stopped = rack->rc_tmr_stopped;
 	if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
 		left = rack->r_ctl.rc_timer_exp - cts;
 	}
 	rack->r_ctl.rc_timer_exp = 0;
 	if (rack->rc_inp->inp_in_hpts == 0) {
 		rack->r_ctl.rc_hpts_flags = 0;
-	} 
+	}
 	if (slot) {
 		/* We are hptsi too */
 		rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
 	} else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
-		/* 
+		/*
 		 * We are still left on the hpts when the to goes
 		 * it will be for output.
 		 */
-		if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to))
-			slot = cts - rack->r_ctl.rc_last_output_to;
+		if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts))
+			slot = rack->r_ctl.rc_last_output_to - cts;
 		else
 			slot = 1;
 	}
 	if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) {
 		/* No send window.. we must enter persist */
 		rack_enter_persist(tp, rack, cts);
 	} else if ((frm_out_sbavail &&
 		    (frm_out_sbavail > (tp->snd_max - tp->snd_una)) &&
 		    (tp->snd_wnd < tp->t_maxseg)) &&
 	    TCPS_HAVEESTABLISHED(tp->t_state)) {
 		/*
 		 * If we have no window or we can't send a segment (and have
 		 * data to send.. we cheat here and frm_out_sbavail is
 		 * passed in with the sbavail(sb) only from bbr_output) and
 		 * we are established, then we must enter persits (if not
 		 * already in persits).
 		 */
 		rack_enter_persist(tp, rack, cts);
 	}
 	hpts_timeout = rack_timer_start(tp, rack, cts);
 	if (tp->t_flags & TF_DELACK) {
-		delayed_ack = TICKS_2_MSEC(tcp_delacktime);
+		delayed_ack = tcp_delacktime;
 		rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
 	}
 	if (delayed_ack && ((hpts_timeout == 0) ||
 			    (delayed_ack < hpts_timeout)))
 		hpts_timeout = delayed_ack;
-	else 
+	else
 		rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
 	/*
 	 * If no timers are going to run and we will fall off the hptsi
 	 * wheel, we resort to a keep-alive timer if its configured.
 	 */
 	if ((hpts_timeout == 0) &&
 	    (slot == 0)) {
 		if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
 		    (tp->t_state <= TCPS_CLOSING)) {
 			/*
 			 * Ok we have no timer (persists, rack, tlp, rxt  or
 			 * del-ack), we don't have segments being paced. So
 			 * all that is left is the keepalive timer.
 			 */
 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
 				/* Get the established keep-alive time */
 				hpts_timeout = TP_KEEPIDLE(tp);
 			} else {
 				/* Get the initial setup keep-alive time */
 				hpts_timeout = TP_KEEPINIT(tp);
 			}
 			rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
 		}
 	}
 	if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
 	    (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
 		/*
 		 * RACK, TLP, persists and RXT timers all are restartable
 		 * based on actions input .. i.e we received a packet (ack
 		 * or sack) and that changes things (rw, or snd_una etc).
 		 * Thus we can restart them with a new value. For
 		 * keep-alive, delayed_ack we keep track of what was left
 		 * and restart the timer with a smaller value.
 		 */
 		if (left < hpts_timeout)
 			hpts_timeout = left;
 	}
 	if (hpts_timeout) {
 		/*
 		 * Hack alert for now we can't time-out over 2,147,483
 		 * seconds (a bit more than 596 hours), which is probably ok
 		 * :).
 		 */
 		if (hpts_timeout > 0x7ffffffe)
 			hpts_timeout = 0x7ffffffe;
 		rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
 	}
 	if (slot) {
 		rack->r_ctl.rc_last_output_to = cts + slot;
 		if ((hpts_timeout == 0) || (hpts_timeout > slot)) {
 			if (rack->rc_inp->inp_in_hpts == 0)
 				tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot));
 			rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
 		} else {
 			/*
 			 * Arrange for the hpts to kick back in after the
 			 * t-o if the t-o does not cause a send.
 			 */
 			if (rack->rc_inp->inp_in_hpts == 0)
 				tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
 			rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
 		}
 	} else if (hpts_timeout) {
 		if (rack->rc_inp->inp_in_hpts == 0)
 			tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
 		rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
 	} else {
 		/* No timer starting */
 #ifdef INVARIANTS
 		if (SEQ_GT(tp->snd_max, tp->snd_una)) {
 			panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
 			    tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
 		}
 #endif
 	}
 	rack->rc_tmr_stopped = 0;
 	if (slot)
 		rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts);
 }
 
 /*
  * RACK Timer, here we simply do logging and house keeping.
  * the normal rack_output() function will call the
  * appropriate thing to check if we need to do a RACK retransmit.
  * We return 1, saying don't proceed with rack_output only
  * when all timers have been stopped (destroyed PCB?).
  */
 static int
 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	/*
 	 * This timer simply provides an internal trigger to send out data.
 	 * The check_recovery_mode call will see if there are needed
 	 * retransmissions, if so we will enter fast-recovery. The output
 	 * call may or may not do the same thing depending on sysctl
 	 * settings.
 	 */
 	struct rack_sendmap *rsm;
 	int32_t recovery;
 
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
 		/* Its not time yet */
 		return (0);
 	}
 	rack_log_to_event(rack, RACK_TO_FRM_RACK);
 	recovery = IN_RECOVERY(tp->t_flags);
 	counter_u64_add(rack_to_tot, 1);
 	if (rack->r_state && (rack->r_state != tp->t_state))
 		rack_set_state(tp, rack);
 	rsm = rack_check_recovery_mode(tp, cts);
 	if (rsm) {
 		uint32_t rtt;
 
 		rtt = rack->rc_rack_rtt;
 		if (rtt == 0)
 			rtt = 1;
 		if ((recovery == 0) &&
 		    (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) {
 			/*
 			 * The rack-timeout that enter's us into recovery
 			 * will force out one MSS and set us up so that we
 			 * can do one more send in 2*rtt (transitioning the
 			 * rack timeout into a rack-tlp).
 			 */
 			rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
 		} else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) &&
 		    ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) {
 			/*
-			 * When a rack timer goes, we have to send at 
+			 * When a rack timer goes, we have to send at
 			 * least one segment. They will be paced a min of 1ms
 			 * apart via the next rack timer (or further
 			 * if the rack timer dictates it).
 			 */
 			rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
 		}
 	} else {
 		/* This is a case that should happen rarely if ever */
 		counter_u64_add(rack_tlp_does_nada, 1);
 #ifdef TCP_BLACKBOX
 		tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
 #endif
 		rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	}
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
 	return (0);
 }
 
+static struct rack_sendmap *
+rack_merge_rsm(struct tcp_rack *rack,
+	       struct rack_sendmap *l_rsm,
+	       struct rack_sendmap *r_rsm)
+{
+	/*
+	 * We are merging two ack'd RSM's,
+	 * the l_rsm is on the left (lower seq
+	 * values) and the r_rsm is on the right
+	 * (higher seq value). The simplest way
+	 * to merge these is to move the right
+	 * one into the left. I don't think there
+	 * is any reason we need to try to find
+	 * the oldest (or last oldest retransmitted).
+	 */
+	l_rsm->r_end = r_rsm->r_end;
+	if (r_rsm->r_rtr_bytes)
+		l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
+	if (r_rsm->r_in_tmap) {
+		/* This really should not happen */
+		TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext);
+	}
+	/* Now the flags */
+	if (r_rsm->r_flags & RACK_HAS_FIN)
+		l_rsm->r_flags |= RACK_HAS_FIN;
+	if (r_rsm->r_flags & RACK_TLP)
+		l_rsm->r_flags |= RACK_TLP;
+	TAILQ_REMOVE(&rack->r_ctl.rc_map, r_rsm, r_next);
+	if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
+		/* Transfer the split limit to the map we free */
+		r_rsm->r_limit_type = l_rsm->r_limit_type;
+		l_rsm->r_limit_type = 0;
+	}
+	rack_free(rack, r_rsm);
+	return(l_rsm);
+}
+
 /*
  * TLP Timer, here we simply setup what segment we want to
  * have the TLP expire on, the normal rack_output() will then
  * send it out.
  *
  * We return 1, saying don't proceed with rack_output only
  * when all timers have been stopped (destroyed PCB?).
  */
 static int
 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	/*
 	 * Tail Loss Probe.
 	 */
 	struct rack_sendmap *rsm = NULL;
 	struct socket *so;
 	uint32_t amm, old_prr_snd = 0;
 	uint32_t out, avail;
 
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
 		/* Its not time yet */
 		return (0);
 	}
 	if (rack_progress_timeout_check(tp)) {
 		tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
 		return (1);
 	}
 	/*
 	 * A TLP timer has expired. We have been idle for 2 rtts. So we now
 	 * need to figure out how to force a full MSS segment out.
 	 */
 	rack_log_to_event(rack, RACK_TO_FRM_TLP);
 	counter_u64_add(rack_tlp_tot, 1);
 	if (rack->r_state && (rack->r_state != tp->t_state))
 		rack_set_state(tp, rack);
 	so = tp->t_inpcb->inp_socket;
 	avail = sbavail(&so->so_snd);
 	out = tp->snd_max - tp->snd_una;
 	rack->rc_timer_up = 1;
 	/*
 	 * If we are in recovery we can jazz out a segment if new data is
 	 * present simply by setting rc_prr_sndcnt to a segment.
 	 */
 	if ((avail > out) &&
 	    ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) {
 		/* New data is available */
 		amm = avail - out;
 		if (amm > tp->t_maxseg) {
 			amm = tp->t_maxseg;
 		} else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) {
 			/* not enough to fill a MTU and no-delay is off */
 			goto need_retran;
 		}
 		if (IN_RECOVERY(tp->t_flags)) {
 			/* Unlikely */
 			old_prr_snd = rack->r_ctl.rc_prr_sndcnt;
 			if (out + amm <= tp->snd_wnd)
 				rack->r_ctl.rc_prr_sndcnt = amm;
 			else
 				goto need_retran;
 		} else {
 			/* Set the send-new override */
 			if (out + amm <= tp->snd_wnd)
 				rack->r_ctl.rc_tlp_new_data = amm;
 			else
 				goto need_retran;
 		}
 		rack->r_ctl.rc_tlp_seg_send_cnt = 0;
 		rack->r_ctl.rc_last_tlp_seq = tp->snd_max;
 		rack->r_ctl.rc_tlpsend = NULL;
 		counter_u64_add(rack_tlp_newdata, 1);
 		goto send;
 	}
 need_retran:
 	/*
 	 * Ok we need to arrange the last un-acked segment to be re-sent, or
 	 * optionally the first un-acked segment.
 	 */
 	if (rack_always_send_oldest)
 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	else {
 		rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
 		if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
 			rsm = rack_find_high_nonack(rack, rsm);
 		}
 	}
 	if (rsm == NULL) {
 		counter_u64_add(rack_tlp_does_nada, 1);
 #ifdef TCP_BLACKBOX
 		tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
 #endif
 		goto out;
 	}
 	if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) {
 		/*
 		 * We need to split this the last segment in two.
 		 */
 		int32_t idx;
 		struct rack_sendmap *nrsm;
 
-		nrsm = rack_alloc(rack);
+		nrsm = rack_alloc_full_limit(rack);
 		if (nrsm == NULL) {
 			/*
 			 * No memory to split, we will just exit and punt
 			 * off to the RXT timer.
 			 */
 			counter_u64_add(rack_tlp_does_nada, 1);
 			goto out;
 		}
 		nrsm->r_start = (rsm->r_end - tp->t_maxseg);
 		nrsm->r_end = rsm->r_end;
 		nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
 		nrsm->r_flags = rsm->r_flags;
 		nrsm->r_sndcnt = rsm->r_sndcnt;
 		nrsm->r_rtr_bytes = 0;
 		rsm->r_end = nrsm->r_start;
 		for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
 			nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
 		}
 		TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
 		if (rsm->r_in_tmap) {
 			TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 			nrsm->r_in_tmap = 1;
 		}
 		rsm->r_flags &= (~RACK_HAS_FIN);
 		rsm = nrsm;
 	}
 	rack->r_ctl.rc_tlpsend = rsm;
 	rack->r_ctl.rc_tlp_rtx_out = 1;
 	if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) {
 		rack->r_ctl.rc_tlp_seg_send_cnt++;
 		tp->t_rxtshift++;
 	} else {
 		rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
 		rack->r_ctl.rc_tlp_seg_send_cnt = 1;
 	}
 send:
 	rack->r_ctl.rc_tlp_send_cnt++;
 	if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) {
 		/*
 		 * Can't [re]/transmit a segment we have not heard from the
 		 * peer in max times. We need the retransmit timer to take
 		 * over.
 		 */
 restore:
 		rack->r_ctl.rc_tlpsend = NULL;
 		if (rsm)
 			rsm->r_flags &= ~RACK_TLP;
 		rack->r_ctl.rc_prr_sndcnt = old_prr_snd;
 		counter_u64_add(rack_tlp_retran_fail, 1);
 		goto out;
 	} else if (rsm) {
 		rsm->r_flags |= RACK_TLP;
 	}
 	if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) &&
 	    (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
 		/*
 		 * We don't want to send a single segment more than the max
 		 * either.
 		 */
 		goto restore;
 	}
 	rack->r_timer_override = 1;
 	rack->r_tlp_running = 1;
 	rack->rc_tlp_in_progress = 1;
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
 	return (0);
 out:
 	rack->rc_timer_up = 0;
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
 	return (0);
 }
 
 /*
  * Delayed ack Timer, here we simply need to setup the
  * ACK_NOW flag and remove the DELACK flag. From there
  * the output routine will send the ack out.
  *
  * We only return 1, saying don't proceed, if all timers
  * are stopped (destroyed PCB?).
  */
 static int
 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	rack_log_to_event(rack, RACK_TO_FRM_DELACK);
 	tp->t_flags &= ~TF_DELACK;
 	tp->t_flags |= TF_ACKNOW;
 	TCPSTAT_INC(tcps_delack);
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
 	return (0);
 }
 
 /*
  * Persists timer, here we simply need to setup the
  * FORCE-DATA flag the output routine will send
  * the one byte send.
  *
  * We only return 1, saying don't proceed, if all timers
  * are stopped (destroyed PCB?).
  */
 static int
 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	struct inpcb *inp;
 	int32_t retval = 0;
 
 	inp = tp->t_inpcb;
 
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	if (rack->rc_in_persist == 0)
 		return (0);
 	if (rack_progress_timeout_check(tp)) {
 		tcp_set_inp_to_drop(inp, ETIMEDOUT);
 		return (1);
 	}
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	/*
 	 * Persistence timer into zero window. Force a byte to be output, if
 	 * possible.
 	 */
 	TCPSTAT_INC(tcps_persisttimeo);
 	/*
 	 * Hack: if the peer is dead/unreachable, we do not time out if the
 	 * window is closed.  After a full backoff, drop the connection if
 	 * the idle time (no responses to probes) reaches the maximum
 	 * backoff that we would use if retransmitting.
 	 */
 	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
 	    ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
 		TCPSTAT_INC(tcps_persistdrop);
 		retval = 1;
 		tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
 		goto out;
 	}
 	if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
 	    tp->snd_una == tp->snd_max)
 		rack_exit_persist(tp, rack);
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
 	/*
 	 * If the user has closed the socket then drop a persisting
 	 * connection after a much reduced timeout.
 	 */
 	if (tp->t_state > TCPS_CLOSE_WAIT &&
 	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
 		retval = 1;
 		TCPSTAT_INC(tcps_persistdrop);
 		tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
 		goto out;
 	}
 	tp->t_flags |= TF_FORCEDATA;
 out:
 	rack_log_to_event(rack, RACK_TO_FRM_PERSIST);
 	return (retval);
 }
 
 /*
  * If a keepalive goes off, we had no other timers
  * happening. We always return 1 here since this
  * routine either drops the connection or sends
  * out a segment with respond.
  */
 static int
 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	struct tcptemp *t_template;
 	struct inpcb *inp;
 
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
 	inp = tp->t_inpcb;
 	rack_log_to_event(rack, RACK_TO_FRM_KEEP);
 	/*
 	 * Keep-alive timer went off; send something or drop connection if
 	 * idle for too long.
 	 */
 	TCPSTAT_INC(tcps_keeptimeo);
 	if (tp->t_state < TCPS_ESTABLISHED)
 		goto dropit;
 	if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
 	    tp->t_state <= TCPS_CLOSING) {
 		if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
 			goto dropit;
 		/*
 		 * Send a packet designed to force a response if the peer is
 		 * up and reachable: either an ACK if the connection is
 		 * still alive, or an RST if the peer has closed the
 		 * connection due to timeout or reboot. Using sequence
 		 * number tp->snd_una-1 causes the transmitted zero-length
 		 * segment to lie outside the receive window; by the
 		 * protocol spec, this requires the correspondent TCP to
 		 * respond.
 		 */
 		TCPSTAT_INC(tcps_keepprobe);
 		t_template = tcpip_maketemplate(inp);
 		if (t_template) {
 			tcp_respond(tp, t_template->tt_ipgen,
 			    &t_template->tt_t, (struct mbuf *)NULL,
 			    tp->rcv_nxt, tp->snd_una - 1, 0);
 			free(t_template, M_TEMP);
 		}
 	}
 	rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
 	return (1);
 dropit:
 	TCPSTAT_INC(tcps_keepdrops);
 	tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
 	return (1);
 }
 
 /*
  * Retransmit helper function, clear up all the ack
  * flags and take care of important book keeping.
  */
 static void
 rack_remxt_tmr(struct tcpcb *tp)
 {
 	/*
 	 * The retransmit timer went off, all sack'd blocks must be
 	 * un-acked.
 	 */
 	struct rack_sendmap *rsm, *trsm = NULL;
 	struct tcp_rack *rack;
 	int32_t cnt = 0;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__);
 	rack_log_to_event(rack, RACK_TO_FRM_TMR);
 	if (rack->r_state && (rack->r_state != tp->t_state))
 		rack_set_state(tp, rack);
 	/*
 	 * Ideally we would like to be able to
 	 * mark SACK-PASS on anything not acked here.
 	 * However, if we do that we would burst out
 	 * all that data 1ms apart. This would be unwise,
 	 * so for now we will just let the normal rxt timer
 	 * and tlp timer take care of it.
 	 */
 	TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
 		if (rsm->r_flags & RACK_ACKED) {
 			cnt++;
 			rsm->r_sndcnt = 0;
 			if (rsm->r_in_tmap == 0) {
 				/* We must re-add it back to the tlist */
 				if (trsm == NULL) {
 					TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 				} else {
 					TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
 				}
 				rsm->r_in_tmap = 1;
 				trsm = rsm;
 			}
 		}
 		rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
 	}
 	/* Clear the count (we just un-acked them) */
 	rack->r_ctl.rc_sacked = 0;
 	/* Clear the tlp rtx mark */
 	rack->r_ctl.rc_tlp_rtx_out = 0;
 	rack->r_ctl.rc_tlp_seg_send_cnt = 0;
 	rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map);
 	/* Setup so we send one segment */
 	if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)
 		rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
 	rack->r_timer_override = 1;
 }
 
 /*
  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
  * we will setup to retransmit the lowest seq number outstanding.
  */
 static int
 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	int32_t rexmt;
 	struct inpcb *inp;
 	int32_t retval = 0;
 
 	inp = tp->t_inpcb;
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	if (rack_progress_timeout_check(tp)) {
 		tcp_set_inp_to_drop(inp, ETIMEDOUT);
 		return (1);
 	}
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
 	if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 	    (tp->snd_una == tp->snd_max)) {
 		/* Nothing outstanding .. nothing to do */
 		return (0);
 	}
 	/*
 	 * Retransmission timer went off.  Message has not been acked within
 	 * retransmit interval.  Back off to a longer retransmit interval
 	 * and retransmit one segment.
 	 */
 	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
 		TCPSTAT_INC(tcps_timeoutdrop);
 		retval = 1;
 		tcp_set_inp_to_drop(rack->rc_inp,
 		    (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
 		goto out;
 	}
 	rack_remxt_tmr(tp);
 	if (tp->t_state == TCPS_SYN_SENT) {
 		/*
 		 * If the SYN was retransmitted, indicate CWND to be limited
 		 * to 1 segment in cc_conn_init().
 		 */
 		tp->snd_cwnd = 1;
 	} else if (tp->t_rxtshift == 1) {
 		/*
 		 * first retransmit; record ssthresh and cwnd so they can be
 		 * recovered if this turns out to be a "bad" retransmit. A
 		 * retransmit is considered "bad" if an ACK for this segment
 		 * is received within RTT/2 interval; the assumption here is
 		 * that the ACK was already in flight.  See "On Estimating
 		 * End-to-End Network Path Properties" by Allman and Paxson
 		 * for more details.
 		 */
 		tp->snd_cwnd_prev = tp->snd_cwnd;
 		tp->snd_ssthresh_prev = tp->snd_ssthresh;
 		tp->snd_recover_prev = tp->snd_recover;
 		if (IN_FASTRECOVERY(tp->t_flags))
 			tp->t_flags |= TF_WASFRECOVERY;
 		else
 			tp->t_flags &= ~TF_WASFRECOVERY;
 		if (IN_CONGRECOVERY(tp->t_flags))
 			tp->t_flags |= TF_WASCRECOVERY;
 		else
 			tp->t_flags &= ~TF_WASCRECOVERY;
 		tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
 		tp->t_flags |= TF_PREVVALID;
 	} else
 		tp->t_flags &= ~TF_PREVVALID;
 	TCPSTAT_INC(tcps_rexmttimeo);
 	if ((tp->t_state == TCPS_SYN_SENT) ||
 	    (tp->t_state == TCPS_SYN_RECEIVED))
-		rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]);
+		rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]);
 	else
 		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
 	TCPT_RANGESET(tp->t_rxtcur, rexmt,
 	   max(MSEC_2_TICKS(rack_rto_min), rexmt),
 	   MSEC_2_TICKS(rack_rto_max));
 	/*
 	 * We enter the path for PLMTUD if connection is established or, if
 	 * connection is FIN_WAIT_1 status, reason for the last is that if
 	 * amount of data we send is very small, we could send it in couple
 	 * of packets and process straight to FIN. In that case we won't
 	 * catch ESTABLISHED state.
 	 */
 	if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
 	    || (tp->t_state == TCPS_FIN_WAIT_1))) {
 #ifdef INET6
 		int32_t isipv6;
 #endif
 
 		/*
 		 * Idea here is that at each stage of mtu probe (usually,
 		 * 1448 -> 1188 -> 524) should be given 2 chances to recover
 		 * before further clamping down. 'tp->t_rxtshift % 2 == 0'
 		 * should take care of that.
 		 */
 		if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
 		    (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
 		    (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
 		    tp->t_rxtshift % 2 == 0)) {
 			/*
 			 * Enter Path MTU Black-hole Detection mechanism: -
 			 * Disable Path MTU Discovery (IP "DF" bit). -
 			 * Reduce MTU to lower value than what we negotiated
 			 * with peer.
 			 */
 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
 				/* Record that we may have found a black hole. */
 				tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
 				/* Keep track of previous MSS. */
 				tp->t_pmtud_saved_maxseg = tp->t_maxseg;
 			}
 
 			/*
 			 * Reduce the MSS to blackhole value or to the
 			 * default in an attempt to retransmit.
 			 */
 #ifdef INET6
 			isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
 			if (isipv6 &&
 			    tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
 				/* Use the sysctl tuneable blackhole MSS. */
 				tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
 				TCPSTAT_INC(tcps_pmtud_blackhole_activated);
 			} else if (isipv6) {
 				/* Use the default MSS. */
 				tp->t_maxseg = V_tcp_v6mssdflt;
 				/*
 				 * Disable Path MTU Discovery when we switch
 				 * to minmss.
 				 */
 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 				TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
 			}
 #endif
 #if defined(INET6) && defined(INET)
 			else
 #endif
 #ifdef INET
 			if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
 				/* Use the sysctl tuneable blackhole MSS. */
 				tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
 				TCPSTAT_INC(tcps_pmtud_blackhole_activated);
 			} else {
 				/* Use the default MSS. */
 				tp->t_maxseg = V_tcp_mssdflt;
 				/*
 				 * Disable Path MTU Discovery when we switch
 				 * to minmss.
 				 */
 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 				TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
 			}
 #endif
 		} else {
 			/*
 			 * If further retransmissions are still unsuccessful
 			 * with a lowered MTU, maybe this isn't a blackhole
 			 * and we restore the previous MSS and blackhole
 			 * detection flags. The limit '6' is determined by
 			 * giving each probe stage (1448, 1188, 524) 2
 			 * chances to recover.
 			 */
 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
 			    (tp->t_rxtshift >= 6)) {
 				tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 				tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
 				tp->t_maxseg = tp->t_pmtud_saved_maxseg;
 				TCPSTAT_INC(tcps_pmtud_blackhole_failed);
 			}
 		}
 	}
 	/*
 	 * Disable RFC1323 and SACK if we haven't got any response to our
 	 * third SYN to work-around some broken terminal servers (most of
 	 * which have hopefully been retired) that have bad VJ header
 	 * compression code which trashes TCP segments containing
 	 * unknown-to-them TCP options.
 	 */
 	if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
 	    (tp->t_rxtshift == 3))
 		tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT);
 	/*
 	 * If we backed off this far, our srtt estimate is probably bogus.
 	 * Clobber it so we'll take the next rtt measurement as our srtt;
 	 * move the current srtt into rttvar to keep the current retransmit
 	 * times until then.
 	 */
 	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
 #ifdef INET6
 		if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
 			in6_losing(tp->t_inpcb);
 		else
 #endif
 			in_losing(tp->t_inpcb);
 		tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
 		tp->t_srtt = 0;
 	}
 	if (rack_use_sack_filter)
 		sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
 	tp->snd_recover = tp->snd_max;
 	tp->t_flags |= TF_ACKNOW;
 	tp->t_rtttime = 0;
 	rack_cong_signal(tp, NULL, CC_RTO);
 out:
 	return (retval);
 }
 
 static int
 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling)
 {
 	int32_t ret = 0;
 	int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
 
 	if (timers == 0) {
 		return (0);
 	}
 	if (tp->t_state == TCPS_LISTEN) {
 		/* no timers on listen sockets */
 		if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
 			return (0);
 		return (1);
 	}
 	if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
 		uint32_t left;
 
 		if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
 			ret = -1;
 			rack_log_to_processing(rack, cts, ret, 0);
 			return (0);
 		}
 		if (hpts_calling == 0) {
 			ret = -2;
 			rack_log_to_processing(rack, cts, ret, 0);
 			return (0);
 		}
 		/*
 		 * Ok our timer went off early and we are not paced false
 		 * alarm, go back to sleep.
 		 */
 		ret = -3;
 		left = rack->r_ctl.rc_timer_exp - cts;
 		tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left));
 		rack_log_to_processing(rack, cts, ret, left);
 		rack->rc_last_pto_set = 0;
 		return (1);
 	}
 	rack->rc_tmr_stopped = 0;
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
 	if (timers & PACE_TMR_DELACK) {
 		ret = rack_timeout_delack(tp, rack, cts);
 	} else if (timers & PACE_TMR_RACK) {
 		ret = rack_timeout_rack(tp, rack, cts);
 	} else if (timers & PACE_TMR_TLP) {
 		ret = rack_timeout_tlp(tp, rack, cts);
 	} else if (timers & PACE_TMR_RXT) {
 		ret = rack_timeout_rxt(tp, rack, cts);
 	} else if (timers & PACE_TMR_PERSIT) {
 		ret = rack_timeout_persist(tp, rack, cts);
 	} else if (timers & PACE_TMR_KEEP) {
 		ret = rack_timeout_keepalive(tp, rack, cts);
 	}
 	rack_log_to_processing(rack, cts, ret, timers);
 	return (ret);
 }
 
 static void
 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
 {
 	uint8_t hpts_removed = 0;
 
 	if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
 	    TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
 		tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
 		hpts_removed = 1;
 	}
 	if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
 		rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
 		if (rack->rc_inp->inp_in_hpts &&
 		    ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
 			/*
 			 * Canceling timer's when we have no output being
 			 * paced. We also must remove ourselves from the
 			 * hpts.
 			 */
 			tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
 			hpts_removed = 1;
 		}
 		rack_log_to_cancel(rack, hpts_removed, line);
 		rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
 	}
 }
 
 static void
 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
 {
 	return;
 }
 
 static int
 rack_stopall(struct tcpcb *tp)
 {
 	struct tcp_rack *rack;
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	rack->t_timers_stopped = 1;
 	return (0);
 }
 
 static void
 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
 {
 	return;
 }
 
 static int
 rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
 {
 	return (0);
 }
 
 static void
 rack_stop_all_timers(struct tcpcb *tp)
 {
 	struct tcp_rack *rack;
 
 	/*
 	 * Assure no timers are running.
 	 */
 	if (tcp_timer_active(tp, TT_PERSIST)) {
 		/* We enter in persists, set the flag appropriately */
 		rack = (struct tcp_rack *)tp->t_fb_ptr;
 		rack->rc_in_persist = 1;
 	}
 	tcp_timer_suspend(tp, TT_PERSIST);
 	tcp_timer_suspend(tp, TT_REXMT);
 	tcp_timer_suspend(tp, TT_KEEP);
 	tcp_timer_suspend(tp, TT_DELACK);
 }
 
 static void
 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, uint32_t ts)
 {
 	int32_t idx;
 
 	rsm->r_rtr_cnt++;
 	rsm->r_sndcnt++;
 	if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
 		rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
 		rsm->r_flags |= RACK_OVERMAX;
 	}
 	if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) {
 		rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
 		rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
 	}
 	idx = rsm->r_rtr_cnt - 1;
 	rsm->r_tim_lastsent[idx] = ts;
 	if (rsm->r_flags & RACK_ACKED) {
 		/* Problably MTU discovery messing with us */
 		rsm->r_flags &= ~RACK_ACKED;
 		rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
 	}
 	if (rsm->r_in_tmap) {
 		TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 	}
 	TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 	rsm->r_in_tmap = 1;
 	if (rsm->r_flags & RACK_SACK_PASSED) {
 		/* We have retransmitted due to the SACK pass */
 		rsm->r_flags &= ~RACK_SACK_PASSED;
 		rsm->r_flags |= RACK_WAS_SACKPASS;
 	}
 	/* Update memory for next rtr */
 	rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
 }
 
 
 static uint32_t
 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp)
 {
 	/*
 	 * We (re-)transmitted starting at rsm->r_start for some length
 	 * (possibly less than r_end.
 	 */
 	struct rack_sendmap *nrsm;
 	uint32_t c_end;
 	int32_t len;
 	int32_t idx;
 
 	len = *lenp;
 	c_end = rsm->r_start + len;
 	if (SEQ_GEQ(c_end, rsm->r_end)) {
 		/*
 		 * We retransmitted the whole piece or more than the whole
 		 * slopping into the next rsm.
 		 */
 		rack_update_rsm(tp, rack, rsm, ts);
 		if (c_end == rsm->r_end) {
 			*lenp = 0;
 			return (0);
 		} else {
 			int32_t act_len;
 
 			/* Hangs over the end return whats left */
 			act_len = rsm->r_end - rsm->r_start;
 			*lenp = (len - act_len);
 			return (rsm->r_end);
 		}
 		/* We don't get out of this block. */
 	}
 	/*
 	 * Here we retransmitted less than the whole thing which means we
 	 * have to split this into what was transmitted and what was not.
 	 */
-	nrsm = rack_alloc(rack);
+	nrsm = rack_alloc_full_limit(rack);
 	if (nrsm == NULL) {
 		/*
 		 * We can't get memory, so lets not proceed.
 		 */
 		*lenp = 0;
 		return (0);
 	}
 	/*
 	 * So here we are going to take the original rsm and make it what we
 	 * retransmitted. nrsm will be the tail portion we did not
 	 * retransmit. For example say the chunk was 1, 11 (10 bytes). And
 	 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
 	 * 1, 6 and the new piece will be 6, 11.
 	 */
 	nrsm->r_start = c_end;
 	nrsm->r_end = rsm->r_end;
 	nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
 	nrsm->r_flags = rsm->r_flags;
 	nrsm->r_sndcnt = rsm->r_sndcnt;
 	nrsm->r_rtr_bytes = 0;
 	rsm->r_end = c_end;
 	for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
 		nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
 	}
 	TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
 	if (rsm->r_in_tmap) {
 		TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 		nrsm->r_in_tmap = 1;
 	}
 	rsm->r_flags &= (~RACK_HAS_FIN);
 	rack_update_rsm(tp, rack, rsm, ts);
 	*lenp = 0;
 	return (0);
 }
 
 
 static void
 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
     uint8_t pass, struct rack_sendmap *hintrsm)
 {
 	struct tcp_rack *rack;
 	struct rack_sendmap *rsm, *nrsm;
 	register uint32_t snd_max, snd_una;
 	int32_t idx;
 
 	/*
 	 * Add to the RACK log of packets in flight or retransmitted. If
 	 * there is a TS option we will use the TS echoed, if not we will
 	 * grab a TS.
 	 *
 	 * Retransmissions will increment the count and move the ts to its
 	 * proper place. Note that if options do not include TS's then we
 	 * won't be able to effectively use the ACK for an RTT on a retran.
 	 *
 	 * Notes about r_start and r_end. Lets consider a send starting at
 	 * sequence 1 for 10 bytes. In such an example the r_start would be
 	 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
 	 * This means that r_end is actually the first sequence for the next
 	 * slot (11).
 	 *
 	 */
 	/*
 	 * If err is set what do we do XXXrrs? should we not add the thing?
 	 * -- i.e. return if err != 0 or should we pretend we sent it? --
 	 * i.e. proceed with add ** do this for now.
 	 */
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (err)
 		/*
 		 * We don't log errors -- we could but snd_max does not
 		 * advance in this case either.
 		 */
 		return;
 
 	if (th_flags & TH_RST) {
 		/*
 		 * We don't log resets and we return immediately from
 		 * sending
 		 */
 		return;
 	}
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	snd_una = tp->snd_una;
 	if (SEQ_LEQ((seq_out + len), snd_una)) {
 		/* Are sending an old segment to induce an ack (keep-alive)? */
 		return;
 	}
 	if (SEQ_LT(seq_out, snd_una)) {
 		/* huh? should we panic? */
 		uint32_t end;
 
 		end = seq_out + len;
 		seq_out = snd_una;
 		len = end - seq_out;
 	}
 	snd_max = tp->snd_max;
 	if (th_flags & (TH_SYN | TH_FIN)) {
 		/*
 		 * The call to rack_log_output is made before bumping
 		 * snd_max. This means we can record one extra byte on a SYN
 		 * or FIN if seq_out is adding more on and a FIN is present
 		 * (and we are not resending).
 		 */
 		if (th_flags & TH_SYN)
 			len++;
 		if (th_flags & TH_FIN)
 			len++;
 		if (SEQ_LT(snd_max, tp->snd_nxt)) {
 			/*
 			 * The add/update as not been done for the FIN/SYN
 			 * yet.
 			 */
 			snd_max = tp->snd_nxt;
 		}
 	}
 	if (len == 0) {
 		/* We don't log zero window probes */
 		return;
 	}
 	rack->r_ctl.rc_time_last_sent = ts;
 	if (IN_RECOVERY(tp->t_flags)) {
 		rack->r_ctl.rc_prr_out += len;
 	}
 	/* First question is it a retransmission? */
 	if (seq_out == snd_max) {
 again:
 		rsm = rack_alloc(rack);
 		if (rsm == NULL) {
 			/*
 			 * Hmm out of memory and the tcb got destroyed while
 			 * we tried to wait.
 			 */
-#ifdef INVARIANTS
-			panic("Out of memory when we should not be rack:%p", rack);
-#endif
 			return;
 		}
 		if (th_flags & TH_FIN) {
 			rsm->r_flags = RACK_HAS_FIN;
 		} else {
 			rsm->r_flags = 0;
 		}
 		rsm->r_tim_lastsent[0] = ts;
 		rsm->r_rtr_cnt = 1;
 		rsm->r_rtr_bytes = 0;
-		if (th_flags & TH_SYN) {
-			/* The data space is one beyond snd_una */
-			rsm->r_start = seq_out + 1;
-			rsm->r_end = rsm->r_start + (len - 1);
-		} else {
-			/* Normal case */
-			rsm->r_start = seq_out;
-			rsm->r_end = rsm->r_start + len;
-		}
+		rsm->r_start = seq_out;
+		rsm->r_end = rsm->r_start + len;
 		rsm->r_sndcnt = 0;
 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 		rsm->r_in_tmap = 1;
 		return;
 	}
 	/*
 	 * If we reach here its a retransmission and we need to find it.
 	 */
 more:
 	if (hintrsm && (hintrsm->r_start == seq_out)) {
 		rsm = hintrsm;
 		hintrsm = NULL;
 	} else if (rack->r_ctl.rc_next) {
 		/* We have a hint from a previous run */
 		rsm = rack->r_ctl.rc_next;
 	} else {
 		/* No hints sorry */
 		rsm = NULL;
 	}
 	if ((rsm) && (rsm->r_start == seq_out)) {
 		/*
 		 * We used rc_next or hintrsm  to retransmit, hopefully the
 		 * likely case.
 		 */
 		seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
 		if (len == 0) {
 			return;
 		} else {
 			goto more;
 		}
 	}
 	/* Ok it was not the last pointer go through it the hard way. */
 	TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
 		if (rsm->r_start == seq_out) {
 			seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
 			rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
 			if (len == 0) {
 				return;
 			} else {
 				continue;
 			}
 		}
 		if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
 			/* Transmitted within this piece */
 			/*
 			 * Ok we must split off the front and then let the
 			 * update do the rest
 			 */
-			nrsm = rack_alloc(rack);
+			nrsm = rack_alloc_full_limit(rack);
 			if (nrsm == NULL) {
-#ifdef INVARIANTS
-				panic("Ran out of memory that was preallocated? rack:%p", rack);
-#endif
 				rack_update_rsm(tp, rack, rsm, ts);
 				return;
 			}
 			/*
 			 * copy rsm to nrsm and then trim the front of rsm
 			 * to not include this part.
 			 */
 			nrsm->r_start = seq_out;
 			nrsm->r_end = rsm->r_end;
 			nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
 			nrsm->r_flags = rsm->r_flags;
 			nrsm->r_sndcnt = rsm->r_sndcnt;
 			nrsm->r_rtr_bytes = 0;
 			for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
 				nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
 			}
 			rsm->r_end = nrsm->r_start;
 			TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
 			if (rsm->r_in_tmap) {
 				TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 				nrsm->r_in_tmap = 1;
 			}
 			rsm->r_flags &= (~RACK_HAS_FIN);
 			seq_out = rack_update_entry(tp, rack, nrsm, ts, &len);
 			if (len == 0) {
 				return;
 			}
 		}
 	}
 	/*
 	 * Hmm not found in map did they retransmit both old and on into the
 	 * new?
 	 */
 	if (seq_out == tp->snd_max) {
 		goto again;
 	} else if (SEQ_LT(seq_out, tp->snd_max)) {
 #ifdef INVARIANTS
 		printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
 		    seq_out, len, tp->snd_una, tp->snd_max);
 		printf("Starting Dump of all rack entries\n");
 		TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
 			printf("rsm:%p start:%u end:%u\n",
 			    rsm, rsm->r_start, rsm->r_end);
 		}
 		printf("Dump complete\n");
 		panic("seq_out not found rack:%p tp:%p",
 		    rack, tp);
 #endif
 	} else {
 #ifdef INVARIANTS
 		/*
 		 * Hmm beyond sndmax? (only if we are using the new rtt-pack
 		 * flag)
 		 */
 		panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
 		    seq_out, len, tp->snd_max, tp);
 #endif
 	}
 }
 
 /*
  * Record one of the RTT updates from an ack into
  * our sample structure.
  */
 static void
 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt)
 {
 	if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
 	    (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
 		rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
 	}
 	if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
 	    (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
 		rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
 	}
 	rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
 	rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
 	rack->r_ctl.rack_rs.rs_rtt_cnt++;
 }
 
 /*
  * Collect new round-trip time estimate
  * and update averages and current timeout.
  */
 static void
 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
 {
 	int32_t delta;
 	uint32_t o_srtt, o_var;
 	int32_t rtt;
 
 	if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
 		/* No valid sample */
 		return;
 	if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
 		/* We are to use the lowest RTT seen in a single ack */
 		rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
 	} else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
 		/* We are to use the highest RTT seen in a single ack */
 		rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
 	} else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
 		/* We are to use the average RTT seen in a single ack */
 		rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
 				(uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
 	} else {
 #ifdef INVARIANTS
 		panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
-#endif		
+#endif
 		return;
 	}
 	if (rtt == 0)
 		rtt = 1;
 	rack_log_rtt_sample(rack, rtt);
 	o_srtt = tp->t_srtt;
 	o_var = tp->t_rttvar;
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (tp->t_srtt != 0) {
 		/*
 		 * srtt is stored as fixed point with 5 bits after the
 		 * binary point (i.e., scaled by 8).  The following magic is
 		 * equivalent to the smoothing algorithm in rfc793 with an
 		 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point).
 		 * Adjust rtt to origin 0.
 		 */
 		delta = ((rtt - 1) << TCP_DELTA_SHIFT)
 		    - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
 
 		tp->t_srtt += delta;
 		if (tp->t_srtt <= 0)
 			tp->t_srtt = 1;
 
 		/*
 		 * We accumulate a smoothed rtt variance (actually, a
 		 * smoothed mean difference), then set the retransmit timer
 		 * to smoothed rtt + 4 times the smoothed variance. rttvar
 		 * is stored as fixed point with 4 bits after the binary
 		 * point (scaled by 16).  The following is equivalent to
 		 * rfc793 smoothing with an alpha of .75 (rttvar =
 		 * rttvar*3/4 + |delta| / 4).  This replaces rfc793's
 		 * wired-in beta.
 		 */
 		if (delta < 0)
 			delta = -delta;
 		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
 		tp->t_rttvar += delta;
 		if (tp->t_rttvar <= 0)
 			tp->t_rttvar = 1;
 		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
 			tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	} else {
 		/*
 		 * No rtt measurement yet - use the unsmoothed rtt. Set the
 		 * variance to half the rtt (so our first retransmit happens
 		 * at 3*rtt).
 		 */
 		tp->t_srtt = rtt << TCP_RTT_SHIFT;
 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
 		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	}
 	TCPSTAT_INC(tcps_rttupdated);
 	rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var);
 	tp->t_rttupdated++;
 #ifdef NETFLIX_STATS
 	stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
 #endif
 	tp->t_rxtshift = 0;
 
 	/*
 	 * the retransmit should happen at rtt + 4 * rttvar. Because of the
 	 * way we do the smoothing, srtt and rttvar will each average +1/2
 	 * tick of bias.  When we compute the retransmit timer, we want 1/2
 	 * tick of rounding and 1 extra tick because of +-1/2 tick
 	 * uncertainty in the firing of the timer.  The bias will give us
 	 * exactly the 1.5 tick we need.  But, because the bias is
 	 * statistical, we have to test that we don't drop below the minimum
 	 * feasible timer (which is 2 ticks).
 	 */
 	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
 	   max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max));
 	tp->t_softerror = 0;
 }
 
 static void
 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
     uint32_t t, uint32_t cts)
 {
 	/*
 	 * For this RSM, we acknowledged the data from a previous
 	 * transmission, not the last one we made. This means we did a false
 	 * retransmit.
 	 */
 	struct tcp_rack *rack;
 
 	if (rsm->r_flags & RACK_HAS_FIN) {
 		/*
 		 * The sending of the FIN often is multiple sent when we
 		 * have everything outstanding ack'd. We ignore this case
 		 * since its over now.
 		 */
 		return;
 	}
 	if (rsm->r_flags & RACK_TLP) {
 		/*
 		 * We expect TLP's to have this occur.
 		 */
 		return;
 	}
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	/* should we undo cc changes and exit recovery? */
 	if (IN_RECOVERY(tp->t_flags)) {
 		if (rack->r_ctl.rc_rsm_start == rsm->r_start) {
 			/*
 			 * Undo what we ratched down and exit recovery if
 			 * possible
 			 */
 			EXIT_RECOVERY(tp->t_flags);
 			tp->snd_recover = tp->snd_una;
 			if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd)
 				tp->snd_cwnd = rack->r_ctl.rc_cwnd_at;
 			if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh)
 				tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at;
 		}
 	}
 	if (rsm->r_flags & RACK_WAS_SACKPASS) {
 		/*
 		 * We retransmitted based on a sack and the earlier
 		 * retransmission ack'd it - re-ordering is occuring.
 		 */
 		counter_u64_add(rack_reorder_seen, 1);
 		rack->r_ctl.rc_reorder_ts = cts;
 	}
 	counter_u64_add(rack_badfr, 1);
 	counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start));
 }
 
 
 static int
 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type)
 {
 	int32_t i;
 	uint32_t t;
 
 	if (rsm->r_flags & RACK_ACKED)
 		/* Already done */
 		return (0);
 
 
 	if ((rsm->r_rtr_cnt == 1) ||
 	    ((ack_type == CUM_ACKED) &&
 	    (to->to_flags & TOF_TS) &&
 	    (to->to_tsecr) &&
 	    (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr))
 	    ) {
 		/*
 		 * We will only find a matching timestamp if its cum-acked.
 		 * But if its only one retransmission its for-sure matching
 		 * :-)
 		 */
 		t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
 		if ((int)t <= 0)
 			t = 1;
 		if (!tp->t_rttlow || tp->t_rttlow > t)
 			tp->t_rttlow = t;
 		if (!rack->r_ctl.rc_rack_min_rtt ||
 		    SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
 			rack->r_ctl.rc_rack_min_rtt = t;
 			if (rack->r_ctl.rc_rack_min_rtt == 0) {
 				rack->r_ctl.rc_rack_min_rtt = 1;
 			}
 		}
 		tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1);
 		if ((rsm->r_flags & RACK_TLP) &&
 		    (!IN_RECOVERY(tp->t_flags))) {
 			/* Segment was a TLP and our retrans matched */
 			if (rack->r_ctl.rc_tlp_cwnd_reduce) {
 				rack->r_ctl.rc_rsm_start = tp->snd_max;
 				rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
 				rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
 				rack_cong_signal(tp, NULL, CC_NDUPACK);
 				/*
 				 * When we enter recovery we need to assure
 				 * we send one packet.
 				 */
 				rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
 			} else
 				rack->r_ctl.rc_tlp_rtx_out = 0;
 		}
 		if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
 			/* New more recent rack_tmit_time */
 			rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
 			rack->rc_rack_rtt = t;
 		}
 		return (1);
 	}
-	/* 
-	 * We clear the soft/rxtshift since we got an ack. 
+	/*
+	 * We clear the soft/rxtshift since we got an ack.
 	 * There is no assurance we will call the commit() function
 	 * so we need to clear these to avoid incorrect handling.
 	 */
 	tp->t_rxtshift = 0;
 	tp->t_softerror = 0;
 	if ((to->to_flags & TOF_TS) &&
 	    (ack_type == CUM_ACKED) &&
 	    (to->to_tsecr) &&
 	    ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) {
 		/*
 		 * Now which timestamp does it match? In this block the ACK
 		 * must be coming from a previous transmission.
 		 */
 		for (i = 0; i < rsm->r_rtr_cnt; i++) {
 			if (rsm->r_tim_lastsent[i] == to->to_tsecr) {
 				t = cts - rsm->r_tim_lastsent[i];
 				if ((int)t <= 0)
 					t = 1;
 				if ((i + 1) < rsm->r_rtr_cnt) {
 					/* Likely */
 					rack_earlier_retran(tp, rsm, t, cts);
 				}
 				if (!tp->t_rttlow || tp->t_rttlow > t)
 					tp->t_rttlow = t;
 				if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
 					rack->r_ctl.rc_rack_min_rtt = t;
 					if (rack->r_ctl.rc_rack_min_rtt == 0) {
 						rack->r_ctl.rc_rack_min_rtt = 1;
 					}
 				}
                                 /*
 				 * Note the following calls to
 				 * tcp_rack_xmit_timer() are being commented
 				 * out for now. They give us no more accuracy
 				 * and often lead to a wrong choice. We have
-				 * enough samples that have not been 
+				 * enough samples that have not been
 				 * retransmitted. I leave the commented out
 				 * code in here in case in the future we
 				 * decide to add it back (though I can't forsee
 				 * doing that). That way we will easily see
 				 * where they need to be placed.
 				 */
 				if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
 				    rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
 					/* New more recent rack_tmit_time */
 					rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
 					rack->rc_rack_rtt = t;
 				}
 				return (1);
 			}
 		}
 		goto ts_not_found;
 	} else {
 		/*
 		 * Ok its a SACK block that we retransmitted. or a windows
 		 * machine without timestamps. We can tell nothing from the
 		 * time-stamp since its not there or the time the peer last
 		 * recieved a segment that moved forward its cum-ack point.
 		 */
 ts_not_found:
 		i = rsm->r_rtr_cnt - 1;
 		t = cts - rsm->r_tim_lastsent[i];
 		if ((int)t <= 0)
 			t = 1;
 		if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
 			/*
 			 * We retransmitted and the ack came back in less
 			 * than the smallest rtt we have observed. We most
 			 * likey did an improper retransmit as outlined in
 			 * 4.2 Step 3 point 2 in the rack-draft.
 			 */
 			i = rsm->r_rtr_cnt - 2;
 			t = cts - rsm->r_tim_lastsent[i];
 			rack_earlier_retran(tp, rsm, t, cts);
 		} else if (rack->r_ctl.rc_rack_min_rtt) {
 			/*
 			 * We retransmitted it and the retransmit did the
 			 * job.
 			 */
 			if (!rack->r_ctl.rc_rack_min_rtt ||
 			    SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
 				rack->r_ctl.rc_rack_min_rtt = t;
 				if (rack->r_ctl.rc_rack_min_rtt == 0) {
 					rack->r_ctl.rc_rack_min_rtt = 1;
 				}
 			}
 			if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) {
 				/* New more recent rack_tmit_time */
 				rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i];
 				rack->rc_rack_rtt = t;
 			}
 			return (1);
 		}
 	}
 	return (0);
 }
 
 /*
  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
  */
 static void
 rack_log_sack_passed(struct tcpcb *tp,
     struct tcp_rack *rack, struct rack_sendmap *rsm)
 {
 	struct rack_sendmap *nrsm;
 	uint32_t ts;
 	int32_t idx;
 
 	idx = rsm->r_rtr_cnt - 1;
 	ts = rsm->r_tim_lastsent[idx];
 	nrsm = rsm;
 	TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
 	    rack_head, r_tnext) {
 		if (nrsm == rsm) {
 			/* Skip orginal segment he is acked */
 			continue;
 		}
 		if (nrsm->r_flags & RACK_ACKED) {
 			/* Skip ack'd segments */
 			continue;
 		}
+		if (nrsm->r_flags & RACK_SACK_PASSED) {
+			/*
+			 * We found one that is already marked
+			 * passed, we have been here before and
+			 * so all others below this are marked.
+			 */
+			break;
+		}
 		idx = nrsm->r_rtr_cnt - 1;
 		if (ts == nrsm->r_tim_lastsent[idx]) {
 			/*
 			 * For this case lets use seq no, if we sent in a
 			 * big block (TSO) we would have a bunch of segments
 			 * sent at the same time.
 			 *
 			 * We would only get a report if its SEQ is earlier.
 			 * If we have done multiple retransmits the times
 			 * would not be equal.
 			 */
 			if (SEQ_LT(nrsm->r_start, rsm->r_start)) {
 				nrsm->r_flags |= RACK_SACK_PASSED;
 				nrsm->r_flags &= ~RACK_WAS_SACKPASS;
 			}
 		} else {
 			/*
 			 * Here they were sent at different times, not a big
 			 * block. Since we transmitted this one later and
 			 * see it sack'd then this must also be missing (or
 			 * we would have gotten a sack block for it)
 			 */
 			nrsm->r_flags |= RACK_SACK_PASSED;
 			nrsm->r_flags &= ~RACK_WAS_SACKPASS;
 		}
 	}
 }
 
 static uint32_t
 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
     struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts)
 {
 	int32_t idx;
 	int32_t times = 0;
 	uint32_t start, end, changed = 0;
 	struct rack_sendmap *rsm, *nrsm;
 	int32_t used_ref = 1;
 
 	start = sack->start;
 	end = sack->end;
 	rsm = *prsm;
 	if (rsm && SEQ_LT(start, rsm->r_start)) {
 		TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) {
 			if (SEQ_GEQ(start, rsm->r_start) &&
 			    SEQ_LT(start, rsm->r_end)) {
 				goto do_rest_ofb;
 			}
 		}
 	}
 	if (rsm == NULL) {
 start_at_beginning:
 		rsm = NULL;
 		used_ref = 0;
 	}
 	/* First lets locate the block where this guy is */
 	TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) {
 		if (SEQ_GEQ(start, rsm->r_start) &&
 		    SEQ_LT(start, rsm->r_end)) {
 			break;
 		}
 	}
 do_rest_ofb:
 	if (rsm == NULL) {
 		/*
 		 * This happens when we get duplicate sack blocks with the
 		 * same end. For example SACK 4: 100 SACK 3: 100 The sort
 		 * will not change there location so we would just start at
 		 * the end of the first one and get lost.
 		 */
 		if (tp->t_flags & TF_SENTFIN) {
 			/*
 			 * Check to see if we have not logged the FIN that
 			 * went out.
 			 */
 			nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
 			if (nrsm && (nrsm->r_end + 1) == tp->snd_max) {
 				/*
 				 * Ok we did not get the FIN logged.
 				 */
 				nrsm->r_end++;
 				rsm = nrsm;
 				goto do_rest_ofb;
 			}
 		}
 		if (times == 1) {
 #ifdef INVARIANTS
 			panic("tp:%p rack:%p sack:%p to:%p prsm:%p",
 			    tp, rack, sack, to, prsm);
 #else
 			goto out;
 #endif
 		}
 		times++;
 		counter_u64_add(rack_sack_proc_restart, 1);
 		goto start_at_beginning;
 	}
 	/* Ok we have an ACK for some piece of rsm */
 	if (rsm->r_start != start) {
 		/*
 		 * Need to split this in two pieces the before and after.
 		 */
 		nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
 		if (nrsm == NULL) {
 			/*
 			 * failed XXXrrs what can we do but loose the sack
 			 * info?
 			 */
 			goto out;
 		}
 		nrsm->r_start = start;
 		nrsm->r_rtr_bytes = 0;
 		nrsm->r_end = rsm->r_end;
 		nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
 		nrsm->r_flags = rsm->r_flags;
 		nrsm->r_sndcnt = rsm->r_sndcnt;
 		for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
 			nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
 		}
 		rsm->r_end = nrsm->r_start;
 		TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
 		if (rsm->r_in_tmap) {
 			TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 			nrsm->r_in_tmap = 1;
 		}
 		rsm->r_flags &= (~RACK_HAS_FIN);
 		rsm = nrsm;
 	}
 	if (SEQ_GEQ(end, rsm->r_end)) {
 		/*
 		 * The end of this block is either beyond this guy or right
 		 * at this guy.
 		 */
 
 		if ((rsm->r_flags & RACK_ACKED) == 0) {
 			rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
 			changed += (rsm->r_end - rsm->r_start);
 			rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
 			rack_log_sack_passed(tp, rack, rsm);
 			/* Is Reordering occuring? */
 			if (rsm->r_flags & RACK_SACK_PASSED) {
 				counter_u64_add(rack_reorder_seen, 1);
 				rack->r_ctl.rc_reorder_ts = cts;
 			}
 			rsm->r_flags |= RACK_ACKED;
 			rsm->r_flags &= ~RACK_TLP;
 			if (rsm->r_in_tmap) {
 				TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 				rsm->r_in_tmap = 0;
 			}
 		}
 		if (end == rsm->r_end) {
 			/* This block only - done */
 			goto out;
 		}
 		/* There is more not coverend by this rsm move on */
 		start = rsm->r_end;
 		nrsm = TAILQ_NEXT(rsm, r_next);
 		rsm = nrsm;
 		times = 0;
 		goto do_rest_ofb;
 	}
 	/* Ok we need to split off this one at the tail */
 	nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
 	if (nrsm == NULL) {
 		/* failed rrs what can we do but loose the sack info? */
 		goto out;
 	}
 	/* Clone it */
 	nrsm->r_start = end;
 	nrsm->r_end = rsm->r_end;
 	nrsm->r_rtr_bytes = 0;
 	nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
 	nrsm->r_flags = rsm->r_flags;
 	nrsm->r_sndcnt = rsm->r_sndcnt;
 	for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
 		nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
 	}
 	/* The sack block does not cover this guy fully */
 	rsm->r_flags &= (~RACK_HAS_FIN);
 	rsm->r_end = end;
 	TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
 	if (rsm->r_in_tmap) {
 		TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 		nrsm->r_in_tmap = 1;
 	}
 	if (rsm->r_flags & RACK_ACKED) {
 		/* Been here done that */
 		goto out;
 	}
 	rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
 	changed += (rsm->r_end - rsm->r_start);
 	rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
 	rack_log_sack_passed(tp, rack, rsm);
 	/* Is Reordering occuring? */
 	if (rsm->r_flags & RACK_SACK_PASSED) {
 		counter_u64_add(rack_reorder_seen, 1);
 		rack->r_ctl.rc_reorder_ts = cts;
 	}
 	rsm->r_flags |= RACK_ACKED;
 	rsm->r_flags &= ~RACK_TLP;
 	if (rsm->r_in_tmap) {
 		TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 		rsm->r_in_tmap = 0;
 	}
 out:
+	if (rsm && (rsm->r_flags & RACK_ACKED)) {
+		/*
+		 * Now can we merge this newly acked
+		 * block with either the previous or
+		 * next block?
+		 */
+		nrsm = TAILQ_NEXT(rsm, r_next);
+		if (nrsm &&
+		    (nrsm->r_flags & RACK_ACKED)) {
+			/* yep this and next can be merged */
+			rsm = rack_merge_rsm(rack, rsm, nrsm);
+		}
+		/* Now what about the previous? */
+		nrsm = TAILQ_PREV(rsm, rack_head, r_next);
+		if (nrsm &&
+		    (nrsm->r_flags & RACK_ACKED)) {
+			/* yep the previous and this can be merged */
+			rsm = rack_merge_rsm(rack, nrsm, rsm);
+		}
+	}
 	if (used_ref == 0) {
 		counter_u64_add(rack_sack_proc_all, 1);
 	} else {
 		counter_u64_add(rack_sack_proc_short, 1);
 	}
 	/* Save off where we last were */
 	if (rsm)
 		rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next);
 	else
 		rack->r_ctl.rc_sacklast = NULL;
 	*prsm = rsm;
 	return (changed);
 }
 
-static void inline 
+static void inline
 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
 {
 	struct rack_sendmap *tmap;
 
 	tmap = NULL;
 	while (rsm && (rsm->r_flags & RACK_ACKED)) {
 		/* Its no longer sacked, mark it so */
 		rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
 #ifdef INVARIANTS
 		if (rsm->r_in_tmap) {
 			panic("rack:%p rsm:%p flags:0x%x in tmap?",
 			      rack, rsm, rsm->r_flags);
 		}
 #endif
 		rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
 		/* Rebuild it into our tmap */
 		if (tmap == NULL) {
 			TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 			tmap = rsm;
 		} else {
 			TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
 			tmap = rsm;
 		}
 		tmap->r_in_tmap = 1;
 		rsm = TAILQ_NEXT(rsm, r_next);
 	}
-	/* 
-	 * Now lets possibly clear the sack filter so we start 
+	/*
+	 * Now lets possibly clear the sack filter so we start
 	 * recognizing sacks that cover this area.
 	 */
 	if (rack_use_sack_filter)
 		sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
 
 }
 
 static void
 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
 {
 	uint32_t changed, last_seq, entered_recovery = 0;
 	struct tcp_rack *rack;
 	struct rack_sendmap *rsm;
 	struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
 	register uint32_t th_ack;
 	int32_t i, j, k, num_sack_blks = 0;
 	uint32_t cts, acked, ack_point, sack_changed = 0;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (th->th_flags & TH_RST) {
 		/* We don't log resets */
 		return;
 	}
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	cts = tcp_ts_getticks();
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
 	changed = 0;
 	th_ack = th->th_ack;
 
 	if (SEQ_GT(th_ack, tp->snd_una)) {
 		rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
 		tp->t_acktime = ticks;
 	}
 	if (rsm && SEQ_GT(th_ack, rsm->r_start))
 		changed = th_ack - rsm->r_start;
 	if (changed) {
 		/*
 		 * The ACK point is advancing to th_ack, we must drop off
 		 * the packets in the rack log and calculate any eligble
 		 * RTT's.
 		 */
 		rack->r_wanted_output++;
 more:
 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
 		if (rsm == NULL) {
 			if ((th_ack - 1) == tp->iss) {
 				/*
 				 * For the SYN incoming case we will not
 				 * have called tcp_output for the sending of
 				 * the SYN, so there will be no map. All
 				 * other cases should probably be a panic.
 				 */
 				goto proc_sack;
 			}
 			if (tp->t_flags & TF_SENTFIN) {
 				/* if we send a FIN we will not hav a map */
 				goto proc_sack;
 			}
 #ifdef INVARIANTS
 			panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n",
 			    tp,
 			    th, tp->t_state, rack,
 			    tp->snd_una, tp->snd_max, tp->snd_nxt, changed);
 #endif
 			goto proc_sack;
 		}
 		if (SEQ_LT(th_ack, rsm->r_start)) {
 			/* Huh map is missing this */
 #ifdef INVARIANTS
 			printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
 			    rsm->r_start,
 			    th_ack, tp->t_state, rack->r_state);
 #endif
 			goto proc_sack;
 		}
 		rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED);
 		/* Now do we consume the whole thing? */
 		if (SEQ_GEQ(th_ack, rsm->r_end)) {
 			/* Its all consumed. */
 			uint32_t left;
 
 			rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
 			rsm->r_rtr_bytes = 0;
 			TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
 			if (rsm->r_in_tmap) {
 				TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 				rsm->r_in_tmap = 0;
 			}
 			if (rack->r_ctl.rc_next == rsm) {
 				/* scoot along the marker */
 				rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map);
 			}
 			if (rsm->r_flags & RACK_ACKED) {
 				/*
 				 * It was acked on the scoreboard -- remove
 				 * it from total
 				 */
 				rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
 			} else if (rsm->r_flags & RACK_SACK_PASSED) {
 				/*
 				 * There are acked segments ACKED on the
 				 * scoreboard further up. We are seeing
 				 * reordering.
 				 */
 				counter_u64_add(rack_reorder_seen, 1);
 				rsm->r_flags |= RACK_ACKED;
 				rack->r_ctl.rc_reorder_ts = cts;
 			}
 			left = th_ack - rsm->r_end;
 			if (rsm->r_rtr_cnt > 1) {
 				/*
 				 * Technically we should make r_rtr_cnt be
 				 * monotonicly increasing and just mod it to
 				 * the timestamp it is replacing.. that way
 				 * we would have the last 3 retransmits. Now
 				 * rc_loss_count will be wrong if we
 				 * retransmit something more than 2 times in
 				 * recovery :(
 				 */
 				rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1);
 			}
 			/* Free back to zone */
 			rack_free(rack, rsm);
 			if (left) {
 				goto more;
 			}
 			goto proc_sack;
 		}
 		if (rsm->r_flags & RACK_ACKED) {
 			/*
 			 * It was acked on the scoreboard -- remove it from
 			 * total for the part being cum-acked.
 			 */
 			rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
 		}
 		rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
 		rsm->r_rtr_bytes = 0;
 		rsm->r_start = th_ack;
 	}
 proc_sack:
 	/* Check for reneging */
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
 	if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
 		/*
 		 * The peer has moved snd_una up to
 		 * the edge of this send, i.e. one
 		 * that it had previously acked. The only
 		 * way that can be true if the peer threw
 		 * away data (space issues) that it had
-		 * previously sacked (else it would have 
+		 * previously sacked (else it would have
 		 * given us snd_una up to (rsm->r_end).
 		 * We need to undo the acked markings here.
 		 *
 		 * Note we have to look to make sure th_ack is
 		 * our rsm->r_start in case we get an old ack
 		 * where th_ack is behind snd_una.
 		 */
 		rack_peer_reneges(rack, rsm, th->th_ack);
 	}
 	if ((to->to_flags & TOF_SACK) == 0) {
 		/* We are done nothing left to log */
 		goto out;
 	}
 	rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
 	if (rsm) {
 		last_seq = rsm->r_end;
 	} else {
 		last_seq = tp->snd_max;
 	}
 	/* Sack block processing */
 	if (SEQ_GT(th_ack, tp->snd_una))
 		ack_point = th_ack;
 	else
 		ack_point = tp->snd_una;
 	for (i = 0; i < to->to_nsacks; i++) {
 		bcopy((to->to_sacks + i * TCPOLEN_SACK),
 		    &sack, sizeof(sack));
 		sack.start = ntohl(sack.start);
 		sack.end = ntohl(sack.end);
 		if (SEQ_GT(sack.end, sack.start) &&
 		    SEQ_GT(sack.start, ack_point) &&
 		    SEQ_LT(sack.start, tp->snd_max) &&
 		    SEQ_GT(sack.end, ack_point) &&
 		    SEQ_LEQ(sack.end, tp->snd_max)) {
 			if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) &&
 			    (SEQ_LT(sack.end, last_seq)) &&
 			    ((sack.end - sack.start) < (tp->t_maxseg / 8))) {
 				/*
 				 * Not the last piece and its smaller than
 				 * 1/8th of a MSS. We ignore this.
 				 */
 				counter_u64_add(rack_runt_sacks, 1);
 				continue;
 			}
 			sack_blocks[num_sack_blks] = sack;
 			num_sack_blks++;
-#ifdef NETFLIX_STATS
 		} else if (SEQ_LEQ(sack.start, th_ack) &&
 			   SEQ_LEQ(sack.end, th_ack)) {
 			/*
 			 * Its a D-SACK block.
 			 */
-			tcp_record_dsack(sack.start, sack.end);
-#endif
+/*			tcp_record_dsack(sack.start, sack.end); */
 		}
-
 	}
 	if (num_sack_blks == 0)
 		goto out;
 	/*
 	 * Sort the SACK blocks so we can update the rack scoreboard with
 	 * just one pass.
 	 */
 	if (rack_use_sack_filter) {
-		num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack);
+		num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks,
+						 num_sack_blks, th->th_ack);
+		ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
 	}
 	if (num_sack_blks < 2) {
 		goto do_sack_work;
 	}
 	/* Sort the sacks */
 	for (i = 0; i < num_sack_blks; i++) {
 		for (j = i + 1; j < num_sack_blks; j++) {
 			if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
 				sack = sack_blocks[i];
 				sack_blocks[i] = sack_blocks[j];
 				sack_blocks[j] = sack;
 			}
 		}
 	}
 	/*
 	 * Now are any of the sack block ends the same (yes some
 	 * implememtations send these)?
 	 */
 again:
 	if (num_sack_blks > 1) {
 		for (i = 0; i < num_sack_blks; i++) {
 			for (j = i + 1; j < num_sack_blks; j++) {
 				if (sack_blocks[i].end == sack_blocks[j].end) {
 					/*
 					 * Ok these two have the same end we
 					 * want the smallest end and then
 					 * throw away the larger and start
 					 * again.
 					 */
 					if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
 						/*
 						 * The second block covers
 						 * more area use that
 						 */
 						sack_blocks[i].start = sack_blocks[j].start;
 					}
 					/*
 					 * Now collapse out the dup-sack and
 					 * lower the count
 					 */
 					for (k = (j + 1); k < num_sack_blks; k++) {
 						sack_blocks[j].start = sack_blocks[k].start;
 						sack_blocks[j].end = sack_blocks[k].end;
 						j++;
 					}
 					num_sack_blks--;
 					goto again;
 				}
 			}
 		}
 	}
 do_sack_work:
 	rsm = rack->r_ctl.rc_sacklast;
 	for (i = 0; i < num_sack_blks; i++) {
 		acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts);
 		if (acked) {
 			rack->r_wanted_output++;
 			changed += acked;
 			sack_changed += acked;
 		}
 	}
 out:
 	if (changed) {
 		/* Something changed cancel the rack timer */
 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 	}
 	if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) {
 		/*
 		 * Ok we have a high probability that we need to go in to
 		 * recovery since we have data sack'd
 		 */
 		struct rack_sendmap *rsm;
 		uint32_t tsused;
 
 		tsused = tcp_ts_getticks();
 		rsm = tcp_rack_output(tp, rack, tsused);
 		if (rsm) {
 			/* Enter recovery */
 			rack->r_ctl.rc_rsm_start = rsm->r_start;
 			rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
 			rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
 			entered_recovery = 1;
 			rack_cong_signal(tp, NULL, CC_NDUPACK);
 			/*
 			 * When we enter recovery we need to assure we send
 			 * one packet.
 			 */
 			rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
 			rack->r_timer_override = 1;
 		}
 	}
 	if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) {
 		/* Deal with changed an PRR here (in recovery only) */
 		uint32_t pipe, snd_una;
 
 		rack->r_ctl.rc_prr_delivered += changed;
 		/* Compute prr_sndcnt */
 		if (SEQ_GT(tp->snd_una, th_ack)) {
 			snd_una = tp->snd_una;
 		} else {
 			snd_una = th_ack;
 		}
 		pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt;
 		if (pipe > tp->snd_ssthresh) {
 			long sndcnt;
 
 			sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
 			if (rack->r_ctl.rc_prr_recovery_fs > 0)
 				sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
 			else {
 				rack->r_ctl.rc_prr_sndcnt = 0;
 				sndcnt = 0;
 			}
 			sndcnt++;
 			if (sndcnt > (long)rack->r_ctl.rc_prr_out)
 				sndcnt -= rack->r_ctl.rc_prr_out;
 			else
 				sndcnt = 0;
 			rack->r_ctl.rc_prr_sndcnt = sndcnt;
 		} else {
 			uint32_t limit;
 
 			if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
 				limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
 			else
 				limit = 0;
 			if (changed > limit)
 				limit = changed;
 			limit += tp->t_maxseg;
 			if (tp->snd_ssthresh > pipe) {
 				rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
 			} else {
 				rack->r_ctl.rc_prr_sndcnt = min(0, limit);
 			}
 		}
 		if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) {
 			rack->r_timer_override = 1;
 		}
 	}
 }
 
 /*
  * Return value of 1, we do not need to call rack_process_data().
  * return value of 0, rack_process_data can be called.
  * For ret_val if its 0 the TCP is locked, if its non-zero
  * its unlocked and probably unsafe to touch the TCB.
  */
 static int
 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to,
     uint32_t tiwin, int32_t tlen,
     int32_t * ofia, int32_t thflags, int32_t * ret_val)
 {
 	int32_t ourfinisacked = 0;
 	int32_t nsegs, acked_amount;
 	int32_t acked;
 	struct mbuf *mfree;
 	struct tcp_rack *rack;
 	int32_t recovery = 0;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (SEQ_GT(th->th_ack, tp->snd_max)) {
 		rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
 		return (1);
 	}
 	if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
 		rack_log_ack(tp, to, th);
 	}
 	if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
 		/*
 		 * Old ack, behind (or duplicate to) the last one rcv'd
 		 * Note: Should mark reordering is occuring! We should also
 		 * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1,
 		 * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no
 		 * retran and> ack 3
 		 */
 		return (0);
 	}
 	/*
 	 * If we reach this point, ACK is not a duplicate, i.e., it ACKs
 	 * something we sent.
 	 */
 	if (tp->t_flags & TF_NEEDSYN) {
 		/*
 		 * T/TCP: Connection was half-synchronized, and our SYN has
 		 * been ACK'd (so connection is now fully synchronized).  Go
 		 * to non-starred state, increment snd_una for ACK of SYN,
 		 * and check if we can do window scaling.
 		 */
 		tp->t_flags &= ~TF_NEEDSYN;
 		tp->snd_una++;
 		/* Do window scaling? */
 		if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 		    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 			tp->rcv_scale = tp->request_r_scale;
 			/* Send window already scaled. */
 		}
 	}
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	acked = BYTES_THIS_ACK(tp, th);
 	TCPSTAT_ADD(tcps_rcvackpack, nsegs);
 	TCPSTAT_ADD(tcps_rcvackbyte, acked);
 
 	/*
 	 * If we just performed our first retransmit, and the ACK arrives
 	 * within our recovery window, then it was a mistake to do the
 	 * retransmit in the first place.  Recover our original cwnd and
 	 * ssthresh, and proceed to transmit where we left off.
 	 */
 	if (tp->t_flags & TF_PREVVALID) {
 		tp->t_flags &= ~TF_PREVVALID;
 		if (tp->t_rxtshift == 1 &&
 		    (int)(ticks - tp->t_badrxtwin) < 0)
 			rack_cong_signal(tp, th, CC_RTO_ERR);
 	}
 	/*
 	 * If we have a timestamp reply, update smoothed round trip time. If
 	 * no timestamp is present but transmit timer is running and timed
 	 * sequence number was acked, update smoothed round trip time. Since
 	 * we now have an rtt measurement, cancel the timer backoff (cf.,
 	 * Phil Karn's retransmit alg.). Recompute the initial retransmit
 	 * timer.
 	 *
 	 * Some boxes send broken timestamp replies during the SYN+ACK
 	 * phase, ignore timestamps of 0 or we could calculate a huge RTT
 	 * and blow up the retransmit timer.
 	 */
 	/*
 	 * If all outstanding data is acked, stop retransmit timer and
 	 * remember to restart (more output or persist). If there is more
 	 * data to be acked, restart retransmit timer, using current
 	 * (possibly backed-off) value.
 	 */
 	if (th->th_ack == tp->snd_max) {
 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 		rack->r_wanted_output++;
 	}
 	/*
 	 * If no data (only SYN) was ACK'd, skip rest of ACK processing.
 	 */
 	if (acked == 0) {
 		if (ofia)
 			*ofia = ourfinisacked;
 		return (0);
 	}
 	if (rack->r_ctl.rc_early_recovery) {
-		if (IN_FASTRECOVERY(tp->t_flags)) {
-			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+		if (IN_RECOVERY(tp->t_flags)) {
+			if (SEQ_LT(th->th_ack, tp->snd_recover) &&
+			    (SEQ_LT(th->th_ack, tp->snd_max))) {
 				tcp_rack_partialack(tp, th);
 			} else {
 				rack_post_recovery(tp, th);
 				recovery = 1;
 			}
 		}
 	}
 	/*
 	 * Let the congestion control algorithm update congestion control
 	 * related information. This typically means increasing the
 	 * congestion window.
 	 */
 	rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery);
 	SOCKBUF_LOCK(&so->so_snd);
 	acked_amount = min(acked, (int)sbavail(&so->so_snd));
 	tp->snd_wnd -= acked_amount;
 	mfree = sbcut_locked(&so->so_snd, acked_amount);
 	if ((sbused(&so->so_snd) == 0) &&
 	    (acked > acked_amount) &&
 	    (tp->t_state >= TCPS_FIN_WAIT_1)) {
 		ourfinisacked = 1;
 	}
 	/* NB: sowwakeup_locked() does an implicit unlock. */
 	sowwakeup_locked(so);
 	m_freem(mfree);
 	if (rack->r_ctl.rc_early_recovery == 0) {
-		if (IN_FASTRECOVERY(tp->t_flags)) {
-			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+		if (IN_RECOVERY(tp->t_flags)) {
+			if (SEQ_LT(th->th_ack, tp->snd_recover) &&
+			    (SEQ_LT(th->th_ack, tp->snd_max))) {
 				tcp_rack_partialack(tp, th);
 			} else {
 				rack_post_recovery(tp, th);
 			}
 		}
 	}
 	tp->snd_una = th->th_ack;
 	if (SEQ_GT(tp->snd_una, tp->snd_recover))
 		tp->snd_recover = tp->snd_una;
 
 	if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
 		tp->snd_nxt = tp->snd_una;
 	}
 	if (tp->snd_una == tp->snd_max) {
 		/* Nothing left outstanding */
 		rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
 		tp->t_acktime = 0;
 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 		/* Set need output so persist might get set */
 		rack->r_wanted_output++;
 		if (rack_use_sack_filter)
 			sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
 		if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
 		    (sbavail(&so->so_snd) == 0) &&
 		    (tp->t_flags2 & TF2_DROP_AF_DATA)) {
-			/* 
+			/*
 			 * The socket was gone and the
 			 * peer sent data, time to
 			 * reset him.
 			 */
 			*ret_val = 1;
 			tp = tcp_close(tp);
 			rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
 			return (1);
 		}
 	}
 	if (ofia)
 		*ofia = ourfinisacked;
 	return (0);
 }
 
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	/*
 	 * Update window information. Don't look at window if no ACK: TAC's
 	 * send garbage on first SYN.
 	 */
 	int32_t nsegs;
+#ifdef TCP_RFC7413
 	int32_t tfo_syn;
+#else
+#define	tfo_syn	(FALSE)
+#endif
 	struct tcp_rack *rack;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
 	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
 	    (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
 		/* keep track of pure window updates */
 		if (tlen == 0 &&
 		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 			TCPSTAT_INC(tcps_rcvwinupd);
 		tp->snd_wnd = tiwin;
 		tp->snd_wl1 = th->th_seq;
 		tp->snd_wl2 = th->th_ack;
 		if (tp->snd_wnd > tp->max_sndwnd)
 			tp->max_sndwnd = tp->snd_wnd;
 		rack->r_wanted_output++;
 	} else if (thflags & TH_ACK) {
 		if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
 			tp->snd_wnd = tiwin;
 			tp->snd_wl1 = th->th_seq;
 			tp->snd_wl2 = th->th_ack;
 		}
 	}
 	/* Was persist timer active and now we have window space? */
 	if ((rack->rc_in_persist != 0) && tp->snd_wnd) {
 		rack_exit_persist(tp, rack);
 		tp->snd_nxt = tp->snd_max;
 		/* Make sure we output to start the timer */
 		rack->r_wanted_output++;
 	}
 	if (tp->t_flags2 & TF2_DROP_AF_DATA) {
 		m_freem(m);
 		return (0);
 	}
 	/*
 	 * Process segments with URG.
 	 */
 	if ((thflags & TH_URG) && th->th_urp &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		/*
 		 * This is a kludge, but if we receive and accept random
 		 * urgent pointers, we'll crash in soreceive.  It's hard to
 		 * imagine someone actually wanting to send this much urgent
 		 * data.
 		 */
 		SOCKBUF_LOCK(&so->so_rcv);
 		if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
 			th->th_urp = 0;	/* XXX */
 			thflags &= ~TH_URG;	/* XXX */
 			SOCKBUF_UNLOCK(&so->so_rcv);	/* XXX */
 			goto dodata;	/* XXX */
 		}
 		/*
 		 * If this segment advances the known urgent pointer, then
 		 * mark the data stream.  This should not happen in
 		 * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a
 		 * FIN has been received from the remote side. In these
 		 * states we ignore the URG.
 		 *
 		 * According to RFC961 (Assigned Protocols), the urgent
 		 * pointer points to the last octet of urgent data.  We
 		 * continue, however, to consider it to indicate the first
 		 * octet of data past the urgent section as the original
 		 * spec states (in one of two places).
 		 */
 		if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
 			tp->rcv_up = th->th_seq + th->th_urp;
 			so->so_oobmark = sbavail(&so->so_rcv) +
 			    (tp->rcv_up - tp->rcv_nxt) - 1;
 			if (so->so_oobmark == 0)
 				so->so_rcv.sb_state |= SBS_RCVATMARK;
 			sohasoutofband(so);
 			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 		}
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		/*
 		 * Remove out of band data so doesn't get presented to user.
 		 * This can happen independent of advancing the URG pointer,
 		 * but if two URG's are pending at once, some out-of-band
 		 * data may creep in... ick.
 		 */
 		if (th->th_urp <= (uint32_t) tlen &&
 		    !(so->so_options & SO_OOBINLINE)) {
 			/* hdr drop is delayed */
 			tcp_pulloutofband(so, th, m, drop_hdrlen);
 		}
 	} else {
 		/*
 		 * If no out of band data is expected, pull receive urgent
 		 * pointer along with the receive window.
 		 */
 		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
 			tp->rcv_up = tp->rcv_nxt;
 	}
 dodata:				/* XXX */
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * Process the segment text, merging it into the TCP sequencing
 	 * queue, and arranging for acknowledgment of receipt if necessary.
 	 * This process logically involves adjusting tp->rcv_wnd as data is
 	 * presented to the user (this happens in tcp_usrreq.c, case
 	 * PRU_RCVD).  If a FIN has already been received on this connection
 	 * then we just ignore the text.
 	 */
+#ifdef TCP_RFC7413
 	tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
-		   IS_FASTOPEN(tp->t_flags));
+	    (tp->t_flags & TF_FASTOPEN));
+#endif
 	if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		tcp_seq save_start = th->th_seq;
 		tcp_seq save_rnxt  = tp->rcv_nxt;
 		int     save_tlen  = tlen;
 
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 		/*
 		 * Insert segment which includes th into TCP reassembly
 		 * queue with control block tp.  Set thflags to whether
 		 * reassembly now includes a segment with FIN.  This handles
 		 * the common case inline (segment is the next to be
 		 * received on an established connection, and the queue is
 		 * empty), avoiding linkage into and removal from the queue
 		 * and repetition of various conversions. Set DELACK for
 		 * segments received in order, but ack immediately when
 		 * segments are out of order (so fast retransmit can work).
 		 */
 		if (th->th_seq == tp->rcv_nxt &&
 		    SEGQ_EMPTY(tp) &&
 		    (TCPS_HAVEESTABLISHED(tp->t_state) ||
 		    tfo_syn)) {
 			if (DELAY_ACK(tp, tlen) || tfo_syn) {
 				rack_timer_cancel(tp, rack,
 				    rack->r_ctl.rc_rcvtime, __LINE__);
 				tp->t_flags |= TF_DELACK;
 			} else {
 				rack->r_wanted_output++;
 				tp->t_flags |= TF_ACKNOW;
 			}
 			tp->rcv_nxt += tlen;
 			thflags = th->th_flags & TH_FIN;
 			TCPSTAT_ADD(tcps_rcvpack, nsegs);
 			TCPSTAT_ADD(tcps_rcvbyte, tlen);
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 				m_freem(m);
 			else
 				sbappendstream_locked(&so->so_rcv, m, 0);
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 		} else {
 			/*
 			 * XXX: Due to the header drop above "th" is
 			 * theoretically invalid by now.  Fortunately
 			 * m_adj() doesn't actually frees any mbufs when
 			 * trimming from the head.
 			 */
 			tcp_seq temp = save_start;
 			thflags = tcp_reass(tp, th, &temp, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
 		}
 		if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) {
 			if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
 				/*
 				 * DSACK actually handled in the fastpath
 				 * above.
 				 */
 				tcp_update_sack_list(tp, save_start,
 				    save_start + save_tlen);
 			} else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
 				if ((tp->rcv_numsacks >= 1) &&
 				    (tp->sackblks[0].end == save_start)) {
 					/*
 					 * Partial overlap, recorded at todrop
 					 * above.
 					 */
 					tcp_update_sack_list(tp,
 					    tp->sackblks[0].start,
 					    tp->sackblks[0].end);
 				} else {
 					tcp_update_dsack_list(tp, save_start,
 					    save_start + save_tlen);
 				}
 			} else if (tlen >= save_tlen) {
 				/* Update of sackblks. */
 				tcp_update_dsack_list(tp, save_start,
 				    save_start + save_tlen);
 			} else if (tlen > 0) {
 				tcp_update_dsack_list(tp, save_start,
 				    save_start + tlen);
 			}
 		}
 	} else {
 		m_freem(m);
 		thflags &= ~TH_FIN;
 	}
 
 	/*
 	 * If FIN is received ACK the FIN and let the user know that the
 	 * connection is closing.
 	 */
 	if (thflags & TH_FIN) {
 		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 			socantrcvmore(so);
 			/*
 			 * If connection is half-synchronized (ie NEEDSYN
 			 * flag on) then delay ACK, so it may be piggybacked
 			 * when SYN is sent. Otherwise, since we received a
 			 * FIN then no more input can be expected, send ACK
 			 * now.
 			 */
 			if (tp->t_flags & TF_NEEDSYN) {
 				rack_timer_cancel(tp, rack,
 				    rack->r_ctl.rc_rcvtime, __LINE__);
 				tp->t_flags |= TF_DELACK;
 			} else {
 				tp->t_flags |= TF_ACKNOW;
 			}
 			tp->rcv_nxt++;
 		}
 		switch (tp->t_state) {
 
 			/*
 			 * In SYN_RECEIVED and ESTABLISHED STATES enter the
 			 * CLOSE_WAIT state.
 			 */
 		case TCPS_SYN_RECEIVED:
 			tp->t_starttime = ticks;
 			/* FALLTHROUGH */
 		case TCPS_ESTABLISHED:
 			rack_timer_cancel(tp, rack,
 			    rack->r_ctl.rc_rcvtime, __LINE__);
 			tcp_state_change(tp, TCPS_CLOSE_WAIT);
 			break;
 
 			/*
 			 * If still in FIN_WAIT_1 STATE FIN has not been
 			 * acked so enter the CLOSING state.
 			 */
 		case TCPS_FIN_WAIT_1:
 			rack_timer_cancel(tp, rack,
 			    rack->r_ctl.rc_rcvtime, __LINE__);
 			tcp_state_change(tp, TCPS_CLOSING);
 			break;
 
 			/*
 			 * In FIN_WAIT_2 state enter the TIME_WAIT state,
 			 * starting the time-wait timer, turning off the
 			 * other standard timers.
 			 */
 		case TCPS_FIN_WAIT_2:
 			rack_timer_cancel(tp, rack,
 			    rack->r_ctl.rc_rcvtime, __LINE__);
 			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 			tcp_twstart(tp);
 			return (1);
 		}
 	}
 	/*
 	 * Return any desired output.
 	 */
 	if ((tp->t_flags & TF_ACKNOW) ||
 	    (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
 		rack->r_wanted_output++;
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	return (0);
 }
 
 /*
  * Here nothing is really faster, its just that we
  * have broken out the fast-data path also just like
  * the fast-ack.
  */
 static int
 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t nxt_pkt)
 {
 	int32_t nsegs;
 	int32_t newsize = 0;	/* automatic sockbuf scaling */
 	struct tcp_rack *rack;
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 
 #endif
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * the timestamp. NOTE that the test is modified according to the
 	 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
 	 */
 	if (__predict_false(th->th_seq != tp->rcv_nxt)) {
 		return (0);
 	}
 	if (__predict_false(tp->snd_nxt != tp->snd_max)) {
 		return (0);
 	}
 	if (tiwin && tiwin != tp->snd_wnd) {
 		return (0);
 	}
 	if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
 		return (0);
 	}
 	if (__predict_false((to->to_flags & TOF_TS) &&
 	    (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
 		return (0);
 	}
 	if (__predict_false((th->th_ack != tp->snd_una))) {
 		return (0);
 	}
 	if (__predict_false(tlen > sbspace(&so->so_rcv))) {
 		return (0);
 	}
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	/*
 	 * This is a pure, in-sequence data packet with nothing on the
 	 * reassembly queue and we have enough buffer space to take it.
 	 */
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 
 
 	/* Clean receiver SACK report if present */
-	if (tp->rcv_numsacks)
-		tcp_clean_sackreport(tp);
+/*	if (tp->rcv_numsacks)
+	        tcp_clean_sackreport(tp);
+*/
 
 	TCPSTAT_INC(tcps_preddat);
 	tp->rcv_nxt += tlen;
 	/*
 	 * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
 	 */
 	tp->snd_wl1 = th->th_seq;
 	/*
 	 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
 	 */
 	tp->rcv_up = tp->rcv_nxt;
 	TCPSTAT_ADD(tcps_rcvpack, nsegs);
 	TCPSTAT_ADD(tcps_rcvbyte, tlen);
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_INPUT, ostate, tp,
 		    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 	newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
 
 	/* Add data to socket buffer. */
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 		m_freem(m);
 	} else {
 		/*
 		 * Set new socket buffer size. Give up when limit is
 		 * reached.
 		 */
 		if (newsize)
 			if (!sbreserve_locked(&so->so_rcv,
 			    newsize, so, NULL))
 				so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 		sbappendstream_locked(&so->so_rcv, m, 0);
 		rack_calc_rwin(so, tp);
 	}
 	/* NB: sorwakeup_locked() does an implicit unlock. */
 	sorwakeup_locked(so);
 	if (DELAY_ACK(tp, tlen)) {
 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 		tp->t_flags |= TF_DELACK;
 	} else {
 		tp->t_flags |= TF_ACKNOW;
 		rack->r_wanted_output++;
 	}
 	if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter)
 		sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
 	return (1);
 }
 
 /*
  * This subfunction is used to try to highly optimize the
  * fast path. We again allow window updates that are
  * in sequence to remain in the fast-path. We also add
  * in the __predict's to attempt to help the compiler.
  * Note that if we return a 0, then we can *not* process
  * it and the caller should push the packet into the
  * slow-path.
  */
 static int
 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t nxt_pkt, uint32_t cts)
 {
 	int32_t acked;
 	int32_t nsegs;
 
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 
 #endif
 	struct tcp_rack *rack;
 
 	if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
 		/* Old ack, behind (or duplicate to) the last one rcv'd */
 		return (0);
 	}
 	if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
 		/* Above what we have sent? */
 		return (0);
 	}
 	if (__predict_false(tp->snd_nxt != tp->snd_max)) {
 		/* We are retransmitting */
 		return (0);
 	}
 	if (__predict_false(tiwin == 0)) {
 		/* zero window */
 		return (0);
 	}
 	if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
 		/* We need a SYN or a FIN, unlikely.. */
 		return (0);
 	}
 	if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
 		/* Timestamp is behind .. old ack with seq wrap? */
 		return (0);
 	}
 	if (__predict_false(IN_RECOVERY(tp->t_flags))) {
 		/* Still recovering */
 		return (0);
 	}
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (rack->r_ctl.rc_sacked) {
 		/* We have sack holes on our scoreboard */
 		return (0);
 	}
 	/* Ok if we reach here, we can process a fast-ack */
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 	rack_log_ack(tp, to, th);
 	/* Did the window get updated? */
 	if (tiwin != tp->snd_wnd) {
 		tp->snd_wnd = tiwin;
 		tp->snd_wl1 = th->th_seq;
 		if (tp->snd_wnd > tp->max_sndwnd)
 			tp->max_sndwnd = tp->snd_wnd;
 	}
 	if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) {
 		rack_exit_persist(tp, rack);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * the timestamp. NOTE that the test is modified according to the
 	 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * This is a pure ack for outstanding data.
 	 */
 	TCPSTAT_INC(tcps_predack);
 
 	/*
 	 * "bad retransmit" recovery.
 	 */
 	if (tp->t_flags & TF_PREVVALID) {
 		tp->t_flags &= ~TF_PREVVALID;
 		if (tp->t_rxtshift == 1 &&
 		    (int)(ticks - tp->t_badrxtwin) < 0)
 			rack_cong_signal(tp, th, CC_RTO_ERR);
 	}
 	/*
 	 * Recalculate the transmit timer / rtt.
 	 *
 	 * Some boxes send broken timestamp replies during the SYN+ACK
 	 * phase, ignore timestamps of 0 or we could calculate a huge RTT
 	 * and blow up the retransmit timer.
 	 */
 	acked = BYTES_THIS_ACK(tp, th);
 
 #ifdef TCP_HHOOK
 	/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
 	hhook_run_tcp_est_in(tp, th, to);
 #endif
 
 	TCPSTAT_ADD(tcps_rcvackpack, nsegs);
 	TCPSTAT_ADD(tcps_rcvackbyte, acked);
 	sbdrop(&so->so_snd, acked);
 	/*
 	 * Let the congestion control algorithm update congestion control
 	 * related information. This typically means increasing the
 	 * congestion window.
 	 */
 	rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0);
 
 	tp->snd_una = th->th_ack;
 	/*
 	 * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
 	 */
 	tp->snd_wl2 = th->th_ack;
 	tp->t_dupacks = 0;
 	m_freem(m);
 	/* ND6_HINT(tp);	 *//* Some progress has been made. */
 
 	/*
 	 * If all outstanding data are acked, stop retransmit timer,
 	 * otherwise restart timer using current (possibly backed-off)
 	 * value. If process is waiting for space, wakeup/selwakeup/signal.
 	 * If data are ready to send, let tcp_output decide between more
 	 * output or persist.
 	 */
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_INPUT, ostate, tp,
 		    (void *)tcp_saveipgen,
 		    &tcp_savetcp, 0);
 #endif
 	if (tp->snd_una == tp->snd_max) {
 		rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
 		tp->t_acktime = 0;
 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 	}
 	/* Wake up the socket if we have room to write more */
 	sowwakeup(so);
 	if (sbavail(&so->so_snd)) {
 		rack->r_wanted_output++;
 	}
 	return (1);
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	int32_t ret_val = 0;
 	int32_t todrop;
 	int32_t ourfinisacked = 0;
 
 	rack_calc_rwin(so, tp);
 	/*
 	 * If the state is SYN_SENT: if seg contains an ACK, but not for our
 	 * SYN, drop the input. if seg contains a RST, then drop the
 	 * connection. if seg does not contain SYN, then drop it. Otherwise
 	 * this is an acceptable SYN segment initialize tp->rcv_nxt and
 	 * tp->irs if seg contains ack then advance tp->snd_una if seg
 	 * contains an ECE and ECN support is enabled, the stream is ECN
 	 * capable. if SYN has been acked change to ESTABLISHED else
 	 * SYN_RCVD state arrange for segment to be acked (eventually)
 	 * continue processing rest of data/controls, beginning with URG
 	 */
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LEQ(th->th_ack, tp->iss) ||
 	    SEQ_GT(th->th_ack, tp->snd_max))) {
 		rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 		return (1);
 	}
 	if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
 		TCP_PROBE5(connect__refused, NULL, tp,
 		    mtod(m, const char *), tp, th);
 		tp = tcp_drop(tp, ECONNREFUSED);
 		rack_do_drop(m, tp);
 		return (1);
 	}
 	if (thflags & TH_RST) {
 		rack_do_drop(m, tp);
 		return (1);
 	}
 	if (!(thflags & TH_SYN)) {
 		rack_do_drop(m, tp);
 		return (1);
 	}
 	tp->irs = th->th_seq;
 	tcp_rcvseqinit(tp);
 	if (thflags & TH_ACK) {
-		int tfo_partial = 0;
-		
 		TCPSTAT_INC(tcps_connects);
 		soisconnected(so);
 #ifdef MAC
 		mac_socketpeer_set_from_mbuf(m, so);
 #endif
 		/* Do window scaling on this connection? */
 		if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 		    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 			tp->rcv_scale = tp->request_r_scale;
 		}
 		tp->rcv_adv += min(tp->rcv_wnd,
 		    TCP_MAXWIN << tp->rcv_scale);
 		/*
-		 * If not all the data that was sent in the TFO SYN
-		 * has been acked, resend the remainder right away.
-		 */
-		if (IS_FASTOPEN(tp->t_flags) &&
-		    (tp->snd_una != tp->snd_max)) {
-			tp->snd_nxt = th->th_ack;
-			tfo_partial = 1;
-		}
-		/*
 		 * If there's data, delay ACK; if there's also a FIN ACKNOW
 		 * will be turned on later.
 		 */
-		if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) {
+		if (DELAY_ACK(tp, tlen) && tlen != 0) {
 			rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr,
 					  ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__);
 			tp->t_flags |= TF_DELACK;
 		} else {
 			((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
 			tp->t_flags |= TF_ACKNOW;
 		}
 
-		if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) &&
-		    V_tcp_do_ecn) {
+		if ((thflags & TH_ECE) && V_tcp_do_ecn) {
 			tp->t_flags |= TF_ECN_PERMIT;
 			TCPSTAT_INC(tcps_ecn_shs);
 		}
-		if (SEQ_GT(th->th_ack, tp->snd_una)) {
-			/* 
-			 * We advance snd_una for the 
-			 * fast open case. If th_ack is
-			 * acknowledging data beyond 
-			 * snd_una we can't just call
-			 * ack-processing since the 
-			 * data stream in our send-map
-			 * will start at snd_una + 1 (one
-			 * beyond the SYN). If its just
-			 * equal we don't need to do that
-			 * and there is no send_map.
-			 */
-			tp->snd_una++;
-		}
 		/*
 		 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
 		 * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
 		 */
 		tp->t_starttime = ticks;
 		if (tp->t_flags & TF_NEEDFIN) {
 			tcp_state_change(tp, TCPS_FIN_WAIT_1);
 			tp->t_flags &= ~TF_NEEDFIN;
 			thflags &= ~TH_SYN;
 		} else {
 			tcp_state_change(tp, TCPS_ESTABLISHED);
 			TCP_PROBE5(connect__established, NULL, tp,
 			    mtod(m, const char *), tp, th);
 			cc_conn_init(tp);
 		}
 	} else {
 		/*
 		 * Received initial SYN in SYN-SENT[*] state => simultaneous
 		 * open.  If segment contains CC option and there is a
 		 * cached CC, apply TAO test. If it succeeds, connection is *
 		 * half-synchronized. Otherwise, do 3-way handshake:
 		 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
 		 * there was no CC option, clear cached CC value.
 		 */
 		tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
 		tcp_state_change(tp, TCPS_SYN_RECEIVED);
 	}
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	/*
 	 * Advance th->th_seq to correspond to first data byte. If data,
 	 * trim to stay within window, dropping FIN if necessary.
 	 */
 	th->th_seq++;
 	if (tlen > tp->rcv_wnd) {
 		todrop = tlen - tp->rcv_wnd;
 		m_adj(m, -todrop);
 		tlen = tp->rcv_wnd;
 		thflags &= ~TH_FIN;
 		TCPSTAT_INC(tcps_rcvpackafterwin);
 		TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 	}
 	tp->snd_wl1 = th->th_seq - 1;
 	tp->rcv_up = th->th_seq;
 	/*
 	 * Client side of transaction: already sent SYN and data. If the
 	 * remote host used T/TCP to validate the SYN, our data will be
 	 * ACK'd; if so, enter normal data segment processing in the middle
 	 * of step 5, ack processing. Otherwise, goto step 6.
 	 */
 	if (thflags & TH_ACK) {
 		if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
 			return (ret_val);
 		/* We may have changed to FIN_WAIT_1 above */
 		if (tp->t_state == TCPS_FIN_WAIT_1) {
 			/*
 			 * In FIN_WAIT_1 STATE in addition to the processing
 			 * for the ESTABLISHED state if our FIN is now
 			 * acknowledged then enter FIN_WAIT_2.
 			 */
 			if (ourfinisacked) {
 				/*
 				 * If we can't receive any more data, then
 				 * closing user can proceed. Starting the
 				 * timer is contrary to the specification,
 				 * but if we don't get a FIN we'll hang
 				 * forever.
 				 *
 				 * XXXjl: we should release the tp also, and
 				 * use a compressed state.
 				 */
 				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 					soisdisconnected(so);
 					tcp_timer_activate(tp, TT_2MSL,
 					    (tcp_fast_finwait2_recycle ?
 					    tcp_finwait2_timeout :
 					    TP_MAXIDLE(tp)));
 				}
 				tcp_state_change(tp, TCPS_FIN_WAIT_2);
 			}
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
-	   tiwin, thflags, nxt_pkt));
+	    tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	int32_t ret_val = 0;
 	int32_t ourfinisacked = 0;
 
 	rack_calc_rwin(so, tp);
 
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 	    SEQ_GT(th->th_ack, tp->snd_max))) {
 		rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 		return (1);
 	}
-	if (IS_FASTOPEN(tp->t_flags)) {
+#ifdef TCP_RFC7413
+	if (tp->t_flags & TF_FASTOPEN) {
 		/*
-		 * When a TFO connection is in SYN_RECEIVED, the
-		 * only valid packets are the initial SYN, a
-		 * retransmit/copy of the initial SYN (possibly with
-		 * a subset of the original data), a valid ACK, a
-		 * FIN, or a RST.
+		 * When a TFO connection is in SYN_RECEIVED, the only valid
+		 * packets are the initial SYN, a retransmit/copy of the
+		 * initial SYN (possibly with a subset of the original
+		 * data), a valid ACK, a FIN, or a RST.
 		 */
 		if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
 			rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		} else if (thflags & TH_SYN) {
 			/* non-initial SYN is ignored */
 			struct tcp_rack *rack;
 
 			rack = (struct tcp_rack *)tp->t_fb_ptr;
 			if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
 			    (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
 			    (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
 				rack_do_drop(m, NULL);
 				return (0);
 			}
 		} else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
 			rack_do_drop(m, NULL);
 			return (0);
 		}
 	}
+#endif
 	if (thflags & TH_RST)
 		return (rack_process_rst(m, th, so, tp));
 	/*
+	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
+	 * synchronized state.
+	 */
+	if (thflags & TH_SYN) {
+		rack_challenge_ack(m, th, tp, &ret_val);
+		return (ret_val);
+	}
+	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	/*
 	 * In the SYN-RECEIVED state, validate that the packet belongs to
 	 * this connection before trimming the data to fit the receive
 	 * window.  Check the sequence number versus IRS since we know the
 	 * sequence numbers haven't wrapped.  This is a partial fix for the
 	 * "LAND" DoS attack.
 	 */
 	if (SEQ_LT(th->th_seq, tp->irs)) {
 		rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 		return (1);
 	}
 	if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
-	tp->snd_wnd = tiwin;
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
-		if (IS_FASTOPEN(tp->t_flags)) {
+#ifdef TCP_RFC7413
+		if (tp->t_flags & TF_FASTOPEN) {
+			tp->snd_wnd = tiwin;
 			cc_conn_init(tp);
 		}
+#endif
 		return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 		    tiwin, thflags, nxt_pkt));
 	}
 	TCPSTAT_INC(tcps_connects);
 	soisconnected(so);
 	/* Do window scaling? */
 	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 		tp->rcv_scale = tp->request_r_scale;
+		tp->snd_wnd = tiwin;
 	}
 	/*
 	 * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
 	 * FIN-WAIT-1
 	 */
 	tp->t_starttime = ticks;
 	if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
 		tcp_fastopen_decrement_counter(tp->t_tfo_pending);
 		tp->t_tfo_pending = NULL;
 	}
 	if (tp->t_flags & TF_NEEDFIN) {
 		tcp_state_change(tp, TCPS_FIN_WAIT_1);
 		tp->t_flags &= ~TF_NEEDFIN;
 	} else {
 		tcp_state_change(tp, TCPS_ESTABLISHED);
 		TCP_PROBE5(accept__established, NULL, tp,
 		    mtod(m, const char *), tp, th);
+#ifdef TCP_RFC7413
+		if (tp->t_tfo_pending) {
+			tcp_fastopen_decrement_counter(tp->t_tfo_pending);
+			tp->t_tfo_pending = NULL;
+
+			/*
+			 * Account for the ACK of our SYN prior to regular
+			 * ACK processing below.
+			 */
+			tp->snd_una++;
+		}
 		/*
 		 * TFO connections call cc_conn_init() during SYN
 		 * processing.  Calling it again here for such connections
 		 * is not harmless as it would undo the snd_cwnd reduction
 		 * that occurs when a TFO SYN|ACK is retransmitted.
 		 */
-		if (!IS_FASTOPEN(tp->t_flags))
+		if (!(tp->t_flags & TF_FASTOPEN))
+#endif
 			cc_conn_init(tp);
 	}
 	/*
 	 * Account for the ACK of our SYN prior to
 	 * regular ACK processing below, except for
 	 * simultaneous SYN, which is handled later.
 	 */
 	if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN))
 		tp->snd_una++;
 	/*
 	 * If segment contains data or ACK, will call tcp_reass() later; if
 	 * not, do so now to pass queued data to user.
 	 */
 	if (tlen == 0 && (thflags & TH_FIN) == 0)
-		(void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
+		(void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
 		    (struct mbuf *)0);
 	tp->snd_wl1 = th->th_seq - 1;
 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (tp->t_state == TCPS_FIN_WAIT_1) {
 		/* We could have went to FIN_WAIT_1 (or EST) above */
 		/*
 		 * In FIN_WAIT_1 STATE in addition to the processing for the
 		 * ESTABLISHED state if our FIN is now acknowledged then
 		 * enter FIN_WAIT_2.
 		 */
 		if (ourfinisacked) {
 			/*
 			 * If we can't receive any more data, then closing
 			 * user can proceed. Starting the timer is contrary
 			 * to the specification, but if we don't get a FIN
 			 * we'll hang forever.
 			 *
 			 * XXXjl: we should release the tp also, and use a
 			 * compressed state.
 			 */
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				soisdisconnected(so);
 				tcp_timer_activate(tp, TT_2MSL,
 				    (tcp_fast_finwait2_recycle ?
 				    tcp_finwait2_timeout :
 				    TP_MAXIDLE(tp)));
 			}
 			tcp_state_change(tp, TCPS_FIN_WAIT_2);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	int32_t ret_val = 0;
 
 	/*
 	 * Header prediction: check for the two common cases of a
 	 * uni-directional data xfer.  If the packet has no control flags,
 	 * is in-sequence, the window didn't change and we're not
 	 * retransmitting, it's a candidate.  If the length is zero and the
 	 * ack moved forward, we're the sender side of the xfer.  Just free
 	 * the data acked & wake any higher level process that was blocked
 	 * waiting for space.  If the length is non-zero and the ack didn't
 	 * move, we're the receiver side.  If we're getting packets in-order
 	 * (the reassembly queue is empty), add the data toc The socket
 	 * buffer and note that we need a delayed ack. Make sure that the
 	 * hidden state-flags are also off. Since we check for
 	 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
 	 */
 	if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
 	    __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) &&
 	    __predict_true(SEGQ_EMPTY(tp)) &&
 	    __predict_true(th->th_seq == tp->rcv_nxt)) {
 		struct tcp_rack *rack;
 
 		rack = (struct tcp_rack *)tp->t_fb_ptr;
 		if (tlen == 0) {
 			if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
 			    tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) {
 				return (0);
 			}
 		} else {
 			if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
 			    tiwin, nxt_pkt)) {
 				return (0);
 			}
 		}
 	}
 	rack_calc_rwin(so, tp);
 
 	if (thflags & TH_RST)
 		return (rack_process_rst(m, th, so, tp));
 
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		rack_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 
 		} else if (tp->t_flags & TF_ACKNOW) {
 			rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			return (ret_val);
 		} else {
 			rack_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (rack_progress_timeout_check(tp)) {
 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
 			rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	/* State changes only happen in rack_process_data() */
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	int32_t ret_val = 0;
 
 	rack_calc_rwin(so, tp);
 	if (thflags & TH_RST)
 		return (rack_process_rst(m, th, so, tp));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		rack_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 
 		} else if (tp->t_flags & TF_ACKNOW) {
 			rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			return (ret_val);
 		} else {
 			rack_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (rack_progress_timeout_check(tp)) {
 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
 			rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 static int
-rack_check_data_after_close(struct mbuf *m, 
+rack_check_data_after_close(struct mbuf *m,
     struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
 {
 	struct tcp_rack *rack;
 
 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (rack->rc_allow_data_af_clo == 0) {
 	close_now:
 		tp = tcp_close(tp);
 		TCPSTAT_INC(tcps_rcvafterclose);
 		rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
 		return (1);
 	}
 	if (sbavail(&so->so_snd) == 0)
 		goto close_now;
 	/* Ok we allow data that is ignored and a followup reset */
 	tp->rcv_nxt = th->th_seq + *tlen;
 	tp->t_flags2 |= TF2_DROP_AF_DATA;
 	rack->r_wanted_output = 1;
 	*tlen = 0;
 	return (0);
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	int32_t ret_val = 0;
 	int32_t ourfinisacked = 0;
 
 	rack_calc_rwin(so, tp);
 
 	if (thflags & TH_RST)
 		return (rack_process_rst(m, th, so, tp));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		rack_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If new data are received on a connection after the user processes
 	 * are gone, then RST the other end.
 	 */
 	if ((so->so_state & SS_NOFDREF) && tlen) {
 		if (rack_check_data_after_close(m, tp, &tlen, th, so))
 			return (1);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			return (ret_val);
 		} else {
 			rack_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (ourfinisacked) {
 		/*
 		 * If we can't receive any more data, then closing user can
 		 * proceed. Starting the timer is contrary to the
 		 * specification, but if we don't get a FIN we'll hang
 		 * forever.
 		 *
 		 * XXXjl: we should release the tp also, and use a
 		 * compressed state.
 		 */
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 			soisdisconnected(so);
 			tcp_timer_activate(tp, TT_2MSL,
 			    (tcp_fast_finwait2_recycle ?
 			    tcp_finwait2_timeout :
 			    TP_MAXIDLE(tp)));
 		}
 		tcp_state_change(tp, TCPS_FIN_WAIT_2);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (rack_progress_timeout_check(tp)) {
 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
 			rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	int32_t ret_val = 0;
 	int32_t ourfinisacked = 0;
 
 	rack_calc_rwin(so, tp);
 
 	if (thflags & TH_RST)
 		return (rack_process_rst(m, th, so, tp));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		rack_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If new data are received on a connection after the user processes
 	 * are gone, then RST the other end.
 	 */
 	if ((so->so_state & SS_NOFDREF) && tlen) {
 		if (rack_check_data_after_close(m, tp, &tlen, th, so))
 			return (1);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			return (ret_val);
 		} else {
 			rack_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (ourfinisacked) {
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 		tcp_twstart(tp);
 		m_freem(m);
 		return (1);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (rack_progress_timeout_check(tp)) {
 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
 			rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	int32_t ret_val = 0;
 	int32_t ourfinisacked = 0;
 
 	rack_calc_rwin(so, tp);
 
 	if (thflags & TH_RST)
 		return (rack_process_rst(m, th, so, tp));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		rack_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If new data are received on a connection after the user processes
 	 * are gone, then RST the other end.
 	 */
 	if ((so->so_state & SS_NOFDREF) && tlen) {
 		if (rack_check_data_after_close(m, tp, &tlen, th, so))
 			return (1);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			return (ret_val);
 		} else {
 			rack_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * case TCPS_LAST_ACK: Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (ourfinisacked) {
 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 		tp = tcp_close(tp);
 		rack_do_drop(m, tp);
 		return (1);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (rack_progress_timeout_check(tp)) {
 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
 			rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	int32_t ret_val = 0;
 	int32_t ourfinisacked = 0;
 
 	rack_calc_rwin(so, tp);
 
 	/* Reset receive buffer auto scaling when not in bulk receive mode. */
 	if (thflags & TH_RST)
 		return (rack_process_rst(m, th, so, tp));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		rack_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If new data are received on a connection after the user processes
 	 * are gone, then RST the other end.
 	 */
 	if ((so->so_state & SS_NOFDREF) &&
 	    tlen) {
 		if (rack_check_data_after_close(m, tp, &tlen, th, so))
 			return (1);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			return (ret_val);
 		} else {
 			rack_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (rack_progress_timeout_check(tp)) {
 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
 			rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 
 static void inline
 rack_clear_rate_sample(struct tcp_rack *rack)
 {
 	rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
 	rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
 	rack->r_ctl.rack_rs.rs_rtt_tot = 0;
 }
 
 static int
 rack_init(struct tcpcb *tp)
 {
 	struct tcp_rack *rack = NULL;
 
 	tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
 	if (tp->t_fb_ptr == NULL) {
 		/*
 		 * We need to allocate memory but cant. The INP and INP_INFO
 		 * locks and they are recusive (happens during setup. So a
 		 * scheme to drop the locks fails :(
 		 *
 		 */
 		return (ENOMEM);
 	}
 	memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	TAILQ_INIT(&rack->r_ctl.rc_map);
 	TAILQ_INIT(&rack->r_ctl.rc_free);
 	TAILQ_INIT(&rack->r_ctl.rc_tmap);
 	rack->rc_tp = tp;
 	if (tp->t_inpcb) {
 		rack->rc_inp = tp->t_inpcb;
 	}
 	/* Probably not needed but lets be sure */
 	rack_clear_rate_sample(rack);
 	rack->r_cpu = 0;
 	rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
 	rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
 	rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
 	rack->rc_pace_reduce = rack_slot_reduction;
 	if (V_tcp_delack_enabled)
 		tp->t_delayed_ack = 1;
 	else
 		tp->t_delayed_ack = 0;
 	rack->rc_pace_max_segs = rack_hptsi_segments;
 	rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg;
 	rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
 	rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
 	rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce;
 	rack->r_idle_reduce_largest  = rack_reduce_largest_on_idle;
 	rack->r_enforce_min_pace = rack_min_pace_time;
 	rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req;
 	rack->r_ctl.rc_prop_rate = rack_proportional_rate;
 	rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
 	rack->r_ctl.rc_early_recovery = rack_early_recovery;
 	rack->rc_always_pace = rack_pace_every_seg;
 	rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
 	rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
 	rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
 	rack->r_ctl.rc_min_to = rack_min_to;
 	rack->r_ctl.rc_prr_inc_var = rack_inc_var;
-	rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
 	if (tp->snd_una != tp->snd_max) {
 		/* Create a send map for the current outstanding data */
 		struct rack_sendmap *rsm;
 
 		rsm = rack_alloc(rack);
 		if (rsm == NULL) {
 			uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
 			tp->t_fb_ptr = NULL;
 			return (ENOMEM);
 		}
 		rsm->r_flags = RACK_OVERMAX;
 		rsm->r_tim_lastsent[0] = tcp_ts_getticks();
 		rsm->r_rtr_cnt = 1;
 		rsm->r_rtr_bytes = 0;
 		rsm->r_start = tp->snd_una;
 		rsm->r_end = tp->snd_max;
 		rsm->r_sndcnt = 0;
 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 		rsm->r_in_tmap = 1;
 	}
+	rack_stop_all_timers(tp);
+	rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
 	return (0);
 }
 
 static int
 rack_handoff_ok(struct tcpcb *tp)
 {
 	if ((tp->t_state == TCPS_CLOSED) ||
 	    (tp->t_state == TCPS_LISTEN)) {
 		/* Sure no problem though it may not stick */
 		return (0);
 	}
 	if ((tp->t_state == TCPS_SYN_SENT) ||
 	    (tp->t_state == TCPS_SYN_RECEIVED)) {
 		/*
 		 * We really don't know you have to get to ESTAB or beyond
 		 * to tell.
 		 */
 		return (EAGAIN);
 	}
 	if (tp->t_flags & TF_SACK_PERMIT) {
 		return (0);
 	}
 	/*
 	 * If we reach here we don't do SACK on this connection so we can
 	 * never do rack.
 	 */
 	return (EINVAL);
 }
 
 static void
 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
 {
 	if (tp->t_fb_ptr) {
 		struct tcp_rack *rack;
 		struct rack_sendmap *rsm;
 
 		rack = (struct tcp_rack *)tp->t_fb_ptr;
 #ifdef TCP_BLACKBOX
 		tcp_log_flowend(tp);
 #endif
 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
 		while (rsm) {
 			TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
 			uma_zfree(rack_zone, rsm);
 			rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
 		}
 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
 		while (rsm) {
 			TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
 			uma_zfree(rack_zone, rsm);
 			rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
 		}
 		rack->rc_free_cnt = 0;
 		uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
 		tp->t_fb_ptr = NULL;
 	}
+	/* Make sure snd_nxt is correctly set */
+	tp->snd_nxt = tp->snd_max;
 }
 
 static void
 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
 {
 	switch (tp->t_state) {
 	case TCPS_SYN_SENT:
 		rack->r_state = TCPS_SYN_SENT;
 		rack->r_substate = rack_do_syn_sent;
 		break;
 	case TCPS_SYN_RECEIVED:
 		rack->r_state = TCPS_SYN_RECEIVED;
 		rack->r_substate = rack_do_syn_recv;
 		break;
 	case TCPS_ESTABLISHED:
 		rack->r_state = TCPS_ESTABLISHED;
 		rack->r_substate = rack_do_established;
 		break;
 	case TCPS_CLOSE_WAIT:
 		rack->r_state = TCPS_CLOSE_WAIT;
 		rack->r_substate = rack_do_close_wait;
 		break;
 	case TCPS_FIN_WAIT_1:
 		rack->r_state = TCPS_FIN_WAIT_1;
 		rack->r_substate = rack_do_fin_wait_1;
 		break;
 	case TCPS_CLOSING:
 		rack->r_state = TCPS_CLOSING;
 		rack->r_substate = rack_do_closing;
 		break;
 	case TCPS_LAST_ACK:
 		rack->r_state = TCPS_LAST_ACK;
 		rack->r_substate = rack_do_lastack;
 		break;
 	case TCPS_FIN_WAIT_2:
 		rack->r_state = TCPS_FIN_WAIT_2;
 		rack->r_substate = rack_do_fin_wait_2;
 		break;
 	case TCPS_LISTEN:
 	case TCPS_CLOSED:
 	case TCPS_TIME_WAIT:
 	default:
-#ifdef INVARIANTS
-		panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state);
-#endif
 		break;
 	};
 }
 
 
 static void
 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
 {
 	/*
 	 * We received an ack, and then did not
 	 * call send or were bounced out due to the
 	 * hpts was running. Now a timer is up as well, is
 	 * it the right timer?
 	 */
 	struct rack_sendmap *rsm;
 	int tmr_up;
-	
+
 	tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
 	if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
 		return;
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
 	    (tmr_up == PACE_TMR_RXT)) {
 		/* Should be an RXT */
 		return;
 	}
 	if (rsm == NULL) {
 		/* Nothing outstanding? */
 		if (tp->t_flags & TF_DELACK) {
 			if (tmr_up == PACE_TMR_DELACK)
 				/* We are supposed to have delayed ack up and we do */
 				return;
 		} else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) {
-			/* 
+			/*
 			 * if we hit enobufs then we would expect the possiblity
 			 * of nothing outstanding and the RXT up (and the hptsi timer).
 			 */
 			return;
 		} else if (((V_tcp_always_keepalive ||
 			     rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
 			    (tp->t_state <= TCPS_CLOSING)) &&
 			   (tmr_up == PACE_TMR_KEEP) &&
 			   (tp->snd_max == tp->snd_una)) {
 			/* We should have keep alive up and we do */
 			return;
 		}
 	}
 	if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) {
 		if ((tp->t_flags & TF_SENTFIN) &&
 		    ((tp->snd_max - tp->snd_una) == 1) &&
 		    (rsm->r_flags & RACK_HAS_FIN)) {
 			/* needs to be a RXT */
 			if (tmr_up == PACE_TMR_RXT)
 				return;
 		} else if (tmr_up == PACE_TMR_RACK)
 			return;
 	} else if (SEQ_GT(tp->snd_max,tp->snd_una) &&
 		   ((tmr_up == PACE_TMR_TLP) ||
 		    (tmr_up == PACE_TMR_RXT))) {
-		/* 
-		 * Either a TLP or RXT is fine if no sack-passed 
+		/*
+		 * Either a TLP or RXT is fine if no sack-passed
 		 * is in place and data is outstanding.
 		 */
 		return;
 	} else if (tmr_up == PACE_TMR_DELACK) {
 		/*
 		 * If the delayed ack was going to go off
 		 * before the rtx/tlp/rack timer were going to
 		 * expire, then that would be the timer in control.
 		 * Note we don't check the time here trusting the
 		 * code is correct.
 		 */
 		return;
 	}
-	/* 
+	/*
 	 * Ok the timer originally started is not what we want now.
 	 * We will force the hpts to be stopped if any, and restart
 	 * with the slot set to what was in the saved slot.
 	 */
 	rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 	rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
 }
 
 static void
 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
     int32_t nxt_pkt, struct timeval *tv)
 {
 	int32_t thflags, retval, did_out = 0;
 	int32_t way_out = 0;
 	uint32_t cts;
 	uint32_t tiwin;
 	struct tcpopt to;
 	struct tcp_rack *rack;
 	struct rack_sendmap *rsm;
 	int32_t prev_state = 0;
 
 	cts = tcp_tv_to_mssectick(tv);
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 
 	kern_prefetch(rack, &prev_state);
 	prev_state = 0;
 	thflags = th->th_flags;
 	/*
 	 * If this is either a state-changing packet or current state isn't
 	 * established, we require a read lock on tcbinfo.  Otherwise, we
 	 * allow the tcbinfo to be in either locked or unlocked, as the
 	 * caller may have unnecessarily acquired a lock due to a race.
 	 */
-	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
-	    tp->t_state != TCPS_ESTABLISHED) {
-		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
-	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
 	    __func__));
 	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
 	    __func__));
 	{
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+		log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
 		TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
 		    tlen, &log, true);
 	}
-	if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
-		way_out = 4;
-		goto done_with_input;
-	}
 	/*
-	 * If a segment with the ACK-bit set arrives in the SYN-SENT state
-	 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
-	 */
-	if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
-	    (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
-		rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
-		return;
-	}
-	/*
 	 * Segment received on connection. Reset idle time and keep-alive
 	 * timer. XXX: This should be done after segment validation to
 	 * ignore broken/spoofed segs.
 	 */
 	if  (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) {
-#ifdef NETFLIX_CWV
-		if ((tp->cwv_enabled) &&
-		    ((tp->cwv_cwnd_valid == 0) &&
-		     TCPS_HAVEESTABLISHED(tp->t_state) &&
-		     (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) {
-			tcp_newcwv_nvp_closedown(tp);
-		} else 
-#endif
-		       if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
+		if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
 			counter_u64_add(rack_input_idle_reduces, 1);
 			rack_cc_after_idle(tp,
 			    (rack->r_idle_reduce_largest ? 1 :0));
 		}
 	}
 	rack->r_ctl.rc_rcvtime = cts;
 	tp->t_rcvtime = ticks;
 
-#ifdef NETFLIX_CWV
-	if (tp->cwv_enabled) {
-		if ((tp->cwv_cwnd_valid == 0) &&
-		    TCPS_HAVEESTABLISHED(tp->t_state) &&
-		    (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
-			tcp_newcwv_nvp_closedown(tp);
-	}
-#endif
 	/*
 	 * Unscale the window into a 32-bit value. For the SYN_SENT state
 	 * the scale is zero.
 	 */
 	tiwin = th->th_win << tp->snd_scale;
 #ifdef NETFLIX_STATS
 	stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
 #endif
 	/*
 	 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
 	 * this to occur after we've validated the segment.
 	 */
 	if (tp->t_flags & TF_ECN_PERMIT) {
 		if (thflags & TH_CWR)
 			tp->t_flags &= ~TF_ECN_SND_ECE;
 		switch (iptos & IPTOS_ECN_MASK) {
 		case IPTOS_ECN_CE:
 			tp->t_flags |= TF_ECN_SND_ECE;
 			TCPSTAT_INC(tcps_ecn_ce);
 			break;
 		case IPTOS_ECN_ECT0:
 			TCPSTAT_INC(tcps_ecn_ect0);
 			break;
 		case IPTOS_ECN_ECT1:
 			TCPSTAT_INC(tcps_ecn_ect1);
 			break;
 		}
 		/* Congestion experienced. */
 		if (thflags & TH_ECE) {
 			rack_cong_signal(tp, th, CC_ECN);
 		}
 	}
 	/*
 	 * Parse options on any incoming segment.
 	 */
 	tcp_dooptions(&to, (u_char *)(th + 1),
 	    (th->th_off << 2) - sizeof(struct tcphdr),
 	    (thflags & TH_SYN) ? TO_SYN : 0);
 
 	/*
 	 * If echoed timestamp is later than the current time, fall back to
 	 * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
 	 * were used when this connection was established.
 	 */
 	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
 		to.to_tsecr -= tp->ts_offset;
 		if (TSTMP_GT(to.to_tsecr, cts))
 			to.to_tsecr = 0;
 	}
 	/*
 	 * If its the first time in we need to take care of options and
 	 * verify we can do SACK for rack!
 	 */
 	if (rack->r_state == 0) {
 		/* Should be init'd by rack_init() */
 		KASSERT(rack->rc_inp != NULL,
 		    ("%s: rack->rc_inp unexpectedly NULL", __func__));
 		if (rack->rc_inp == NULL) {
 			rack->rc_inp = tp->t_inpcb;
 		}
 
 		/*
 		 * Process options only when we get SYN/ACK back. The SYN
 		 * case for incoming connections is handled in tcp_syncache.
 		 * According to RFC1323 the window field in a SYN (i.e., a
 		 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
 		 * this is traditional behavior, may need to be cleaned up.
 		 */
 		rack->r_cpu = inp_to_cpuid(tp->t_inpcb);
 		if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
 			if ((to.to_flags & TOF_SCALE) &&
 			    (tp->t_flags & TF_REQ_SCALE)) {
 				tp->t_flags |= TF_RCVD_SCALE;
 				tp->snd_scale = to.to_wscale;
 			} else
 				tp->t_flags &= ~TF_REQ_SCALE;
 			/*
 			 * Initial send window.  It will be updated with the
 			 * next incoming segment to the scaled value.
 			 */
 			tp->snd_wnd = th->th_win;
 			if ((to.to_flags & TOF_TS) &&
 			    (tp->t_flags & TF_REQ_TSTMP)) {
 				tp->t_flags |= TF_RCVD_TSTMP;
 				tp->ts_recent = to.to_tsval;
 				tp->ts_recent_age = cts;
 			} else
 				tp->t_flags &= ~TF_REQ_TSTMP;
 			if (to.to_flags & TOF_MSS)
 				tcp_mss(tp, to.to_mss);
 			if ((tp->t_flags & TF_SACK_PERMIT) &&
 			    (to.to_flags & TOF_SACKPERM) == 0)
 				tp->t_flags &= ~TF_SACK_PERMIT;
-			if (IS_FASTOPEN(tp->t_flags)) {
-				if (to.to_flags & TOF_FASTOPEN) {
-					uint16_t mss;
-
-					if (to.to_flags & TOF_MSS)
-						mss = to.to_mss;
-					else
-						if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
-							mss = TCP6_MSS;
-						else
-							mss = TCP_MSS;
-					tcp_fastopen_update_cache(tp, mss,
-					    to.to_tfo_len, to.to_tfo_cookie);
-				} else
-					tcp_fastopen_disable_path(tp);
-			}
 		}
 		/*
 		 * At this point we are at the initial call. Here we decide
 		 * if we are doing RACK or not. We do this by seeing if
 		 * TF_SACK_PERMIT is set, if not rack is *not* possible and
 		 * we switch to the default code.
 		 */
 		if ((tp->t_flags & TF_SACK_PERMIT) == 0) {
 			tcp_switch_back_to_default(tp);
 			(*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
 			    tlen, iptos);
 			return;
 		}
 		/* Set the flag */
 		rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 		tcp_set_hpts(tp->t_inpcb);
-		rack_stop_all_timers(tp);
 		sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
 	}
 	/*
 	 * This is the one exception case where we set the rack state
 	 * always. All other times (timers etc) we must have a rack-state
 	 * set (so we assure we have done the checks above for SACK).
 	 */
 	if (rack->r_state != tp->t_state)
 		rack_set_state(tp, rack);
 	if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL)
 		kern_prefetch(rsm, &prev_state);
 	prev_state = rack->r_state;
 	rack->r_ctl.rc_tlp_send_cnt = 0;
 	rack_clear_rate_sample(rack);
 	retval = (*rack->r_substate) (m, th, so,
 	    tp, &to, drop_hdrlen,
 	    tlen, tiwin, thflags, nxt_pkt);
 #ifdef INVARIANTS
 	if ((retval == 0) &&
 	    (tp->t_inpcb == NULL)) {
 		panic("retval:%d tp:%p t_inpcb:NULL state:%d",
 		    retval, tp, prev_state);
 	}
 #endif
 	if (retval == 0) {
 		/*
 		 * If retval is 1 the tcb is unlocked and most likely the tp
 		 * is gone.
 		 */
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 		tcp_rack_xmit_timer_commit(rack, tp);
-		if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) &&
-		    (rack->rc_in_persist == 0)){
-			/* 
-			 * The peer shrunk its window on us to the point
-			 * where we have sent too much. The only thing
-			 * we can do here is stop any timers and
-			 * enter persist. We most likely lost the last
-			 * bytes we sent but oh well, we will have to
-			 * retransmit them after the peer is caught up.
-			 */
-			if (rack->rc_inp->inp_in_hpts)
-				tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
-			rack_timer_cancel(tp, rack, cts, __LINE__);
-			rack_enter_persist(tp, rack, cts);
-			rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
-			way_out = 3;
-			goto done_with_input;
-		}
 		if (nxt_pkt == 0) {
 			if (rack->r_wanted_output != 0) {
 				did_out = 1;
 				(void)tp->t_fb->tfb_tcp_output(tp);
 			}
 			rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
 		}
 		if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
 		    (SEQ_GT(tp->snd_max, tp->snd_una) ||
 		     (tp->t_flags & TF_DELACK) ||
 		     ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
 		      (tp->t_state <= TCPS_CLOSING)))) {
 			/* We could not send (probably in the hpts but stopped the timer earlier)? */
 			if ((tp->snd_max == tp->snd_una) &&
 			    ((tp->t_flags & TF_DELACK) == 0) &&
 			    (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
 				/* keep alive not needed if we are hptsi output yet */
 				;
 			} else {
 				if (rack->rc_inp->inp_in_hpts)
 					tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
 				rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
 			}
 			way_out = 1;
 		} else {
 			/* Do we have the correct timer running? */
 			rack_timer_audit(tp, rack, &so->so_snd);
 			way_out = 2;
 		}
-	done_with_input:
 		rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out);
 		if (did_out)
 			rack->r_wanted_output = 0;
 #ifdef INVARIANTS
 		if (tp->t_inpcb == NULL) {
 			panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
 			      did_out,
 			      retval, tp, prev_state);
 		}
 #endif
 		INP_WUNLOCK(tp->t_inpcb);
 	}
 }
 
 void
 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
 {
 	struct timeval tv;
 #ifdef RSS
 	struct tcp_function_block *tfb;
 	struct tcp_rack *rack;
-	struct epoch_tracker et;
+	struct inpcb *inp;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (rack->r_state == 0) {
 		/*
 		 * Initial input (ACK to SYN-ACK etc)lets go ahead and get
 		 * it processed
 		 */
-		INP_INFO_RLOCK_ET(&V_tcbinfo, et);
 		tcp_get_usecs(&tv);
 		rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
 		    tlen, iptos, 0, &tv);
-		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 		return;
 	}
 	tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos);
 	INP_WUNLOCK(tp->t_inpcb);
 #else
 	tcp_get_usecs(&tv);
 	rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
 	    tlen, iptos, 0, &tv);
 #endif
 }
 
 struct rack_sendmap *
 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
 {
 	struct rack_sendmap *rsm = NULL;
 	int32_t idx;
 	uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0;
 
 	/* Return the next guy to be re-transmitted */
 	if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
 		return (NULL);
 	}
 	if (tp->t_flags & TF_SENTFIN) {
 		/* retran the end FIN? */
 		return (NULL);
 	}
 	/* ok lets look at this one */
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
 		goto check_it;
 	}
 	rsm = rack_find_lowest_rsm(rack);
 	if (rsm == NULL) {
 		return (NULL);
 	}
 check_it:
 	srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
 	srtt = TICKS_2_MSEC(srtt_cur);
 	if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
 		srtt = rack->rc_rack_rtt;
 	if (rsm->r_flags & RACK_ACKED) {
 		return (NULL);
 	}
 	if ((rsm->r_flags & RACK_SACK_PASSED) == 0) {
 		/* Its not yet ready */
 		return (NULL);
 	}
 	idx = rsm->r_rtr_cnt - 1;
 	ts_low = rsm->r_tim_lastsent[idx];
 	thresh = rack_calc_thresh_rack(rack, srtt, tsused);
 	if (tsused <= ts_low) {
 		return (NULL);
 	}
 	if ((tsused - ts_low) >= thresh) {
 		return (rsm);
 	}
 	return (NULL);
 }
 
 static int
 rack_output(struct tcpcb *tp)
 {
 	struct socket *so;
 	uint32_t recwin, sendwin;
 	uint32_t sb_offset;
 	int32_t len, flags, error = 0;
 	struct mbuf *m;
 	struct mbuf *mb;
 	uint32_t if_hw_tsomaxsegcount = 0;
 	uint32_t if_hw_tsomaxsegsize;
 	long tot_len_this_send = 0;
 	struct ip *ip = NULL;
 #ifdef TCPDEBUG
 	struct ipovly *ipov = NULL;
 #endif
+#ifdef NETFLIX_TCP_O_UDP
 	struct udphdr *udp = NULL;
+#endif
 	struct tcp_rack *rack;
 	struct tcphdr *th;
 	uint8_t pass = 0;
-	uint8_t wanted_cookie = 0;
 	u_char opt[TCP_MAXOLEN];
-	unsigned ipoptlen, optlen, hdrlen, ulen=0;
+	unsigned ipoptlen, optlen, hdrlen;
+#ifdef NETFLIX_TCP_O_UDP
+	unsigned ulen;
+#endif
 	uint32_t rack_seq;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	unsigned ipsec_optlen = 0;
 
 #endif
 	int32_t idle, sendalot;
 	int32_t sub_from_prr = 0;
 	volatile int32_t sack_rxmit;
 	struct rack_sendmap *rsm = NULL;
 	int32_t tso, mtu, would_have_fin = 0;
 	struct tcpopt to;
 	int32_t slot = 0;
 	uint32_t cts;
 	uint8_t hpts_calling, doing_tlp = 0;
 	int32_t do_a_prefetch;
 	int32_t prefetch_rsm = 0;
 	int32_t prefetch_so_done = 0;
 	struct tcp_log_buffer *lgb = NULL;
 	struct inpcb *inp;
 	struct sockbuf *sb;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int32_t isipv6;
 #endif
 	/* setup and take the cache hits here */
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	inp = rack->rc_inp;
 	so = inp->inp_socket;
 	sb = &so->so_snd;
 	kern_prefetch(sb, &do_a_prefetch);
 	do_a_prefetch = 1;
-	
+
 	INP_WLOCK_ASSERT(inp);
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE)
 		return (tcp_offload_output(tp));
 #endif
+
+#ifdef TCP_RFC7413
+	/*
+	 * For TFO connections in SYN_RECEIVED, only allow the initial
+	 * SYN|ACK and those sent by the retransmit timer.
+	 */
+	if ((tp->t_flags & TF_FASTOPEN) &&
+	    (tp->t_state == TCPS_SYN_RECEIVED) &&
+	    SEQ_GT(tp->snd_max, tp->snd_una) &&	/* inital SYN|ACK sent */
+	    (tp->snd_nxt != tp->snd_una))	/* not a retransmit */
+		return (0);
+#endif
 #ifdef INET6
 	if (rack->r_state) {
 		/* Use the cache line loaded if possible */
 		isipv6 = rack->r_is_v6;
 	} else {
 		isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 	}
 #endif
 	cts = tcp_ts_getticks();
 	if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
 	    inp->inp_in_hpts) {
 		/*
 		 * We are on the hpts for some timer but not hptsi output.
 		 * Remove from the hpts unconditionally.
 		 */
 		rack_timer_cancel(tp, rack, cts, __LINE__);
 	}
 	/* Mark that we have called rack_output(). */
 	if ((rack->r_timer_override) ||
 	    (tp->t_flags & TF_FORCEDATA) ||
 	    (tp->t_state < TCPS_ESTABLISHED)) {
 		if (tp->t_inpcb->inp_in_hpts)
 			tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
 	} else if (tp->t_inpcb->inp_in_hpts) {
 		/*
 		 * On the hpts you can't pass even if ACKNOW is on, we will
 		 * when the hpts fires.
 		 */
 		counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
 		return (0);
 	}
 	hpts_calling = inp->inp_hpts_calls;
 	inp->inp_hpts_calls = 0;
 	if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
 		if (rack_process_timers(tp, rack, cts, hpts_calling)) {
 			counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
 			return (0);
 		}
 	}
 	rack->r_wanted_output = 0;
 	rack->r_timer_override = 0;
 	/*
-	 * For TFO connections in SYN_SENT or SYN_RECEIVED,
-	 * only allow the initial SYN or SYN|ACK and those sent
-	 * by the retransmit timer.
-	 */
-	if (IS_FASTOPEN(tp->t_flags) &&
-	    ((tp->t_state == TCPS_SYN_RECEIVED) ||
-	     (tp->t_state == TCPS_SYN_SENT)) &&
-	    SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
-	    (tp->t_rxtshift == 0))              /* not a retransmit */
-		return (0);
-	/*
 	 * Determine length of data that should be transmitted, and flags
 	 * that will be used. If there is some data or critical controls
 	 * (SYN, RST) to send, then transmit; otherwise, investigate
 	 * further.
 	 */
 	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
-#ifdef NETFLIX_CWV
-	if (tp->cwv_enabled) {
-		if ((tp->cwv_cwnd_valid == 0) &&
-		    TCPS_HAVEESTABLISHED(tp->t_state) &&
-		    (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
-			tcp_newcwv_nvp_closedown(tp);
-	} else
-#endif
 	if (tp->t_idle_reduce) {
 		if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
 			rack_cc_after_idle(tp,
 		            (rack->r_idle_reduce_largest ? 1 :0));
 	}
 	tp->t_flags &= ~TF_LASTIDLE;
 	if (idle) {
 		if (tp->t_flags & TF_MORETOCOME) {
 			tp->t_flags |= TF_LASTIDLE;
 			idle = 0;
 		}
 	}
 again:
 	/*
 	 * If we've recently taken a timeout, snd_max will be greater than
 	 * snd_nxt.  There may be SACK information that allows us to avoid
 	 * resending already delivered data.  Adjust snd_nxt accordingly.
 	 */
 	sendalot = 0;
 	cts = tcp_ts_getticks();
 	tso = 0;
 	mtu = 0;
 	sb_offset = tp->snd_max - tp->snd_una;
 	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
 
 	flags = tcp_outflags[tp->t_state];
 	/*
 	 * Send any SACK-generated retransmissions.  If we're explicitly
 	 * trying to send out new data (when sendalot is 1), bypass this
 	 * function. If we retransmit in fast recovery mode, decrement
 	 * snd_cwnd, since we're replacing a (future) new transmission with
 	 * a retransmission now, and we previously incremented snd_cwnd in
 	 * tcp_input().
 	 */
 	/*
 	 * Still in sack recovery , reset rxmit flag to zero.
 	 */
 	while (rack->rc_free_cnt < rack_free_cache) {
 		rsm = rack_alloc(rack);
 		if (rsm == NULL) {
 			if (inp->inp_hpts_calls)
 				/* Retry in a ms */
 				slot = 1;
 			goto just_return_nolock;
 		}
 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
 		rack->rc_free_cnt++;
 		rsm = NULL;
 	}
 	if (inp->inp_hpts_calls)
 		inp->inp_hpts_calls = 0;
 	sack_rxmit = 0;
 	len = 0;
 	rsm = NULL;
 	if (flags & TH_RST) {
 		SOCKBUF_LOCK(sb);
 		goto send;
 	}
 	if (rack->r_ctl.rc_tlpsend) {
 		/* Tail loss probe */
 		long cwin;
 		long tlen;
 
 		doing_tlp = 1;
 		rsm = rack->r_ctl.rc_tlpsend;
 		rack->r_ctl.rc_tlpsend = NULL;
 		sack_rxmit = 1;
 		tlen = rsm->r_end - rsm->r_start;
 		if (tlen > tp->t_maxseg)
 			tlen = tp->t_maxseg;
-		KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
-		    ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
-		    __func__, __LINE__,
-		    rsm->r_start, tp->snd_una, tp, rack, rsm));
+#ifdef INVARIANTS
+		if (SEQ_GT(tp->snd_una, rsm->r_start)) {
+			panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u",
+			    tp, rack, tp->snd_una, rsm, rsm->r_start);
+		}
+#endif
 		sb_offset = rsm->r_start - tp->snd_una;
 		cwin = min(tp->snd_wnd, tlen);
 		len = cwin;
 	} else if (rack->r_ctl.rc_resend) {
 		/* Retransmit timer */
 		rsm = rack->r_ctl.rc_resend;
 		rack->r_ctl.rc_resend = NULL;
 		len = rsm->r_end - rsm->r_start;
 		sack_rxmit = 1;
 		sendalot = 0;
-		KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
-		    ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
-		    __func__, __LINE__,
-		    rsm->r_start, tp->snd_una, tp, rack, rsm));
 		sb_offset = rsm->r_start - tp->snd_una;
 		if (len >= tp->t_maxseg) {
 			len = tp->t_maxseg;
 		}
+		KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d",
+		    __func__, sb_offset));
 	} else if ((rack->rc_in_persist == 0) &&
 	    ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) {
 		long tlen;
 
 		if ((!IN_RECOVERY(tp->t_flags)) &&
 		    ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) {
 			/* Enter recovery if not induced by a time-out */
 			rack->r_ctl.rc_rsm_start = rsm->r_start;
 			rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
 			rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
 			rack_cong_signal(tp, NULL, CC_NDUPACK);
 			/*
 			 * When we enter recovery we need to assure we send
 			 * one packet.
 			 */
 			rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
 		}
 #ifdef INVARIANTS
 		if (SEQ_LT(rsm->r_start, tp->snd_una)) {
 			panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
 			    tp, rack, rsm, rsm->r_start, tp->snd_una);
 		}
 #endif
 		tlen = rsm->r_end - rsm->r_start;
-		KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
-		    ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
-		    __func__, __LINE__,
-		    rsm->r_start, tp->snd_una, tp, rack, rsm));
 		sb_offset = rsm->r_start - tp->snd_una;
 		if (tlen > rack->r_ctl.rc_prr_sndcnt) {
 			len = rack->r_ctl.rc_prr_sndcnt;
 		} else {
 			len = tlen;
 		}
 		if (len >= tp->t_maxseg) {
 			sendalot = 1;
 			len = tp->t_maxseg;
 		} else {
 			sendalot = 0;
 			if ((rack->rc_timer_up == 0) &&
 			    (len < tlen)) {
 				/*
 				 * If its not a timer don't send a partial
 				 * segment.
 				 */
 				len = 0;
 				goto just_return_nolock;
 			}
 		}
+		KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d",
+		    __func__, sb_offset));
 		if (len > 0) {
 			sub_from_prr = 1;
 			sack_rxmit = 1;
 			TCPSTAT_INC(tcps_sack_rexmits);
 			TCPSTAT_ADD(tcps_sack_rexmit_bytes,
 			    min(len, tp->t_maxseg));
 			counter_u64_add(rack_rtm_prr_retran, 1);
 		}
 	}
 	if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
 		/* we are retransmitting the fin */
 		len--;
 		if (len) {
 			/*
 			 * When retransmitting data do *not* include the
 			 * FIN. This could happen from a TLP probe.
 			 */
 			flags &= ~TH_FIN;
 		}
 	}
 #ifdef INVARIANTS
 	/* For debugging */
 	rack->r_ctl.rc_rsm_at_retran = rsm;
 #endif
 	/*
+	 * Enforce a connection sendmap count limit if set
+	 * as long as we are not retransmiting.
+	 */
+	if ((rsm == NULL) &&
+	    (rack_map_entries_limit > 0) &&
+	    (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) {
+		counter_u64_add(rack_to_alloc_limited, 1);
+		if (!rack->alloc_limit_reported) {
+			rack->alloc_limit_reported = 1;
+			counter_u64_add(rack_alloc_limited_conns, 1);
+		}
+		goto just_return_nolock;
+	}
+	/*
 	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
 	 * state flags.
 	 */
 	if (tp->t_flags & TF_NEEDFIN)
 		flags |= TH_FIN;
 	if (tp->t_flags & TF_NEEDSYN)
 		flags |= TH_SYN;
 	if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
 		void *end_rsm;
 		end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
 		if (end_rsm)
 			kern_prefetch(end_rsm, &prefetch_rsm);
 		prefetch_rsm = 1;
 	}
 	SOCKBUF_LOCK(sb);
 	/*
 	 * If in persist timeout with window of 0, send 1 byte. Otherwise,
 	 * if window is small but nonzero and time TF_SENTFIN expired, we
 	 * will send what we can and go to transmit state.
 	 */
 	if (tp->t_flags & TF_FORCEDATA) {
 		if (sendwin == 0) {
 			/*
 			 * If we still have some data to send, then clear
 			 * the FIN bit.  Usually this would happen below
 			 * when it realizes that we aren't sending all the
 			 * data.  However, if we have exactly 1 byte of
 			 * unsent data, then it won't clear the FIN bit
 			 * below, and if we are in persist state, we wind up
 			 * sending the packet without recording that we sent
 			 * the FIN bit.
 			 *
 			 * We can't just blindly clear the FIN bit, because
 			 * if we don't have any more data to send then the
 			 * probe will be the FIN itself.
 			 */
 			if (sb_offset < sbused(sb))
 				flags &= ~TH_FIN;
 			sendwin = 1;
 		} else {
 			if (rack->rc_in_persist)
 				rack_exit_persist(tp, rack);
 			/*
 			 * If we are dropping persist mode then we need to
 			 * correct snd_nxt/snd_max and off.
 			 */
 			tp->snd_nxt = tp->snd_max;
 			sb_offset = tp->snd_nxt - tp->snd_una;
 		}
 	}
 	/*
 	 * If snd_nxt == snd_max and we have transmitted a FIN, the
 	 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
 	 * negative length.  This can also occur when TCP opens up its
 	 * congestion window while receiving additional duplicate acks after
 	 * fast-retransmit because TCP will reset snd_nxt to snd_max after
 	 * the fast-retransmit.
 	 *
 	 * In the normal retransmit-FIN-only case, however, snd_nxt will be
 	 * set to snd_una, the sb_offset will be 0, and the length may wind
 	 * up 0.
 	 *
 	 * If sack_rxmit is true we are retransmitting from the scoreboard
 	 * in which case len is already set.
 	 */
 	if (sack_rxmit == 0) {
 		uint32_t avail;
 
 		avail = sbavail(sb);
-		if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
+		if (SEQ_GT(tp->snd_nxt, tp->snd_una))
 			sb_offset = tp->snd_nxt - tp->snd_una;
 		else
 			sb_offset = 0;
 		if (IN_RECOVERY(tp->t_flags) == 0) {
 			if (rack->r_ctl.rc_tlp_new_data) {
 				/* TLP is forcing out new data */
 				if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
 					rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
 				}
 				if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd)
 					len = tp->snd_wnd;
 				else
 					len = rack->r_ctl.rc_tlp_new_data;
 				rack->r_ctl.rc_tlp_new_data = 0;
 				doing_tlp = 1;
 			} else {
 				if (sendwin > avail) {
 					/* use the available */
 					if (avail > sb_offset) {
 						len = (int32_t)(avail - sb_offset);
 					} else {
 						len = 0;
 					}
 				} else {
 					if (sendwin > sb_offset) {
 						len = (int32_t)(sendwin - sb_offset);
 					} else {
 						len = 0;
 					}
 				}
 			}
 		} else {
 			uint32_t outstanding;
 
 			/*
 			 * We are inside of a SACK recovery episode and are
 			 * sending new data, having retransmitted all the
 			 * data possible so far in the scoreboard.
 			 */
 			outstanding = tp->snd_max - tp->snd_una;
-			if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd)
-				len = 0;
-			else if (avail > sb_offset)
+			if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) {
+				if (tp->snd_wnd > outstanding) {
+					len = tp->snd_wnd - outstanding;
+					/* Check to see if we have the data */
+					if (((sb_offset + len) > avail) &&
+					    (avail > sb_offset))
+						len = avail - sb_offset;
+					else
+						len = 0;
+				} else
+					len = 0;
+			} else if (avail > sb_offset)
 				len = avail - sb_offset;
 			else
 				len = 0;
 			if (len > 0) {
 				if (len > rack->r_ctl.rc_prr_sndcnt)
 					len = rack->r_ctl.rc_prr_sndcnt;
 
 				if (len > 0) {
 					sub_from_prr = 1;
 					counter_u64_add(rack_rtm_prr_newdata, 1);
 				}
 			}
 			if (len > tp->t_maxseg) {
 				/*
 				 * We should never send more than a MSS when
 				 * retransmitting or sending new data in prr
 				 * mode unless the override flag is on. Most
 				 * likely the PRR algorithm is not going to
 				 * let us send a lot as well :-)
 				 */
 				if (rack->r_ctl.rc_prr_sendalot == 0)
 					len = tp->t_maxseg;
 			} else if (len < tp->t_maxseg) {
 				/*
 				 * Do we send any? The idea here is if the
 				 * send empty's the socket buffer we want to
 				 * do it. However if not then lets just wait
 				 * for our prr_sndcnt to get bigger.
 				 */
 				long leftinsb;
 
 				leftinsb = sbavail(sb) - sb_offset;
 				if (leftinsb > len) {
 					/* This send does not empty the sb */
 					len = 0;
 				}
 			}
 		}
 	}
 	if (prefetch_so_done == 0) {
 		kern_prefetch(so, &prefetch_so_done);
 		prefetch_so_done = 1;
 	}
 	/*
 	 * Lop off SYN bit if it has already been sent.  However, if this is
 	 * SYN-SENT state and if segment contains data and if we don't know
 	 * that foreign host supports TAO, suppress sending segment.
 	 */
-	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
-	    ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
-		if (tp->t_state != TCPS_SYN_RECEIVED)
+	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
+		if ((tp->t_state != TCPS_SYN_RECEIVED) &&
+		    (tp->t_state != TCPS_SYN_SENT))
 			flags &= ~TH_SYN;
+#ifdef TCP_RFC7413
 		/*
 		 * When sending additional segments following a TFO SYN|ACK,
 		 * do not include the SYN bit.
 		 */
-		if (IS_FASTOPEN(tp->t_flags) &&
+		if ((tp->t_flags & TF_FASTOPEN) &&
 		    (tp->t_state == TCPS_SYN_RECEIVED))
 			flags &= ~TH_SYN;
+#endif
 		sb_offset--, len++;
+		if (sbavail(sb) == 0)
+			len = 0;
 	}
 	/*
 	 * Be careful not to send data and/or FIN on SYN segments. This
 	 * measure is needed to prevent interoperability problems with not
 	 * fully conformant TCP implementations.
 	 */
 	if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
 		len = 0;
 		flags &= ~TH_FIN;
 	}
+#ifdef TCP_RFC7413
 	/*
-	 * On TFO sockets, ensure no data is sent in the following cases:
-	 *
-	 *  - When retransmitting SYN|ACK on a passively-created socket
-	 *
-	 *  - When retransmitting SYN on an actively created socket
-	 *
-	 *  - When sending a zero-length cookie (cookie request) on an
-	 *    actively created socket
-	 *
-	 *  - When the socket is in the CLOSED state (RST is being sent)
+	 * When retransmitting SYN|ACK on a passively-created TFO socket,
+	 * don't include data, as the presence of data may have caused the
+	 * original SYN|ACK to have been dropped by a middlebox.
 	 */
-	if (IS_FASTOPEN(tp->t_flags) &&
-	    (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
-	     ((tp->t_state == TCPS_SYN_SENT) &&
-	      (tp->t_tfo_client_cookie_len == 0)) ||
-	     (flags & TH_RST))) {
-		sack_rxmit = 0;
+	if ((tp->t_flags & TF_FASTOPEN) &&
+	    ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0)))
 		len = 0;
-	}
-	/* Without fast-open there should never be data sent on a SYN */
-	if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags)))
-		len = 0;
+#endif
 	if (len <= 0) {
 		/*
 		 * If FIN has been sent but not acked, but we haven't been
 		 * called to retransmit, len will be < 0.  Otherwise, window
 		 * shrank after we sent into it.  If window shrank to 0,
 		 * cancel pending retransmit, pull snd_nxt back to (closed)
 		 * window, and set the persist timer if it isn't already
 		 * going.  If the window didn't close completely, just wait
 		 * for an ACK.
 		 *
 		 * We also do a general check here to ensure that we will
 		 * set the persist timer when we have data to send, but a
 		 * 0-byte window. This makes sure the persist timer is set
 		 * even if the packet hits one of the "goto send" lines
 		 * below.
 		 */
 		len = 0;
 		if ((tp->snd_wnd == 0) &&
 		    (TCPS_HAVEESTABLISHED(tp->t_state)) &&
 		    (sb_offset < (int)sbavail(sb))) {
 			tp->snd_nxt = tp->snd_una;
 			rack_enter_persist(tp, rack, cts);
 		}
 	}
 	/* len will be >= 0 after this point. */
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 	tcp_sndbuf_autoscale(tp, so, sendwin);
 	/*
 	 * Decide if we can use TCP Segmentation Offloading (if supported by
 	 * hardware).
 	 *
 	 * TSO may only be used if we are in a pure bulk sending state.  The
 	 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
 	 * options prevent using TSO.  With TSO the TCP header is the same
 	 * (except for the sequence number) for all generated packets.  This
 	 * makes it impossible to transmit any options which vary per
 	 * generated segment or packet.
 	 *
 	 * IPv4 handling has a clear separation of ip options and ip header
 	 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
 	 * the right thing below to provide length of just ip options and thus
 	 * checking for ipoptlen is enough to decide if ip options are present.
 	 */
 
 #ifdef INET6
 	if (isipv6)
 		ipoptlen = ip6_optlen(tp->t_inpcb);
 	else
 #endif
 		if (tp->t_inpcb->inp_options)
 			ipoptlen = tp->t_inpcb->inp_options->m_len -
 			    offsetof(struct ipoption, ipopt_list);
 		else
 			ipoptlen = 0;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	/*
 	 * Pre-calculate here as we save another lookup into the darknesses
 	 * of IPsec that way and can actually decide if TSO is ok.
 	 */
 #ifdef INET6
 	if (isipv6 && IPSEC_ENABLED(ipv6))
 		ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
 #ifdef INET
 	else
 #endif
 #endif				/* INET6 */
 #ifdef INET
 	if (IPSEC_ENABLED(ipv4))
 		ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
 #endif				/* INET */
 #endif
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	ipoptlen += ipsec_optlen;
 #endif
 	if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
+#ifdef NETFLIX_TCP_O_UDP
 	    (tp->t_port == 0) &&
+#endif
 	    ((tp->t_flags & TF_SIGNATURE) == 0) &&
 	    tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
 	    ipoptlen == 0)
 		tso = 1;
 	{
 		uint32_t outstanding;
 
 		outstanding = tp->snd_max - tp->snd_una;
 		if (tp->t_flags & TF_SENTFIN) {
 			/*
 			 * If we sent a fin, snd_max is 1 higher than
 			 * snd_una
 			 */
 			outstanding--;
 		}
 		if (outstanding > 0) {
 			/*
 			 * This is sub-optimal. We only send a stand alone
 			 * FIN on its own segment.
 			 */
 			if (flags & TH_FIN) {
 				flags &= ~TH_FIN;
 				would_have_fin = 1;
 			}
 		} else if (sack_rxmit) {
 			if ((rsm->r_flags & RACK_HAS_FIN) == 0)
 				flags &= ~TH_FIN;
 		} else {
 			if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
 			    sbused(sb)))
 				flags &= ~TH_FIN;
 		}
 	}
 	recwin = sbspace(&so->so_rcv);
 
 	/*
 	 * Sender silly window avoidance.   We transmit under the following
 	 * conditions when len is non-zero:
 	 *
 	 * - We have a full segment (or more with TSO) - This is the last
 	 * buffer in a write()/send() and we are either idle or running
 	 * NODELAY - we've timed out (e.g. persist timer) - we have more
 	 * then 1/2 the maximum send window's worth of data (receiver may be
 	 * limited the window size) - we need to retransmit
 	 */
 	if (len) {
 		if (len >= tp->t_maxseg) {
 			pass = 1;
 			goto send;
 		}
 		/*
 		 * NOTE! on localhost connections an 'ack' from the remote
 		 * end may occur synchronously with the output and cause us
 		 * to flush a buffer queued with moretocome.  XXX
 		 *
 		 */
 		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
 		    (idle || (tp->t_flags & TF_NODELAY)) &&
-		    ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) && 
+		    ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) &&
 		    (tp->t_flags & TF_NOPUSH) == 0) {
 			pass = 2;
 			goto send;
 		}
 		if (tp->t_flags & TF_FORCEDATA) {	/* typ. timeout case */
 			pass = 3;
 			goto send;
 		}
 		if ((tp->snd_una == tp->snd_max) && len) {	/* Nothing outstanding */
 			goto send;
 		}
 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
 			pass = 4;
 			goto send;
 		}
 		if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {	/* retransmit case */
 			pass = 5;
 			goto send;
 		}
 		if (sack_rxmit) {
 			pass = 6;
 			goto send;
 		}
 	}
 	/*
 	 * Sending of standalone window updates.
 	 *
 	 * Window updates are important when we close our window due to a
 	 * full socket buffer and are opening it again after the application
 	 * reads data from it.  Once the window has opened again and the
 	 * remote end starts to send again the ACK clock takes over and
 	 * provides the most current window information.
 	 *
 	 * We must avoid the silly window syndrome whereas every read from
 	 * the receive buffer, no matter how small, causes a window update
 	 * to be sent.  We also should avoid sending a flurry of window
 	 * updates when the socket buffer had queued a lot of data and the
 	 * application is doing small reads.
 	 *
 	 * Prevent a flurry of pointless window updates by only sending an
 	 * update when we can increase the advertized window by more than
 	 * 1/4th of the socket buffer capacity.  When the buffer is getting
 	 * full or is very small be more aggressive and send an update
 	 * whenever we can increase by two mss sized segments. In all other
 	 * situations the ACK's to new incoming data will carry further
 	 * window increases.
 	 *
 	 * Don't send an independent window update if a delayed ACK is
 	 * pending (it will get piggy-backed on it) or the remote side
 	 * already has done a half-close and won't send more data.  Skip
 	 * this if the connection is in T/TCP half-open state.
 	 */
 	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
 	    !(tp->t_flags & TF_DELACK) &&
 	    !TCPS_HAVERCVDFIN(tp->t_state)) {
 		/*
 		 * "adv" is the amount we could increase the window, taking
 		 * into account that we are limited by TCP_MAXWIN <<
 		 * tp->rcv_scale.
 		 */
 		int32_t adv;
 		int oldwin;
 
 		adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
 		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
 			oldwin = (tp->rcv_adv - tp->rcv_nxt);
 			adv -= oldwin;
 		} else
 			oldwin = 0;
 
 		/*
 		 * If the new window size ends up being the same as the old
 		 * size when it is scaled, then don't force a window update.
 		 */
 		if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
 			goto dontupdate;
 
 		if (adv >= (int32_t)(2 * tp->t_maxseg) &&
 		    (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
 		    recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
 		    so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) {
 			pass = 7;
 			goto send;
 		}
 		if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
 			goto send;
 	}
 dontupdate:
 
 	/*
 	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
 	 * is also a catch-all for the retransmit timer timeout case.
 	 */
 	if (tp->t_flags & TF_ACKNOW) {
 		pass = 8;
 		goto send;
 	}
 	if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
 		pass = 9;
 		goto send;
 	}
 	if (SEQ_GT(tp->snd_up, tp->snd_una)) {
 		pass = 10;
 		goto send;
 	}
 	/*
 	 * If our state indicates that FIN should be sent and we have not
 	 * yet done so, then we need to send.
 	 */
-	if ((flags & TH_FIN) &&
-	    (tp->snd_nxt == tp->snd_una)) {
-		pass = 11;
-		goto send;
+	if (flags & TH_FIN) {
+		if ((tp->t_flags & TF_SENTFIN) ||
+		    (((tp->t_flags & TF_SENTFIN) == 0) &&
+		     (tp->snd_nxt == tp->snd_una))) {
+			pass = 11;
+			goto send;
+		}
 	}
 	/*
 	 * No reason to send a segment, just return.
 	 */
 just_return:
 	SOCKBUF_UNLOCK(sb);
 just_return_nolock:
 	if (tot_len_this_send == 0)
 		counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
 	rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
 	rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling);
 	tp->t_flags &= ~TF_FORCEDATA;
 	return (0);
 
 send:
 	if (doing_tlp == 0) {
 		/*
 		 * Data not a TLP, and its not the rxt firing. If it is the
 		 * rxt firing, we want to leave the tlp_in_progress flag on
 		 * so we don't send another TLP. It has to be a rack timer
 		 * or normal send (response to acked data) to clear the tlp
 		 * in progress flag.
 		 */
 		rack->rc_tlp_in_progress = 0;
 	}
 	SOCKBUF_LOCK_ASSERT(sb);
 	if (len > 0) {
 		if (len >= tp->t_maxseg)
 			tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
 	}
 	/*
 	 * Before ESTABLISHED, force sending of initial options unless TCP
 	 * set not to do any options. NOTE: we assume that the IP/TCP header
 	 * plus TCP options always fit in a single mbuf, leaving room for a
 	 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
 	 * + optlen <= MCLBYTES
 	 */
 	optlen = 0;
 #ifdef INET6
 	if (isipv6)
 		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	else
 #endif
 		hdrlen = sizeof(struct tcpiphdr);
 
 	/*
 	 * Compute options for segment. We only have to care about SYN and
 	 * established connection segments.  Options for SYN-ACK segments
 	 * are handled in TCP syncache.
 	 */
 	to.to_flags = 0;
 	if ((tp->t_flags & TF_NOOPT) == 0) {
 		/* Maximum segment size. */
 		if (flags & TH_SYN) {
 			tp->snd_nxt = tp->iss;
 			to.to_mss = tcp_mssopt(&inp->inp_inc);
-#ifdef NETFLIX_TCPOUDP
+#ifdef NETFLIX_TCP_O_UDP
 			if (tp->t_port)
 				to.to_mss -= V_tcp_udp_tunneling_overhead;
 #endif
 			to.to_flags |= TOF_MSS;
-
+#ifdef TCP_RFC7413
 			/*
-			 * On SYN or SYN|ACK transmits on TFO connections,
-			 * only include the TFO option if it is not a
-			 * retransmit, as the presence of the TFO option may
-			 * have caused the original SYN or SYN|ACK to have
-			 * been dropped by a middlebox.
+			 * Only include the TFO option on the first
+			 * transmission of the SYN|ACK on a
+			 * passively-created TFO socket, as the presence of
+			 * the TFO option may have caused the original
+			 * SYN|ACK to have been dropped by a middlebox.
 			 */
-			if (IS_FASTOPEN(tp->t_flags) &&
+			if ((tp->t_flags & TF_FASTOPEN) &&
+			    (tp->t_state == TCPS_SYN_RECEIVED) &&
 			    (tp->t_rxtshift == 0)) {
-				if (tp->t_state == TCPS_SYN_RECEIVED) {
-					to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
-					to.to_tfo_cookie =
-					    (u_int8_t *)&tp->t_tfo_cookie.server;
-					to.to_flags |= TOF_FASTOPEN;
-					wanted_cookie = 1;
-				} else if (tp->t_state == TCPS_SYN_SENT) {
-					to.to_tfo_len =
-					    tp->t_tfo_client_cookie_len;
-					to.to_tfo_cookie =
-					    tp->t_tfo_cookie.client;
-					to.to_flags |= TOF_FASTOPEN;
-					wanted_cookie = 1;
-					/*
-					 * If we wind up having more data to
-					 * send with the SYN than can fit in
-					 * one segment, don't send any more
-					 * until the SYN|ACK comes back from
-					 * the other end.
-					 */
-					sendalot = 0;
-				}
+				to.to_tfo_len = TCP_FASTOPEN_MAX_COOKIE_LEN;
+				to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie;
+				to.to_flags |= TOF_FASTOPEN;
 			}
+#endif
 		}
 		/* Window scaling. */
 		if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
 			to.to_wscale = tp->request_r_scale;
 			to.to_flags |= TOF_SCALE;
 		}
 		/* Timestamps. */
 		if ((tp->t_flags & TF_RCVD_TSTMP) ||
 		    ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
 			to.to_tsval = cts + tp->ts_offset;
 			to.to_tsecr = tp->ts_recent;
 			to.to_flags |= TOF_TS;
 		}
 		/* Set receive buffer autosizing timestamp. */
 		if (tp->rfbuf_ts == 0 &&
 		    (so->so_rcv.sb_flags & SB_AUTOSIZE))
 			tp->rfbuf_ts = tcp_ts_getticks();
 		/* Selective ACK's. */
 		if (flags & TH_SYN)
 			to.to_flags |= TOF_SACKPERM;
 		else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 		    tp->rcv_numsacks > 0) {
 			to.to_flags |= TOF_SACK;
 			to.to_nsacks = tp->rcv_numsacks;
 			to.to_sacks = (u_char *)tp->sackblks;
 		}
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		/* TCP-MD5 (RFC2385). */
 		if (tp->t_flags & TF_SIGNATURE)
 			to.to_flags |= TOF_SIGNATURE;
 #endif				/* TCP_SIGNATURE */
 
 		/* Processing the options. */
 		hdrlen += optlen = tcp_addoptions(&to, opt);
-		/*
-		 * If we wanted a TFO option to be added, but it was unable
-		 * to fit, ensure no data is sent.
-		 */
-		if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
-		    !(to.to_flags & TOF_FASTOPEN))
-			len = 0;
 	}
-#ifdef NETFLIX_TCPOUDP
+#ifdef NETFLIX_TCP_O_UDP
 	if (tp->t_port) {
 		if (V_tcp_udp_tunneling_port == 0) {
 			/* The port was removed?? */
 			SOCKBUF_UNLOCK(&so->so_snd);
 			return (EHOSTUNREACH);
 		}
 		hdrlen += sizeof(struct udphdr);
 	}
 #endif
 #ifdef INET6
 	if (isipv6)
 		ipoptlen = ip6_optlen(tp->t_inpcb);
 	else
 #endif
 	if (tp->t_inpcb->inp_options)
 		ipoptlen = tp->t_inpcb->inp_options->m_len -
 		    offsetof(struct ipoption, ipopt_list);
 	else
 		ipoptlen = 0;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	ipoptlen += ipsec_optlen;
 #endif
 
 	/*
 	 * Adjust data length if insertion of options will bump the packet
 	 * length beyond the t_maxseg length. Clear the FIN bit because we
 	 * cut off the tail of the segment.
 	 */
 	if (len + optlen + ipoptlen > tp->t_maxseg) {
 		if (flags & TH_FIN) {
 			would_have_fin = 1;
 			flags &= ~TH_FIN;
 		}
 		if (tso) {
 			uint32_t if_hw_tsomax;
 			uint32_t moff;
 			int32_t max_len;
 
 			/* extract TSO information */
 			if_hw_tsomax = tp->t_tsomax;
 			if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
 			if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
 			KASSERT(ipoptlen == 0,
 			    ("%s: TSO can't do IP options", __func__));
 
 			/*
 			 * Check if we should limit by maximum payload
 			 * length:
 			 */
 			if (if_hw_tsomax != 0) {
 				/* compute maximum TSO length */
 				max_len = (if_hw_tsomax - hdrlen -
 				    max_linkhdr);
 				if (max_len <= 0) {
 					len = 0;
 				} else if (len > max_len) {
 					sendalot = 1;
 					len = max_len;
 				}
 			}
 			/*
 			 * Prevent the last segment from being fractional
 			 * unless the send sockbuf can be emptied:
 			 */
 			max_len = (tp->t_maxseg - optlen);
 			if ((sb_offset + len) < sbavail(sb)) {
 				moff = len % (u_int)max_len;
 				if (moff != 0) {
 					len -= moff;
 					sendalot = 1;
 				}
 			}
 			/*
 			 * In case there are too many small fragments don't
 			 * use TSO:
 			 */
 			if (len <= max_len) {
 				len = max_len;
 				sendalot = 1;
 				tso = 0;
 			}
 			/*
 			 * Send the FIN in a separate segment after the bulk
 			 * sending is done. We don't trust the TSO
 			 * implementations to clear the FIN flag on all but
 			 * the last segment.
 			 */
 			if (tp->t_flags & TF_NEEDFIN)
 				sendalot = 1;
 
 		} else {
 			if (optlen + ipoptlen >= tp->t_maxseg) {
 				/*
 				 * Since we don't have enough space to put
 				 * the IP header chain and the TCP header in
 				 * one packet as required by RFC 7112, don't
 				 * send it. Also ensure that at least one
 				 * byte of the payload can be put into the
 				 * TCP segment.
 				 */
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EMSGSIZE;
 				sack_rxmit = 0;
 				goto out;
 			}
 			len = tp->t_maxseg - optlen - ipoptlen;
 			sendalot = 1;
 		}
 	} else
 		tso = 0;
 	KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
 	    ("%s: len > IP_MAXPACKET", __func__));
 #ifdef DIAGNOSTIC
 #ifdef INET6
 	if (max_linkhdr + hdrlen > MCLBYTES)
 #else
 	if (max_linkhdr + hdrlen > MHLEN)
 #endif
 		panic("tcphdr too big");
 #endif
 
 	/*
 	 * This KASSERT is here to catch edge cases at a well defined place.
 	 * Before, those had triggered (random) panic conditions further
 	 * down.
 	 */
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 	if ((len == 0) &&
 	    (flags & TH_FIN) &&
 	    (sbused(sb))) {
 		/*
 		 * We have outstanding data, don't send a fin by itself!.
 		 */
 		goto just_return;
 	}
 	/*
 	 * Grab a header mbuf, attaching a copy of data to be transmitted,
 	 * and initialize the header from the template for sends on this
 	 * connection.
 	 */
 	if (len) {
 		uint32_t max_val;
 		uint32_t moff;
 
 		if (rack->rc_pace_max_segs)
 			max_val = rack->rc_pace_max_segs * tp->t_maxseg;
 		else
 			max_val = len;
 		/*
 		 * We allow a limit on sending with hptsi.
 		 */
 		if (len > max_val) {
 			len = max_val;
 		}
 #ifdef INET6
 		if (MHLEN < hdrlen + max_linkhdr)
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		else
 #endif
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 
 		if (m == NULL) {
 			SOCKBUF_UNLOCK(sb);
 			error = ENOBUFS;
 			sack_rxmit = 0;
 			goto out;
 		}
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 
 		/*
 		 * Start the m_copy functions from the closest mbuf to the
 		 * sb_offset in the socket buffer chain.
 		 */
 		mb = sbsndptr_noadv(sb, sb_offset, &moff);
 		if (len <= MHLEN - hdrlen - max_linkhdr) {
 			m_copydata(mb, moff, (int)len,
 			    mtod(m, caddr_t)+hdrlen);
 			if (SEQ_LT(tp->snd_nxt, tp->snd_max))
 				sbsndptr_adv(sb, mb, len);
 			m->m_len += len;
 		} else {
 			struct sockbuf *msb;
 
 			if (SEQ_LT(tp->snd_nxt, tp->snd_max))
 				msb = NULL;
 			else
 				msb = sb;
-			m->m_next = tcp_m_copym(mb, moff, &len,
-			    if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb);
+			m->m_next = tcp_m_copym(/*tp, */ mb, moff, &len,
+			    if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb /*, 0, NULL*/);
 			if (len <= (tp->t_maxseg - optlen)) {
-				/* 
+				/*
 				 * Must have ran out of mbufs for the copy
 				 * shorten it to no longer need tso. Lets
 				 * not put on sendalot since we are low on
 				 * mbufs.
 				 */
 				tso = 0;
 			}
 			if (m->m_next == NULL) {
 				SOCKBUF_UNLOCK(sb);
 				(void)m_free(m);
 				error = ENOBUFS;
 				sack_rxmit = 0;
 				goto out;
 			}
 		}
 		if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
 			TCPSTAT_INC(tcps_sndprobe);
 #ifdef NETFLIX_STATS
 			if (SEQ_LT(tp->snd_nxt, tp->snd_max))
 				stats_voi_update_abs_u32(tp->t_stats,
 				    VOI_TCP_RETXPB, len);
 			else
 				stats_voi_update_abs_u64(tp->t_stats,
 				    VOI_TCP_TXPB, len);
 #endif
 		} else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
 			if (rsm && (rsm->r_flags & RACK_TLP)) {
 				/*
 				 * TLP should not count in retran count, but
 				 * in its own bin
 				 */
+/*				tp->t_sndtlppack++;*/
+/*				tp->t_sndtlpbyte += len;*/
 				counter_u64_add(rack_tlp_retran, 1);
 				counter_u64_add(rack_tlp_retran_bytes, len);
 			} else {
 				tp->t_sndrexmitpack++;
 				TCPSTAT_INC(tcps_sndrexmitpack);
 				TCPSTAT_ADD(tcps_sndrexmitbyte, len);
 			}
 #ifdef NETFLIX_STATS
 			stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
 			    len);
 #endif
 		} else {
 			TCPSTAT_INC(tcps_sndpack);
 			TCPSTAT_ADD(tcps_sndbyte, len);
 #ifdef NETFLIX_STATS
 			stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
 			    len);
 #endif
 		}
 		/*
 		 * If we're sending everything we've got, set PUSH. (This
 		 * will keep happy those implementations which only give
 		 * data to the user when a buffer fills or a PUSH comes in.)
 		 */
 		if (sb_offset + len == sbused(sb) &&
 		    sbused(sb) &&
 		    !(flags & TH_SYN))
 			flags |= TH_PUSH;
 
 		/*
 		 * Are we doing hptsi, if so we must calculate the slot. We
 		 * only do hptsi in ESTABLISHED and with no RESET being
 		 * sent where we have data to send.
 		 */
 		if (((tp->t_state == TCPS_ESTABLISHED) ||
 		    (tp->t_state == TCPS_CLOSE_WAIT) ||
 		    ((tp->t_state == TCPS_FIN_WAIT_1) &&
 		    ((tp->t_flags & TF_SENTFIN) == 0) &&
 		    ((flags & TH_FIN) == 0))) &&
 		    ((flags & TH_RST) == 0) &&
 		    (rack->rc_always_pace)) {
 			/*
 			 * We use the most optimistic possible cwnd/srtt for
 			 * sending calculations. This will make our
 			 * calculation anticipate getting more through
 			 * quicker then possible. But thats ok we don't want
 			 * the peer to have a gap in data sending.
 			 */
 			uint32_t srtt, cwnd, tr_perms = 0;
-	
+
 			if (rack->r_ctl.rc_rack_min_rtt)
 				srtt = rack->r_ctl.rc_rack_min_rtt;
 			else
 				srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT));
 			if (rack->r_ctl.rc_rack_largest_cwnd)
 				cwnd = rack->r_ctl.rc_rack_largest_cwnd;
 			else
 				cwnd = tp->snd_cwnd;
 			tr_perms = cwnd / srtt;
 			if (tr_perms == 0) {
 				tr_perms = tp->t_maxseg;
 			}
 			tot_len_this_send += len;
 			/*
 			 * Calculate how long this will take to drain, if
 			 * the calculation comes out to zero, thats ok we
 			 * will use send_a_lot to possibly spin around for
 			 * more increasing tot_len_this_send to the point
 			 * that its going to require a pace, or we hit the
 			 * cwnd. Which in that case we are just waiting for
 			 * a ACK.
 			 */
 			slot = tot_len_this_send / tr_perms;
 			/* Now do we reduce the time so we don't run dry? */
 			if (slot && rack->rc_pace_reduce) {
 				int32_t reduce;
 
 				reduce = (slot / rack->rc_pace_reduce);
 				if (reduce < slot) {
 					slot -= reduce;
 				} else
 					slot = 0;
 			}
 			if (rack->r_enforce_min_pace &&
 			    (slot == 0) &&
 			    (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) {
 				/* We are enforcing a minimum pace time of 1ms */
 				slot = rack->r_enforce_min_pace;
 			}
 		}
 		SOCKBUF_UNLOCK(sb);
 	} else {
 		SOCKBUF_UNLOCK(sb);
 		if (tp->t_flags & TF_ACKNOW)
 			TCPSTAT_INC(tcps_sndacks);
 		else if (flags & (TH_SYN | TH_FIN | TH_RST))
 			TCPSTAT_INC(tcps_sndctrl);
 		else if (SEQ_GT(tp->snd_up, tp->snd_una))
 			TCPSTAT_INC(tcps_sndurg);
 		else
 			TCPSTAT_INC(tcps_sndwinup);
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			sack_rxmit = 0;
 			goto out;
 		}
 #ifdef INET6
 		if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
 		    MHLEN >= hdrlen) {
 			M_ALIGN(m, hdrlen);
 		} else
 #endif
 			m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 	}
 	SOCKBUF_UNLOCK_ASSERT(sb);
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 #ifdef INET6
 	if (isipv6) {
 		ip6 = mtod(m, struct ip6_hdr *);
-#ifdef NETFLIX_TCPOUDP
+#ifdef NETFLIX_TCP_O_UDP
 		if (tp->t_port) {
 			udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
 			udp->uh_dport = tp->t_port;
 			ulen = hdrlen + len - sizeof(struct ip6_hdr);
 			udp->uh_ulen = htons(ulen);
 			th = (struct tcphdr *)(udp + 1);
 		} else
 #endif
 			th = (struct tcphdr *)(ip6 + 1);
-		tcpip_fillheaders(inp, ip6, th);
+		tcpip_fillheaders(inp, /*tp->t_port, */ ip6, th);
 	} else
 #endif				/* INET6 */
 	{
 		ip = mtod(m, struct ip *);
 #ifdef TCPDEBUG
 		ipov = (struct ipovly *)ip;
 #endif
-#ifdef NETFLIX_TCPOUDP
+#ifdef NETFLIX_TCP_O_UDP
 		if (tp->t_port) {
 			udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
 			udp->uh_dport = tp->t_port;
 			ulen = hdrlen + len - sizeof(struct ip);
 			udp->uh_ulen = htons(ulen);
 			th = (struct tcphdr *)(udp + 1);
 		} else
 #endif
 			th = (struct tcphdr *)(ip + 1);
-		tcpip_fillheaders(inp, ip, th);
+		tcpip_fillheaders(inp,/*tp->t_port, */ ip, th);
 	}
 	/*
 	 * Fill in fields, remembering maximum advertised window for use in
 	 * delaying messages about window sizes. If resending a FIN, be sure
 	 * not to use a new sequence number.
 	 */
 	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
 	    tp->snd_nxt == tp->snd_max)
 		tp->snd_nxt--;
 	/*
 	 * If we are starting a connection, send ECN setup SYN packet. If we
 	 * are on a retransmit, we may resend those bits a number of times
 	 * as per RFC 3168.
 	 */
 	if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
 		if (tp->t_rxtshift >= 1) {
 			if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
 				flags |= TH_ECE | TH_CWR;
 		} else
 			flags |= TH_ECE | TH_CWR;
 	}
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (tp->t_flags & TF_ECN_PERMIT)) {
 		/*
 		 * If the peer has ECN, mark data packets with ECN capable
 		 * transmission (ECT). Ignore pure ack packets,
 		 * retransmissions and window probes.
 		 */
 		if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
 		    !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
 #ifdef INET6
 			if (isipv6)
 				ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
 			else
 #endif
 				ip->ip_tos |= IPTOS_ECN_ECT0;
 			TCPSTAT_INC(tcps_ecn_ect0);
 			/*
 			 * Reply with proper ECN notifications.
 			 * Only set CWR on new data segments.
 			 */
 			if (tp->t_flags & TF_ECN_SND_CWR) {
 				flags |= TH_CWR;
 				tp->t_flags &= ~TF_ECN_SND_CWR;
 			}
 		}
 		if (tp->t_flags & TF_ECN_SND_ECE)
 			flags |= TH_ECE;
 	}
 	/*
 	 * If we are doing retransmissions, then snd_nxt will not reflect
 	 * the first unsent octet.  For ACK only packets, we do not want the
 	 * sequence number of the retransmitted packet, we want the sequence
 	 * number of the next unsent octet.  So, if there is no data (and no
 	 * SYN or FIN), use snd_max instead of snd_nxt when filling in
 	 * ti_seq.  But if we are in persist state, snd_max might reflect
 	 * one byte beyond the right edge of the window, so use snd_nxt in
 	 * that case, since we know we aren't doing a retransmission.
 	 * (retransmit and persist are mutually exclusive...)
 	 */
 	if (sack_rxmit == 0) {
 		if (len || (flags & (TH_SYN | TH_FIN)) ||
 		    rack->rc_in_persist) {
 			th->th_seq = htonl(tp->snd_nxt);
 			rack_seq = tp->snd_nxt;
 		} else if (flags & TH_RST) {
 			/*
 			 * For a Reset send the last cum ack in sequence
 			 * (this like any other choice may still generate a
 			 * challenge ack, if a ack-update packet is in
 			 * flight).
 			 */
 			th->th_seq = htonl(tp->snd_una);
 			rack_seq = tp->snd_una;
 		} else {
 			th->th_seq = htonl(tp->snd_max);
 			rack_seq = tp->snd_max;
 		}
 	} else {
 		th->th_seq = htonl(rsm->r_start);
 		rack_seq = rsm->r_start;
 	}
 	th->th_ack = htonl(tp->rcv_nxt);
 	if (optlen) {
 		bcopy(opt, th + 1, optlen);
 		th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 	}
 	th->th_flags = flags;
 	/*
 	 * Calculate receive window.  Don't shrink window, but avoid silly
 	 * window syndrome.
-	 * If a RST segment is sent, advertise a window of zero.
 	 */
-	if (flags & TH_RST) {
+	if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
+	    recwin < (long)tp->t_maxseg)
 		recwin = 0;
-	} else {
-		if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
-		    recwin < (long)tp->t_maxseg)
-			recwin = 0;
-		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
-		    recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
-			recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
-		if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
-			recwin = (long)TCP_MAXWIN << tp->rcv_scale;
-	}
+	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
+	    recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
+		recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
+	if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
+		recwin = (long)TCP_MAXWIN << tp->rcv_scale;
 
 	/*
 	 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
 	 * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
 	 * handled in syncache.
 	 */
 	if (flags & TH_SYN)
 		th->th_win = htons((u_short)
 		    (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
 	else {
 		/* Avoid shrinking window with window scaling. */
 		recwin = roundup2(recwin, 1 << tp->rcv_scale);
 		th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
 	}
 	/*
 	 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
 	 * window.  This may cause the remote transmitter to stall.  This
 	 * flag tells soreceive() to disable delayed acknowledgements when
 	 * draining the buffer.  This can occur if the receiver is
 	 * attempting to read more data than can be buffered prior to
 	 * transmitting on the connection.
 	 */
 	if (th->th_win == 0) {
 		tp->t_sndzerowin++;
 		tp->t_flags |= TF_RXWIN0SENT;
 	} else
 		tp->t_flags &= ~TF_RXWIN0SENT;
 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
 		th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
 		th->th_flags |= TH_URG;
 	} else
 		/*
 		 * If no urgent pointer to send, then we pull the urgent
 		 * pointer to the left edge of the send window so that it
 		 * doesn't drift into the send window on sequence number
 		 * wraparound.
 		 */
 		tp->snd_up = tp->snd_una;	/* drag it along */
 
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 	if (to.to_flags & TOF_SIGNATURE) {
 		/*
 		 * Calculate MD5 signature and put it into the place
 		 * determined before.
 		 * NOTE: since TCP options buffer doesn't point into
 		 * mbuf's data, calculate offset and use it.
 		 */
 		if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
 		    (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
 			/*
 			 * Do not send segment if the calculation of MD5
 			 * digest has failed.
 			 */
 			goto out;
 		}
 	}
 #endif
 
 	/*
 	 * Put TCP length in extended header, and then checksum extended
 	 * header and data.
 	 */
 	m->m_pkthdr.len = hdrlen + len;	/* in6_cksum() need this */
 #ifdef INET6
 	if (isipv6) {
 		/*
 		 * ip6_plen is not need to be filled now, and will be filled
 		 * in ip6_output.
 		 */
+#ifdef NETFLIX_TCP_O_UDP
 		if (tp->t_port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
 			th->th_sum = htons(0);
+			UDPSTAT_INC(udps_opackets);
 		} else {
+#endif
 			m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			th->th_sum = in6_cksum_pseudo(ip6,
 			    sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
 			    0);
+#ifdef NETFLIX_TCP_O_UDP
 		}
+#endif
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
+#ifdef NETFLIX_TCP_O_UDP
 		if (tp->t_port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
 			   ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
 			th->th_sum = htons(0);
+			UDPSTAT_INC(udps_opackets);
 		} else {
+#endif
 			m->m_pkthdr.csum_flags = CSUM_TCP;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			th->th_sum = in_pseudo(ip->ip_src.s_addr,
 			    ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
 			    IPPROTO_TCP + len + optlen));
+#ifdef NETFLIX_TCP_O_UDP
 		}
+#endif
 		/* IP version must be set here for ipv4/ipv6 checking later */
 		KASSERT(ip->ip_v == IPVERSION,
 		    ("%s: IP version incorrect: %d", __func__, ip->ip_v));
 	}
 #endif
 
 	/*
 	 * Enable TSO and specify the size of the segments. The TCP pseudo
 	 * header checksum is always provided. XXX: Fixme: This is currently
 	 * not the case for IPv6.
 	 */
 	if (tso) {
 		KASSERT(len > tp->t_maxseg - optlen,
 		    ("%s: len <= tso_segsz", __func__));
 		m->m_pkthdr.csum_flags |= CSUM_TSO;
 		m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
 	}
 	KASSERT(len + hdrlen == m_length(m, NULL),
 	    ("%s: mbuf chain different than expected: %d + %u != %u",
 	    __func__, len, hdrlen, m_length(m, NULL)));
 
 #ifdef TCP_HHOOK
 	/* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
 	hhook_run_tcp_est_out(tp, th, &to, len, tso);
 #endif
 
 #ifdef TCPDEBUG
 	/*
 	 * Trace.
 	 */
 	if (so->so_options & SO_DEBUG) {
 		u_short save = 0;
 
 #ifdef INET6
 		if (!isipv6)
 #endif
 		{
 			save = ipov->ih_len;
 			ipov->ih_len = htons(m->m_pkthdr.len	/* - hdrlen +
 			      * (th->th_off << 2) */ );
 		}
 		tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
 #ifdef INET6
 		if (!isipv6)
 #endif
 			ipov->ih_len = save;
 	}
 #endif				/* TCPDEBUG */
 
 	/* We're getting ready to send; log now. */
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
 		log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
 		if (rsm || sack_rxmit) {
 			log.u_bbr.flex8 = 1;
 		} else {
 			log.u_bbr.flex8 = 0;
 		}
 		lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
 		    len, &log, false, NULL, NULL, 0, NULL);
 	} else
 		lgb = NULL;
 
 	/*
 	 * Fill in IP length and desired time to live and send to IP level.
 	 * There should be a better way to handle ttl and tos; we could keep
 	 * them in the template, but need a way to checksum without them.
 	 */
 	/*
 	 * m->m_pkthdr.len should have been set before cksum calcuration,
 	 * because in6_cksum() need it.
 	 */
 #ifdef INET6
 	if (isipv6) {
 		/*
 		 * we separately set hoplimit for every segment, since the
 		 * user might want to change the value via setsockopt. Also,
 		 * desired default hop limit might be changed via Neighbor
 		 * Discovery.
 		 */
 		ip6->ip6_hlim = in6_selecthlim(inp, NULL);
 
 		/*
 		 * Set the packet size here for the benefit of DTrace
 		 * probes. ip6_output() will set it properly; it's supposed
 		 * to include the option header lengths as well.
 		 */
 		ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
 
 		if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 
 		if (tp->t_state == TCPS_SYN_SENT)
 			TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
 
 		TCP_PROBE5(send, NULL, tp, ip6, tp, th);
 		/* TODO: IPv6 IP6TOS_ECT bit on */
 		error = ip6_output(m, tp->t_inpcb->in6p_outputopts,
 		    &inp->inp_route6,
 		    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
 		    NULL, NULL, inp);
 
 		if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL)
 			mtu = inp->inp_route6.ro_rt->rt_mtu;
 	}
 #endif				/* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		ip->ip_len = htons(m->m_pkthdr.len);
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6PROTO)
 			ip->ip_ttl = in6_selecthlim(inp, NULL);
 #endif				/* INET6 */
 		/*
 		 * If we do path MTU discovery, then we set DF on every
 		 * packet. This might not be the best thing to do according
 		 * to RFC3390 Section 2. However the tcp hostcache migitates
 		 * the problem so it affects only the first tcp connection
 		 * with a host.
 		 *
 		 * NB: Don't set DF on small MTU/MSS to have a safe
 		 * fallback.
 		 */
 		if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 			if (tp->t_port == 0 || len < V_tcp_minmss) {
 				ip->ip_off |= htons(IP_DF);
 			}
 		} else {
 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 		}
 
 		if (tp->t_state == TCPS_SYN_SENT)
 			TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
 
 		TCP_PROBE5(send, NULL, tp, ip, tp, th);
 
 		error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route,
 		    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
 		    inp);
 		if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL)
 			mtu = inp->inp_route.ro_rt->rt_mtu;
 	}
 #endif				/* INET */
 
 out:
 	if (lgb) {
 		lgb->tlb_errno = error;
 		lgb = NULL;
 	}
 	/*
 	 * In transmit state, time the transmission and arrange for the
 	 * retransmit.  In persist state, just set snd_max.
 	 */
 	if (error == 0) {
 		if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 		    (tp->t_flags & TF_SACK_PERMIT) &&
 		    tp->rcv_numsacks > 0)
 		    tcp_clean_dsack_blocks(tp);
 		if (len == 0)
 			counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
 		else if (len == 1) {
 			counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
 		} else if (len > 1) {
 			int idx;
 
 			idx = (len / tp->t_maxseg) + 3;
 			if (idx >= TCP_MSS_ACCT_ATIMER)
 				counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
 			else
 				counter_u64_add(rack_out_size[idx], 1);
 		}
 	}
 	if (sub_from_prr && (error == 0)) {
-		rack->r_ctl.rc_prr_sndcnt -= len;
+		if (rack->r_ctl.rc_prr_sndcnt >= len)
+			rack->r_ctl.rc_prr_sndcnt -= len;
+		else
+			rack->r_ctl.rc_prr_sndcnt = 0;
 	}
 	sub_from_prr = 0;
 	rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts,
 	    pass, rsm);
 	if ((tp->t_flags & TF_FORCEDATA) == 0 ||
 	    (rack->rc_in_persist == 0)) {
+#ifdef NETFLIX_STATS
 		tcp_seq startseq = tp->snd_nxt;
-
+#endif
 		/*
 		 * Advance snd_nxt over sequence space of this segment.
 		 */
 		if (error)
 			/* We don't log or do anything with errors */
 			goto timer;
 
 		if (flags & (TH_SYN | TH_FIN)) {
 			if (flags & TH_SYN)
 				tp->snd_nxt++;
 			if (flags & TH_FIN) {
 				tp->snd_nxt++;
 				tp->t_flags |= TF_SENTFIN;
 			}
 		}
 		/* In the ENOBUFS case we do *not* update snd_max */
 		if (sack_rxmit)
 			goto timer;
 
 		tp->snd_nxt += len;
 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
 			if (tp->snd_una == tp->snd_max) {
 				/*
 				 * Update the time we just added data since
 				 * none was outstanding.
 				 */
 				rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
 				tp->t_acktime = ticks;
 			}
 			tp->snd_max = tp->snd_nxt;
-			/*
-			 * Time this transmission if not a retransmission and
-			 * not currently timing anything.
-			 * This is only relevant in case of switching back to
-			 * the base stack.
-			 */
-			if (tp->t_rtttime == 0) {
-				tp->t_rtttime = ticks;
-				tp->t_rtseq = startseq;
-				TCPSTAT_INC(tcps_segstimed);
-			}
 #ifdef NETFLIX_STATS
 			if (!(tp->t_flags & TF_GPUTINPROG) && len) {
 				tp->t_flags |= TF_GPUTINPROG;
 				tp->gput_seq = startseq;
 				tp->gput_ack = startseq +
 				    ulmin(sbavail(sb) - sb_offset, sendwin);
 				tp->gput_ts = tcp_ts_getticks();
 			}
 #endif
 		}
 		/*
 		 * Set retransmit timer if not currently set, and not doing
 		 * a pure ack or a keep-alive probe. Initial value for
 		 * retransmit timer is smoothed round-trip time + 2 *
 		 * round-trip time variance. Initialize shift counter which
 		 * is used for backoff of retransmit time.
 		 */
 timer:
 		if ((tp->snd_wnd == 0) &&
 		    TCPS_HAVEESTABLISHED(tp->t_state)) {
 			/*
 			 * If the persists timer was set above (right before
 			 * the goto send), and still needs to be on. Lets
 			 * make sure all is canceled. If the persist timer
 			 * is not running, we want to get it up.
 			 */
 			if (rack->rc_in_persist == 0) {
 				rack_enter_persist(tp, rack, cts);
 			}
 		}
 	} else {
 		/*
 		 * Persist case, update snd_max but since we are in persist
 		 * mode (no window) we do not update snd_nxt.
 		 */
 		int32_t xlen = len;
 
 		if (error)
 			goto nomore;
 
 		if (flags & TH_SYN)
 			++xlen;
 		if (flags & TH_FIN) {
 			++xlen;
 			tp->t_flags |= TF_SENTFIN;
 		}
 		/* In the ENOBUFS case we do *not* update snd_max */
 		if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) {
 			if (tp->snd_una == tp->snd_max) {
 				/*
 				 * Update the time we just added data since
 				 * none was outstanding.
 				 */
 				rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
 				tp->t_acktime = ticks;
 			}
 			tp->snd_max = tp->snd_nxt + len;
 		}
 	}
 nomore:
 	if (error) {
 		SOCKBUF_UNLOCK_ASSERT(sb);	/* Check gotos. */
 		/*
 		 * Failures do not advance the seq counter above. For the
 		 * case of ENOBUFS we will fall out and retry in 1ms with
 		 * the hpts. Everything else will just have to retransmit
 		 * with the timer.
 		 *
 		 * In any case, we do not want to loop around for another
 		 * send without a good reason.
 		 */
 		sendalot = 0;
 		switch (error) {
 		case EPERM:
 			tp->t_flags &= ~TF_FORCEDATA;
 			tp->t_softerror = error;
 			return (error);
 		case ENOBUFS:
 			if (slot == 0) {
 				/*
 				 * Pace us right away to retry in a some
 				 * time
 				 */
 				slot = 1 + rack->rc_enobuf;
 				if (rack->rc_enobuf < 255)
 					rack->rc_enobuf++;
 				if (slot > (rack->rc_rack_rtt / 2)) {
 					slot = rack->rc_rack_rtt / 2;
 				}
 				if (slot < 10)
 					slot = 10;
 			}
 			counter_u64_add(rack_saw_enobuf, 1);
 			error = 0;
 			goto enobufs;
 		case EMSGSIZE:
 			/*
 			 * For some reason the interface we used initially
 			 * to send segments changed to another or lowered
 			 * its MTU. If TSO was active we either got an
 			 * interface without TSO capabilits or TSO was
 			 * turned off. If we obtained mtu from ip_output()
 			 * then update it and try again.
 			 */
 			if (tso)
 				tp->t_flags &= ~TF_TSO;
 			if (mtu != 0) {
 				tcp_mss_update(tp, -1, mtu, NULL, NULL);
 				goto again;
 			}
 			slot = 10;
 			rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
 			tp->t_flags &= ~TF_FORCEDATA;
 			return (error);
 		case ENETUNREACH:
 			counter_u64_add(rack_saw_enetunreach, 1);
 		case EHOSTDOWN:
 		case EHOSTUNREACH:
 		case ENETDOWN:
 			if (TCPS_HAVERCVDSYN(tp->t_state)) {
 				tp->t_softerror = error;
 			}
 			/* FALLTHROUGH */
 		default:
 			slot = 10;
 			rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
 			tp->t_flags &= ~TF_FORCEDATA;
 			return (error);
 		}
 	} else {
 		rack->rc_enobuf = 0;
 	}
 	TCPSTAT_INC(tcps_sndtotal);
 
 	/*
 	 * Data sent (as far as we can tell). If this advertises a larger
 	 * window than any other segment, then remember the size of the
 	 * advertised window. Any pending ACK has now been sent.
 	 */
 	if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
 		tp->rcv_adv = tp->rcv_nxt + recwin;
 	tp->last_ack_sent = tp->rcv_nxt;
 	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 enobufs:
 	rack->r_tlp_running = 0;
 	if ((flags & TH_RST) || (would_have_fin == 1)) {
 		/*
 		 * We don't send again after a RST. We also do *not* send
 		 * again if we would have had a find, but now have
 		 * outstanding data.
 		 */
 		slot = 0;
 		sendalot = 0;
 	}
 	if (slot) {
 		/* set the rack tcb into the slot N */
 		counter_u64_add(rack_paced_segments, 1);
 	} else if (sendalot) {
 		if (len)
 			counter_u64_add(rack_unpaced_segments, 1);
 		sack_rxmit = 0;
 		tp->t_flags &= ~TF_FORCEDATA;
 		goto again;
 	} else if (len) {
 		counter_u64_add(rack_unpaced_segments, 1);
 	}
 	tp->t_flags &= ~TF_FORCEDATA;
 	rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
 	return (error);
 }
 
 /*
  * rack_ctloutput() must drop the inpcb lock before performing copyin on
  * socket option arguments.  When it re-acquires the lock after the copy, it
  * has to revalidate that the connection is still valid for the socket
  * option.
  */
 static int
 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
 {
 	int32_t error = 0, optval;
 
 	switch (sopt->sopt_name) {
 	case TCP_RACK_PROP_RATE:
 	case TCP_RACK_PROP:
 	case TCP_RACK_TLP_REDUCE:
 	case TCP_RACK_EARLY_RECOV:
 	case TCP_RACK_PACE_ALWAYS:
 	case TCP_DELACK:
 	case TCP_RACK_PACE_REDUCE:
 	case TCP_RACK_PACE_MAX_SEG:
 	case TCP_RACK_PRR_SENDALOT:
 	case TCP_RACK_MIN_TO:
 	case TCP_RACK_EARLY_SEG:
 	case TCP_RACK_REORD_THRESH:
 	case TCP_RACK_REORD_FADE:
 	case TCP_RACK_TLP_THRESH:
 	case TCP_RACK_PKT_DELAY:
 	case TCP_RACK_TLP_USE:
 	case TCP_RACK_TLP_INC_VAR:
 	case TCP_RACK_IDLE_REDUCE_HIGH:
 	case TCP_RACK_MIN_PACE:
 	case TCP_RACK_MIN_PACE_SEG:
 	case TCP_BBR_RACK_RTT_USE:
 	case TCP_DATA_AFTER_CLOSE:
 		break;
 	default:
 		return (tcp_default_ctloutput(so, sopt, inp, tp));
 		break;
 	}
 	INP_WUNLOCK(inp);
 	error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
 	if (error)
 		return (error);
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	switch (sopt->sopt_name) {
 	case TCP_RACK_PROP_RATE:
 		if ((optval <= 0) || (optval >= 100)) {
 			error = EINVAL;
 			break;
 		}
 		RACK_OPTS_INC(tcp_rack_prop_rate);
 		rack->r_ctl.rc_prop_rate = optval;
 		break;
 	case TCP_RACK_TLP_USE:
 		if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
 			error = EINVAL;
 			break;
 		}
 		RACK_OPTS_INC(tcp_tlp_use);
 		rack->rack_tlp_threshold_use = optval;
 		break;
 	case TCP_RACK_PROP:
 		/* RACK proportional rate reduction (bool) */
 		RACK_OPTS_INC(tcp_rack_prop);
 		rack->r_ctl.rc_prop_reduce = optval;
 		break;
 	case TCP_RACK_TLP_REDUCE:
 		/* RACK TLP cwnd reduction (bool) */
 		RACK_OPTS_INC(tcp_rack_tlp_reduce);
 		rack->r_ctl.rc_tlp_cwnd_reduce = optval;
 		break;
 	case TCP_RACK_EARLY_RECOV:
 		/* Should recovery happen early (bool) */
 		RACK_OPTS_INC(tcp_rack_early_recov);
 		rack->r_ctl.rc_early_recovery = optval;
 		break;
 	case TCP_RACK_PACE_ALWAYS:
 		/* Use the always pace method (bool)  */
 		RACK_OPTS_INC(tcp_rack_pace_always);
 		if (optval > 0)
 			rack->rc_always_pace = 1;
 		else
 			rack->rc_always_pace = 0;
 		break;
 	case TCP_RACK_PACE_REDUCE:
 		/* RACK Hptsi reduction factor (divisor) */
 		RACK_OPTS_INC(tcp_rack_pace_reduce);
 		if (optval)
 			/* Must be non-zero */
 			rack->rc_pace_reduce = optval;
 		else
 			error = EINVAL;
 		break;
 	case TCP_RACK_PACE_MAX_SEG:
 		/* Max segments in a pace */
 		RACK_OPTS_INC(tcp_rack_max_seg);
 		rack->rc_pace_max_segs = optval;
 		break;
 	case TCP_RACK_PRR_SENDALOT:
 		/* Allow PRR to send more than one seg */
 		RACK_OPTS_INC(tcp_rack_prr_sendalot);
 		rack->r_ctl.rc_prr_sendalot = optval;
 		break;
 	case TCP_RACK_MIN_TO:
 		/* Minimum time between rack t-o's in ms */
 		RACK_OPTS_INC(tcp_rack_min_to);
 		rack->r_ctl.rc_min_to = optval;
 		break;
 	case TCP_RACK_EARLY_SEG:
 		/* If early recovery max segments */
 		RACK_OPTS_INC(tcp_rack_early_seg);
 		rack->r_ctl.rc_early_recovery_segs = optval;
 		break;
 	case TCP_RACK_REORD_THRESH:
 		/* RACK reorder threshold (shift amount) */
 		RACK_OPTS_INC(tcp_rack_reord_thresh);
 		if ((optval > 0) && (optval < 31))
 			rack->r_ctl.rc_reorder_shift = optval;
 		else
 			error = EINVAL;
 		break;
 	case TCP_RACK_REORD_FADE:
 		/* Does reordering fade after ms time */
 		RACK_OPTS_INC(tcp_rack_reord_fade);
 		rack->r_ctl.rc_reorder_fade = optval;
 		break;
 	case TCP_RACK_TLP_THRESH:
 		/* RACK TLP theshold i.e. srtt+(srtt/N) */
 		RACK_OPTS_INC(tcp_rack_tlp_thresh);
 		if (optval)
 			rack->r_ctl.rc_tlp_threshold = optval;
 		else
 			error = EINVAL;
 		break;
 	case TCP_RACK_PKT_DELAY:
 		/* RACK added ms i.e. rack-rtt + reord + N */
 		RACK_OPTS_INC(tcp_rack_pkt_delay);
 		rack->r_ctl.rc_pkt_delay = optval;
 		break;
 	case TCP_RACK_TLP_INC_VAR:
 		/* Does TLP include rtt variance in t-o */
 		RACK_OPTS_INC(tcp_rack_tlp_inc_var);
 		rack->r_ctl.rc_prr_inc_var = optval;
 		break;
 	case TCP_RACK_IDLE_REDUCE_HIGH:
 		RACK_OPTS_INC(tcp_rack_idle_reduce_high);
 		if (optval)
 			rack->r_idle_reduce_largest = 1;
 		else
 			rack->r_idle_reduce_largest = 0;
 		break;
 	case TCP_DELACK:
 		if (optval == 0)
 			tp->t_delayed_ack = 0;
 		else
 			tp->t_delayed_ack = 1;
 		if (tp->t_flags & TF_DELACK) {
 			tp->t_flags &= ~TF_DELACK;
 			tp->t_flags |= TF_ACKNOW;
 			rack_output(tp);
 		}
 		break;
 	case TCP_RACK_MIN_PACE:
 		RACK_OPTS_INC(tcp_rack_min_pace);
 		if (optval > 3)
 			rack->r_enforce_min_pace = 3;
 		else
 			rack->r_enforce_min_pace = optval;
 		break;
 	case TCP_RACK_MIN_PACE_SEG:
 		RACK_OPTS_INC(tcp_rack_min_pace_seg);
 		if (optval >= 16)
 			rack->r_min_pace_seg_thresh = 15;
 		else
 			rack->r_min_pace_seg_thresh = optval;
 		break;
 	case TCP_BBR_RACK_RTT_USE:
 		if ((optval != USE_RTT_HIGH) &&
 		    (optval != USE_RTT_LOW) &&
 		    (optval != USE_RTT_AVG))
 			error = EINVAL;
 		else
 			rack->r_ctl.rc_rate_sample_method = optval;
 		break;
 	case TCP_DATA_AFTER_CLOSE:
 		if (optval)
 			rack->rc_allow_data_af_clo = 1;
 		else
 			rack->rc_allow_data_af_clo = 0;
 		break;
 	default:
 		return (tcp_default_ctloutput(so, sopt, inp, tp));
 		break;
 	}
-#ifdef NETFLIX_STATS
-	tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
-#endif
+/*	tcp_log_socket_option(tp, sopt->sopt_name, optval, error);*/
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static int
 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
 {
 	int32_t error, optval;
 
 	/*
 	 * Because all our options are either boolean or an int, we can just
 	 * pull everything into optval and then unlock and copy. If we ever
 	 * add a option that is not a int, then this will have quite an
 	 * impact to this routine.
 	 */
 	switch (sopt->sopt_name) {
 	case TCP_RACK_PROP_RATE:
 		optval = rack->r_ctl.rc_prop_rate;
 		break;
 	case TCP_RACK_PROP:
 		/* RACK proportional rate reduction (bool) */
 		optval = rack->r_ctl.rc_prop_reduce;
 		break;
 	case TCP_RACK_TLP_REDUCE:
 		/* RACK TLP cwnd reduction (bool) */
 		optval = rack->r_ctl.rc_tlp_cwnd_reduce;
 		break;
 	case TCP_RACK_EARLY_RECOV:
 		/* Should recovery happen early (bool) */
 		optval = rack->r_ctl.rc_early_recovery;
 		break;
 	case TCP_RACK_PACE_REDUCE:
 		/* RACK Hptsi reduction factor (divisor) */
 		optval = rack->rc_pace_reduce;
 		break;
 	case TCP_RACK_PACE_MAX_SEG:
 		/* Max segments in a pace */
 		optval = rack->rc_pace_max_segs;
 		break;
 	case TCP_RACK_PACE_ALWAYS:
 		/* Use the always pace method */
 		optval = rack->rc_always_pace;
 		break;
 	case TCP_RACK_PRR_SENDALOT:
 		/* Allow PRR to send more than one seg */
 		optval = rack->r_ctl.rc_prr_sendalot;
 		break;
 	case TCP_RACK_MIN_TO:
 		/* Minimum time between rack t-o's in ms */
 		optval = rack->r_ctl.rc_min_to;
 		break;
 	case TCP_RACK_EARLY_SEG:
 		/* If early recovery max segments */
 		optval = rack->r_ctl.rc_early_recovery_segs;
 		break;
 	case TCP_RACK_REORD_THRESH:
 		/* RACK reorder threshold (shift amount) */
 		optval = rack->r_ctl.rc_reorder_shift;
 		break;
 	case TCP_RACK_REORD_FADE:
 		/* Does reordering fade after ms time */
 		optval = rack->r_ctl.rc_reorder_fade;
 		break;
 	case TCP_RACK_TLP_THRESH:
 		/* RACK TLP theshold i.e. srtt+(srtt/N) */
 		optval = rack->r_ctl.rc_tlp_threshold;
 		break;
 	case TCP_RACK_PKT_DELAY:
 		/* RACK added ms i.e. rack-rtt + reord + N */
 		optval = rack->r_ctl.rc_pkt_delay;
 		break;
 	case TCP_RACK_TLP_USE:
 		optval = rack->rack_tlp_threshold_use;
 		break;
 	case TCP_RACK_TLP_INC_VAR:
 		/* Does TLP include rtt variance in t-o */
 		optval = rack->r_ctl.rc_prr_inc_var;
 		break;
 	case TCP_RACK_IDLE_REDUCE_HIGH:
 		optval = rack->r_idle_reduce_largest;
 		break;
 	case TCP_RACK_MIN_PACE:
 		optval = rack->r_enforce_min_pace;
 		break;
 	case TCP_RACK_MIN_PACE_SEG:
 		optval = rack->r_min_pace_seg_thresh;
 		break;
 	case TCP_BBR_RACK_RTT_USE:
 		optval = rack->r_ctl.rc_rate_sample_method;
 		break;
 	case TCP_DELACK:
 		optval = tp->t_delayed_ack;
 		break;
 	case TCP_DATA_AFTER_CLOSE:
 		optval = rack->rc_allow_data_af_clo;
 		break;
 	default:
 		return (tcp_default_ctloutput(so, sopt, inp, tp));
 		break;
 	}
 	INP_WUNLOCK(inp);
 	error = sooptcopyout(sopt, &optval, sizeof optval);
 	return (error);
 }
 
 static int
 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
 {
 	int32_t error = EINVAL;
 	struct tcp_rack *rack;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (rack == NULL) {
 		/* Huh? */
 		goto out;
 	}
 	if (sopt->sopt_dir == SOPT_SET) {
 		return (rack_set_sockopt(so, sopt, inp, tp, rack));
 	} else if (sopt->sopt_dir == SOPT_GET) {
 		return (rack_get_sockopt(so, sopt, inp, tp, rack));
 	}
 out:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 
 struct tcp_function_block __tcp_rack = {
 	.tfb_tcp_block_name = __XSTRING(STACKNAME),
 	.tfb_tcp_output = rack_output,
 	.tfb_tcp_do_segment = rack_do_segment,
-	.tfb_tcp_hpts_do_segment = rack_hpts_do_segment,
 	.tfb_tcp_ctloutput = rack_ctloutput,
 	.tfb_tcp_fb_init = rack_init,
 	.tfb_tcp_fb_fini = rack_fini,
 	.tfb_tcp_timer_stop_all = rack_stopall,
 	.tfb_tcp_timer_activate = rack_timer_activate,
 	.tfb_tcp_timer_active = rack_timer_active,
 	.tfb_tcp_timer_stop = rack_timer_stop,
 	.tfb_tcp_rexmit_tmr = rack_remxt_tmr,
 	.tfb_tcp_handoff_ok = rack_handoff_ok
 };
 
 static const char *rack_stack_names[] = {
 	__XSTRING(STACKNAME),
 #ifdef STACKALIAS
 	__XSTRING(STACKALIAS),
 #endif
 };
 
 static int
 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
 {
 	memset(mem, 0, size);
 	return (0);
 }
 
 static void
 rack_dtor(void *mem, int32_t size, void *arg)
 {
 
 }
 
 static bool rack_mod_inited = false;
 
 static int
 tcp_addrack(module_t mod, int32_t type, void *data)
 {
 	int32_t err = 0;
 	int num_stacks;
 
 	switch (type) {
 	case MOD_LOAD:
 		rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
 		    sizeof(struct rack_sendmap),
 		    rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
 
 		rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
 		    sizeof(struct tcp_rack),
 		    rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
 
 		sysctl_ctx_init(&rack_sysctl_ctx);
 		rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
 		    OID_AUTO,
 		    __XSTRING(STACKNAME),
 		    CTLFLAG_RW, 0,
 		    "");
 		if (rack_sysctl_root == NULL) {
 			printf("Failed to add sysctl node\n");
 			err = EFAULT;
 			goto free_uma;
 		}
 		rack_init_sysctls();
 		num_stacks = nitems(rack_stack_names);
 		err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
 		    rack_stack_names, &num_stacks);
 		if (err) {
 			printf("Failed to register %s stack name for "
 			    "%s module\n", rack_stack_names[num_stacks],
 			    __XSTRING(MODNAME));
 			sysctl_ctx_free(&rack_sysctl_ctx);
 free_uma:
 			uma_zdestroy(rack_zone);
 			uma_zdestroy(rack_pcb_zone);
 			rack_counter_destroy();
 			printf("Failed to register rack module -- err:%d\n", err);
 			return (err);
 		}
 		rack_mod_inited = true;
 		break;
 	case MOD_QUIESCE:
 		err = deregister_tcp_functions(&__tcp_rack, true, false);
 		break;
 	case MOD_UNLOAD:
 		err = deregister_tcp_functions(&__tcp_rack, false, true);
 		if (err == EBUSY)
 			break;
 		if (rack_mod_inited) {
 			uma_zdestroy(rack_zone);
 			uma_zdestroy(rack_pcb_zone);
 			sysctl_ctx_free(&rack_sysctl_ctx);
 			rack_counter_destroy();
 			rack_mod_inited = false;
 		}
 		err = 0;
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (err);
 }
 
 static moduledata_t tcp_rack = {
 	.name = __XSTRING(MODNAME),
 	.evhand = tcp_addrack,
 	.priv = 0
 };
 
 MODULE_VERSION(MODNAME, 1);
 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
-MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);
Index: stable/12/sys/netinet/tcp_stacks/rack_bbr_common.c
===================================================================
--- stable/12/sys/netinet/tcp_stacks/rack_bbr_common.c	(nonexistent)
+++ stable/12/sys/netinet/tcp_stacks/rack_bbr_common.c	(revision 362880)
@@ -0,0 +1,859 @@
+/*-
+ * Copyright (c) 2016-2018
+ *	Netflix Inc.
+ *      All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/*
+ * Author: Randall Stewart <rrs@netflix.com>
+ * This work is based on the ACM Queue paper
+ * BBR - Congestion Based Congestion Control
+ * and also numerous discussions with Neal, Yuchung and Van.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+#include "opt_ratelimit.h"
+/*#include "opt_kern_tls.h"*/
+#include <sys/param.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#ifdef TCP_HHOOK
+#include <sys/hhook.h>
+#endif
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#ifdef KERN_TLS
+#include <sys/sockbuf_tls.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/tree.h>
+#include <sys/refcount.h>
+#include <sys/queue.h>
+#include <sys/smp.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/time.h>
+#include <vm/uma.h>
+#include <sys/kern_prefetch.h>
+
+#include <net/route.h>
+#include <net/vnet.h>
+#include <net/ethernet.h>
+#include <net/bpf.h>
+
+#define TCPSTATES		/* for logging */
+
+#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>	/* required for icmp_var.h */
+#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#define	TCPOUTFLAGS
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/tcp_hpts.h>
+#include <netinet/cc/cc.h>
+#include <netinet/tcp_log_buf.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif				/* TCPDEBUG */
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcp_fastopen.h>
+
+#include <netipsec/ipsec_support.h>
+#include <net/if.h>
+#include <net/if_var.h>
+
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif				/* IPSEC */
+
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <machine/in_cksum.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+#include "rack_bbr_common.h"
+
+/*
+ * Common TCP Functions - These are shared by borth
+ * rack and BBR.
+ */
+
+
+#ifdef KERN_TLS
+uint32_t
+ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd)
+{
+	struct sbtls_info *tls;
+	uint32_t len;
+
+again:
+	tls = so->so_snd.sb_tls_info;
+	len = tls->sb_params.sb_maxlen;         /* max tls payload */
+	len += tls->sb_params.sb_tls_hlen;      /* tls header len  */
+	len += tls->sb_params.sb_tls_tlen;      /* tls trailer len */
+	if ((len * 4) > rwnd) {
+		/*
+		 * Stroke this will suck counter and what
+		 * else should we do Drew? From the
+		 * TCP perspective I am not sure
+		 * what should be done...
+		 */
+		if (tls->sb_params.sb_maxlen > 4096) {
+			tls->sb_params.sb_maxlen -= 4096;
+			if (tls->sb_params.sb_maxlen < 4096)
+				tls->sb_params.sb_maxlen = 4096;
+			goto again;
+		}
+	}
+	return (len);
+}
+#endif
+
+int
+ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int has_pkt)
+{
+	/*
+	 * We are passed a raw change of mbuf packets
+	 * that arrived in LRO. They are linked via
+	 * the m_nextpkt link in the pkt-headers.
+	 *
+	 * We process each one by:
+	 * a) saving off the next
+	 * b) stripping off the ether-header
+	 * c) formulating the arguments for
+	 *    the tfb_tcp_hpts_do_segment
+	 * d) calling each mbuf to tfb_tcp_hpts_do_segment
+	 *    after adjusting the time to match the arrival time.
+	 * Note that the LRO code assures no IP options are present.
+	 *
+	 * The symantics for calling tfb_tcp_hpts_do_segment are the 
+	 * following:
+	 * 1) It returns 0 if all went well and you (the caller) need
+	 *    to release the lock.
+	 * 2) If nxt_pkt is set, then the function will surpress calls
+	 *    to tfb_tcp_output() since you are promising to call again
+	 *    with another packet.
+	 * 3) If it returns 1, then you must free all the packets being
+	 *    shipped in, the tcb has been destroyed (or about to be destroyed).
+	 */
+	struct mbuf *m_save;
+	struct ether_header *eh;
+	struct epoch_tracker et;
+	struct tcphdr *th;
+#ifdef INET6
+	struct ip6_hdr *ip6 = NULL;	/* Keep compiler happy. */
+#endif
+#ifdef INET
+	struct ip *ip = NULL;		/* Keep compiler happy. */
+#endif
+	struct ifnet *ifp;
+	struct timeval tv;
+	int32_t retval, nxt_pkt, tlen, off;
+	uint16_t etype;
+	uint16_t drop_hdrlen;
+	uint8_t iptos, no_vn=0, bpf_req=0;
+
+	/* 
+	 * This is a bit deceptive, we get the
+	 * "info epoch" which is really the network
+	 * epoch. This covers us on both any INP
+	 * type change but also if the ifp goes
+	 * away it covers us as well.
+	 */
+	INP_INFO_RLOCK_ET(&V_tcbinfo, et);
+	if (m && m->m_pkthdr.rcvif)
+		ifp = m->m_pkthdr.rcvif;
+	else
+		ifp = NULL;
+	if (ifp) {
+		bpf_req = bpf_peers_present(ifp->if_bpf);
+	} else  {
+		/* 
+		 * We probably should not work around
+		 * but kassert, since lro alwasy sets rcvif.
+		 */
+		no_vn = 1;
+		goto skip_vnet;
+	}
+	CURVNET_SET(ifp->if_vnet);
+skip_vnet:
+	while (m) {
+		m_save = m->m_nextpkt;
+		m->m_nextpkt = NULL;
+		/* Now lets get the ether header */
+		eh = mtod(m, struct ether_header *);
+		etype = ntohs(eh->ether_type);
+		/* Let the BPF see the packet */
+		if (bpf_req && ifp)
+			ETHER_BPF_MTAP(ifp, m);
+		m_adj(m,  sizeof(*eh));
+		/* Trim off the ethernet header */
+		switch (etype) {
+#ifdef INET6
+		case ETHERTYPE_IPV6:
+		{
+			if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
+				m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
+				if (m == NULL) {
+					TCPSTAT_INC(tcps_rcvshort);
+					m_freem(m);
+					goto skipped_pkt;
+				}
+			}
+			ip6 = (struct ip6_hdr *)(eh + 1);
+			th = (struct tcphdr *)(ip6 + 1);
+			tlen = ntohs(ip6->ip6_plen);
+			drop_hdrlen = sizeof(*ip6);
+			if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
+				if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+					th->th_sum = m->m_pkthdr.csum_data;
+				else
+					th->th_sum = in6_cksum_pseudo(ip6, tlen,
+								      IPPROTO_TCP, m->m_pkthdr.csum_data);
+				th->th_sum ^= 0xffff;
+			} else
+				th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen);
+			if (th->th_sum) {
+				TCPSTAT_INC(tcps_rcvbadsum);
+				m_freem(m);
+				goto skipped_pkt;
+			}
+			/*
+			 * Be proactive about unspecified IPv6 address in source.
+			 * As we use all-zero to indicate unbounded/unconnected pcb,
+			 * unspecified IPv6 address can be used to confuse us.
+			 *
+			 * Note that packets with unspecified IPv6 destination is
+			 * already dropped in ip6_input.
+			 */
+			if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
+				/* XXX stat */
+				m_freem(m);
+				goto skipped_pkt;
+			}
+			iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
+			break;
+		}
+#endif
+#ifdef INET
+		case ETHERTYPE_IP:
+		{
+			if (m->m_len < sizeof (struct tcpiphdr)) {
+				if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
+				    == NULL) {
+					TCPSTAT_INC(tcps_rcvshort);
+					m_freem(m);
+					goto skipped_pkt;
+				}
+			}
+			ip = (struct ip *)(eh + 1);
+			th = (struct tcphdr *)(ip + 1);
+			drop_hdrlen = sizeof(*ip);
+			iptos = ip->ip_tos;
+			tlen = ntohs(ip->ip_len) - sizeof(struct ip);
+			if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
+				if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+					th->th_sum = m->m_pkthdr.csum_data;
+				else
+					th->th_sum = in_pseudo(ip->ip_src.s_addr,
+							       ip->ip_dst.s_addr,
+							       htonl(m->m_pkthdr.csum_data + tlen +
+								     IPPROTO_TCP));
+				th->th_sum ^= 0xffff;
+			} else {
+				int len;
+				struct ipovly *ipov = (struct ipovly *)ip;
+				/*
+				 * Checksum extended TCP header and data.
+				 */
+				len = drop_hdrlen + tlen;
+				bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
+				ipov->ih_len = htons(tlen);
+				th->th_sum = in_cksum(m, len);
+				/* Reset length for SDT probes. */
+				ip->ip_len = htons(len);
+				/* Reset TOS bits */
+				ip->ip_tos = iptos;
+				/* Re-initialization for later version check */
+				ip->ip_v = IPVERSION;
+				ip->ip_hl = sizeof(*ip) >> 2;
+			}
+			if (th->th_sum) {
+				TCPSTAT_INC(tcps_rcvbadsum);
+				m_freem(m);
+				goto skipped_pkt;
+			}
+			break;
+		}
+#endif
+		}
+		/*
+		 * Convert TCP protocol specific fields to host format.
+		 */
+		tcp_fields_to_host(th);
+
+		off = th->th_off << 2;
+		if (off < sizeof (struct tcphdr) || off > tlen) {
+			TCPSTAT_INC(tcps_rcvbadoff);
+				m_freem(m);
+				goto skipped_pkt;
+		}
+		tlen -= off;
+		drop_hdrlen += off;
+		/* 
+		 * Now lets setup the timeval to be when we should
+		 * have been called (if we can).
+		 */
+		m->m_pkthdr.lro_nsegs = 1;
+		if (m->m_flags & M_TSTMP_LRO) {
+			tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
+			tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000;
+		} else {
+			/* Should not be should we kassert instead? */
+			tcp_get_usecs(&tv);
+		}
+		/* Now what about next packet? */
+		if (m_save || has_pkt)
+			nxt_pkt = 1;
+		else
+			nxt_pkt = 0;
+		retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen,
+							      iptos, nxt_pkt, &tv);
+		if (retval) {
+			/* We lost the lock and tcb probably */
+			m = m_save;
+			while (m) {
+				m_save = m->m_nextpkt;
+				m->m_nextpkt = NULL;
+				m_freem(m);
+				m = m_save;
+			}
+			if (no_vn == 0)
+				CURVNET_RESTORE();
+			INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+			return (retval);
+		}
+skipped_pkt:
+		m = m_save;
+	}
+	if (no_vn == 0)
+		CURVNET_RESTORE();
+	INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+	return (retval);
+}
+
+int
+ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt)
+{
+	struct mbuf *m;
+
+	/* First lets see if we have old packets */
+	if (tp->t_in_pkt) {
+		m = tp->t_in_pkt;
+		tp->t_in_pkt = NULL;
+		tp->t_tail_pkt = NULL;
+		if (ctf_process_inbound_raw(tp, so, m, have_pkt)) {
+			/* We lost the tcpcb (maybe a RST came in)? */
+			return (1);
+		}
+	}
+	return (0);
+}
+
+uint32_t
+ctf_outstanding(struct tcpcb *tp)
+{
+	return (tp->snd_max - tp->snd_una);
+}
+
+uint32_t 
+ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked)
+{
+	if (rc_sacked <= ctf_outstanding(tp))
+		return (ctf_outstanding(tp) - rc_sacked);
+	else {
+		/* TSNH */
+#ifdef INVARIANTS
+		panic("tp:%p rc_sacked:%d > out:%d",
+		      tp, rc_sacked, ctf_outstanding(tp));
+#endif		
+		return (0);
+	}
+}
+
+void
+ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
+    int32_t rstreason, int32_t tlen)
+{
+	if (tp != NULL) {
+		tcp_dropwithreset(m, th, tp, tlen, rstreason);
+		INP_WUNLOCK(tp->t_inpcb);
+	} else
+		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
+}
+
+/*
+ * ctf_drop_checks returns 1 for you should not proceed. It places
+ * in ret_val what should be returned 1/0 by the caller. The 1 indicates
+ * that the TCB is unlocked and probably dropped. The 0 indicates the
+ * TCB is still valid and locked.
+ */
+int
+ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp,  int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
+{
+	int32_t todrop;
+	int32_t thflags;
+	int32_t tlen;
+
+	thflags = *thf;
+	tlen = *tlenp;
+	todrop = tp->rcv_nxt - th->th_seq;
+	if (todrop > 0) {
+		if (thflags & TH_SYN) {
+			thflags &= ~TH_SYN;
+			th->th_seq++;
+			if (th->th_urp > 1)
+				th->th_urp--;
+			else
+				thflags &= ~TH_URG;
+			todrop--;
+		}
+		/*
+		 * Following if statement from Stevens, vol. 2, p. 960.
+		 */
+		if (todrop > tlen
+		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
+			/*
+			 * Any valid FIN must be to the left of the window.
+			 * At this point the FIN must be a duplicate or out
+			 * of sequence; drop it.
+			 */
+			thflags &= ~TH_FIN;
+			/*
+			 * Send an ACK to resynchronize and drop any data.
+			 * But keep on processing for RST or ACK.
+			 */
+			tp->t_flags |= TF_ACKNOW;
+			todrop = tlen;
+			TCPSTAT_INC(tcps_rcvduppack);
+			TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
+		} else {
+			TCPSTAT_INC(tcps_rcvpartduppack);
+			TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
+		}
+		/*
+		 * DSACK - add SACK block for dropped range
+		 */
+		if (tp->t_flags & TF_SACK_PERMIT) {
+			tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen);
+			/*
+			 * ACK now, as the next in-sequence segment
+			 * will clear the DSACK block again
+			 */
+			tp->t_flags |= TF_ACKNOW;
+		}
+		*drop_hdrlen += todrop;	/* drop from the top afterwards */
+		th->th_seq += todrop;
+		tlen -= todrop;
+		if (th->th_urp > todrop)
+			th->th_urp -= todrop;
+		else {
+			thflags &= ~TH_URG;
+			th->th_urp = 0;
+		}
+	}
+	/*
+	 * If segment ends after window, drop trailing data (and PUSH and
+	 * FIN); if nothing left, just ACK.
+	 */
+	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
+	if (todrop > 0) {
+		TCPSTAT_INC(tcps_rcvpackafterwin);
+		if (todrop >= tlen) {
+			TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
+			/*
+			 * If window is closed can only take segments at
+			 * window edge, and have to drop data and PUSH from
+			 * incoming segments.  Continue processing, but
+			 * remember to ack.  Otherwise, drop segment and
+			 * ack.
+			 */
+			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
+				tp->t_flags |= TF_ACKNOW;
+				TCPSTAT_INC(tcps_rcvwinprobe);
+			} else {
+				ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
+				return (1);
+			}
+		} else
+			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
+		m_adj(m, -todrop);
+		tlen -= todrop;
+		thflags &= ~(TH_PUSH | TH_FIN);
+	}
+	*thf = thflags;
+	*tlenp = tlen;
+	return (0);
+}
+
+/*
+ * The value in ret_val informs the caller
+ * if we dropped the tcb (and lock) or not.
+ * 1 = we dropped it, 0 = the TCB is still locked
+ * and valid.
+ */
+void
+ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val)
+{
+	/*
+	 * Generate an ACK dropping incoming segment if it occupies sequence
+	 * space, where the ACK reflects our state.
+	 *
+	 * We can now skip the test for the RST flag since all paths to this
+	 * code happen after packets containing RST have been dropped.
+	 *
+	 * In the SYN-RECEIVED state, don't send an ACK unless the segment
+	 * we received passes the SYN-RECEIVED ACK test. If it fails send a
+	 * RST.  This breaks the loop in the "LAND" DoS attack, and also
+	 * prevents an ACK storm between two listening ports that have been
+	 * sent forged SYN segments, each with the source address of the
+	 * other.
+	 */
+	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
+	    (SEQ_GT(tp->snd_una, th->th_ack) ||
+	    SEQ_GT(th->th_ack, tp->snd_max))) {
+		*ret_val = 1;
+		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+		return;
+	} else
+		*ret_val = 0;
+	tp->t_flags |= TF_ACKNOW;
+	if (m)
+		m_freem(m);
+}
+
+void
+ctf_do_drop(struct mbuf *m, struct tcpcb *tp)
+{
+
+	/*
+	 * Drop space held by incoming segment and return.
+	 */
+	if (tp != NULL)
+		INP_WUNLOCK(tp->t_inpcb);
+	if (m)
+		m_freem(m);
+}
+
+int
+ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp)
+{
+	/*
+	 * RFC5961 Section 3.2
+	 *
+	 * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
+	 * window, we send challenge ACK.
+	 *
+	 * Note: to take into account delayed ACKs, we should test against
+	 * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
+	 * of closed window, not covered by the RFC.
+	 */
+	int dropped = 0;
+
+	if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
+	    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
+	    (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
+
+		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+		KASSERT(tp->t_state != TCPS_SYN_SENT,
+		    ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
+		    __func__, th, tp));
+
+		if (V_tcp_insecure_rst ||
+		    (tp->last_ack_sent == th->th_seq) ||
+		    (tp->rcv_nxt == th->th_seq) ||
+		    ((tp->last_ack_sent - 1) == th->th_seq)) {
+			TCPSTAT_INC(tcps_drops);
+			/* Drop the connection. */
+			switch (tp->t_state) {
+			case TCPS_SYN_RECEIVED:
+				so->so_error = ECONNREFUSED;
+				goto close;
+			case TCPS_ESTABLISHED:
+			case TCPS_FIN_WAIT_1:
+			case TCPS_FIN_WAIT_2:
+			case TCPS_CLOSE_WAIT:
+			case TCPS_CLOSING:
+			case TCPS_LAST_ACK:
+				so->so_error = ECONNRESET;
+		close:
+				tcp_state_change(tp, TCPS_CLOSED);
+				/* FALLTHROUGH */
+			default:
+				tp = tcp_close(tp);
+			}
+			dropped = 1;
+			ctf_do_drop(m, tp);
+		} else {
+			TCPSTAT_INC(tcps_badrst);
+			/* Send challenge ACK. */
+			tcp_respond(tp, mtod(m, void *), th, m,
+			    tp->rcv_nxt, tp->snd_nxt, TH_ACK);
+			tp->last_ack_sent = tp->rcv_nxt;
+		}
+	} else {
+		m_freem(m);
+	}
+	return (dropped);
+}
+
+/*
+ * The value in ret_val informs the caller
+ * if we dropped the tcb (and lock) or not.
+ * 1 = we dropped it, 0 = the TCB is still locked
+ * and valid.
+ */
+void
+ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val)
+{
+	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+
+	TCPSTAT_INC(tcps_badsyn);
+	if (V_tcp_insecure_syn &&
+	    SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+	    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
+		tp = tcp_drop(tp, ECONNRESET);
+		*ret_val = 1;
+		ctf_do_drop(m, tp);
+	} else {
+		/* Send challenge ACK. */
+		tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
+		    tp->snd_nxt, TH_ACK);
+		tp->last_ack_sent = tp->rcv_nxt;
+		m = NULL;
+		*ret_val = 0;
+		ctf_do_drop(m, NULL);
+	}
+}
+
+/*
+ * bbr_ts_check returns 1 for you should not proceed, the state
+ * machine should return. It places in ret_val what should
+ * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates
+ * that the TCB is unlocked and probably dropped. The 0 indicates the
+ * TCB is still valid and locked.
+ */
+int
+ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
+    int32_t tlen, int32_t thflags, int32_t * ret_val)
+{
+
+	if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
+		/*
+		 * Invalidate ts_recent.  If this segment updates ts_recent,
+		 * the age will be reset later and ts_recent will get a
+		 * valid value.  If it does not, setting ts_recent to zero
+		 * will at least satisfy the requirement that zero be placed
+		 * in the timestamp echo reply when ts_recent isn't valid.
+		 * The age isn't reset until we get a valid ts_recent
+		 * because we don't want out-of-order segments to be dropped
+		 * when ts_recent is old.
+		 */
+		tp->ts_recent = 0;
+	} else {
+		TCPSTAT_INC(tcps_rcvduppack);
+		TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
+		TCPSTAT_INC(tcps_pawsdrop);
+		*ret_val = 0;
+		if (tlen) {
+			ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
+		} else {
+			ctf_do_drop(m, NULL);
+		}
+		return (1);
+	}
+	return (0);
+}
+
+void
+ctf_calc_rwin(struct socket *so, struct tcpcb *tp)
+{
+	int32_t win;
+
+	/*
+	 * Calculate amount of space in receive window, and then do TCP
+	 * input processing. Receive window is amount of space in rcv queue,
+	 * but not less than advertised window.
+	 */
+	win = sbspace(&so->so_rcv);
+	if (win < 0)
+		win = 0;
+	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
+}
+
+void
+ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
+    int32_t rstreason, int32_t tlen)
+{
+
+	if (tp->t_inpcb) {
+		tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
+	}
+	tcp_dropwithreset(m, th, tp, tlen, rstreason);
+	INP_WUNLOCK(tp->t_inpcb);
+}
+
+uint32_t
+ctf_fixed_maxseg(struct tcpcb *tp)
+{
+	int optlen;
+
+	if (tp->t_flags & TF_NOOPT)
+		return (tp->t_maxseg);
+
+	/*
+	 * Here we have a simplified code from tcp_addoptions(),
+	 * without a proper loop, and having most of paddings hardcoded.
+	 * We only consider fixed options that we would send every
+	 * time I.e. SACK is not considered.
+	 * 
+	 */
+#define	PAD(len)	((((len) / 4) + !!((len) % 4)) * 4)
+	if (TCPS_HAVEESTABLISHED(tp->t_state)) {
+		if (tp->t_flags & TF_RCVD_TSTMP)
+			optlen = TCPOLEN_TSTAMP_APPA;
+		else
+			optlen = 0;
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+		if (tp->t_flags & TF_SIGNATURE)
+			optlen += PAD(TCPOLEN_SIGNATURE);
+#endif
+	} else {
+		if (tp->t_flags & TF_REQ_TSTMP)
+			optlen = TCPOLEN_TSTAMP_APPA;
+		else
+			optlen = PAD(TCPOLEN_MAXSEG);
+		if (tp->t_flags & TF_REQ_SCALE)
+			optlen += PAD(TCPOLEN_WINDOW);
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+		if (tp->t_flags & TF_SIGNATURE)
+			optlen += PAD(TCPOLEN_SIGNATURE);
+#endif
+		if (tp->t_flags & TF_SACK_PERMIT)
+			optlen += PAD(TCPOLEN_SACK_PERMITTED);
+	}
+#undef PAD
+	optlen = min(optlen, TCP_MAXOLEN);
+	return (tp->t_maxseg - optlen);
+}
+
+void
+ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks)
+{
+	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
+		union tcp_log_stackspecific log;
+		struct timeval tv;
+
+		memset(&log, 0, sizeof(log));
+		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+		log.u_bbr.flex8 = num_sack_blks;
+		if (num_sack_blks > 0) {
+			log.u_bbr.flex1 = sack_blocks[0].start;
+			log.u_bbr.flex2 = sack_blocks[0].end;
+		}
+		if (num_sack_blks > 1) {
+			log.u_bbr.flex3 = sack_blocks[1].start;
+			log.u_bbr.flex4 = sack_blocks[1].end;
+		}
+		if (num_sack_blks > 2) {
+			log.u_bbr.flex5 = sack_blocks[2].start;
+			log.u_bbr.flex6 = sack_blocks[2].end;
+		}
+		if (num_sack_blks > 3) {
+			log.u_bbr.applimited = sack_blocks[3].start;
+			log.u_bbr.pkts_out = sack_blocks[3].end;
+		}
+		TCP_LOG_EVENTP(tp, NULL,
+		    &tp->t_inpcb->inp_socket->so_rcv,
+		    &tp->t_inpcb->inp_socket->so_snd,
+		    TCP_SACK_FILTER_RES, 0,
+		    0, &log, false, &tv);
+	}
+}
+
+uint32_t 
+ctf_decay_count(uint32_t count, uint32_t decay)
+{
+	/*
+	 * Given a count, decay it by a set percentage. The
+	 * percentage is in thousands i.e. 100% = 1000, 
+	 * 19.3% = 193.
+	 */
+	uint64_t perc_count, decay_per;
+	uint32_t decayed_count;
+	if (decay > 1000) {
+		/* We don't raise it */
+		return (count);
+	}
+	perc_count = count;
+	decay_per = decay;
+	perc_count *= decay_per;
+	perc_count /= 1000;
+	/* 
+	 * So now perc_count holds the 
+	 * count decay value.
+	 */
+	decayed_count = count - (uint32_t)perc_count;
+	return (decayed_count);
+}

Property changes on: stable/12/sys/netinet/tcp_stacks/rack_bbr_common.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: stable/12/sys/netinet/tcp_stacks/rack_bbr_common.h
===================================================================
--- stable/12/sys/netinet/tcp_stacks/rack_bbr_common.h	(revision 362879)
+++ stable/12/sys/netinet/tcp_stacks/rack_bbr_common.h	(revision 362880)
@@ -1,68 +1,117 @@
 #ifndef __pacer_timer_h__
 #define __pacer_timer_h__
 /*-
  * Copyright (c) 2017 Netflix, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * __FBSDID("$FreeBSD$");
  */
 /* Common defines and such used by both RACK and BBR */
 /* Special values for mss accounting array */
 #define TCP_MSS_ACCT_JUSTRET 0
 #define TCP_MSS_ACCT_SNDACK  1
 #define TCP_MSS_ACCT_PERSIST 2
 #define TCP_MSS_ACCT_ATIMER  60
 #define TCP_MSS_ACCT_INPACE  61
 #define TCP_MSS_ACCT_LATE    62
 #define TCP_MSS_SMALL_SIZE_OFF 63	/* Point where small sizes enter */
 #define TCP_MSS_ACCT_SIZE    70
 #define TCP_MSS_SMALL_MAX_SIZE_DIV (TCP_MSS_ACCT_SIZE - TCP_MSS_SMALL_SIZE_OFF)
 
+#define DUP_ACK_THRESHOLD 3
 
-/* Magic flags to tell whats cooking on the pacing wheel */
-#define PACE_PKT_OUTPUT 0x01	/* Output Packets being paced */
-#define PACE_TMR_RACK   0x02	/* RACK timer running */
-#define PACE_TMR_TLP    0x04	/* TLP timer running */
-#define PACE_TMR_RXT    0x08	/* Retransmit timer running */
-#define PACE_TMR_PERSIT 0x10	/* Persists timer running */
-#define PACE_TMR_KEEP   0x20	/* Keep alive timer running */
-#define PACE_TMR_DELACK 0x40	/* Delayed ack timer running */
-#define PACE_TMR_MASK   (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
-
 /* Magic flags for tracing progress events */
 #define PROGRESS_DROP   1
 #define PROGRESS_UPDATE 2
 #define PROGRESS_CLEAR  3
 #define PROGRESS_START  4
 
 
 /* RTT sample methods */
 #define USE_RTT_HIGH 0
 #define USE_RTT_LOW  1
 #define USE_RTT_AVG  2
 
+#define PACE_MAX_IP_BYTES 65536
+#define USECS_IN_SECOND 1000000
+#define MSEC_IN_SECOND 1000
+#define MS_IN_USEC 1000
+#define USEC_TO_MSEC(x) (x / MS_IN_USEC)
+#define TCP_TS_OVERHEAD 12		/* Overhead of having Timestamps on */
+
 #ifdef _KERNEL
 /* We have only 7 bits in rack so assert its true */
 CTASSERT((PACE_TMR_MASK & 0x80) == 0);
+#ifdef KERN_TLS
+uint32_t ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd);
+#endif
+int
+ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so,
+    struct mbuf *m, int has_pkt);
+int
+ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt);
+uint32_t ctf_outstanding(struct tcpcb *tp);
+uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked);
+int
+ctf_drop_checks(struct tcpopt *to, struct mbuf *m,
+    struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf,
+    int32_t * drop_hdrlen, int32_t * ret_val);
+void
+ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
+    struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val);
+void
+ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
+	struct tcphdr *th, int32_t rstreason, int32_t tlen);
+void
+ctf_do_drop(struct mbuf *m, struct tcpcb *tp);
+
+int
+ctf_process_rst(struct mbuf *m, struct tcphdr *th,
+    struct socket *so, struct tcpcb *tp);
+
+void
+ctf_challenge_ack(struct mbuf *m, struct tcphdr *th,
+    struct tcpcb *tp, int32_t * ret_val);
+
+int
+ctf_ts_check(struct mbuf *m, struct tcphdr *th,
+    struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val);
+
+void
+ctf_calc_rwin(struct socket *so, struct tcpcb *tp);
+
+void
+ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
+    int32_t rstreason, int32_t tlen);
+
+uint32_t 
+ctf_fixed_maxseg(struct tcpcb *tp);
+
+void
+ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks);
+
+uint32_t 
+ctf_decay_count(uint32_t count, uint32_t decay_percentage);
+
 #endif
 #endif
Index: stable/12/sys/netinet/tcp_var.h
===================================================================
--- stable/12/sys/netinet/tcp_var.h	(revision 362879)
+++ stable/12/sys/netinet/tcp_var.h	(revision 362880)
@@ -1,976 +1,982 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1993, 1994, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_var.h	8.4 (Berkeley) 5/24/95
  * $FreeBSD$
  */
 
 #ifndef _NETINET_TCP_VAR_H_
 #define _NETINET_TCP_VAR_H_
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 
 #ifdef _KERNEL
 #include <net/vnet.h>
 #include <sys/mbuf.h>
 #endif
 
 #if defined(_KERNEL) || defined(_WANT_TCPCB)
 /* TCP segment queue entry */
 struct tseg_qent {
 	TAILQ_ENTRY(tseg_qent) tqe_q;
 	struct	mbuf   *tqe_m;		/* mbuf contains packet */
 	struct  mbuf   *tqe_last;	/* last mbuf in chain */
 	tcp_seq tqe_start;		/* TCP Sequence number start */
 	int	tqe_len;		/* TCP segment data length */
 	uint32_t tqe_flags;		/* The flags from the th->th_flags */
 	uint32_t tqe_mbuf_cnt;		/* Count of mbuf overhead */
 };
 TAILQ_HEAD(tsegqe_head, tseg_qent);
 
 struct sackblk {
 	tcp_seq start;		/* start seq no. of sack block */
 	tcp_seq end;		/* end seq no. */
 };
 
 struct sackhole {
 	tcp_seq start;		/* start seq no. of hole */
 	tcp_seq end;		/* end seq no. */
 	tcp_seq rxmit;		/* next seq. no in hole to be retransmitted */
 	TAILQ_ENTRY(sackhole) scblink;	/* scoreboard linkage */
 };
 
 struct sackhint {
 	struct sackhole	*nexthole;
 	int		sack_bytes_rexmit;
 	tcp_seq		last_sack_ack;	/* Most recent/largest sacked ack */
 
 	int		ispare;		/* explicit pad for 64bit alignment */
 	int             sacked_bytes;	/*
 					 * Total sacked bytes reported by the
 					 * receiver via sack option
 					 */
 	uint32_t	_pad1[1];	/* TBD */
 	uint64_t	_pad[1];	/* TBD */
 };
 
 #define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq)
 
 STAILQ_HEAD(tcp_log_stailq, tcp_log_mem);
 
 /*
  * Tcp control block, one per tcp; fields:
  * Organized for 64 byte cacheline efficiency based
  * on common tcp_input/tcp_output processing.
  */
 struct tcpcb {
 	/* Cache line 1 */
 	struct	inpcb *t_inpcb;		/* back pointer to internet pcb */
 	struct tcp_function_block *t_fb;/* TCP function call block */
 	void	*t_fb_ptr;		/* Pointer to t_fb specific data */
 	uint32_t t_maxseg:24,		/* maximum segment size */
 		t_logstate:8;		/* State of "black box" logging */
 	uint32_t t_port:16,		/* Tunneling (over udp) port */
 		t_state:4,		/* state of this connection */
 		t_idle_reduce : 1,
 		t_delayed_ack: 7,	/* Delayed ack variable */
-		bits_spare : 4;
+		t_fin_is_rst: 1,	/* Are fin's treated as resets */
+		bits_spare : 3;
 	u_int	t_flags;
 	tcp_seq	snd_una;		/* sent but unacknowledged */
 	tcp_seq	snd_max;		/* highest sequence number sent;
 					 * used to recognize retransmits
 					 */
 	tcp_seq	snd_nxt;		/* send next */
 	tcp_seq	snd_up;			/* send urgent pointer */
 	uint32_t  snd_wnd;		/* send window */
 	uint32_t  snd_cwnd;		/* congestion-controlled window */
 	uint32_t t_peakrate_thr; 	/* pre-calculated peak rate threshold */
 	/* Cache line 2 */
 	u_int32_t  ts_offset;		/* our timestamp offset */
 	u_int32_t	rfbuf_ts;	/* recv buffer autoscaling timestamp */
 	int	rcv_numsacks;		/* # distinct sack blks present */
 	u_int	t_tsomax;		/* TSO total burst length limit in bytes */
 	u_int	t_tsomaxsegcount;	/* TSO maximum segment count */
 	u_int	t_tsomaxsegsize;	/* TSO maximum segment size in bytes */
 	tcp_seq	rcv_nxt;		/* receive next */
 	tcp_seq	rcv_adv;		/* advertised window */
 	uint32_t  rcv_wnd;		/* receive window */
 	u_int	t_flags2;		/* More tcpcb flags storage */
 	int	t_srtt;			/* smoothed round-trip time */
 	int	t_rttvar;		/* variance in round-trip time */
 	u_int32_t  ts_recent;		/* timestamp echo data */
 	u_char	snd_scale;		/* window scaling for send window */
 	u_char	rcv_scale;		/* window scaling for recv window */
 	u_char	snd_limited;		/* segments limited transmitted */
 	u_char	request_r_scale;	/* pending window scaling */
 	tcp_seq	last_ack_sent;
 	u_int	t_rcvtime;		/* inactivity time */
 	/* Cache line 3 */
 	tcp_seq	rcv_up;			/* receive urgent pointer */
 	int	t_segqlen;		/* segment reassembly queue length */
 	uint32_t t_segqmbuflen;		/* Count of bytes mbufs on all entries */
 	struct	tsegqe_head t_segq;	/* segment reassembly queue */
 	struct mbuf      *t_in_pkt;
 	struct mbuf	 *t_tail_pkt;
 	struct tcp_timer *t_timers;	/* All the TCP timers in one struct */
 	struct	vnet *t_vnet;		/* back pointer to parent vnet */
 	uint32_t  snd_ssthresh;		/* snd_cwnd size threshold for
 					 * for slow start exponential to
 					 * linear switch
 					 */
 	tcp_seq	snd_wl1;		/* window update seg seq number */
 	/* Cache line 4 */
 	tcp_seq	snd_wl2;		/* window update seg ack number */
 
 	tcp_seq	irs;			/* initial receive sequence number */
 	tcp_seq	iss;		        /* initial send sequence number */
 	u_int   t_acktime;
 	u_int	ts_recent_age;		/* when last updated */
 	tcp_seq	snd_recover;		/* for use in NewReno Fast Recovery */
 	uint16_t cl4_spare;		/* Spare to adjust CL 4 */
 	char	t_oobflags;		/* have some */
 	char	t_iobc;			/* input character */
 	int	t_rxtcur;		/* current retransmit value (ticks) */
 
 	int	t_rxtshift;		/* log(2) of rexmt exp. backoff */
 	u_int	t_rtttime;		/* RTT measurement start time */
 
 	tcp_seq	t_rtseq;		/* sequence number being timed */
 	u_int	t_starttime;		/* time connection was established */
 
 	u_int	t_pmtud_saved_maxseg;	/* pre-blackhole MSS */
 	u_int	t_rttmin;		/* minimum rtt allowed */
 
 	u_int	t_rttbest;		/* best rtt we've seen */
 
 	int	t_softerror;		/* possible error not yet reported */
 	uint32_t  max_sndwnd;		/* largest window peer has offered */
 	/* Cache line 5 */
 	uint32_t  snd_cwnd_prev;	/* cwnd prior to retransmit */
 	uint32_t  snd_ssthresh_prev;	/* ssthresh prior to retransmit */
 	tcp_seq	snd_recover_prev;	/* snd_recover prior to retransmit */
 	int	t_sndzerowin;		/* zero-window updates sent */
 	u_long	t_rttupdated;		/* number of times rtt sampled */
 	int	snd_numholes;		/* number of holes seen by sender */
 	u_int	t_badrxtwin;		/* window for retransmit recovery */
 	TAILQ_HEAD(sackhole_head, sackhole) snd_holes;
 					/* SACK scoreboard (sorted) */
 	tcp_seq	snd_fack;		/* last seq number(+1) sack'd by rcv'r*/
 	tcp_seq sack_newdata;		/* New data xmitted in this recovery
 					   episode starts at this seq number */
 	struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
 	struct sackhint	sackhint;	/* SACK scoreboard hint */
 	int	t_rttlow;		/* smallest observerved RTT */
 	int	rfbuf_cnt;		/* recv buffer autoscaling byte count */
 	struct toedev	*tod;		/* toedev handling this connection */
 	int	t_sndrexmitpack;	/* retransmit packets sent */
 	int	t_rcvoopack;		/* out-of-order packets received */
 	void	*t_toe;			/* TOE pcb pointer */
 	struct cc_algo	*cc_algo;	/* congestion control algorithm */
 	struct cc_var	*ccv;		/* congestion control specific vars */
 	struct osd	*osd;		/* storage for Khelp module data */
 	int	t_bytes_acked;		/* # bytes acked during current RTT */
 	u_int   t_maxunacktime;
 	u_int	t_keepinit;		/* time to establish connection */
 	u_int	t_keepidle;		/* time before keepalive probes begin */
 	u_int	t_keepintvl;		/* interval between keepalives */
 	u_int	t_keepcnt;		/* number of keepalives before close */
 	int	t_dupacks;		/* consecutive dup acks recd */
 	int	t_lognum;		/* Number of log entries */
 	struct tcp_log_stailq t_logs;	/* Log buffer */
 	struct tcp_log_id_node *t_lin;
 	struct tcp_log_id_bucket *t_lib;
 	const char *t_output_caller;	/* Function that called tcp_output */
 	uint32_t t_logsn;		/* Log "serial number" */
 	uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */
 	unsigned int *t_tfo_pending;	/* TCP Fast Open server pending counter */
 	union {
 		uint8_t client[TCP_FASTOPEN_MAX_COOKIE_LEN];
 		uint64_t server;
 	} t_tfo_cookie;			/* TCP Fast Open cookie to send */
 #ifdef TCPPCAP
 	struct mbufq t_inpkts;		/* List of saved input packets. */
 	struct mbufq t_outpkts;		/* List of saved output packets. */
 #endif
 };
 #endif	/* _KERNEL || _WANT_TCPCB */
 
 #ifdef _KERNEL
 struct tcptemp {
 	u_char	tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */
 	struct	tcphdr tt_t;
 };
 
 /* 
  * TODO: We yet need to brave plowing in
  * to tcp_input() and the pru_usrreq() block.
  * Right now these go to the old standards which
  * are somewhat ok, but in the long term may
  * need to be changed. If we do tackle tcp_input()
  * then we need to get rid of the tcp_do_segment()
  * function below.
  */
 /* Flags for tcp functions */
 #define TCP_FUNC_BEING_REMOVED 0x01   	/* Can no longer be referenced */
 
 /*
  * If defining the optional tcp_timers, in the
  * tfb_tcp_timer_stop call you must use the
  * callout_async_drain() function with the
  * tcp_timer_discard callback. You should check
  * the return of callout_async_drain() and if 0
  * increment tt_draincnt. Since the timer sub-system
  * does not know your callbacks you must provide a
  * stop_all function that loops through and calls
  * tcp_timer_stop() with each of your defined timers.
  * Adding a tfb_tcp_handoff_ok function allows the socket
  * option to change stacks to query you even if the
  * connection is in a later stage. You return 0 to
  * say you can take over and run your stack, you return
  * non-zero (an error number) to say no you can't.
  * If the function is undefined you can only change
  * in the early states (before connect or listen).
  * tfb_tcp_fb_fini is changed to add a flag to tell
  * the old stack if the tcb is being destroyed or
  * not. A one in the flag means the TCB is being
  * destroyed, a zero indicates its transitioning to
  * another stack (via socket option).
  */
 struct tcp_function_block {
 	char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX];
 	int	(*tfb_tcp_output)(struct tcpcb *);
 	int	(*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *);
 	void	(*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *,
 			    struct socket *, struct tcpcb *,
 		        int, int, uint8_t);
+	int     (*tfb_do_queued_segments)(struct socket *, struct tcpcb *, int);
+	int      (*tfb_do_segment_nounlock)(struct mbuf *, struct tcphdr *,
+			    struct socket *, struct tcpcb *,
+			    int, int, uint8_t,
+			    int, struct timeval *);
 	void	(*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *,
 			    struct socket *, struct tcpcb *,
 			    int, int, uint8_t,
 			    int, struct timeval *);
 	int     (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt,
 			    struct inpcb *inp, struct tcpcb *tp);
 	/* Optional memory allocation/free routine */
 	int	(*tfb_tcp_fb_init)(struct tcpcb *);
 	void	(*tfb_tcp_fb_fini)(struct tcpcb *, int);
 	/* Optional timers, must define all if you define one */
 	int	(*tfb_tcp_timer_stop_all)(struct tcpcb *);
 	void	(*tfb_tcp_timer_activate)(struct tcpcb *,
 			    uint32_t, u_int);
 	int	(*tfb_tcp_timer_active)(struct tcpcb *, uint32_t);
 	void	(*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t);
 	void	(*tfb_tcp_rexmit_tmr)(struct tcpcb *);
 	int	(*tfb_tcp_handoff_ok)(struct tcpcb *);
 	void	(*tfb_tcp_mtu_chg)(struct tcpcb *);
 	volatile uint32_t tfb_refcnt;
 	uint32_t  tfb_flags;
 	uint8_t	tfb_id;
 };
 
 struct tcp_function {
 	TAILQ_ENTRY(tcp_function)	tf_next;
 	char				tf_name[TCP_FUNCTION_NAME_LEN_MAX];
 	struct tcp_function_block	*tf_fb;
 };
 
 TAILQ_HEAD(tcp_funchead, tcp_function);
 #endif	/* _KERNEL */
 
 /*
  * Flags and utility macros for the t_flags field.
  */
 #define	TF_ACKNOW	0x00000001	/* ack peer immediately */
 #define	TF_DELACK	0x00000002	/* ack, but try to delay it */
 #define	TF_NODELAY	0x00000004	/* don't delay packets to coalesce */
 #define	TF_NOOPT	0x00000008	/* don't use tcp options */
 #define	TF_SENTFIN	0x00000010	/* have sent FIN */
 #define	TF_REQ_SCALE	0x00000020	/* have/will request window scaling */
 #define	TF_RCVD_SCALE	0x00000040	/* other side has requested scaling */
 #define	TF_REQ_TSTMP	0x00000080	/* have/will request timestamps */
 #define	TF_RCVD_TSTMP	0x00000100	/* a timestamp was received in SYN */
 #define	TF_SACK_PERMIT	0x00000200	/* other side said I could SACK */
 #define	TF_NEEDSYN	0x00000400	/* send SYN (implicit state) */
 #define	TF_NEEDFIN	0x00000800	/* send FIN (implicit state) */
 #define	TF_NOPUSH	0x00001000	/* don't push */
 #define	TF_PREVVALID	0x00002000	/* saved values for bad rxmit valid */
 #define	TF_MORETOCOME	0x00010000	/* More data to be appended to sock */
 #define	TF_LQ_OVERFLOW	0x00020000	/* listen queue overflow */
 #define	TF_LASTIDLE	0x00040000	/* connection was previously idle */
 #define	TF_RXWIN0SENT	0x00080000	/* sent a receiver win 0 in response */
 #define	TF_FASTRECOVERY	0x00100000	/* in NewReno Fast Recovery */
 #define	TF_WASFRECOVERY	0x00200000	/* was in NewReno Fast Recovery */
 #define	TF_SIGNATURE	0x00400000	/* require MD5 digests (RFC2385) */
 #define	TF_FORCEDATA	0x00800000	/* force out a byte */
 #define	TF_TSO		0x01000000	/* TSO enabled on this connection */
 #define	TF_TOE		0x02000000	/* this connection is offloaded */
 #define	TF_ECN_PERMIT	0x04000000	/* connection ECN-ready */
 #define	TF_ECN_SND_CWR	0x08000000	/* ECN CWR in queue */
 #define	TF_ECN_SND_ECE	0x10000000	/* ECN ECE in queue */
 #define	TF_CONGRECOVERY	0x20000000	/* congestion recovery mode */
 #define	TF_WASCRECOVERY	0x40000000	/* was in congestion recovery */
 #define	TF_FASTOPEN	0x80000000	/* TCP Fast Open indication */
 
 #define	IN_FASTRECOVERY(t_flags)	(t_flags & TF_FASTRECOVERY)
 #define	ENTER_FASTRECOVERY(t_flags)	t_flags |= TF_FASTRECOVERY
 #define	EXIT_FASTRECOVERY(t_flags)	t_flags &= ~TF_FASTRECOVERY
 
 #define	IN_CONGRECOVERY(t_flags)	(t_flags & TF_CONGRECOVERY)
 #define	ENTER_CONGRECOVERY(t_flags)	t_flags |= TF_CONGRECOVERY
 #define	EXIT_CONGRECOVERY(t_flags)	t_flags &= ~TF_CONGRECOVERY
 
 #define	IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY))
 #define	ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY)
 #define	EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY)
 
 #if defined(_KERNEL) && !defined(TCP_RFC7413)
 #define	IS_FASTOPEN(t_flags)		(false)
 #else
 #define	IS_FASTOPEN(t_flags)		(t_flags & TF_FASTOPEN)
 #endif
 
 #define	BYTES_THIS_ACK(tp, th)	(th->th_ack - tp->snd_una)
 
 /*
  * Flags for the t_oobflags field.
  */
 #define	TCPOOB_HAVEDATA	0x01
 #define	TCPOOB_HADDATA	0x02
 
 /*
  * Flags for the extended TCP flags field, t_flags2
  */
 #define	TF2_PLPMTU_BLACKHOLE	0x00000001 /* Possible PLPMTUD Black Hole. */
 #define	TF2_PLPMTU_PMTUD	0x00000002 /* Allowed to attempt PLPMTUD. */
 #define	TF2_PLPMTU_MAXSEGSNT	0x00000004 /* Last seg sent was full seg. */
 #define	TF2_LOG_AUTO		0x00000008 /* Session is auto-logging. */
 #define TF2_DROP_AF_DATA 	0x00000010 /* Drop after all data ack'd */
 
 /*
  * Structure to hold TCP options that are only used during segment
  * processing (in tcp_input), but not held in the tcpcb.
  * It's basically used to reduce the number of parameters
  * to tcp_dooptions and tcp_addoptions.
  * The binary order of the to_flags is relevant for packing of the
  * options in tcp_addoptions.
  */
 struct tcpopt {
 	u_int32_t	to_flags;	/* which options are present */
 #define	TOF_MSS		0x0001		/* maximum segment size */
 #define	TOF_SCALE	0x0002		/* window scaling */
 #define	TOF_SACKPERM	0x0004		/* SACK permitted */
 #define	TOF_TS		0x0010		/* timestamp */
 #define	TOF_SIGNATURE	0x0040		/* TCP-MD5 signature option (RFC2385) */
 #define	TOF_SACK	0x0080		/* Peer sent SACK option */
 #define	TOF_FASTOPEN	0x0100		/* TCP Fast Open (TFO) cookie */
 #define	TOF_MAXOPT	0x0200
 	u_int32_t	to_tsval;	/* new timestamp */
 	u_int32_t	to_tsecr;	/* reflected timestamp */
 	u_char		*to_sacks;	/* pointer to the first SACK blocks */
 	u_char		*to_signature;	/* pointer to the TCP-MD5 signature */
 	u_int8_t	*to_tfo_cookie; /* pointer to the TFO cookie */
 	u_int16_t	to_mss;		/* maximum segment size */
 	u_int8_t	to_wscale;	/* window scaling */
 	u_int8_t	to_nsacks;	/* number of SACK blocks */
 	u_int8_t	to_tfo_len;	/* TFO cookie length */
 	u_int32_t	to_spare;	/* UTO */
 };
 
 /*
  * Flags for tcp_dooptions.
  */
 #define	TO_SYN		0x01		/* parse SYN-only options */
 
 struct hc_metrics_lite {	/* must stay in sync with hc_metrics */
 	uint32_t	rmx_mtu;	/* MTU for this path */
 	uint32_t	rmx_ssthresh;	/* outbound gateway buffer limit */
 	uint32_t	rmx_rtt;	/* estimated round trip time */
 	uint32_t	rmx_rttvar;	/* estimated rtt variance */
 	uint32_t	rmx_cwnd;	/* congestion window */
 	uint32_t	rmx_sendpipe;   /* outbound delay-bandwidth product */
 	uint32_t	rmx_recvpipe;   /* inbound delay-bandwidth product */
 };
 
 /*
  * Used by tcp_maxmtu() to communicate interface specific features
  * and limits at the time of connection setup.
  */
 struct tcp_ifcap {
 	int	ifcap;
 	u_int	tsomax;
 	u_int	tsomaxsegcount;
 	u_int	tsomaxsegsize;
 };
 
 #ifndef _NETINET_IN_PCB_H_
 struct in_conninfo;
 #endif /* _NETINET_IN_PCB_H_ */
 
 struct tcptw {
 	struct inpcb	*tw_inpcb;	/* XXX back pointer to internet pcb */
 	tcp_seq		snd_nxt;
 	tcp_seq		rcv_nxt;
 	tcp_seq		iss;
 	tcp_seq		irs;
 	u_short		last_win;	/* cached window value */
 	short		tw_so_options;	/* copy of so_options */
 	struct ucred	*tw_cred;	/* user credentials */
 	u_int32_t	t_recent;
 	u_int32_t	ts_offset;	/* our timestamp offset */
 	u_int		t_starttime;
 	int		tw_time;
 	TAILQ_ENTRY(tcptw) tw_2msl;
 	void		*tw_pspare;	/* TCP_SIGNATURE */
 	u_int		*tw_spare;	/* TCP_SIGNATURE */
 };
 
 #define	intotcpcb(ip)	((struct tcpcb *)(ip)->inp_ppcb)
 #define	intotw(ip)	((struct tcptw *)(ip)->inp_ppcb)
 #define	sototcpcb(so)	(intotcpcb(sotoinpcb(so)))
 
 /*
  * The smoothed round-trip time and estimated variance
  * are stored as fixed point numbers scaled by the values below.
  * For convenience, these scales are also used in smoothing the average
  * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed).
  * With these scales, srtt has 3 bits to the right of the binary point,
  * and thus an "ALPHA" of 0.875.  rttvar has 2 bits to the right of the
  * binary point, and is smoothed with an ALPHA of 0.75.
  */
 #define	TCP_RTT_SCALE		32	/* multiplier for srtt; 3 bits frac. */
 #define	TCP_RTT_SHIFT		5	/* shift for srtt; 3 bits frac. */
 #define	TCP_RTTVAR_SCALE	16	/* multiplier for rttvar; 2 bits */
 #define	TCP_RTTVAR_SHIFT	4	/* shift for rttvar; 2 bits */
 #define	TCP_DELTA_SHIFT		2	/* see tcp_input.c */
 
 /*
  * The initial retransmission should happen at rtt + 4 * rttvar.
  * Because of the way we do the smoothing, srtt and rttvar
  * will each average +1/2 tick of bias.  When we compute
  * the retransmit timer, we want 1/2 tick of rounding and
  * 1 extra tick because of +-1/2 tick uncertainty in the
  * firing of the timer.  The bias will give us exactly the
  * 1.5 tick we need.  But, because the bias is
  * statistical, we have to test that we don't drop below
  * the minimum feasible timer (which is 2 ticks).
  * This version of the macro adapted from a paper by Lawrence
  * Brakmo and Larry Peterson which outlines a problem caused
  * by insufficient precision in the original implementation,
  * which results in inappropriately large RTO values for very
  * fast networks.
  */
 #define	TCP_REXMTVAL(tp) \
 	max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT))  \
 	  + (tp)->t_rttvar) >> TCP_DELTA_SHIFT)
 
 /*
  * TCP statistics.
  * Many of these should be kept per connection,
  * but that's inconvenient at the moment.
  */
 struct	tcpstat {
 	uint64_t tcps_connattempt;	/* connections initiated */
 	uint64_t tcps_accepts;		/* connections accepted */
 	uint64_t tcps_connects;		/* connections established */
 	uint64_t tcps_drops;		/* connections dropped */
 	uint64_t tcps_conndrops;	/* embryonic connections dropped */
 	uint64_t tcps_minmssdrops;	/* average minmss too low drops */
 	uint64_t tcps_closed;		/* conn. closed (includes drops) */
 	uint64_t tcps_segstimed;	/* segs where we tried to get rtt */
 	uint64_t tcps_rttupdated;	/* times we succeeded */
 	uint64_t tcps_delack;		/* delayed acks sent */
 	uint64_t tcps_timeoutdrop;	/* conn. dropped in rxmt timeout */
 	uint64_t tcps_rexmttimeo;	/* retransmit timeouts */
 	uint64_t tcps_persisttimeo;	/* persist timeouts */
 	uint64_t tcps_keeptimeo;	/* keepalive timeouts */
 	uint64_t tcps_keepprobe;	/* keepalive probes sent */
 	uint64_t tcps_keepdrops;	/* connections dropped in keepalive */
 
 	uint64_t tcps_sndtotal;		/* total packets sent */
 	uint64_t tcps_sndpack;		/* data packets sent */
 	uint64_t tcps_sndbyte;		/* data bytes sent */
 	uint64_t tcps_sndrexmitpack;	/* data packets retransmitted */
 	uint64_t tcps_sndrexmitbyte;	/* data bytes retransmitted */
 	uint64_t tcps_sndrexmitbad;	/* unnecessary packet retransmissions */
 	uint64_t tcps_sndacks;		/* ack-only packets sent */
 	uint64_t tcps_sndprobe;		/* window probes sent */
 	uint64_t tcps_sndurg;		/* packets sent with URG only */
 	uint64_t tcps_sndwinup;		/* window update-only packets sent */
 	uint64_t tcps_sndctrl;		/* control (SYN|FIN|RST) packets sent */
 
 	uint64_t tcps_rcvtotal;		/* total packets received */
 	uint64_t tcps_rcvpack;		/* packets received in sequence */
 	uint64_t tcps_rcvbyte;		/* bytes received in sequence */
 	uint64_t tcps_rcvbadsum;	/* packets received with ccksum errs */
 	uint64_t tcps_rcvbadoff;	/* packets received with bad offset */
 	uint64_t tcps_rcvreassfull;	/* packets dropped for no reass space */
 	uint64_t tcps_rcvshort;		/* packets received too short */
 	uint64_t tcps_rcvduppack;	/* duplicate-only packets received */
 	uint64_t tcps_rcvdupbyte;	/* duplicate-only bytes received */
 	uint64_t tcps_rcvpartduppack;	/* packets with some duplicate data */
 	uint64_t tcps_rcvpartdupbyte;	/* dup. bytes in part-dup. packets */
 	uint64_t tcps_rcvoopack;	/* out-of-order packets received */
 	uint64_t tcps_rcvoobyte;	/* out-of-order bytes received */
 	uint64_t tcps_rcvpackafterwin;	/* packets with data after window */
 	uint64_t tcps_rcvbyteafterwin;	/* bytes rcvd after window */
 	uint64_t tcps_rcvafterclose;	/* packets rcvd after "close" */
 	uint64_t tcps_rcvwinprobe;	/* rcvd window probe packets */
 	uint64_t tcps_rcvdupack;	/* rcvd duplicate acks */
 	uint64_t tcps_rcvacktoomuch;	/* rcvd acks for unsent data */
 	uint64_t tcps_rcvackpack;	/* rcvd ack packets */
 	uint64_t tcps_rcvackbyte;	/* bytes acked by rcvd acks */
 	uint64_t tcps_rcvwinupd;	/* rcvd window update packets */
 	uint64_t tcps_pawsdrop;		/* segments dropped due to PAWS */
 	uint64_t tcps_predack;		/* times hdr predict ok for acks */
 	uint64_t tcps_preddat;		/* times hdr predict ok for data pkts */
 	uint64_t tcps_pcbcachemiss;
 	uint64_t tcps_cachedrtt;	/* times cached RTT in route updated */
 	uint64_t tcps_cachedrttvar;	/* times cached rttvar updated */
 	uint64_t tcps_cachedssthresh;	/* times cached ssthresh updated */
 	uint64_t tcps_usedrtt;		/* times RTT initialized from route */
 	uint64_t tcps_usedrttvar;	/* times RTTVAR initialized from rt */
 	uint64_t tcps_usedssthresh;	/* times ssthresh initialized from rt*/
 	uint64_t tcps_persistdrop;	/* timeout in persist state */
 	uint64_t tcps_badsyn;		/* bogus SYN, e.g. premature ACK */
 	uint64_t tcps_mturesent;	/* resends due to MTU discovery */
 	uint64_t tcps_listendrop;	/* listen queue overflows */
 	uint64_t tcps_badrst;		/* ignored RSTs in the window */
 
 	uint64_t tcps_sc_added;		/* entry added to syncache */
 	uint64_t tcps_sc_retransmitted;	/* syncache entry was retransmitted */
 	uint64_t tcps_sc_dupsyn;	/* duplicate SYN packet */
 	uint64_t tcps_sc_dropped;	/* could not reply to packet */
 	uint64_t tcps_sc_completed;	/* successful extraction of entry */
 	uint64_t tcps_sc_bucketoverflow;/* syncache per-bucket limit hit */
 	uint64_t tcps_sc_cacheoverflow;	/* syncache cache limit hit */
 	uint64_t tcps_sc_reset;		/* RST removed entry from syncache */
 	uint64_t tcps_sc_stale;		/* timed out or listen socket gone */
 	uint64_t tcps_sc_aborted;	/* syncache entry aborted */
 	uint64_t tcps_sc_badack;	/* removed due to bad ACK */
 	uint64_t tcps_sc_unreach;	/* ICMP unreachable received */
 	uint64_t tcps_sc_zonefail;	/* zalloc() failed */
 	uint64_t tcps_sc_sendcookie;	/* SYN cookie sent */
 	uint64_t tcps_sc_recvcookie;	/* SYN cookie received */
 
 	uint64_t tcps_hc_added;		/* entry added to hostcache */
 	uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */
 
 	uint64_t tcps_finwait2_drops;    /* Drop FIN_WAIT_2 connection after time limit */
 
 	/* SACK related stats */
 	uint64_t tcps_sack_recovery_episode; /* SACK recovery episodes */
 	uint64_t tcps_sack_rexmits;	    /* SACK rexmit segments   */
 	uint64_t tcps_sack_rexmit_bytes;    /* SACK rexmit bytes      */
 	uint64_t tcps_sack_rcv_blocks;	    /* SACK blocks (options) received */
 	uint64_t tcps_sack_send_blocks;	    /* SACK blocks (options) sent     */
 	uint64_t tcps_sack_sboverflow;	    /* times scoreboard overflowed */
 	
 	/* ECN related stats */
 	uint64_t tcps_ecn_ce;		/* ECN Congestion Experienced */
 	uint64_t tcps_ecn_ect0;		/* ECN Capable Transport */
 	uint64_t tcps_ecn_ect1;		/* ECN Capable Transport */
 	uint64_t tcps_ecn_shs;		/* ECN successful handshakes */
 	uint64_t tcps_ecn_rcwnd;	/* # times ECN reduced the cwnd */
 
 	/* TCP_SIGNATURE related stats */
 	uint64_t tcps_sig_rcvgoodsig;	/* Total matching signature received */
 	uint64_t tcps_sig_rcvbadsig;	/* Total bad signature received */
 	uint64_t tcps_sig_err_buildsig;	/* Failed to make signature */
 	uint64_t tcps_sig_err_sigopt;	/* No signature expected by socket */
 	uint64_t tcps_sig_err_nosigopt;	/* No signature provided by segment */
 
 	/* Path MTU Discovery Black Hole Detection related stats */
 	uint64_t tcps_pmtud_blackhole_activated;	 /* Black Hole Count */
 	uint64_t tcps_pmtud_blackhole_activated_min_mss; /* BH at min MSS Count */
 	uint64_t tcps_pmtud_blackhole_failed;		 /* Black Hole Failure Count */
 
 	uint64_t _pad[12];		/* 6 UTO, 6 TBD */
 };
 
 #define	tcps_rcvmemdrop	tcps_rcvreassfull	/* compat */
 
 #ifdef _KERNEL
 #define	TI_UNLOCKED	1
 #define	TI_RLOCKED	2
 #include <sys/counter.h>
 
 VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat);	/* tcp statistics */
 /*
  * In-kernel consumers can use these accessor macros directly to update
  * stats.
  */
 #define	TCPSTAT_ADD(name, val)	\
     VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val))
 #define	TCPSTAT_INC(name)	TCPSTAT_ADD(name, 1)
 
 /*
  * Kernel module consumers must use this accessor macro.
  */
 void	kmod_tcpstat_inc(int statnum);
 #define	KMOD_TCPSTAT_INC(name)						\
     kmod_tcpstat_inc(offsetof(struct tcpstat, name) / sizeof(uint64_t))
 
 /*
  * Running TCP connection count by state.
  */
 VNET_DECLARE(counter_u64_t, tcps_states[TCP_NSTATES]);
 #define	V_tcps_states	VNET(tcps_states)
 #define	TCPSTATES_INC(state)	counter_u64_add(V_tcps_states[state], 1)
 #define	TCPSTATES_DEC(state)	counter_u64_add(V_tcps_states[state], -1)
 
 /*
  * TCP specific helper hook point identifiers.
  */
 #define	HHOOK_TCP_EST_IN		0
 #define	HHOOK_TCP_EST_OUT		1
 #define	HHOOK_TCP_LAST			HHOOK_TCP_EST_OUT
 
 struct tcp_hhook_data {
 	struct tcpcb	*tp;
 	struct tcphdr	*th;
 	struct tcpopt	*to;
 	uint32_t	len;
 	int		tso;
 	tcp_seq		curack;
 };
 #ifdef TCP_HHOOK
 void hhook_run_tcp_est_out(struct tcpcb *tp,
 	struct tcphdr *th, struct tcpopt *to,
 	uint32_t len, int tso);
 #endif
 #endif
 
 /*
  * TCB structure exported to user-land via sysctl(3).
  *
  * Fields prefixed with "xt_" are unique to the export structure, and fields
  * with "t_" or other prefixes match corresponding fields of 'struct tcpcb'.
  *
  * Legend:
  * (s) - used by userland utilities in src
  * (p) - used by utilities in ports
  * (3) - is known to be used by third party software not in ports
  * (n) - no known usage
  *
  * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been
  * included.  Not all of our clients do.
  */
 #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_)
 struct xtcpcb {
 	ksize_t	xt_len;		/* length of this structure */
 	struct xinpcb	xt_inp;
 	char		xt_stack[TCP_FUNCTION_NAME_LEN_MAX];	/* (s) */
 	char		xt_logid[TCP_LOG_ID_LEN];	/* (s) */
 	int64_t		spare64[8];
 	int32_t		t_state;		/* (s,p) */
 	uint32_t	t_flags;		/* (s,p) */
 	int32_t		t_sndzerowin;		/* (s) */
 	int32_t		t_sndrexmitpack;	/* (s) */
 	int32_t		t_rcvoopack;		/* (s) */
 	int32_t		t_rcvtime;		/* (s) */
 	int32_t		tt_rexmt;		/* (s) */
 	int32_t		tt_persist;		/* (s) */
 	int32_t		tt_keep;		/* (s) */
 	int32_t		tt_2msl;		/* (s) */
 	int32_t		tt_delack;		/* (s) */
 	int32_t		t_logstate;		/* (3) */
 	int32_t		spare32[32];
 } __aligned(8);
 
 #ifdef _KERNEL
 void	tcp_inptoxtp(const struct inpcb *, struct xtcpcb *);
 #endif
 #endif
 
 /*
  * TCP function information (name-to-id mapping, aliases, and refcnt)
  * exported to user-land via sysctl(3).
  */
 struct tcp_function_info {
 	uint32_t	tfi_refcnt;
 	uint8_t		tfi_id;
 	char		tfi_name[TCP_FUNCTION_NAME_LEN_MAX];
 	char		tfi_alias[TCP_FUNCTION_NAME_LEN_MAX];
 };
 
 /*
  * Identifiers for TCP sysctl nodes
  */
 #define	TCPCTL_DO_RFC1323	1	/* use RFC-1323 extensions */
 #define	TCPCTL_MSSDFLT		3	/* MSS default */
 #define TCPCTL_STATS		4	/* statistics */
 #define	TCPCTL_RTTDFLT		5	/* default RTT estimate */
 #define	TCPCTL_KEEPIDLE		6	/* keepalive idle timer */
 #define	TCPCTL_KEEPINTVL	7	/* interval to send keepalives */
 #define	TCPCTL_SENDSPACE	8	/* send buffer space */
 #define	TCPCTL_RECVSPACE	9	/* receive buffer space */
 #define	TCPCTL_KEEPINIT		10	/* timeout for establishing syn */
 #define	TCPCTL_PCBLIST		11	/* list of all outstanding PCBs */
 #define	TCPCTL_DELACKTIME	12	/* time before sending delayed ACK */
 #define	TCPCTL_V6MSSDFLT	13	/* MSS default for IPv6 */
 #define	TCPCTL_SACK		14	/* Selective Acknowledgement,rfc 2018 */
 #define	TCPCTL_DROP		15	/* drop tcp connection */
 #define	TCPCTL_STATES		16	/* connection counts by TCP state */
 
 #ifdef _KERNEL
 #ifdef SYSCTL_DECL
 SYSCTL_DECL(_net_inet_tcp);
 SYSCTL_DECL(_net_inet_tcp_sack);
 MALLOC_DECLARE(M_TCPLOG);
 #endif
 
 VNET_DECLARE(int, tcp_log_in_vain);
 #define	V_tcp_log_in_vain		VNET(tcp_log_in_vain)
 
 /*
  * Global TCP tunables shared between different stacks.
  * Please keep the list sorted.
  */
 VNET_DECLARE(int, drop_synfin);
 VNET_DECLARE(int, path_mtu_discovery);
 VNET_DECLARE(int, tcp_abc_l_var);
 VNET_DECLARE(int, tcp_autorcvbuf_inc);
 VNET_DECLARE(int, tcp_autorcvbuf_max);
 VNET_DECLARE(int, tcp_autosndbuf_inc);
 VNET_DECLARE(int, tcp_autosndbuf_max);
 VNET_DECLARE(int, tcp_delack_enabled);
 VNET_DECLARE(int, tcp_do_autorcvbuf);
 VNET_DECLARE(int, tcp_do_autosndbuf);
 VNET_DECLARE(int, tcp_do_ecn);
 VNET_DECLARE(int, tcp_do_rfc1323);
 VNET_DECLARE(int, tcp_do_rfc3042);
 VNET_DECLARE(int, tcp_do_rfc3390);
 VNET_DECLARE(int, tcp_do_rfc3465);
 VNET_DECLARE(int, tcp_do_rfc6675_pipe);
 VNET_DECLARE(int, tcp_do_sack);
 VNET_DECLARE(int, tcp_do_tso);
 VNET_DECLARE(int, tcp_ecn_maxretries);
 VNET_DECLARE(int, tcp_initcwnd_segments);
 VNET_DECLARE(int, tcp_insecure_rst);
 VNET_DECLARE(int, tcp_insecure_syn);
 VNET_DECLARE(int, tcp_minmss);
 VNET_DECLARE(int, tcp_mssdflt);
 VNET_DECLARE(int, tcp_recvspace);
 VNET_DECLARE(int, tcp_sack_globalholes);
 VNET_DECLARE(int, tcp_sack_globalmaxholes);
 VNET_DECLARE(int, tcp_sack_maxholes);
 VNET_DECLARE(int, tcp_sc_rst_sock_fail);
 VNET_DECLARE(int, tcp_sendspace);
 VNET_DECLARE(struct inpcbhead, tcb);
 VNET_DECLARE(struct inpcbinfo, tcbinfo);
 
 #define	V_drop_synfin			VNET(drop_synfin)
 #define	V_path_mtu_discovery		VNET(path_mtu_discovery)
 #define	V_tcb				VNET(tcb)
 #define	V_tcbinfo			VNET(tcbinfo)
 #define	V_tcp_abc_l_var			VNET(tcp_abc_l_var)
 #define	V_tcp_autorcvbuf_inc		VNET(tcp_autorcvbuf_inc)
 #define	V_tcp_autorcvbuf_max		VNET(tcp_autorcvbuf_max)
 #define	V_tcp_autosndbuf_inc		VNET(tcp_autosndbuf_inc)
 #define	V_tcp_autosndbuf_max		VNET(tcp_autosndbuf_max)
 #define	V_tcp_delack_enabled		VNET(tcp_delack_enabled)
 #define	V_tcp_do_autorcvbuf		VNET(tcp_do_autorcvbuf)
 #define	V_tcp_do_autosndbuf		VNET(tcp_do_autosndbuf)
 #define	V_tcp_do_ecn			VNET(tcp_do_ecn)
 #define	V_tcp_do_rfc1323		VNET(tcp_do_rfc1323)
 #define V_tcp_ts_offset_per_conn	VNET(tcp_ts_offset_per_conn)
 #define	V_tcp_do_rfc3042		VNET(tcp_do_rfc3042)
 #define	V_tcp_do_rfc3390		VNET(tcp_do_rfc3390)
 #define	V_tcp_do_rfc3465		VNET(tcp_do_rfc3465)
 #define	V_tcp_do_rfc6675_pipe		VNET(tcp_do_rfc6675_pipe)
 #define	V_tcp_do_sack			VNET(tcp_do_sack)
 #define	V_tcp_do_tso			VNET(tcp_do_tso)
 #define	V_tcp_ecn_maxretries		VNET(tcp_ecn_maxretries)
 #define	V_tcp_initcwnd_segments		VNET(tcp_initcwnd_segments)
 #define	V_tcp_insecure_rst		VNET(tcp_insecure_rst)
 #define	V_tcp_insecure_syn		VNET(tcp_insecure_syn)
 #define	V_tcp_minmss			VNET(tcp_minmss)
 #define	V_tcp_mssdflt			VNET(tcp_mssdflt)
 #define	V_tcp_recvspace			VNET(tcp_recvspace)
 #define	V_tcp_sack_globalholes		VNET(tcp_sack_globalholes)
 #define	V_tcp_sack_globalmaxholes	VNET(tcp_sack_globalmaxholes)
 #define	V_tcp_sack_maxholes		VNET(tcp_sack_maxholes)
 #define	V_tcp_sc_rst_sock_fail		VNET(tcp_sc_rst_sock_fail)
 #define	V_tcp_sendspace			VNET(tcp_sendspace)
 #define	V_tcp_udp_tunneling_overhead	VNET(tcp_udp_tunneling_overhead)
 #define	V_tcp_udp_tunneling_port	VNET(tcp_udp_tunneling_port)
 
 
 #ifdef TCP_HHOOK
 VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]);
 #define	V_tcp_hhh		VNET(tcp_hhh)
 #endif
 
 int	 tcp_addoptions(struct tcpopt *, u_char *);
 int	 tcp_ccalgounload(struct cc_algo *unload_algo);
 struct tcpcb *
 	 tcp_close(struct tcpcb *);
 void	 tcp_discardcb(struct tcpcb *);
 void	 tcp_twstart(struct tcpcb *);
 void	 tcp_twclose(struct tcptw *, int);
 void	 tcp_ctlinput(int, struct sockaddr *, void *);
 int	 tcp_ctloutput(struct socket *, struct sockopt *);
 struct tcpcb *
 	 tcp_drop(struct tcpcb *, int);
 void	 tcp_drain(void);
 void	 tcp_init(void);
 void	 tcp_fini(void *);
 char	*tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *,
 	    const void *);
 char	*tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *,
 	    const void *);
 int	 tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *, struct mbuf *);
 void	 tcp_reass_global_init(void);
 void	 tcp_reass_flush(struct tcpcb *);
 void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
 void	tcp_dropwithreset(struct mbuf *, struct tcphdr *,
 		     struct tcpcb *, int, int);
 void	tcp_pulloutofband(struct socket *,
 		     struct tcphdr *, struct mbuf *, int);
 void	tcp_xmit_timer(struct tcpcb *, int);
 void	tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
 void	cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
 			    uint16_t nsegs, uint16_t type);
 void 	cc_conn_init(struct tcpcb *tp);
 void 	cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 void	cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
 #ifdef TCP_HHOOK
 void	hhook_run_tcp_est_in(struct tcpcb *tp,
 			    struct tcphdr *th, struct tcpopt *to);
 #endif
 
 int	 tcp_input(struct mbuf **, int *, int);
 int	 tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *,
 	    struct tcpcb *, int);
 void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
 			struct socket *, struct tcpcb *, int, int, uint8_t);
 
 int register_tcp_functions(struct tcp_function_block *blk, int wait);
 int register_tcp_functions_as_names(struct tcp_function_block *blk,
     int wait, const char *names[], int *num_names);
 int register_tcp_functions_as_name(struct tcp_function_block *blk,
     const char *name, int wait);
 int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
     bool force);
 struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs);
 void tcp_switch_back_to_default(struct tcpcb *tp);
 struct tcp_function_block *
 find_and_ref_tcp_fb(struct tcp_function_block *fs);
 int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp);
 
 uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
 uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *);
 u_int	 tcp_maxseg(const struct tcpcb *);
 void	 tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *,
 	    struct tcp_ifcap *);
 void	 tcp_mss(struct tcpcb *, int);
 int	 tcp_mssopt(struct in_conninfo *);
 struct inpcb *
 	 tcp_drop_syn_sent(struct inpcb *, int);
 struct tcpcb *
 	 tcp_newtcpcb(struct inpcb *);
 int	 tcp_output(struct tcpcb *);
 void	 tcp_state_change(struct tcpcb *, int);
 void	 tcp_respond(struct tcpcb *, void *,
 	    struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int);
 void	 tcp_tw_init(void);
 #ifdef VIMAGE
 void	 tcp_tw_destroy(void);
 #endif
 void	 tcp_tw_zone_change(void);
 int	 tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *,
 	    struct mbuf *, int);
 void	 tcp_setpersist(struct tcpcb *);
 void	 tcp_slowtimo(void);
 struct tcptemp *
 	 tcpip_maketemplate(struct inpcb *);
 void	 tcpip_fillheaders(struct inpcb *, void *, void *);
 void	 tcp_timer_activate(struct tcpcb *, uint32_t, u_int);
 int	 tcp_timer_suspend(struct tcpcb *, uint32_t);
 void	 tcp_timers_unsuspend(struct tcpcb *, uint32_t);
 int	 tcp_timer_active(struct tcpcb *, uint32_t);
 void	 tcp_timer_stop(struct tcpcb *, uint32_t);
 void	 tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
 int	 inp_to_cpuid(struct inpcb *inp);
 /*
  * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
  */
 void	 tcp_hc_init(void);
 #ifdef VIMAGE
 void	 tcp_hc_destroy(void);
 #endif
 void	 tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *);
 uint32_t tcp_hc_getmtu(struct in_conninfo *);
 void	 tcp_hc_updatemtu(struct in_conninfo *, uint32_t);
 void	 tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *);
 
 extern	struct pr_usrreqs tcp_usrreqs;
 
 uint32_t tcp_new_ts_offset(struct in_conninfo *);
 tcp_seq	 tcp_new_isn(struct in_conninfo *);
 
 int	 tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq);
 void	 tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq);
 void	 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend);
 void	 tcp_clean_dsack_blocks(struct tcpcb *tp);
 void	 tcp_clean_sackreport(struct tcpcb *tp);
 void	 tcp_sack_adjust(struct tcpcb *tp);
 struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);
 void	 tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
 void	 tcp_free_sackholes(struct tcpcb *tp);
 int	 tcp_newreno(struct tcpcb *, struct tcphdr *);
 int	 tcp_compute_pipe(struct tcpcb *);
 void	 tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t);
 struct mbuf *
 	 tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
 	   int32_t seglimit, int32_t segsize, struct sockbuf *sb);
 
 
 static inline void
 tcp_fields_to_host(struct tcphdr *th)
 {
 
 	th->th_seq = ntohl(th->th_seq);
 	th->th_ack = ntohl(th->th_ack);
 	th->th_win = ntohs(th->th_win);
 	th->th_urp = ntohs(th->th_urp);
 }
 
 static inline void
 tcp_fields_to_net(struct tcphdr *th)
 {
 
 	th->th_seq = htonl(th->th_seq);
 	th->th_ack = htonl(th->th_ack);
 	th->th_win = htons(th->th_win);
 	th->th_urp = htons(th->th_urp);
 }
 #endif /* _KERNEL */
 
 #endif /* _NETINET_TCP_VAR_H_ */
Index: stable/12/sys/sys/mbuf.h
===================================================================
--- stable/12/sys/sys/mbuf.h	(revision 362879)
+++ stable/12/sys/sys/mbuf.h	(revision 362880)
@@ -1,1391 +1,1392 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)mbuf.h	8.5 (Berkeley) 2/19/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_MBUF_H_
 #define	_SYS_MBUF_H_
 
 /* XXX: These includes suck. Sorry! */
 #include <sys/queue.h>
 #ifdef _KERNEL
 #include <sys/systm.h>
 #include <vm/uma.h>
 #ifdef WITNESS
 #include <sys/lock.h>
 #endif
 #endif
 
 #ifdef _KERNEL
 #include <sys/sdt.h>
 
 #define	MBUF_PROBE1(probe, arg0)					\
 	SDT_PROBE1(sdt, , , probe, arg0)
 #define	MBUF_PROBE2(probe, arg0, arg1)					\
 	SDT_PROBE2(sdt, , , probe, arg0, arg1)
 #define	MBUF_PROBE3(probe, arg0, arg1, arg2)				\
 	SDT_PROBE3(sdt, , , probe, arg0, arg1, arg2)
 #define	MBUF_PROBE4(probe, arg0, arg1, arg2, arg3)			\
 	SDT_PROBE4(sdt, , , probe, arg0, arg1, arg2, arg3)
 #define	MBUF_PROBE5(probe, arg0, arg1, arg2, arg3, arg4)		\
 	SDT_PROBE5(sdt, , , probe, arg0, arg1, arg2, arg3, arg4)
 
 SDT_PROBE_DECLARE(sdt, , , m__init);
 SDT_PROBE_DECLARE(sdt, , , m__gethdr);
 SDT_PROBE_DECLARE(sdt, , , m__get);
 SDT_PROBE_DECLARE(sdt, , , m__getcl);
 SDT_PROBE_DECLARE(sdt, , , m__clget);
 SDT_PROBE_DECLARE(sdt, , , m__cljget);
 SDT_PROBE_DECLARE(sdt, , , m__cljset);
 SDT_PROBE_DECLARE(sdt, , , m__free);
 SDT_PROBE_DECLARE(sdt, , , m__freem);
 
 #endif /* _KERNEL */
 
 /*
  * Mbufs are of a single size, MSIZE (sys/param.h), which includes overhead.
  * An mbuf may add a single "mbuf cluster" of size MCLBYTES (also in
  * sys/param.h), which has no additional overhead and is used instead of the
  * internal data area; this is done when at least MINCLSIZE of data must be
  * stored.  Additionally, it is possible to allocate a separate buffer
  * externally and attach it to the mbuf in a way similar to that of mbuf
  * clusters.
  *
  * NB: These calculation do not take actual compiler-induced alignment and
  * padding inside the complete struct mbuf into account.  Appropriate
  * attention is required when changing members of struct mbuf.
  *
  * MLEN is data length in a normal mbuf.
  * MHLEN is data length in an mbuf with pktheader.
  * MINCLSIZE is a smallest amount of data that should be put into cluster.
  *
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are sensible.
  */
 struct mbuf;
 #define	MHSIZE		offsetof(struct mbuf, m_dat)
 #define	MPKTHSIZE	offsetof(struct mbuf, m_pktdat)
 #define	MLEN		((int)(MSIZE - MHSIZE))
 #define	MHLEN		((int)(MSIZE - MPKTHSIZE))
 #define	MINCLSIZE	(MHLEN + 1)
 
 #ifdef _KERNEL
 /*-
  * Macro for type conversion: convert mbuf pointer to data pointer of correct
  * type:
  *
  * mtod(m, t)	-- Convert mbuf pointer to data pointer of correct type.
  * mtodo(m, o) -- Same as above but with offset 'o' into data.
  */
 #define	mtod(m, t)	((t)((m)->m_data))
 #define	mtodo(m, o)	((void *)(((m)->m_data) + (o)))
 
 /*
  * Argument structure passed to UMA routines during mbuf and packet
  * allocations.
  */
 struct mb_args {
 	int	flags;	/* Flags for mbuf being allocated */
 	short	type;	/* Type of mbuf being allocated */
 };
 #endif /* _KERNEL */
 
 /*
  * Packet tag structure (see below for details).
  */
 struct m_tag {
 	SLIST_ENTRY(m_tag)	m_tag_link;	/* List of packet tags */
 	u_int16_t		m_tag_id;	/* Tag ID */
 	u_int16_t		m_tag_len;	/* Length of data */
 	u_int32_t		m_tag_cookie;	/* ABI/Module ID */
 	void			(*m_tag_free)(struct m_tag *);
 };
 
 /*
  * Static network interface owned tag.
  * Allocated through ifp->if_snd_tag_alloc().
  */
 struct m_snd_tag {
 	struct ifnet *ifp;		/* network interface tag belongs to */
 };
 
 /*
  * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set.
  * Size ILP32: 48
  *	 LP64: 56
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are correct.
  */
 struct pkthdr {
 	union {
 		struct m_snd_tag *snd_tag;	/* send tag, if any */
 		struct ifnet	*rcvif;		/* rcv interface */
 	};
 	SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */
 	int32_t		 len;		/* total packet length */
 
 	/* Layer crossing persistent information. */
 	uint32_t	 flowid;	/* packet's 4-tuple system */
 	uint32_t	 csum_flags;	/* checksum and offload features */
 	uint16_t	 fibnum;	/* this packet should use this fib */
 	uint8_t		 cosqos;	/* class/quality of service */
 	uint8_t		 rsstype;	/* hash type */
 	union {
 		uint64_t	rcv_tstmp;	/* timestamp in ns */
 		struct {
 			uint8_t		 l2hlen;	/* layer 2 hdr len */
 			uint8_t		 l3hlen;	/* layer 3 hdr len */
 			uint8_t		 l4hlen;	/* layer 4 hdr len */
 			uint8_t		 l5hlen;	/* layer 5 hdr len */
 			uint32_t	 spare;
 		};
 	};
 	union {
 		uint8_t  eight[8];
 		uint16_t sixteen[4];
 		uint32_t thirtytwo[2];
 		uint64_t sixtyfour[1];
 		uintptr_t unintptr[1];
 		void	*ptr;
 	} PH_per;
 
 	/* Layer specific non-persistent local storage for reassembly, etc. */
 	union {
 		uint8_t  eight[8];
 		uint16_t sixteen[4];
 		uint32_t thirtytwo[2];
 		uint64_t sixtyfour[1];
 		uintptr_t unintptr[1];
 		void 	*ptr;
 	} PH_loc;
 };
 #define	ether_vtag	PH_per.sixteen[0]
 #define	PH_vt		PH_per
 #define	vt_nrecs	sixteen[0]
 #define	tso_segsz	PH_per.sixteen[1]
 #define	lro_nsegs	tso_segsz
 #define	csum_phsum	PH_per.sixteen[2]
 #define	csum_data	PH_per.thirtytwo[1]
 #define pace_thoff	PH_loc.sixteen[0]
 #define pace_tlen	PH_loc.sixteen[1]
 #define pace_drphdrlen	PH_loc.sixteen[2]
 #define pace_tos	PH_loc.eight[6]
 #define pace_lock	PH_loc.eight[7]
 
 /*
  * Description of external storage mapped into mbuf; valid only if M_EXT is
  * set.
  * Size ILP32: 28
  *	 LP64: 48
  * Compile-time assertions in uipc_mbuf.c test these values to ensure that
  * they are correct.
  */
 typedef	void m_ext_free_t(struct mbuf *);
 struct m_ext {
 	union {
 		/*
 		 * If EXT_FLAG_EMBREF is set, then we use refcount in the
 		 * mbuf, the 'ext_count' member.  Otherwise, we have a
 		 * shadow copy and we use pointer 'ext_cnt'.  The original
 		 * mbuf is responsible to carry the pointer to free routine
 		 * and its arguments.  They aren't copied into shadows in
 		 * mb_dupcl() to avoid dereferencing next cachelines.
 		 */
 		volatile u_int	 ext_count;
 		volatile u_int	*ext_cnt;
 	};
 	char		*ext_buf;	/* start of buffer */
 	uint32_t	 ext_size;	/* size of buffer, for ext_free */
 	uint32_t	 ext_type:8,	/* type of external storage */
 			 ext_flags:24;	/* external storage mbuf flags */
 	/*
 	 * Fields below store the free context for the external storage.
 	 * They are valid only in the refcount carrying mbuf, the one with
 	 * EXT_FLAG_EMBREF flag, with exclusion for EXT_EXTREF type, where
 	 * the free context is copied into all mbufs that use same external
 	 * storage.
 	 */
 #define	m_ext_copylen	offsetof(struct m_ext, ext_free)
 	m_ext_free_t	*ext_free;	/* free routine if not the usual */
 	void		*ext_arg1;	/* optional argument pointer */
 	void		*ext_arg2;	/* optional argument pointer */
 };
 
 /*
  * The core of the mbuf object along with some shortcut defines for practical
  * purposes.
  */
 struct mbuf {
 	/*
 	 * Header present at the beginning of every mbuf.
 	 * Size ILP32: 24
 	 *      LP64: 32
 	 * Compile-time assertions in uipc_mbuf.c test these values to ensure
 	 * that they are correct.
 	 */
 	union {	/* next buffer in chain */
 		struct mbuf		*m_next;
 		SLIST_ENTRY(mbuf)	m_slist;
 		STAILQ_ENTRY(mbuf)	m_stailq;
 	};
 	union {	/* next chain in queue/record */
 		struct mbuf		*m_nextpkt;
 		SLIST_ENTRY(mbuf)	m_slistpkt;
 		STAILQ_ENTRY(mbuf)	m_stailqpkt;
 	};
 	caddr_t		 m_data;	/* location of data */
 	int32_t		 m_len;		/* amount of data in this mbuf */
 	uint32_t	 m_type:8,	/* type of data in this mbuf */
 			 m_flags:24;	/* flags; see below */
 #if !defined(__LP64__)
 	uint32_t	 m_pad;		/* pad for 64bit alignment */
 #endif
 
 	/*
 	 * A set of optional headers (packet header, external storage header)
 	 * and internal data storage.  Historically, these arrays were sized
 	 * to MHLEN (space left after a packet header) and MLEN (space left
 	 * after only a regular mbuf header); they are now variable size in
 	 * order to support future work on variable-size mbufs.
 	 */
 	union {
 		struct {
 			struct pkthdr	m_pkthdr;	/* M_PKTHDR set */
 			union {
 				struct m_ext	m_ext;	/* M_EXT set */
 				char		m_pktdat[0];
 			};
 		};
 		char	m_dat[0];			/* !M_PKTHDR, !M_EXT */
 	};
 };
 
 /*
  * mbuf flags of global significance and layer crossing.
  * Those of only protocol/layer specific significance are to be mapped
  * to M_PROTO[1-12] and cleared at layer handoff boundaries.
  * NB: Limited to the lower 24 bits.
  */
 #define	M_EXT		0x00000001 /* has associated external storage */
 #define	M_PKTHDR	0x00000002 /* start of record */
 #define	M_EOR		0x00000004 /* end of record */
 #define	M_RDONLY	0x00000008 /* associated data is marked read-only */
 #define	M_BCAST		0x00000010 /* send/received as link-level broadcast */
 #define	M_MCAST		0x00000020 /* send/received as link-level multicast */
 #define	M_PROMISC	0x00000040 /* packet was not for us */
 #define	M_VLANTAG	0x00000080 /* ether_vtag is valid */
 #define	M_NOMAP		0x00000100 /* mbuf data is unmapped (soon from Drew) */
 #define	M_NOFREE	0x00000200 /* do not free mbuf, embedded in cluster */
 #define	M_TSTMP		0x00000400 /* rcv_tstmp field is valid */
 #define	M_TSTMP_HPREC	0x00000800 /* rcv_tstmp is high-prec, typically
 				      hw-stamped on port (useful for IEEE 1588
 				      and 802.1AS) */
+#define M_TSTMP_LRO	0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */
 
 #define	M_PROTO1	0x00001000 /* protocol-specific */
 #define	M_PROTO2	0x00002000 /* protocol-specific */
 #define	M_PROTO3	0x00004000 /* protocol-specific */
 #define	M_PROTO4	0x00008000 /* protocol-specific */
 #define	M_PROTO5	0x00010000 /* protocol-specific */
 #define	M_PROTO6	0x00020000 /* protocol-specific */
 #define	M_PROTO7	0x00040000 /* protocol-specific */
 #define	M_PROTO8	0x00080000 /* protocol-specific */
 #define	M_PROTO9	0x00100000 /* protocol-specific */
 #define	M_PROTO10	0x00200000 /* protocol-specific */
 #define	M_PROTO11	0x00400000 /* protocol-specific */
 #define	M_PROTO12	0x00800000 /* protocol-specific */
 
 #define MB_DTOR_SKIP	0x1	/* don't pollute the cache by touching a freed mbuf */
 
 /*
  * Flags to purge when crossing layers.
  */
 #define	M_PROTOFLAGS \
     (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8|\
      M_PROTO9|M_PROTO10|M_PROTO11|M_PROTO12)
 
 /*
  * Flags preserved when copying m_pkthdr.
  */
 #define M_COPYFLAGS \
     (M_PKTHDR|M_EOR|M_RDONLY|M_BCAST|M_MCAST|M_PROMISC|M_VLANTAG|M_TSTMP| \
      M_TSTMP_HPREC|M_PROTOFLAGS)
 
 /*
  * Mbuf flag description for use with printf(9) %b identifier.
  */
 #define	M_FLAG_BITS \
     "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \
     "\7M_PROMISC\10M_VLANTAG\13M_TSTMP\14M_TSTMP_HPREC"
 #define	M_FLAG_PROTOBITS \
     "\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \
     "\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \
     "\27M_PROTO11\30M_PROTO12"
 #define	M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS)
 
 /*
  * Network interface cards are able to hash protocol fields (such as IPv4
  * addresses and TCP port numbers) classify packets into flows.  These flows
  * can then be used to maintain ordering while delivering packets to the OS
  * via parallel input queues, as well as to provide a stateless affinity
  * model.  NIC drivers can pass up the hash via m->m_pkthdr.flowid, and set
  * m_flag fields to indicate how the hash should be interpreted by the
  * network stack.
  *
  * Most NICs support RSS, which provides ordering and explicit affinity, and
  * use the hash m_flag bits to indicate what header fields were covered by
  * the hash.  M_HASHTYPE_OPAQUE and M_HASHTYPE_OPAQUE_HASH can be set by non-
  * RSS cards or configurations that provide an opaque flow identifier, allowing
  * for ordering and distribution without explicit affinity.  Additionally,
  * M_HASHTYPE_OPAQUE_HASH indicates that the flow identifier has hash
  * properties.
  *
  * The meaning of the IPV6_EX suffix:
  * "o  Home address from the home address option in the IPv6 destination
  *     options header.  If the extension header is not present, use the Source
  *     IPv6 Address.
  *  o  IPv6 address that is contained in the Routing-Header-Type-2 from the
  *     associated extension header.  If the extension header is not present,
  *     use the Destination IPv6 Address."
  * Quoted from:
  * https://docs.microsoft.com/en-us/windows-hardware/drivers/network/rss-hashing-types#ndishashipv6ex
  */
 #define	M_HASHTYPE_HASHPROP		0x80	/* has hash properties */
 #define	M_HASHTYPE_HASH(t)		(M_HASHTYPE_HASHPROP | (t))
 /* Microsoft RSS standard hash types */
 #define	M_HASHTYPE_NONE			0
 #define	M_HASHTYPE_RSS_IPV4		M_HASHTYPE_HASH(1) /* IPv4 2-tuple */
 #define	M_HASHTYPE_RSS_TCP_IPV4		M_HASHTYPE_HASH(2) /* TCPv4 4-tuple */
 #define	M_HASHTYPE_RSS_IPV6		M_HASHTYPE_HASH(3) /* IPv6 2-tuple */
 #define	M_HASHTYPE_RSS_TCP_IPV6		M_HASHTYPE_HASH(4) /* TCPv6 4-tuple */
 #define	M_HASHTYPE_RSS_IPV6_EX		M_HASHTYPE_HASH(5) /* IPv6 2-tuple +
 							    * ext hdrs */
 #define	M_HASHTYPE_RSS_TCP_IPV6_EX	M_HASHTYPE_HASH(6) /* TCPv6 4-tuple +
 							    * ext hdrs */
 #define	M_HASHTYPE_RSS_UDP_IPV4		M_HASHTYPE_HASH(7) /* IPv4 UDP 4-tuple*/
 #define	M_HASHTYPE_RSS_UDP_IPV6		M_HASHTYPE_HASH(9) /* IPv6 UDP 4-tuple*/
 #define	M_HASHTYPE_RSS_UDP_IPV6_EX	M_HASHTYPE_HASH(10)/* IPv6 UDP 4-tuple +
 							    * ext hdrs */
 
 #define	M_HASHTYPE_OPAQUE		63	/* ordering, not affinity */
 #define	M_HASHTYPE_OPAQUE_HASH		M_HASHTYPE_HASH(M_HASHTYPE_OPAQUE)
 						/* ordering+hash, not affinity*/
 
 #define	M_HASHTYPE_CLEAR(m)	((m)->m_pkthdr.rsstype = 0)
 #define	M_HASHTYPE_GET(m)	((m)->m_pkthdr.rsstype)
 #define	M_HASHTYPE_SET(m, v)	((m)->m_pkthdr.rsstype = (v))
 #define	M_HASHTYPE_TEST(m, v)	(M_HASHTYPE_GET(m) == (v))
 #define	M_HASHTYPE_ISHASH(m)	(M_HASHTYPE_GET(m) & M_HASHTYPE_HASHPROP)
 
 /*
  * COS/QOS class and quality of service tags.
  * It uses DSCP code points as base.
  */
 #define	QOS_DSCP_CS0		0x00
 #define	QOS_DSCP_DEF		QOS_DSCP_CS0
 #define	QOS_DSCP_CS1		0x20
 #define	QOS_DSCP_AF11		0x28
 #define	QOS_DSCP_AF12		0x30
 #define	QOS_DSCP_AF13		0x38
 #define	QOS_DSCP_CS2		0x40
 #define	QOS_DSCP_AF21		0x48
 #define	QOS_DSCP_AF22		0x50
 #define	QOS_DSCP_AF23		0x58
 #define	QOS_DSCP_CS3		0x60
 #define	QOS_DSCP_AF31		0x68
 #define	QOS_DSCP_AF32		0x70
 #define	QOS_DSCP_AF33		0x78
 #define	QOS_DSCP_CS4		0x80
 #define	QOS_DSCP_AF41		0x88
 #define	QOS_DSCP_AF42		0x90
 #define	QOS_DSCP_AF43		0x98
 #define	QOS_DSCP_CS5		0xa0
 #define	QOS_DSCP_EF		0xb8
 #define	QOS_DSCP_CS6		0xc0
 #define	QOS_DSCP_CS7		0xe0
 
 /*
  * External mbuf storage buffer types.
  */
 #define	EXT_CLUSTER	1	/* mbuf cluster */
 #define	EXT_SFBUF	2	/* sendfile(2)'s sf_buf */
 #define	EXT_JUMBOP	3	/* jumbo cluster page sized */
 #define	EXT_JUMBO9	4	/* jumbo cluster 9216 bytes */
 #define	EXT_JUMBO16	5	/* jumbo cluster 16184 bytes */
 #define	EXT_PACKET	6	/* mbuf+cluster from packet zone */
 #define	EXT_MBUF	7	/* external mbuf reference */
 
 #define	EXT_VENDOR1	224	/* for vendor-internal use */
 #define	EXT_VENDOR2	225	/* for vendor-internal use */
 #define	EXT_VENDOR3	226	/* for vendor-internal use */
 #define	EXT_VENDOR4	227	/* for vendor-internal use */
 
 #define	EXT_EXP1	244	/* for experimental use */
 #define	EXT_EXP2	245	/* for experimental use */
 #define	EXT_EXP3	246	/* for experimental use */
 #define	EXT_EXP4	247	/* for experimental use */
 
 #define	EXT_NET_DRV	252	/* custom ext_buf provided by net driver(s) */
 #define	EXT_MOD_TYPE	253	/* custom module's ext_buf type */
 #define	EXT_DISPOSABLE	254	/* can throw this buffer away w/page flipping */
 #define	EXT_EXTREF	255	/* has externally maintained ext_cnt ptr */
 
 /*
  * Flags for external mbuf buffer types.
  * NB: limited to the lower 24 bits.
  */
 #define	EXT_FLAG_EMBREF		0x000001	/* embedded ext_count */
 #define	EXT_FLAG_EXTREF		0x000002	/* external ext_cnt, notyet */
 
 #define	EXT_FLAG_NOFREE		0x000010	/* don't free mbuf to pool, notyet */
 
 #define	EXT_FLAG_VENDOR1	0x010000	/* These flags are vendor */
 #define	EXT_FLAG_VENDOR2	0x020000	/* or submodule specific, */
 #define	EXT_FLAG_VENDOR3	0x040000	/* not used by mbuf code. */
 #define	EXT_FLAG_VENDOR4	0x080000	/* Set/read by submodule. */
 
 #define	EXT_FLAG_EXP1		0x100000	/* for experimental use */
 #define	EXT_FLAG_EXP2		0x200000	/* for experimental use */
 #define	EXT_FLAG_EXP3		0x400000	/* for experimental use */
 #define	EXT_FLAG_EXP4		0x800000	/* for experimental use */
 
 /*
  * EXT flag description for use with printf(9) %b identifier.
  */
 #define	EXT_FLAG_BITS \
     "\20\1EXT_FLAG_EMBREF\2EXT_FLAG_EXTREF\5EXT_FLAG_NOFREE" \
     "\21EXT_FLAG_VENDOR1\22EXT_FLAG_VENDOR2\23EXT_FLAG_VENDOR3" \
     "\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \
     "\30EXT_FLAG_EXP4"
 
 /*
  * Flags indicating checksum, segmentation and other offload work to be
  * done, or already done, by hardware or lower layers.  It is split into
  * separate inbound and outbound flags.
  *
  * Outbound flags that are set by upper protocol layers requesting lower
  * layers, or ideally the hardware, to perform these offloading tasks.
  * For outbound packets this field and its flags can be directly tested
  * against ifnet if_hwassist.
  */
 #define	CSUM_IP			0x00000001	/* IP header checksum offload */
 #define	CSUM_IP_UDP		0x00000002	/* UDP checksum offload */
 #define	CSUM_IP_TCP		0x00000004	/* TCP checksum offload */
 #define	CSUM_IP_SCTP		0x00000008	/* SCTP checksum offload */
 #define	CSUM_IP_TSO		0x00000010	/* TCP segmentation offload */
 #define	CSUM_IP_ISCSI		0x00000020	/* iSCSI checksum offload */
 
 #define	CSUM_IP6_UDP		0x00000200	/* UDP checksum offload */
 #define	CSUM_IP6_TCP		0x00000400	/* TCP checksum offload */
 #define	CSUM_IP6_SCTP		0x00000800	/* SCTP checksum offload */
 #define	CSUM_IP6_TSO		0x00001000	/* TCP segmentation offload */
 #define	CSUM_IP6_ISCSI		0x00002000	/* iSCSI checksum offload */
 
 /* Inbound checksum support where the checksum was verified by hardware. */
 #define	CSUM_L3_CALC		0x01000000	/* calculated layer 3 csum */
 #define	CSUM_L3_VALID		0x02000000	/* checksum is correct */
 #define	CSUM_L4_CALC		0x04000000	/* calculated layer 4 csum */
 #define	CSUM_L4_VALID		0x08000000	/* checksum is correct */
 #define	CSUM_L5_CALC		0x10000000	/* calculated layer 5 csum */
 #define	CSUM_L5_VALID		0x20000000	/* checksum is correct */
 #define	CSUM_COALESCED		0x40000000	/* contains merged segments */
 
 #define	CSUM_SND_TAG		0x80000000	/* Packet header has send tag */
 
 /*
  * CSUM flag description for use with printf(9) %b identifier.
  */
 #define	CSUM_BITS \
     "\20\1CSUM_IP\2CSUM_IP_UDP\3CSUM_IP_TCP\4CSUM_IP_SCTP\5CSUM_IP_TSO" \
     "\6CSUM_IP_ISCSI" \
     "\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP\15CSUM_IP6_TSO" \
     "\16CSUM_IP6_ISCSI" \
     "\31CSUM_L3_CALC\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID" \
     "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG"
 
 /* CSUM flags compatibility mappings. */
 #define	CSUM_IP_CHECKED		CSUM_L3_CALC
 #define	CSUM_IP_VALID		CSUM_L3_VALID
 #define	CSUM_DATA_VALID		CSUM_L4_VALID
 #define	CSUM_PSEUDO_HDR		CSUM_L4_CALC
 #define	CSUM_SCTP_VALID		CSUM_L4_VALID
 #define	CSUM_DELAY_DATA		(CSUM_TCP|CSUM_UDP)
 #define	CSUM_DELAY_IP		CSUM_IP		/* Only v4, no v6 IP hdr csum */
 #define	CSUM_DELAY_DATA_IPV6	(CSUM_TCP_IPV6|CSUM_UDP_IPV6)
 #define	CSUM_DATA_VALID_IPV6	CSUM_DATA_VALID
 #define	CSUM_TCP		CSUM_IP_TCP
 #define	CSUM_UDP		CSUM_IP_UDP
 #define	CSUM_SCTP		CSUM_IP_SCTP
 #define	CSUM_TSO		(CSUM_IP_TSO|CSUM_IP6_TSO)
 #define	CSUM_UDP_IPV6		CSUM_IP6_UDP
 #define	CSUM_TCP_IPV6		CSUM_IP6_TCP
 #define	CSUM_SCTP_IPV6		CSUM_IP6_SCTP
 
 /*
  * mbuf types describing the content of the mbuf (including external storage).
  */
 #define	MT_NOTMBUF	0	/* USED INTERNALLY ONLY! Object is not mbuf */
 #define	MT_DATA		1	/* dynamic (data) allocation */
 #define	MT_HEADER	MT_DATA	/* packet header, use M_PKTHDR instead */
 
 #define	MT_VENDOR1	4	/* for vendor-internal use */
 #define	MT_VENDOR2	5	/* for vendor-internal use */
 #define	MT_VENDOR3	6	/* for vendor-internal use */
 #define	MT_VENDOR4	7	/* for vendor-internal use */
 
 #define	MT_SONAME	8	/* socket name */
 
 #define	MT_EXP1		9	/* for experimental use */
 #define	MT_EXP2		10	/* for experimental use */
 #define	MT_EXP3		11	/* for experimental use */
 #define	MT_EXP4		12	/* for experimental use */
 
 #define	MT_CONTROL	14	/* extra-data protocol message */
 #define	MT_EXTCONTROL	15	/* control message with externalized contents */
 #define	MT_OOBDATA	16	/* expedited data  */
 
 #define	MT_NOINIT	255	/* Not a type but a flag to allocate
 				   a non-initialized mbuf */
 
 /*
  * String names of mbuf-related UMA(9) and malloc(9) types.  Exposed to
  * !_KERNEL so that monitoring tools can look up the zones with
  * libmemstat(3).
  */
 #define	MBUF_MEM_NAME		"mbuf"
 #define	MBUF_CLUSTER_MEM_NAME	"mbuf_cluster"
 #define	MBUF_PACKET_MEM_NAME	"mbuf_packet"
 #define	MBUF_JUMBOP_MEM_NAME	"mbuf_jumbo_page"
 #define	MBUF_JUMBO9_MEM_NAME	"mbuf_jumbo_9k"
 #define	MBUF_JUMBO16_MEM_NAME	"mbuf_jumbo_16k"
 #define	MBUF_TAG_MEM_NAME	"mbuf_tag"
 #define	MBUF_EXTREFCNT_MEM_NAME	"mbuf_ext_refcnt"
 
 #ifdef _KERNEL
 
 #ifdef WITNESS
 #define	MBUF_CHECKSLEEP(how) do {					\
 	if (how == M_WAITOK)						\
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,		\
 		    "Sleeping in \"%s\"", __func__);			\
 } while (0)
 #else
 #define	MBUF_CHECKSLEEP(how)
 #endif
 
 /*
  * Network buffer allocation API
  *
  * The rest of it is defined in kern/kern_mbuf.c
  */
 extern uma_zone_t	zone_mbuf;
 extern uma_zone_t	zone_clust;
 extern uma_zone_t	zone_pack;
 extern uma_zone_t	zone_jumbop;
 extern uma_zone_t	zone_jumbo9;
 extern uma_zone_t	zone_jumbo16;
 
 void		 mb_dupcl(struct mbuf *, struct mbuf *);
 void		 mb_free_ext(struct mbuf *);
 void		 m_adj(struct mbuf *, int);
 int		 m_apply(struct mbuf *, int, int,
 		    int (*)(void *, void *, u_int), void *);
 int		 m_append(struct mbuf *, int, c_caddr_t);
 void		 m_cat(struct mbuf *, struct mbuf *);
 void		 m_catpkt(struct mbuf *, struct mbuf *);
 int		 m_clget(struct mbuf *m, int how);
 void 		*m_cljget(struct mbuf *m, int how, int size);
 struct mbuf	*m_collapse(struct mbuf *, int, int);
 void		 m_copyback(struct mbuf *, int, int, c_caddr_t);
 void		 m_copydata(const struct mbuf *, int, int, caddr_t);
 struct mbuf	*m_copym(struct mbuf *, int, int, int);
 struct mbuf	*m_copypacket(struct mbuf *, int);
 void		 m_copy_pkthdr(struct mbuf *, struct mbuf *);
 struct mbuf	*m_copyup(struct mbuf *, int, int);
 struct mbuf	*m_defrag(struct mbuf *, int);
 void		 m_demote_pkthdr(struct mbuf *);
 void		 m_demote(struct mbuf *, int, int);
 struct mbuf	*m_devget(char *, int, int, struct ifnet *,
 		    void (*)(char *, caddr_t, u_int));
 void		 m_dispose_extcontrolm(struct mbuf *m);
 struct mbuf	*m_dup(const struct mbuf *, int);
 int		 m_dup_pkthdr(struct mbuf *, const struct mbuf *, int);
 void		 m_extadd(struct mbuf *, char *, u_int, m_ext_free_t,
 		    void *, void *, int, int);
 u_int		 m_fixhdr(struct mbuf *);
 struct mbuf	*m_fragment(struct mbuf *, int, int);
 void		 m_freem(struct mbuf *);
 struct mbuf	*m_get2(int, int, short, int);
 struct mbuf	*m_getjcl(int, short, int, int);
 struct mbuf	*m_getm2(struct mbuf *, int, int, short, int);
 struct mbuf	*m_getptr(struct mbuf *, int, int *);
 u_int		 m_length(struct mbuf *, struct mbuf **);
 int		 m_mbuftouio(struct uio *, const struct mbuf *, int);
 void		 m_move_pkthdr(struct mbuf *, struct mbuf *);
 int		 m_pkthdr_init(struct mbuf *, int);
 struct mbuf	*m_prepend(struct mbuf *, int, int);
 void		 m_print(const struct mbuf *, int);
 struct mbuf	*m_pulldown(struct mbuf *, int, int, int *);
 struct mbuf	*m_pullup(struct mbuf *, int);
 int		 m_sanity(struct mbuf *, int);
 struct mbuf	*m_split(struct mbuf *, int, int);
 struct mbuf	*m_uiotombuf(struct uio *, int, int, int, int);
 struct mbuf	*m_unshare(struct mbuf *, int);
 
 static __inline int
 m_gettype(int size)
 {
 	int type;
 
 	switch (size) {
 	case MSIZE:
 		type = EXT_MBUF;
 		break;
 	case MCLBYTES:
 		type = EXT_CLUSTER;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case MJUMPAGESIZE:
 		type = EXT_JUMBOP;
 		break;
 #endif
 	case MJUM9BYTES:
 		type = EXT_JUMBO9;
 		break;
 	case MJUM16BYTES:
 		type = EXT_JUMBO16;
 		break;
 	default:
 		panic("%s: invalid cluster size %d", __func__, size);
 	}
 
 	return (type);
 }
 
 /*
  * Associated an external reference counted buffer with an mbuf.
  */
 static __inline void
 m_extaddref(struct mbuf *m, char *buf, u_int size, u_int *ref_cnt,
     m_ext_free_t freef, void *arg1, void *arg2)
 {
 
 	KASSERT(ref_cnt != NULL, ("%s: ref_cnt not provided", __func__));
 
 	atomic_add_int(ref_cnt, 1);
 	m->m_flags |= M_EXT;
 	m->m_ext.ext_buf = buf;
 	m->m_ext.ext_cnt = ref_cnt;
 	m->m_data = m->m_ext.ext_buf;
 	m->m_ext.ext_size = size;
 	m->m_ext.ext_free = freef;
 	m->m_ext.ext_arg1 = arg1;
 	m->m_ext.ext_arg2 = arg2;
 	m->m_ext.ext_type = EXT_EXTREF;
 	m->m_ext.ext_flags = 0;
 }
 
 static __inline uma_zone_t
 m_getzone(int size)
 {
 	uma_zone_t zone;
 
 	switch (size) {
 	case MCLBYTES:
 		zone = zone_clust;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case MJUMPAGESIZE:
 		zone = zone_jumbop;
 		break;
 #endif
 	case MJUM9BYTES:
 		zone = zone_jumbo9;
 		break;
 	case MJUM16BYTES:
 		zone = zone_jumbo16;
 		break;
 	default:
 		panic("%s: invalid cluster size %d", __func__, size);
 	}
 
 	return (zone);
 }
 
 /*
  * Initialize an mbuf with linear storage.
  *
  * Inline because the consumer text overhead will be roughly the same to
  * initialize or call a function with this many parameters and M_PKTHDR
  * should go away with constant propagation for !MGETHDR.
  */
 static __inline int
 m_init(struct mbuf *m, int how, short type, int flags)
 {
 	int error;
 
 	m->m_next = NULL;
 	m->m_nextpkt = NULL;
 	m->m_data = m->m_dat;
 	m->m_len = 0;
 	m->m_flags = flags;
 	m->m_type = type;
 	if (flags & M_PKTHDR)
 		error = m_pkthdr_init(m, how);
 	else
 		error = 0;
 
 	MBUF_PROBE5(m__init, m, how, type, flags, error);
 	return (error);
 }
 
 static __inline struct mbuf *
 m_get(int how, short type)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = 0;
 	args.type = type;
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	MBUF_PROBE3(m__get, how, type, m);
 	return (m);
 }
 
 static __inline struct mbuf *
 m_gethdr(int how, short type)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = M_PKTHDR;
 	args.type = type;
 	m = uma_zalloc_arg(zone_mbuf, &args, how);
 	MBUF_PROBE3(m__gethdr, how, type, m);
 	return (m);
 }
 
 static __inline struct mbuf *
 m_getcl(int how, short type, int flags)
 {
 	struct mbuf *m;
 	struct mb_args args;
 
 	args.flags = flags;
 	args.type = type;
 	m = uma_zalloc_arg(zone_pack, &args, how);
 	MBUF_PROBE4(m__getcl, how, type, flags, m);
 	return (m);
 }
 
 /*
  * XXX: m_cljset() is a dangerous API.  One must attach only a new,
  * unreferenced cluster to an mbuf(9).  It is not possible to assert
  * that, so care can be taken only by users of the API.
  */
 static __inline void
 m_cljset(struct mbuf *m, void *cl, int type)
 {
 	int size;
 
 	switch (type) {
 	case EXT_CLUSTER:
 		size = MCLBYTES;
 		break;
 #if MJUMPAGESIZE != MCLBYTES
 	case EXT_JUMBOP:
 		size = MJUMPAGESIZE;
 		break;
 #endif
 	case EXT_JUMBO9:
 		size = MJUM9BYTES;
 		break;
 	case EXT_JUMBO16:
 		size = MJUM16BYTES;
 		break;
 	default:
 		panic("%s: unknown cluster type %d", __func__, type);
 		break;
 	}
 
 	m->m_data = m->m_ext.ext_buf = cl;
 	m->m_ext.ext_free = m->m_ext.ext_arg1 = m->m_ext.ext_arg2 = NULL;
 	m->m_ext.ext_size = size;
 	m->m_ext.ext_type = type;
 	m->m_ext.ext_flags = EXT_FLAG_EMBREF;
 	m->m_ext.ext_count = 1;
 	m->m_flags |= M_EXT;
 	MBUF_PROBE3(m__cljset, m, cl, type);
 }
 
 static __inline void
 m_chtype(struct mbuf *m, short new_type)
 {
 
 	m->m_type = new_type;
 }
 
 static __inline void
 m_clrprotoflags(struct mbuf *m)
 {
 
 	while (m) {
 		m->m_flags &= ~M_PROTOFLAGS;
 		m = m->m_next;
 	}
 }
 
 static __inline struct mbuf *
 m_last(struct mbuf *m)
 {
 
 	while (m->m_next)
 		m = m->m_next;
 	return (m);
 }
 
 static inline u_int
 m_extrefcnt(struct mbuf *m)
 {
 
 	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT missing", __func__));
 
 	return ((m->m_ext.ext_flags & EXT_FLAG_EMBREF) ? m->m_ext.ext_count :
 	    *m->m_ext.ext_cnt);
 }
 
 /*
  * mbuf, cluster, and external object allocation macros (for compatibility
  * purposes).
  */
 #define	M_MOVE_PKTHDR(to, from)	m_move_pkthdr((to), (from))
 #define	MGET(m, how, type)	((m) = m_get((how), (type)))
 #define	MGETHDR(m, how, type)	((m) = m_gethdr((how), (type)))
 #define	MCLGET(m, how)		m_clget((m), (how))
 #define	MEXTADD(m, buf, size, free, arg1, arg2, flags, type)		\
     m_extadd((m), (char *)(buf), (size), (free), (arg1), (arg2),	\
     (flags), (type))
 #define	m_getm(m, len, how, type)					\
     m_getm2((m), (len), (how), (type), M_PKTHDR)
 
 /*
  * Evaluate TRUE if it's safe to write to the mbuf m's data region (this can
  * be both the local data payload, or an external buffer area, depending on
  * whether M_EXT is set).
  */
 #define	M_WRITABLE(m)	(!((m)->m_flags & M_RDONLY) &&			\
 			 (!(((m)->m_flags & M_EXT)) ||			\
 			 (m_extrefcnt(m) == 1)))
 
 /* Check if the supplied mbuf has a packet header, or else panic. */
 #define	M_ASSERTPKTHDR(m)						\
 	KASSERT((m) != NULL && (m)->m_flags & M_PKTHDR,			\
 	    ("%s: no mbuf packet header!", __func__))
 
 /*
  * Ensure that the supplied mbuf is a valid, non-free mbuf.
  *
  * XXX: Broken at the moment.  Need some UMA magic to make it work again.
  */
 #define	M_ASSERTVALID(m)						\
 	KASSERT((((struct mbuf *)m)->m_flags & 0) == 0,			\
 	    ("%s: attempted use of a free mbuf!", __func__))
 
 /*
  * Return the address of the start of the buffer associated with an mbuf,
  * handling external storage, packet-header mbufs, and regular data mbufs.
  */
 #define	M_START(m)							\
 	(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf :			\
 	 ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] :		\
 	 &(m)->m_dat[0])
 
 /*
  * Return the size of the buffer associated with an mbuf, handling external
  * storage, packet-header mbufs, and regular data mbufs.
  */
 #define	M_SIZE(m)							\
 	(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size :			\
 	 ((m)->m_flags & M_PKTHDR) ? MHLEN :				\
 	 MLEN)
 
 /*
  * Set the m_data pointer of a newly allocated mbuf to place an object of the
  * specified size at the end of the mbuf, longword aligned.
  *
  * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
  * separate macros, each asserting that it was called at the proper moment.
  * This required callers to themselves test the storage type and call the
  * right one.  Rather than require callers to be aware of those layout
  * decisions, we centralize here.
  */
 static __inline void
 m_align(struct mbuf *m, int len)
 {
 #ifdef INVARIANTS
 	const char *msg = "%s: not a virgin mbuf";
 #endif
 	int adjust;
 
 	KASSERT(m->m_data == M_START(m), (msg, __func__));
 
 	adjust = M_SIZE(m) - len;
 	m->m_data += adjust &~ (sizeof(long)-1);
 }
 
 #define	M_ALIGN(m, len)		m_align(m, len)
 #define	MH_ALIGN(m, len)	m_align(m, len)
 #define	MEXT_ALIGN(m, len)	m_align(m, len)
 
 /*
  * Compute the amount of space available before the current start of data in
  * an mbuf.
  *
  * The M_WRITABLE() is a temporary, conservative safety measure: the burden
  * of checking writability of the mbuf data area rests solely with the caller.
  *
  * NB: In previous versions, M_LEADINGSPACE() would only check M_WRITABLE()
  * for mbufs with external storage.  We now allow mbuf-embedded data to be
  * read-only as well.
  */
 #define	M_LEADINGSPACE(m)						\
 	(M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0)
 
 /*
  * Compute the amount of space available after the end of data in an mbuf.
  *
  * The M_WRITABLE() is a temporary, conservative safety measure: the burden
  * of checking writability of the mbuf data area rests solely with the caller.
  *
  * NB: In previous versions, M_TRAILINGSPACE() would only check M_WRITABLE()
  * for mbufs with external storage.  We now allow mbuf-embedded data to be
  * read-only as well.
  */
 #define	M_TRAILINGSPACE(m)						\
 	(M_WRITABLE(m) ?						\
 	    ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0)
 
 /*
  * Arrange to prepend space of size plen to mbuf m.  If a new mbuf must be
  * allocated, how specifies whether to wait.  If the allocation fails, the
  * original mbuf chain is freed and m is set to NULL.
  */
 #define	M_PREPEND(m, plen, how) do {					\
 	struct mbuf **_mmp = &(m);					\
 	struct mbuf *_mm = *_mmp;					\
 	int _mplen = (plen);						\
 	int __mhow = (how);						\
 									\
 	MBUF_CHECKSLEEP(how);						\
 	if (M_LEADINGSPACE(_mm) >= _mplen) {				\
 		_mm->m_data -= _mplen;					\
 		_mm->m_len += _mplen;					\
 	} else								\
 		_mm = m_prepend(_mm, _mplen, __mhow);			\
 	if (_mm != NULL && _mm->m_flags & M_PKTHDR)			\
 		_mm->m_pkthdr.len += _mplen;				\
 	*_mmp = _mm;							\
 } while (0)
 
 /*
  * Change mbuf to new type.  This is a relatively expensive operation and
  * should be avoided.
  */
 #define	MCHTYPE(m, t)	m_chtype((m), (t))
 
 /* Length to m_copy to copy all. */
 #define	M_COPYALL	1000000000
 
 extern int		max_datalen;	/* MHLEN - max_hdr */
 extern int		max_hdr;	/* Largest link + protocol header */
 extern int		max_linkhdr;	/* Largest link-level header */
 extern int		max_protohdr;	/* Largest protocol header */
 extern int		nmbclusters;	/* Maximum number of clusters */
 
 /*-
  * Network packets may have annotations attached by affixing a list of
  * "packet tags" to the pkthdr structure.  Packet tags are dynamically
  * allocated semi-opaque data structures that have a fixed header
  * (struct m_tag) that specifies the size of the memory block and a
  * <cookie,type> pair that identifies it.  The cookie is a 32-bit unique
  * unsigned value used to identify a module or ABI.  By convention this value
  * is chosen as the date+time that the module is created, expressed as the
  * number of seconds since the epoch (e.g., using date -u +'%s').  The type
  * value is an ABI/module-specific value that identifies a particular
  * annotation and is private to the module.  For compatibility with systems
  * like OpenBSD that define packet tags w/o an ABI/module cookie, the value
  * PACKET_ABI_COMPAT is used to implement m_tag_get and m_tag_find
  * compatibility shim functions and several tag types are defined below.
  * Users that do not require compatibility should use a private cookie value
  * so that packet tag-related definitions can be maintained privately.
  *
  * Note that the packet tag returned by m_tag_alloc has the default memory
  * alignment implemented by malloc.  To reference private data one can use a
  * construct like:
  *
  *	struct m_tag *mtag = m_tag_alloc(...);
  *	struct foo *p = (struct foo *)(mtag+1);
  *
  * if the alignment of struct m_tag is sufficient for referencing members of
  * struct foo.  Otherwise it is necessary to embed struct m_tag within the
  * private data structure to insure proper alignment; e.g.,
  *
  *	struct foo {
  *		struct m_tag	tag;
  *		...
  *	};
  *	struct foo *p = (struct foo *) m_tag_alloc(...);
  *	struct m_tag *mtag = &p->tag;
  */
 
 /*
  * Persistent tags stay with an mbuf until the mbuf is reclaimed.  Otherwise
  * tags are expected to ``vanish'' when they pass through a network
  * interface.  For most interfaces this happens normally as the tags are
  * reclaimed when the mbuf is free'd.  However in some special cases
  * reclaiming must be done manually.  An example is packets that pass through
  * the loopback interface.  Also, one must be careful to do this when
  * ``turning around'' packets (e.g., icmp_reflect).
  *
  * To mark a tag persistent bit-or this flag in when defining the tag id.
  * The tag will then be treated as described above.
  */
 #define	MTAG_PERSISTENT				0x800
 
 #define	PACKET_TAG_NONE				0  /* Nadda */
 
 /* Packet tags for use with PACKET_ABI_COMPAT. */
 #define	PACKET_TAG_IPSEC_IN_DONE		1  /* IPsec applied, in */
 #define	PACKET_TAG_IPSEC_OUT_DONE		2  /* IPsec applied, out */
 #define	PACKET_TAG_IPSEC_IN_CRYPTO_DONE		3  /* NIC IPsec crypto done */
 #define	PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED	4  /* NIC IPsec crypto req'ed */
 #define	PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO	5  /* NIC notifies IPsec */
 #define	PACKET_TAG_IPSEC_PENDING_TDB		6  /* Reminder to do IPsec */
 #define	PACKET_TAG_BRIDGE			7  /* Bridge processing done */
 #define	PACKET_TAG_GIF				8  /* GIF processing done */
 #define	PACKET_TAG_GRE				9  /* GRE processing done */
 #define	PACKET_TAG_IN_PACKET_CHECKSUM		10 /* NIC checksumming done */
 #define	PACKET_TAG_ENCAP			11 /* Encap.  processing */
 #define	PACKET_TAG_IPSEC_SOCKET			12 /* IPSEC socket ref */
 #define	PACKET_TAG_IPSEC_HISTORY		13 /* IPSEC history */
 #define	PACKET_TAG_IPV6_INPUT			14 /* IPV6 input processing */
 #define	PACKET_TAG_DUMMYNET			15 /* dummynet info */
 #define	PACKET_TAG_DIVERT			17 /* divert info */
 #define	PACKET_TAG_IPFORWARD			18 /* ipforward info */
 #define	PACKET_TAG_MACLABEL	(19 | MTAG_PERSISTENT) /* MAC label */
 #define	PACKET_TAG_PF		(21 | MTAG_PERSISTENT) /* PF/ALTQ information */
 #define	PACKET_TAG_RTSOCKFAM			25 /* rtsock sa family */
 #define	PACKET_TAG_IPOPTIONS			27 /* Saved IP options */
 #define	PACKET_TAG_CARP				28 /* CARP info */
 #define	PACKET_TAG_IPSEC_NAT_T_PORTS		29 /* two uint16_t */
 #define	PACKET_TAG_ND_OUTGOING			30 /* ND outgoing */
 
 /* Specific cookies and tags. */
 
 /* Packet tag routines. */
 struct m_tag	*m_tag_alloc(u_int32_t, int, int, int);
 void		 m_tag_delete(struct mbuf *, struct m_tag *);
 void		 m_tag_delete_chain(struct mbuf *, struct m_tag *);
 void		 m_tag_free_default(struct m_tag *);
 struct m_tag	*m_tag_locate(struct mbuf *, u_int32_t, int, struct m_tag *);
 struct m_tag	*m_tag_copy(struct m_tag *, int);
 int		 m_tag_copy_chain(struct mbuf *, const struct mbuf *, int);
 void		 m_tag_delete_nonpersistent(struct mbuf *);
 
 /*
  * Initialize the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_init(struct mbuf *m)
 {
 
 	SLIST_INIT(&m->m_pkthdr.tags);
 }
 
 /*
  * Set up the contents of a tag.  Note that this does not fill in the free
  * method; the caller is expected to do that.
  *
  * XXX probably should be called m_tag_init, but that was already taken.
  */
 static __inline void
 m_tag_setup(struct m_tag *t, u_int32_t cookie, int type, int len)
 {
 
 	t->m_tag_id = type;
 	t->m_tag_len = len;
 	t->m_tag_cookie = cookie;
 }
 
 /*
  * Reclaim resources associated with a tag.
  */
 static __inline void
 m_tag_free(struct m_tag *t)
 {
 
 	(*t->m_tag_free)(t);
 }
 
 /*
  * Return the first tag associated with an mbuf.
  */
 static __inline struct m_tag *
 m_tag_first(struct mbuf *m)
 {
 
 	return (SLIST_FIRST(&m->m_pkthdr.tags));
 }
 
 /*
  * Return the next tag in the list of tags associated with an mbuf.
  */
 static __inline struct m_tag *
 m_tag_next(struct mbuf *m __unused, struct m_tag *t)
 {
 
 	return (SLIST_NEXT(t, m_tag_link));
 }
 
 /*
  * Prepend a tag to the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_prepend(struct mbuf *m, struct m_tag *t)
 {
 
 	SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link);
 }
 
 /*
  * Unlink a tag from the list of tags associated with an mbuf.
  */
 static __inline void
 m_tag_unlink(struct mbuf *m, struct m_tag *t)
 {
 
 	SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link);
 }
 
 /* These are for OpenBSD compatibility. */
 #define	MTAG_ABI_COMPAT		0		/* compatibility ABI */
 
 static __inline struct m_tag *
 m_tag_get(int type, int length, int wait)
 {
 	return (m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait));
 }
 
 static __inline struct m_tag *
 m_tag_find(struct mbuf *m, int type, struct m_tag *start)
 {
 	return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL :
 	    m_tag_locate(m, MTAG_ABI_COMPAT, type, start));
 }
 
 static __inline struct mbuf *
 m_free(struct mbuf *m)
 {
 	struct mbuf *n = m->m_next;
 
 	MBUF_PROBE1(m__free, m);
 	if ((m->m_flags & (M_PKTHDR|M_NOFREE)) == (M_PKTHDR|M_NOFREE))
 		m_tag_delete_chain(m, NULL);
 	if (m->m_flags & M_EXT)
 		mb_free_ext(m);
 	else if ((m->m_flags & M_NOFREE) == 0)
 		uma_zfree(zone_mbuf, m);
 	return (n);
 }
 
 static __inline int
 rt_m_getfib(struct mbuf *m)
 {
 	KASSERT(m->m_flags & M_PKTHDR , ("Attempt to get FIB from non header mbuf."));
 	return (m->m_pkthdr.fibnum);
 }
 
 #define M_GETFIB(_m)   rt_m_getfib(_m)
 
 #define M_SETFIB(_m, _fib) do {						\
         KASSERT((_m)->m_flags & M_PKTHDR, ("Attempt to set FIB on non header mbuf."));	\
 	((_m)->m_pkthdr.fibnum) = (_fib);				\
 } while (0)
 
 /* flags passed as first argument for "m_ether_tcpip_hash()" */
 #define	MBUF_HASHFLAG_L2	(1 << 2)
 #define	MBUF_HASHFLAG_L3	(1 << 3)
 #define	MBUF_HASHFLAG_L4	(1 << 4)
 
 /* mbuf hashing helper routines */
 uint32_t	m_ether_tcpip_hash_init(void);
 uint32_t	m_ether_tcpip_hash(const uint32_t, const struct mbuf *, const uint32_t);
 
 #ifdef MBUF_PROFILING
  void m_profile(struct mbuf *m);
  #define M_PROFILE(m) m_profile(m)
 #else
  #define M_PROFILE(m)
 #endif
 
 struct mbufq {
 	STAILQ_HEAD(, mbuf)	mq_head;
 	int			mq_len;
 	int			mq_maxlen;
 };
 
 static inline void
 mbufq_init(struct mbufq *mq, int maxlen)
 {
 
 	STAILQ_INIT(&mq->mq_head);
 	mq->mq_maxlen = maxlen;
 	mq->mq_len = 0;
 }
 
 static inline struct mbuf *
 mbufq_flush(struct mbufq *mq)
 {
 	struct mbuf *m;
 
 	m = STAILQ_FIRST(&mq->mq_head);
 	STAILQ_INIT(&mq->mq_head);
 	mq->mq_len = 0;
 	return (m);
 }
 
 static inline void
 mbufq_drain(struct mbufq *mq)
 {
 	struct mbuf *m, *n;
 
 	n = mbufq_flush(mq);
 	while ((m = n) != NULL) {
 		n = STAILQ_NEXT(m, m_stailqpkt);
 		m_freem(m);
 	}
 }
 
 static inline struct mbuf *
 mbufq_first(const struct mbufq *mq)
 {
 
 	return (STAILQ_FIRST(&mq->mq_head));
 }
 
 static inline struct mbuf *
 mbufq_last(const struct mbufq *mq)
 {
 
 	return (STAILQ_LAST(&mq->mq_head, mbuf, m_stailqpkt));
 }
 
 static inline int
 mbufq_full(const struct mbufq *mq)
 {
 
 	return (mq->mq_len >= mq->mq_maxlen);
 }
 
 static inline int
 mbufq_len(const struct mbufq *mq)
 {
 
 	return (mq->mq_len);
 }
 
 static inline int
 mbufq_enqueue(struct mbufq *mq, struct mbuf *m)
 {
 
 	if (mbufq_full(mq))
 		return (ENOBUFS);
 	STAILQ_INSERT_TAIL(&mq->mq_head, m, m_stailqpkt);
 	mq->mq_len++;
 	return (0);
 }
 
 static inline struct mbuf *
 mbufq_dequeue(struct mbufq *mq)
 {
 	struct mbuf *m;
 
 	m = STAILQ_FIRST(&mq->mq_head);
 	if (m) {
 		STAILQ_REMOVE_HEAD(&mq->mq_head, m_stailqpkt);
 		m->m_nextpkt = NULL;
 		mq->mq_len--;
 	}
 	return (m);
 }
 
 static inline void
 mbufq_prepend(struct mbufq *mq, struct mbuf *m)
 {
 
 	STAILQ_INSERT_HEAD(&mq->mq_head, m, m_stailqpkt);
 	mq->mq_len++;
 }
 
 /*
  * Note: this doesn't enforce the maximum list size for dst.
  */
 static inline void
 mbufq_concat(struct mbufq *mq_dst, struct mbufq *mq_src)
 {
 
 	mq_dst->mq_len += mq_src->mq_len;
 	STAILQ_CONCAT(&mq_dst->mq_head, &mq_src->mq_head);
 	mq_src->mq_len = 0;
 }
 
 #ifdef _SYS_TIMESPEC_H_
 static inline void
 mbuf_tstmp2timespec(struct mbuf *m, struct timespec *ts)
 {
 
 	KASSERT((m->m_flags & M_PKTHDR) != 0, ("mbuf %p no M_PKTHDR", m));
 	KASSERT((m->m_flags & M_TSTMP) != 0, ("mbuf %p no M_TSTMP", m));
 	ts->tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
 	ts->tv_nsec = m->m_pkthdr.rcv_tstmp % 1000000000;
 }
 #endif
 
 #ifdef NETDUMP
 /* Invoked from the netdump client code. */
 void	netdump_mbuf_drain(void);
 void	netdump_mbuf_dump(void);
 void	netdump_mbuf_reinit(int nmbuf, int nclust, int clsize);
 #endif
 
 #endif /* _KERNEL */
 #endif /* !_SYS_MBUF_H_ */
Index: stable/12
===================================================================
--- stable/12	(revision 362879)
+++ stable/12	(revision 362880)

Property changes on: stable/12
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r349893