diff --git a/sys/dev/cxgbe/crypto/t6_kern_tls.c b/sys/dev/cxgbe/crypto/t6_kern_tls.c
index 1374530f941f..4340c5484763 100644
--- a/sys/dev/cxgbe/crypto/t6_kern_tls.c
+++ b/sys/dev/cxgbe/crypto/t6_kern_tls.c
@@ -1,2151 +1,2151 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2018-2019 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/ktr.h>
 #include <sys/ktls.h>
 #include <sys/sglist.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockbuf.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp_var.h>
 #include <opencrypto/cryptodev.h>
 #include <opencrypto/xform.h>
 
 #include "common/common.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_tcb.h"
 #include "t4_l2t.h"
 #include "t4_clip.h"
 #include "t4_mp_ring.h"
 #include "crypto/t4_crypto.h"
 
 #if defined(INET) || defined(INET6)
 
 #define TLS_HEADER_LENGTH		5
 
 struct tls_scmd {
 	__be32 seqno_numivs;
 	__be32 ivgen_hdrlen;
 };
 
 struct tlspcb {
 	struct m_snd_tag com;
 	struct vi_info *vi;	/* virtual interface */
 	struct adapter *sc;
 	struct l2t_entry *l2te;	/* L2 table entry used by this connection */
 	int tid;		/* Connection identifier */
 
 	int tx_key_addr;
 	bool inline_key;
 	bool using_timestamps;
 	unsigned char enc_mode;
 
 	struct tls_scmd scmd0;
 	struct tls_scmd scmd0_short;
 
 	unsigned int tx_key_info_size;
 
 	uint32_t prev_seq;
 	uint32_t prev_ack;
 	uint32_t prev_tsecr;
 	uint16_t prev_win;
 	uint16_t prev_mss;
 
 	/* Only used outside of setup and teardown when using inline keys. */
 	struct tls_keyctx keyctx;
 
 	/* Fields only used during setup and teardown. */
 	struct inpcb *inp;	/* backpointer to host stack's PCB */
 	struct sge_txq *txq;
 	struct sge_wrq *ctrlq;
 	struct clip_entry *ce;	/* CLIP table entry used by this tid */
 
 	bool open_pending;
 };
 
 static void t6_tls_tag_free(struct m_snd_tag *mst);
 static int ktls_setup_keys(struct tlspcb *tlsp,
     const struct ktls_session *tls, struct sge_txq *txq);
 
 static const struct if_snd_tag_sw t6_tls_tag_sw = {
 	.snd_tag_free = t6_tls_tag_free,
 	.type = IF_SND_TAG_TYPE_TLS
 };
 
 static inline struct tlspcb *
 mst_to_tls(struct m_snd_tag *t)
 {
 	return (__containerof(t, struct tlspcb, com));
 }
 
 static struct tlspcb *
 alloc_tlspcb(struct ifnet *ifp, struct vi_info *vi, int flags)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct tlspcb *tlsp;
 
 	tlsp = malloc(sizeof(*tlsp), M_CXGBE, M_ZERO | flags);
 	if (tlsp == NULL)
 		return (NULL);
 
 	m_snd_tag_init(&tlsp->com, ifp, &t6_tls_tag_sw);
 	tlsp->vi = vi;
 	tlsp->sc = sc;
 	tlsp->ctrlq = &sc->sge.ctrlq[pi->port_id];
 	tlsp->tid = -1;
 	tlsp->tx_key_addr = -1;
 
 	return (tlsp);
 }
 
 static int
 ktls_act_open_cpl_size(bool isipv6)
 {
 
 	if (isipv6)
 		return (sizeof(struct cpl_t6_act_open_req6));
 	else
 		return (sizeof(struct cpl_t6_act_open_req));
 }
 
 static void
 mk_ktls_act_open_req(struct adapter *sc, struct vi_info *vi, struct inpcb *inp,
     struct tlspcb *tlsp, int atid, void *dst)
 {
 	struct tcpcb *tp = intotcpcb(inp);
 	struct cpl_t6_act_open_req *cpl6;
 	struct cpl_act_open_req *cpl;
 	uint64_t options;
 	int qid_atid;
 
 	cpl6 = dst;
 	cpl = (struct cpl_act_open_req *)cpl6;
 	INIT_TP_WR(cpl6, 0);
 	qid_atid = V_TID_QID(sc->sge.fwq.abs_id) | V_TID_TID(atid) |
 	    V_TID_COOKIE(CPL_COOKIE_KERN_TLS);
 	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
 		qid_atid));
 	inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port,
 	    &cpl->peer_ip, &cpl->peer_port);
 
 	options = F_TCAM_BYPASS | V_ULP_MODE(ULP_MODE_NONE);
 	options |= V_SMAC_SEL(vi->smt_idx) | V_TX_CHAN(vi->pi->tx_chan);
 	options |= F_NON_OFFLOAD;
 	cpl->opt0 = htobe64(options);
 
 	options = V_TX_QUEUE(sc->params.tp.tx_modq[vi->pi->tx_chan]);
 	if (tp->t_flags & TF_REQ_TSTMP)
 		options |= F_TSTAMPS_EN;
 	cpl->opt2 = htobe32(options);
 }
 
 static void
 mk_ktls_act_open_req6(struct adapter *sc, struct vi_info *vi,
     struct inpcb *inp, struct tlspcb *tlsp, int atid, void *dst)
 {
 	struct tcpcb *tp = intotcpcb(inp);
 	struct cpl_t6_act_open_req6 *cpl6;
 	struct cpl_act_open_req6 *cpl;
 	uint64_t options;
 	int qid_atid;
 
 	cpl6 = dst;
 	cpl = (struct cpl_act_open_req6 *)cpl6;
 	INIT_TP_WR(cpl6, 0);
 	qid_atid = V_TID_QID(sc->sge.fwq.abs_id) | V_TID_TID(atid) |
 	    V_TID_COOKIE(CPL_COOKIE_KERN_TLS);
 	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ6,
 		qid_atid));
 	cpl->local_port = inp->inp_lport;
 	cpl->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
 	cpl->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
 	cpl->peer_port = inp->inp_fport;
 	cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0];
 	cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8];
 
 	options = F_TCAM_BYPASS | V_ULP_MODE(ULP_MODE_NONE);
 	options |= V_SMAC_SEL(vi->smt_idx) | V_TX_CHAN(vi->pi->tx_chan);
 	options |= F_NON_OFFLOAD;
 	cpl->opt0 = htobe64(options);
 
 	options = V_TX_QUEUE(sc->params.tp.tx_modq[vi->pi->tx_chan]);
 	if (tp->t_flags & TF_REQ_TSTMP)
 		options |= F_TSTAMPS_EN;
 	cpl->opt2 = htobe32(options);
 }
 
 static int
 send_ktls_act_open_req(struct adapter *sc, struct vi_info *vi,
     struct inpcb *inp, struct tlspcb *tlsp, int atid)
 {
 	struct wrqe *wr;
 	bool isipv6;
 
 	isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 	if (isipv6) {
 		tlsp->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true);
 		if (tlsp->ce == NULL)
 			return (ENOENT);
 	}
 
 	wr = alloc_wrqe(ktls_act_open_cpl_size(isipv6), tlsp->ctrlq);
 	if (wr == NULL) {
 		CTR2(KTR_CXGBE, "%s: atid %d failed to alloc WR", __func__,
 		    atid);
 		return (ENOMEM);
 	}
 
 	if (isipv6)
 		mk_ktls_act_open_req6(sc, vi, inp, tlsp, atid, wrtod(wr));
 	else
 		mk_ktls_act_open_req(sc, vi, inp, tlsp, atid, wrtod(wr));
 
 	tlsp->open_pending = true;
 	t4_wrq_tx(sc, wr);
 	return (0);
 }
 
 static int
 ktls_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1);
 	u_int atid = G_TID_TID(G_AOPEN_ATID(be32toh(cpl->atid_status)));
 	u_int status = G_AOPEN_STATUS(be32toh(cpl->atid_status));
 	struct tlspcb *tlsp = lookup_atid(sc, atid);
 	struct inpcb *inp = tlsp->inp;
 
 	CTR3(KTR_CXGBE, "%s: atid %d status %d", __func__, atid, status);
 	free_atid(sc, atid);
 	if (status == 0)
 		tlsp->tid = GET_TID(cpl);
 
 	INP_WLOCK(inp);
 	tlsp->open_pending = false;
 	wakeup(tlsp);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 /* SET_TCB_FIELD sent as a ULP command looks like this */
 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
 
 _Static_assert((LEN__SET_TCB_FIELD_ULP + sizeof(struct ulptx_idata)) % 16 == 0,
     "CPL_SET_TCB_FIELD ULP command not 16-byte aligned");
 
 static void
 write_set_tcb_field_ulp(struct tlspcb *tlsp, void *dst, struct sge_txq *txq,
     uint16_t word, uint64_t mask, uint64_t val)
 {
 	struct ulp_txpkt *txpkt;
 	struct ulptx_idata *idata;
 	struct cpl_set_tcb_field_core *cpl;
 
 	/* ULP_TXPKT */
 	txpkt = dst;
 	txpkt->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
 	    V_ULP_TXPKT_DATAMODIFY(0) |
 	    V_ULP_TXPKT_CHANNELID(tlsp->vi->pi->port_id) | V_ULP_TXPKT_DEST(0) |
 	    V_ULP_TXPKT_FID(txq->eq.cntxt_id) | V_ULP_TXPKT_RO(1));
 	txpkt->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
 
 	/* ULPTX_IDATA sub-command */
 	idata = (struct ulptx_idata *)(txpkt + 1);
 	idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	idata->len = htobe32(sizeof(*cpl));
 
 	/* CPL_SET_TCB_FIELD */
 	cpl = (struct cpl_set_tcb_field_core *)(idata + 1);
 	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tlsp->tid));
 	cpl->reply_ctrl = htobe16(F_NO_REPLY);
 	cpl->word_cookie = htobe16(V_WORD(word));
 	cpl->mask = htobe64(mask);
 	cpl->val = htobe64(val);
 
 	/* ULPTX_NOOP */
 	idata = (struct ulptx_idata *)(cpl + 1);
 	idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 	idata->len = htobe32(0);
 }
 
 static int
 ktls_set_tcb_fields(struct tlspcb *tlsp, struct tcpcb *tp, struct sge_txq *txq)
 {
 	struct fw_ulptx_wr *wr;
 	struct mbuf *m;
 	char *dst;
 	void *items[1];
 	int error, len;
 
 	len = sizeof(*wr) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 	if (tp->t_flags & TF_REQ_TSTMP)
 		len += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 	m = alloc_wr_mbuf(len, M_NOWAIT);
 	if (m == NULL) {
 		CTR2(KTR_CXGBE, "%s: tid %d failed to alloc WR mbuf", __func__,
 		    tlsp->tid);
 		return (ENOMEM);
 	}
 	m->m_pkthdr.snd_tag = m_snd_tag_ref(&tlsp->com);
 	m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 
 	/* FW_ULPTX_WR */
 	wr = mtod(m, void *);
 	wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR));
 	wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA |
 	    V_FW_WR_LEN16(len / 16));
 	wr->cookie = 0;
 	dst = (char *)(wr + 1);
 
         /* Clear TF_NON_OFFLOAD and set TF_CORE_BYPASS */
 	write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_T_FLAGS,
 	    V_TCB_T_FLAGS(V_TF_CORE_BYPASS(1) | V_TF_NON_OFFLOAD(1)),
 	    V_TCB_T_FLAGS(V_TF_CORE_BYPASS(1)));
 	dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	/* Clear the SND_UNA_RAW, SND_NXT_RAW, and SND_MAX_RAW offsets. */
 	write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_SND_UNA_RAW,
 	    V_TCB_SND_NXT_RAW(M_TCB_SND_NXT_RAW) |
 	    V_TCB_SND_UNA_RAW(M_TCB_SND_UNA_RAW),
 	    V_TCB_SND_NXT_RAW(0) | V_TCB_SND_UNA_RAW(0));
 	dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_SND_MAX_RAW,
 	    V_TCB_SND_MAX_RAW(M_TCB_SND_MAX_RAW), V_TCB_SND_MAX_RAW(0));
 	dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	if (tp->t_flags & TF_REQ_TSTMP) {
 		write_set_tcb_field_ulp(tlsp, dst, txq, W_TCB_TIMESTAMP_OFFSET,
 		    V_TCB_TIMESTAMP_OFFSET(M_TCB_TIMESTAMP_OFFSET),
 		    V_TCB_TIMESTAMP_OFFSET(tp->ts_offset >> 28));
 		dst += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 	}
 
 	KASSERT(dst - (char *)wr == len, ("%s: length mismatch", __func__));
 
 	items[0] = m;
 	error = mp_ring_enqueue(txq->r, items, 1, 1);
 	if (error)
 		m_free(m);
 	return (error);
 }
 
 int
 t6_tls_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
     struct m_snd_tag **pt)
 {
 	const struct ktls_session *tls;
 	struct tlspcb *tlsp;
 	struct adapter *sc;
 	struct vi_info *vi;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct sge_txq *txq;
 	int atid, error, explicit_iv_size, keyid, mac_first;
 
 	tls = params->tls.tls;
 
 	/* Only TLS 1.1 and TLS 1.2 are currently supported. */
 	if (tls->params.tls_vmajor != TLS_MAJOR_VER_ONE ||
 	    tls->params.tls_vminor < TLS_MINOR_VER_ONE ||
 	    tls->params.tls_vminor > TLS_MINOR_VER_TWO)
 		return (EPROTONOSUPPORT);
 
 	/* Sanity check values in *tls. */
 	switch (tls->params.cipher_algorithm) {
 	case CRYPTO_AES_CBC:
 		/* XXX: Explicitly ignore any provided IV. */
 		switch (tls->params.cipher_key_len) {
 		case 128 / 8:
 		case 192 / 8:
 		case 256 / 8:
 			break;
 		default:
 			return (EINVAL);
 		}
 		switch (tls->params.auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 		case CRYPTO_SHA2_256_HMAC:
 		case CRYPTO_SHA2_384_HMAC:
 			break;
 		default:
 			return (EPROTONOSUPPORT);
 		}
 		explicit_iv_size = AES_BLOCK_LEN;
 		mac_first = 1;
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
 		if (tls->params.iv_len != SALT_SIZE)
 			return (EINVAL);
 		switch (tls->params.cipher_key_len) {
 		case 128 / 8:
 		case 192 / 8:
 		case 256 / 8:
 			break;
 		default:
 			return (EINVAL);
 		}
 		explicit_iv_size = 8;
 		mac_first = 0;
 		break;
 	default:
 		return (EPROTONOSUPPORT);
 	}
 
 	vi = ifp->if_softc;
 	sc = vi->adapter;
 
 	tlsp = alloc_tlspcb(ifp, vi, M_WAITOK);
 
 	atid = alloc_atid(sc, tlsp);
 	if (atid < 0) {
 		error = ENOMEM;
 		goto failed;
 	}
 
 	if (sc->tlst.inline_keys)
 		keyid = -1;
 	else
 		keyid = t4_alloc_tls_keyid(sc);
 	if (keyid < 0) {
 		CTR2(KTR_CXGBE, "%s: atid %d using immediate key ctx", __func__,
 		    atid);
 		tlsp->inline_key = true;
 	} else {
 		tlsp->tx_key_addr = keyid;
 		CTR3(KTR_CXGBE, "%s: atid %d allocated TX key addr %#x",
 		    __func__,
 		    atid, tlsp->tx_key_addr);
 	}
 
 	inp = params->tls.inp;
 	INP_RLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_RUNLOCK(inp);
 		error = ECONNRESET;
 		goto failed;
 	}
 	tlsp->inp = inp;
 
 	tp = inp->inp_ppcb;
 	if (tp->t_flags & TF_REQ_TSTMP) {
 		tlsp->using_timestamps = true;
 		if ((tp->ts_offset & 0xfffffff) != 0) {
 			INP_RUNLOCK(inp);
 			error = EINVAL;
 			goto failed;
 		}
 	} else
 		tlsp->using_timestamps = false;
 
 	error = send_ktls_act_open_req(sc, vi, inp, tlsp, atid);
 	if (error) {
 		INP_RUNLOCK(inp);
 		goto failed;
 	}
 
 	/* Wait for reply to active open. */
 	CTR2(KTR_CXGBE, "%s: atid %d sent CPL_ACT_OPEN_REQ", __func__,
 	    atid);
 	while (tlsp->open_pending) {
 		/*
 		 * XXX: PCATCH?  We would then have to discard the PCB
 		 * when the completion CPL arrived.
 		 */
 		error = rw_sleep(tlsp, &inp->inp_lock, 0, "t6tlsop", 0);
 	}
 
 	atid = -1;
 	if (tlsp->tid < 0) {
 		INP_RUNLOCK(inp);
 		error = ENOMEM;
 		goto failed;
 	}
 
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_RUNLOCK(inp);
 		error = ECONNRESET;
 		goto failed;
 	}
 
 	txq = &sc->sge.txq[vi->first_txq];
 	if (inp->inp_flowtype != M_HASHTYPE_NONE)
 		txq += ((inp->inp_flowid % (vi->ntxq - vi->rsrv_noflowq)) +
 		    vi->rsrv_noflowq);
 	tlsp->txq = txq;
 
 	error = ktls_set_tcb_fields(tlsp, tp, txq);
 	INP_RUNLOCK(inp);
 	if (error)
 		goto failed;
 
 	error = ktls_setup_keys(tlsp, tls, txq);
 	if (error)
 		goto failed;
 
 	tlsp->enc_mode = t4_tls_cipher_mode(tls);
 	tlsp->tx_key_info_size = t4_tls_key_info_size(tls);
 
 	/* The SCMD fields used when encrypting a full TLS record. */
 	tlsp->scmd0.seqno_numivs = htobe32(V_SCMD_SEQ_NO_CTRL(3) |
 	    V_SCMD_PROTO_VERSION(t4_tls_proto_ver(tls)) |
 	    V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) |
 	    V_SCMD_CIPH_AUTH_SEQ_CTRL((mac_first == 0)) |
 	    V_SCMD_CIPH_MODE(tlsp->enc_mode) |
 	    V_SCMD_AUTH_MODE(t4_tls_auth_mode(tls)) |
 	    V_SCMD_HMAC_CTRL(t4_tls_hmac_ctrl(tls)) |
 	    V_SCMD_IV_SIZE(explicit_iv_size / 2) | V_SCMD_NUM_IVS(1));
 
 	tlsp->scmd0.ivgen_hdrlen = V_SCMD_IV_GEN_CTRL(0) |
 	    V_SCMD_TLS_FRAG_ENABLE(0);
 	if (tlsp->inline_key)
 		tlsp->scmd0.ivgen_hdrlen |= V_SCMD_KEY_CTX_INLINE(1);
 	tlsp->scmd0.ivgen_hdrlen = htobe32(tlsp->scmd0.ivgen_hdrlen);
 
 	/*
 	 * The SCMD fields used when encrypting a partial TLS record
 	 * (no trailer and possibly a truncated payload).
 	 */
 	tlsp->scmd0_short.seqno_numivs = V_SCMD_SEQ_NO_CTRL(0) |
 	    V_SCMD_PROTO_VERSION(SCMD_PROTO_VERSION_GENERIC) |
 	    V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) |
 	    V_SCMD_CIPH_AUTH_SEQ_CTRL((mac_first == 0)) |
 	    V_SCMD_AUTH_MODE(SCMD_AUTH_MODE_NOP) |
 	    V_SCMD_HMAC_CTRL(SCMD_HMAC_CTRL_NOP) |
 	    V_SCMD_IV_SIZE(AES_BLOCK_LEN / 2) | V_SCMD_NUM_IVS(0);
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM)
 		tlsp->scmd0_short.seqno_numivs |=
 		    V_SCMD_CIPH_MODE(SCMD_CIPH_MODE_AES_CTR);
 	else
 		tlsp->scmd0_short.seqno_numivs |=
 		    V_SCMD_CIPH_MODE(tlsp->enc_mode);
 	tlsp->scmd0_short.seqno_numivs =
 	    htobe32(tlsp->scmd0_short.seqno_numivs);
 
 	tlsp->scmd0_short.ivgen_hdrlen = V_SCMD_IV_GEN_CTRL(0) |
 	    V_SCMD_TLS_FRAG_ENABLE(0) |
 	    V_SCMD_AADIVDROP(1);
 	if (tlsp->inline_key)
 		tlsp->scmd0_short.ivgen_hdrlen |= V_SCMD_KEY_CTX_INLINE(1);
 
 	TXQ_LOCK(txq);
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM)
 		txq->kern_tls_gcm++;
 	else
 		txq->kern_tls_cbc++;
 	TXQ_UNLOCK(txq);
 	*pt = &tlsp->com;
 	return (0);
 
 failed:
 	if (atid >= 0)
 		free_atid(sc, atid);
 	m_snd_tag_rele(&tlsp->com);
 	return (error);
 }
 
 static int
 ktls_setup_keys(struct tlspcb *tlsp, const struct ktls_session *tls,
     struct sge_txq *txq)
 {
 	struct tls_key_req *kwr;
 	struct tls_keyctx *kctx;
 	void *items[1];
 	struct mbuf *m;
 	int error;
 
 	/*
 	 * Store the salt and keys in the key context.  For
 	 * connections with an inline key, this key context is passed
 	 * as immediate data in each work request.  For connections
 	 * storing the key in DDR, a work request is used to store a
 	 * copy of the key context in DDR.
 	 */
 	t4_tls_key_ctx(tls, KTLS_TX, &tlsp->keyctx);
 	if (tlsp->inline_key)
 		return (0);
 
 	/* Populate key work request. */
         m = alloc_wr_mbuf(TLS_KEY_WR_SZ, M_NOWAIT);
 	if (m == NULL) {
 		CTR2(KTR_CXGBE, "%s: tid %d failed to alloc WR mbuf", __func__,
 		    tlsp->tid);
 		return (ENOMEM);
 	}
 	m->m_pkthdr.snd_tag = m_snd_tag_ref(&tlsp->com);
 	m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 	kwr = mtod(m, void *);
 	memset(kwr, 0, TLS_KEY_WR_SZ);
 
 	t4_write_tlskey_wr(tls, KTLS_TX, tlsp->tid, 0, tlsp->tx_key_addr, kwr);
 	kctx = (struct tls_keyctx *)(kwr + 1);
 	memcpy(kctx, &tlsp->keyctx, sizeof(*kctx));
 
 	/*
 	 * Place the key work request in the transmit queue.  It
 	 * should be sent to the NIC before any TLS packets using this
 	 * session.
 	 */
 	items[0] = m;
 	error = mp_ring_enqueue(txq->r, items, 1, 1);
 	if (error)
 		m_free(m);
 	else
 		CTR2(KTR_CXGBE, "%s: tid %d sent key WR", __func__, tlsp->tid);
 	return (error);
 }
 
 static u_int
 ktls_base_wr_size(struct tlspcb *tlsp)
 {
 	u_int wr_len;
 
 	wr_len = sizeof(struct fw_ulptx_wr);	// 16
 	wr_len += sizeof(struct ulp_txpkt);	// 8
 	wr_len += sizeof(struct ulptx_idata);	// 8
 	wr_len += sizeof(struct cpl_tx_sec_pdu);// 32
 	if (tlsp->inline_key)
 		wr_len += tlsp->tx_key_info_size;
 	else {
 		wr_len += sizeof(struct ulptx_sc_memrd);// 8
 		wr_len += sizeof(struct ulptx_idata);	// 8
 	}
 	wr_len += sizeof(struct cpl_tx_data);	// 16
 	return (wr_len);
 }
 
 /* How many bytes of TCP payload to send for a given TLS record. */
 static u_int
 ktls_tcp_payload_length(struct tlspcb *tlsp, struct mbuf *m_tls)
 {
 	struct tls_record_layer *hdr;
 	u_int plen, mlen;
 
 	M_ASSERTEXTPG(m_tls);
 	hdr = (void *)m_tls->m_epg_hdr;
 	plen = ntohs(hdr->tls_length);
 
 	/*
 	 * What range of the TLS record is the mbuf requesting to be
 	 * sent.
 	 */
 	mlen = mtod(m_tls, vm_offset_t) + m_tls->m_len;
 
 	/* Always send complete records. */
 	if (mlen == TLS_HEADER_LENGTH + plen)
 		return (mlen);
 
 	/*
 	 * If the host stack has asked to send part of the trailer,
 	 * trim the length to avoid sending any of the trailer.  There
 	 * is no way to send a partial trailer currently.
 	 */
 	if (mlen > TLS_HEADER_LENGTH + plen - m_tls->m_epg_trllen)
 		mlen = TLS_HEADER_LENGTH + plen - m_tls->m_epg_trllen;
 
 
 	/*
 	 * For AES-CBC adjust the ciphertext length for the block
 	 * size.
 	 */
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_CBC &&
 	    mlen > TLS_HEADER_LENGTH) {
 		mlen = TLS_HEADER_LENGTH + rounddown(mlen - TLS_HEADER_LENGTH,
 		    AES_BLOCK_LEN);
 	}
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %d short TLS record (%u vs %u)",
 	    __func__, tlsp->tid, mlen, TLS_HEADER_LENGTH + plen);
 #endif
 	return (mlen);
 }
 
 /*
  * For a "short" TLS record, determine the offset into the TLS record
  * payload to send.  This offset does not include the TLS header, but
  * a non-zero offset implies that a header will not be sent.
  */
 static u_int
 ktls_payload_offset(struct tlspcb *tlsp, struct mbuf *m_tls)
 {
 	struct tls_record_layer *hdr;
 	u_int offset, plen;
 #ifdef INVARIANTS
 	u_int mlen;
 #endif
 
 	M_ASSERTEXTPG(m_tls);
 	hdr = (void *)m_tls->m_epg_hdr;
 	plen = ntohs(hdr->tls_length);
 #ifdef INVARIANTS
 	mlen = mtod(m_tls, vm_offset_t) + m_tls->m_len;
 	MPASS(mlen < TLS_HEADER_LENGTH + plen);
 #endif
 	if (mtod(m_tls, vm_offset_t) <= m_tls->m_epg_hdrlen)
 		return (0);
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) {
 		/*
 		 * Always send something.  This function is only called
 		 * if we aren't sending the tag at all, but if the
 		 * request starts in the tag then we are in an odd
 		 * state where would effectively send nothing.  Cap
 		 * the offset at the last byte of the record payload
 		 * to send the last cipher block.
 		 */
 		offset = min(mtod(m_tls, vm_offset_t) - m_tls->m_epg_hdrlen,
 		    (plen - TLS_HEADER_LENGTH - m_tls->m_epg_trllen) - 1);
 		return (rounddown(offset, AES_BLOCK_LEN));
 	}
 	return (0);
 }
 
 static u_int
 ktls_sgl_size(u_int nsegs)
 {
 	u_int wr_len;
 
 	/* First segment is part of ulptx_sgl. */
 	nsegs--;
 
 	wr_len = sizeof(struct ulptx_sgl);
 	wr_len += 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	return (wr_len);
 }
 
 static int
 ktls_wr_len(struct tlspcb *tlsp, struct mbuf *m, struct mbuf *m_tls,
     int *nsegsp)
 {
 	struct tls_record_layer *hdr;
 	u_int imm_len, offset, plen, wr_len, tlen;
 
 	M_ASSERTEXTPG(m_tls);
 
 	/*
 	 * Determine the size of the TLS record payload to send
 	 * excluding header and trailer.
 	 */
 	tlen = ktls_tcp_payload_length(tlsp, m_tls);
 	if (tlen <= m_tls->m_epg_hdrlen) {
 		/*
 		 * For requests that only want to send the TLS header,
 		 * send a tunnelled packet as immediate data.
 		 */
 		wr_len = sizeof(struct fw_eth_tx_pkt_wr) +
 		    sizeof(struct cpl_tx_pkt_core) +
 		    roundup2(m->m_len + m_tls->m_len, 16);
 		if (wr_len > SGE_MAX_WR_LEN) {
 			CTR3(KTR_CXGBE,
 		    "%s: tid %d TLS header-only packet too long (len %d)",
 			    __func__, tlsp->tid, m->m_len + m_tls->m_len);
 		}
 
 		/* This should always be the last TLS record in a chain. */
 		MPASS(m_tls->m_next == NULL);
 
 		/*
 		 * XXX: Set a bogus 'nsegs' value to avoid tripping an
 		 * assertion in mbuf_nsegs() in t4_sge.c.
 		 */
 		*nsegsp = 1;
 		return (wr_len);
 	}
 
 	hdr = (void *)m_tls->m_epg_hdr;
 	plen = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - m_tls->m_epg_trllen;
 	if (tlen < plen) {
 		plen = tlen;
 		offset = ktls_payload_offset(tlsp, m_tls);
 	} else
 		offset = 0;
 
 	/* Calculate the size of the work request. */
 	wr_len = ktls_base_wr_size(tlsp);
 
 	/*
 	 * Full records and short records with an offset of 0 include
 	 * the TLS header as immediate data.  Short records include a
 	 * raw AES IV as immediate data.
 	 */
 	imm_len = 0;
 	if (offset == 0)
 		imm_len += m_tls->m_epg_hdrlen;
 	if (plen == tlen)
 		imm_len += AES_BLOCK_LEN;
 	wr_len += roundup2(imm_len, 16);
 
 	/* TLS record payload via DSGL. */
 	*nsegsp = sglist_count_mbuf_epg(m_tls, m_tls->m_epg_hdrlen + offset,
 	    plen - (m_tls->m_epg_hdrlen + offset));
 	wr_len += ktls_sgl_size(*nsegsp);
 
 	wr_len = roundup2(wr_len, 16);
 	return (wr_len);
 }
 
 /*
  * See if we have any TCP options requiring a dedicated options-only
  * packet.
  */
 static int
 ktls_has_tcp_options(struct tcphdr *tcp)
 {
 	u_char *cp;
 	int cnt, opt, optlen;
 
 	cp = (u_char *)(tcp + 1);
 	cnt = tcp->th_off * 4 - sizeof(struct tcphdr);
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = cp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 		switch (opt) {
 		case TCPOPT_NOP:
 		case TCPOPT_TIMESTAMP:
 			break;
 		default:
 			return (1);
 		}
 	}
 	return (0);
 }
 
 /*
  * Find the TCP timestamp option.
  */
 static void *
 ktls_find_tcp_timestamps(struct tcphdr *tcp)
 {
 	u_char *cp;
 	int cnt, opt, optlen;
 
 	cp = (u_char *)(tcp + 1);
 	cnt = tcp->th_off * 4 - sizeof(struct tcphdr);
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = cp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 		if (opt == TCPOPT_TIMESTAMP && optlen == TCPOLEN_TIMESTAMP)
 			return (cp + 2);
 	}
 	return (NULL);
 }
 
 int
 t6_ktls_parse_pkt(struct mbuf *m, int *nsegsp, int *len16p)
 {
 	struct tlspcb *tlsp;
 	struct ether_header *eh;
 	struct ip *ip;
 	struct ip6_hdr *ip6;
 	struct tcphdr *tcp;
 	struct mbuf *m_tls;
 	int nsegs;
 	u_int wr_len, tot_len;
 
 	/*
 	 * Locate headers in initial mbuf.
 	 *
 	 * XXX: This assumes all of the headers are in the initial mbuf.
 	 * Could perhaps use m_advance() like parse_pkt() if that turns
 	 * out to not be true.
 	 */
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_pkthdr.snd_tag != NULL);
 	tlsp = mst_to_tls(m->m_pkthdr.snd_tag);
 
 	if (m->m_len <= sizeof(*eh) + sizeof(*ip)) {
 		CTR2(KTR_CXGBE, "%s: tid %d header mbuf too short", __func__,
 		    tlsp->tid);
 		return (EINVAL);
 	}
 	eh = mtod(m, struct ether_header *);
 	if (ntohs(eh->ether_type) != ETHERTYPE_IP &&
 	    ntohs(eh->ether_type) != ETHERTYPE_IPV6) {
 		CTR2(KTR_CXGBE, "%s: tid %d mbuf not ETHERTYPE_IP{,V6}",
 		    __func__, tlsp->tid);
 		return (EINVAL);
 	}
 	m->m_pkthdr.l2hlen = sizeof(*eh);
 
 	/* XXX: Reject unsupported IP options? */
 	if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
 		ip = (struct ip *)(eh + 1);
 		if (ip->ip_p != IPPROTO_TCP) {
 			CTR2(KTR_CXGBE, "%s: tid %d mbuf not IPPROTO_TCP",
 			    __func__, tlsp->tid);
 			return (EINVAL);
 		}
 		m->m_pkthdr.l3hlen = ip->ip_hl * 4;
 	} else {
 		ip6 = (struct ip6_hdr *)(eh + 1);
 		if (ip6->ip6_nxt != IPPROTO_TCP) {
 			CTR3(KTR_CXGBE, "%s: tid %d mbuf not IPPROTO_TCP (%u)",
 			    __func__, tlsp->tid, ip6->ip6_nxt);
 			return (EINVAL);
 		}
 		m->m_pkthdr.l3hlen = sizeof(struct ip6_hdr);
 	}
 	if (m->m_len < m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen +
 	    sizeof(*tcp)) {
 		CTR2(KTR_CXGBE, "%s: tid %d header mbuf too short (2)",
 		    __func__, tlsp->tid);
 		return (EINVAL);
 	}
 	tcp = (struct tcphdr *)((char *)(eh + 1) + m->m_pkthdr.l3hlen);
 	m->m_pkthdr.l4hlen = tcp->th_off * 4;
 
 	/* Bail if there is TCP payload before the TLS record. */
 	if (m->m_len != m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen +
 	    m->m_pkthdr.l4hlen) {
 		CTR6(KTR_CXGBE,
 		    "%s: tid %d header mbuf bad length (%d + %d + %d != %d)",
 		    __func__, tlsp->tid, m->m_pkthdr.l2hlen,
 		    m->m_pkthdr.l3hlen, m->m_pkthdr.l4hlen, m->m_len);
 		return (EINVAL);
 	}
 
 	/* Assume all headers are in 'm' for now. */
 	MPASS(m->m_next != NULL);
 	MPASS(m->m_next->m_flags & M_EXTPG);
 
 	tot_len = 0;
 
 	/*
 	 * Each of the remaining mbufs in the chain should reference a
 	 * TLS record.
 	 */
 	*nsegsp = 0;
 	for (m_tls = m->m_next; m_tls != NULL; m_tls = m_tls->m_next) {
 		MPASS(m_tls->m_flags & M_EXTPG);
 
 		wr_len = ktls_wr_len(tlsp, m, m_tls, &nsegs);
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: tid %d wr_len %d nsegs %d", __func__,
 		    tlsp->tid, wr_len, nsegs);
 #endif
 		if (wr_len > SGE_MAX_WR_LEN || nsegs > TX_SGL_SEGS)
 			return (EFBIG);
 		tot_len += roundup2(wr_len, EQ_ESIZE);
 
 		/*
 		 * Store 'nsegs' for the first TLS record in the
 		 * header mbuf's metadata.
 		 */
 		if (*nsegsp == 0)
 			*nsegsp = nsegs;
 	}
 
 	MPASS(tot_len != 0);
 
 	/*
 	 * See if we have any TCP options or a FIN requiring a
 	 * dedicated packet.
 	 */
 	if ((tcp->th_flags & TH_FIN) != 0 || ktls_has_tcp_options(tcp)) {
 		wr_len = sizeof(struct fw_eth_tx_pkt_wr) +
 		    sizeof(struct cpl_tx_pkt_core) + roundup2(m->m_len, 16);
 		if (wr_len > SGE_MAX_WR_LEN) {
 			CTR3(KTR_CXGBE,
 			    "%s: tid %d options-only packet too long (len %d)",
 			    __func__, tlsp->tid, m->m_len);
 			return (EINVAL);
 		}
 		tot_len += roundup2(wr_len, EQ_ESIZE);
 	}
 
 	/* Include room for a TP work request to program an L2T entry. */
 	tot_len += EQ_ESIZE;
 
 	/*
 	 * Include room for a ULPTX work request including up to 5
 	 * CPL_SET_TCB_FIELD commands before the first TLS work
 	 * request.
 	 */
 	wr_len = sizeof(struct fw_ulptx_wr) +
 	    5 * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	/*
 	 * If timestamps are present, reserve 1 more command for
 	 * setting the echoed timestamp.
 	 */
 	if (tlsp->using_timestamps)
 		wr_len += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 
 	tot_len += roundup2(wr_len, EQ_ESIZE);
 
 	*len16p = tot_len / 16;
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %d len16 %d nsegs %d", __func__,
 	    tlsp->tid, *len16p, *nsegsp);
 #endif
 	return (0);
 }
 
 /*
  * If the SGL ends on an address that is not 16 byte aligned, this function will
  * add a 0 filled flit at the end.
  */
 static void
 write_gl_to_buf(struct sglist *gl, caddr_t to)
 {
 	struct sglist_seg *seg;
 	__be64 *flitp;
 	struct ulptx_sgl *usgl;
 	int i, nflits, nsegs;
 
 	KASSERT(((uintptr_t)to & 0xf) == 0,
 	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, to));
 
 	nsegs = gl->sg_nseg;
 	MPASS(nsegs > 0);
 
 	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
 	flitp = (__be64 *)to;
 	seg = &gl->sg_segs[0];
 	usgl = (void *)flitp;
 
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 	usgl->len0 = htobe32(seg->ss_len);
 	usgl->addr0 = htobe64(seg->ss_paddr);
 	seg++;
 
 	for (i = 0; i < nsegs - 1; i++, seg++) {
 		usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
 		usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
 	}
 	if (i & 1)
 		usgl->sge[i / 2].len[1] = htobe32(0);
 	flitp += nflits;
 
 	if (nflits & 1) {
 		MPASS(((uintptr_t)flitp) & 0xf);
 		*flitp++ = 0;
 	}
 
 	MPASS((((uintptr_t)flitp) & 0xf) == 0);
 }
 
 static inline void
 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
 {
 
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	if (__predict_true((uintptr_t)(*to) + len <=
 	    (uintptr_t)&eq->desc[eq->sidx])) {
 		bcopy(from, *to, len);
 		(*to) += len;
 		if ((uintptr_t)(*to) == (uintptr_t)&eq->desc[eq->sidx])
 			(*to) = (caddr_t)eq->desc;
 	} else {
 		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
 
 		bcopy(from, *to, portion);
 		from += portion;
 		portion = len - portion;	/* remaining */
 		bcopy(from, (void *)eq->desc, portion);
 		(*to) = (caddr_t)eq->desc + portion;
 	}
 }
 
 static int
 ktls_write_tcp_options(struct sge_txq *txq, void *dst, struct mbuf *m,
     u_int available, u_int pidx)
 {
 	struct tx_sdesc *txsd;
 	struct fw_eth_tx_pkt_wr *wr;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen;
 	struct ether_header *eh;
 	struct ip *ip, newip;
 	struct ip6_hdr *ip6, newip6;
 	struct tcphdr *tcp, newtcp;
 	caddr_t out;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m);
 
 	wr = dst;
 	pktlen = m->m_len;
 	ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen;
 	len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16);
 	ndesc = tx_len16_to_desc(len16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	cpl = (void *)(wr + 1);
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 
 	out = (void *)(cpl + 1);
 
 	/* Copy over Ethernet header. */
 	eh = mtod(m, struct ether_header *);
 	copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen);
 
 	/* Fixup length in IP header and copy out. */
 	if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
 		ip = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip = *ip;
 		newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip));
 		if (m->m_pkthdr.l3hlen > sizeof(*ip))
 			copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out,
 			    m->m_pkthdr.l3hlen - sizeof(*ip));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	} else {
 		ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip6 = *ip6;
 		newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6));
 		MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	}
 	cpl->ctrl1 = htobe64(ctrl1);
 	txq->txcsum++;
 
 	/* Clear PUSH and FIN in the TCP header if present. */
 	tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen);
 	newtcp = *tcp;
 	newtcp.th_flags &= ~(TH_PUSH | TH_FIN);
 	copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp));
 
 	/* Copy rest of packet. */
 	copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, pktlen -
 	    (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp)));
 	txq->imm_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txq->kern_tls_options++;
 
 	txsd = &txq->sdesc[pidx];
 	txsd->m = NULL;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 static int
 ktls_write_tunnel_packet(struct sge_txq *txq, void *dst, struct mbuf *m,
     struct mbuf *m_tls, u_int available, tcp_seq tcp_seqno, u_int pidx)
 {
 	struct tx_sdesc *txsd;
 	struct fw_eth_tx_pkt_wr *wr;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen;
 	struct ether_header *eh;
 	struct ip *ip, newip;
 	struct ip6_hdr *ip6, newip6;
 	struct tcphdr *tcp, newtcp;
 	caddr_t out;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m);
 
 	/* Locate the template TLS header. */
 	M_ASSERTEXTPG(m_tls);
 
 	/* This should always be the last TLS record in a chain. */
 	MPASS(m_tls->m_next == NULL);
 
 	wr = dst;
 	pktlen = m->m_len + m_tls->m_len;
 	ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen;
 	len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16);
 	ndesc = tx_len16_to_desc(len16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	cpl = (void *)(wr + 1);
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 
 	out = (void *)(cpl + 1);
 
 	/* Copy over Ethernet header. */
 	eh = mtod(m, struct ether_header *);
 	copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen);
 
 	/* Fixup length in IP header and copy out. */
 	if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
 		ip = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip = *ip;
 		newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip));
 		if (m->m_pkthdr.l3hlen > sizeof(*ip))
 			copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out,
 			    m->m_pkthdr.l3hlen - sizeof(*ip));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	} else {
 		ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip6 = *ip6;
 		newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6));
 		MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	}
 	cpl->ctrl1 = htobe64(ctrl1);
 	txq->txcsum++;
 
 	/* Set sequence number in TCP header. */
 	tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen);
 	newtcp = *tcp;
 	newtcp.th_seq = htonl(tcp_seqno + mtod(m_tls, vm_offset_t));
 	copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp));
 
 	/* Copy rest of TCP header. */
 	copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, m->m_len -
 	    (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp)));
 
 	/* Copy the subset of the TLS header requested. */
 	copy_to_txd(&txq->eq, (char *)m_tls->m_epg_hdr +
 	    mtod(m_tls, vm_offset_t), &out, m_tls->m_len);
 	txq->imm_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txq->kern_tls_header++;
 
 	txsd = &txq->sdesc[pidx];
 	txsd->m = m;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 _Static_assert(sizeof(struct cpl_set_tcb_field) <= EQ_ESIZE,
     "CPL_SET_TCB_FIELD must be smaller than a single TX descriptor");
 _Static_assert(W_TCB_SND_UNA_RAW == W_TCB_SND_NXT_RAW,
     "SND_NXT_RAW and SND_UNA_RAW are in different words");
 
 static int
 ktls_write_tls_wr(struct tlspcb *tlsp, struct sge_txq *txq,
     void *dst, struct mbuf *m, struct tcphdr *tcp, struct mbuf *m_tls,
     u_int nsegs, u_int available, tcp_seq tcp_seqno, uint32_t *tsopt,
     u_int pidx, bool set_l2t_idx)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct fw_ulptx_wr *wr;
 	struct ulp_txpkt *txpkt;
 	struct ulptx_sc_memrd *memrd;
 	struct ulptx_idata *idata;
 	struct cpl_tx_sec_pdu *sec_pdu;
 	struct cpl_tx_data *tx_data;
 	struct tls_record_layer *hdr;
 	char *iv, *out;
 	u_int aad_start, aad_stop;
 	u_int auth_start, auth_stop, auth_insert;
 	u_int cipher_start, cipher_stop, iv_offset;
 	u_int imm_len, mss, ndesc, offset, plen, tlen, twr_len, wr_len;
 	u_int fields, tx_max_offset, tx_max;
 	bool first_wr, last_wr, using_scratch;
 
 	ndesc = 0;
 	MPASS(tlsp->txq == txq);
 
 	first_wr = (tlsp->prev_seq == 0 && tlsp->prev_ack == 0 &&
 	    tlsp->prev_win == 0);
 
 	/*
 	 * Use the per-txq scratch pad if near the end of the ring to
 	 * simplify handling of wrap-around.  This uses a simple but
 	 * not quite perfect test of using the scratch buffer if we
 	 * can't fit a maximal work request in without wrapping.
 	 */
 	using_scratch = (eq->sidx - pidx < SGE_MAX_WR_LEN / EQ_ESIZE);
 
 	/* Locate the TLS header. */
 	M_ASSERTEXTPG(m_tls);
 	hdr = (void *)m_tls->m_epg_hdr;
 	plen = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) - m_tls->m_epg_trllen;
 
 	/* Determine how much of the TLS record to send. */
 	tlen = ktls_tcp_payload_length(tlsp, m_tls);
 	if (tlen <= m_tls->m_epg_hdrlen) {
 		/*
 		 * For requests that only want to send the TLS header,
 		 * send a tunnelled packet as immediate data.
 		 */
 #ifdef VERBOSE_TRACES
 		CTR3(KTR_CXGBE, "%s: tid %d header-only TLS record %u",
 		    __func__, tlsp->tid, (u_int)m_tls->m_epg_seqno);
 #endif
 		return (ktls_write_tunnel_packet(txq, dst, m, m_tls, available,
 		    tcp_seqno, pidx));
 	}
 	if (tlen < plen) {
 		plen = tlen;
 		offset = ktls_payload_offset(tlsp, m_tls);
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: tid %d short TLS record %u with offset %u",
 		    __func__, tlsp->tid, (u_int)m_tls->m_epg_seqno, offset);
 #endif
 		if (m_tls->m_next == NULL && (tcp->th_flags & TH_FIN) != 0) {
 			txq->kern_tls_fin_short++;
 #ifdef INVARIANTS
 			panic("%s: FIN on short TLS record", __func__);
 #endif
 		}
 	} else
 		offset = 0;
 
 	/*
 	 * This is the last work request for a given TLS mbuf chain if
 	 * it is the last mbuf in the chain and FIN is not set.  If
 	 * FIN is set, then ktls_write_tcp_fin() will write out the
 	 * last work request.
 	 */
 	last_wr = m_tls->m_next == NULL && (tcp->th_flags & TH_FIN) == 0;
 
 	/*
 	 * The host stack may ask us to not send part of the start of
 	 * a TLS record.  (For example, the stack might have
 	 * previously sent a "short" TLS record and might later send
 	 * down an mbuf that requests to send the remainder of the TLS
 	 * record.)  The crypto engine must process a TLS record from
 	 * the beginning if computing a GCM tag or HMAC, so we always
 	 * send the TLS record from the beginning as input to the
 	 * crypto engine and via CPL_TX_DATA to TP.  However, TP will
 	 * drop individual packets after they have been chopped up
 	 * into MSS-sized chunks if the entire sequence range of those
 	 * packets is less than SND_UNA.  SND_UNA is computed as
 	 * TX_MAX - SND_UNA_RAW.  Thus, use the offset stored in
 	 * m_data to set TX_MAX to the first byte in the TCP sequence
 	 * space the host actually wants us to send and set
 	 * SND_UNA_RAW to 0.
 	 *
 	 * If the host sends us back to back requests that span the
 	 * trailer of a single TLS record (first request ends "in" the
 	 * trailer and second request starts at the next byte but
 	 * still "in" the trailer), the initial bytes of the trailer
 	 * that the first request drops will not be retransmitted.  If
 	 * the host uses the same requests when retransmitting the
 	 * connection will hang.  To handle this, always transmit the
 	 * full trailer for a request that begins "in" the trailer
 	 * (the second request in the example above).  This should
 	 * also help to avoid retransmits for the common case.
 	 *
 	 * A similar condition exists when using CBC for back to back
 	 * requests that span a single AES block.  The first request
 	 * will be truncated to end at the end of the previous AES
 	 * block.  To handle this, always begin transmission at the
 	 * start of the current AES block.
 	 */
 	tx_max_offset = mtod(m_tls, vm_offset_t);
 	if (tx_max_offset > TLS_HEADER_LENGTH + ntohs(hdr->tls_length) -
 	    m_tls->m_epg_trllen) {
 		/* Always send the full trailer. */
 		tx_max_offset = TLS_HEADER_LENGTH + ntohs(hdr->tls_length) -
 		    m_tls->m_epg_trllen;
 	}
 	if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_CBC &&
 	    tx_max_offset > TLS_HEADER_LENGTH) {
 		/* Always send all of the first AES block. */
 		tx_max_offset = TLS_HEADER_LENGTH +
 		    rounddown(tx_max_offset - TLS_HEADER_LENGTH,
 		    AES_BLOCK_LEN);
 	}
 	tx_max = tcp_seqno + tx_max_offset;
 
 	/*
 	 * Update TCB fields.  Reserve space for the FW_ULPTX_WR header
 	 * but don't populate it until we know how many field updates
 	 * are required.
 	 */
 	if (using_scratch)
 		wr = (void *)txq->ss;
 	else
 		wr = dst;
 	out = (void *)(wr + 1);
 	fields = 0;
 	if (set_l2t_idx) {
 		KASSERT(nsegs != 0,
 		    ("trying to set L2T_IX for subsequent TLS WR"));
 #ifdef VERBOSE_TRACES
 		CTR3(KTR_CXGBE, "%s: tid %d set L2T_IX to %d", __func__,
 		    tlsp->tid, tlsp->l2te->idx);
 #endif
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_L2T_IX,
 		    V_TCB_L2T_IX(M_TCB_L2T_IX), V_TCB_L2T_IX(tlsp->l2te->idx));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 	}
 	if (tsopt != NULL && tlsp->prev_tsecr != ntohl(tsopt[1])) {
 		KASSERT(nsegs != 0,
 		    ("trying to set T_RTSEQ_RECENT for subsequent TLS WR"));
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d wrote updated T_RTSEQ_RECENT",
 		    __func__, tlsp->tid);
 #endif
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_T_RTSEQ_RECENT,
 		    V_TCB_T_RTSEQ_RECENT(M_TCB_T_RTSEQ_RECENT),
 		    V_TCB_T_RTSEQ_RECENT(ntohl(tsopt[1])));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 
 		tlsp->prev_tsecr = ntohl(tsopt[1]);
 	}
 
 	if (first_wr || tlsp->prev_seq != tx_max) {
 		KASSERT(nsegs != 0,
 		    ("trying to set TX_MAX for subsequent TLS WR"));
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE,
 		    "%s: tid %d setting TX_MAX to %u (tcp_seqno %u)",
 		    __func__, tlsp->tid, tx_max, tcp_seqno);
 #endif
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_TX_MAX,
 		    V_TCB_TX_MAX(M_TCB_TX_MAX), V_TCB_TX_MAX(tx_max));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 	}
 
 	/*
 	 * If there is data to drop at the beginning of this TLS
 	 * record or if this is a retransmit,
 	 * reset SND_UNA_RAW to 0 so that SND_UNA == TX_MAX.
 	 */
 	if (tlsp->prev_seq != tx_max || mtod(m_tls, vm_offset_t) != 0) {
 		KASSERT(nsegs != 0,
 		    ("trying to clear SND_UNA_RAW for subsequent TLS WR"));
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d clearing SND_UNA_RAW", __func__,
 		    tlsp->tid);
 #endif
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_SND_UNA_RAW,
 		    V_TCB_SND_UNA_RAW(M_TCB_SND_UNA_RAW),
 		    V_TCB_SND_UNA_RAW(0));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 	}
 
 	/*
 	 * Store the expected sequence number of the next byte after
 	 * this record.
 	 */
 	tlsp->prev_seq = tcp_seqno + tlen;
 
 	if (first_wr || tlsp->prev_ack != ntohl(tcp->th_ack)) {
 		KASSERT(nsegs != 0,
 		    ("trying to set RCV_NXT for subsequent TLS WR"));
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_RCV_NXT,
 		    V_TCB_RCV_NXT(M_TCB_RCV_NXT),
 		    V_TCB_RCV_NXT(ntohl(tcp->th_ack)));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 
 		tlsp->prev_ack = ntohl(tcp->th_ack);
 	}
 
 	if (first_wr || tlsp->prev_win != ntohs(tcp->th_win)) {
 		KASSERT(nsegs != 0,
 		    ("trying to set RCV_WND for subsequent TLS WR"));
 		write_set_tcb_field_ulp(tlsp, out, txq, W_TCB_RCV_WND,
 		    V_TCB_RCV_WND(M_TCB_RCV_WND),
 		    V_TCB_RCV_WND(ntohs(tcp->th_win)));
 		out += roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		fields++;
 
 		tlsp->prev_win = ntohs(tcp->th_win);
 	}
 
 	/* Recalculate 'nsegs' if cached value is not available. */
 	if (nsegs == 0)
 		nsegs = sglist_count_mbuf_epg(m_tls, m_tls->m_epg_hdrlen +
 		    offset, plen - (m_tls->m_epg_hdrlen + offset));
 
 	/* Calculate the size of the TLS work request. */
 	twr_len = ktls_base_wr_size(tlsp);
 
 	imm_len = 0;
 	if (offset == 0)
 		imm_len += m_tls->m_epg_hdrlen;
 	if (plen == tlen)
 		imm_len += AES_BLOCK_LEN;
 	twr_len += roundup2(imm_len, 16);
 	twr_len += ktls_sgl_size(nsegs);
 
 	/*
 	 * If any field updates were required, determine if they can
 	 * be included in the TLS work request.  If not, use the
 	 * FW_ULPTX_WR work request header at 'wr' as a dedicated work
 	 * request for the field updates and start a new work request
 	 * for the TLS work request afterward.
 	 */
 	if (fields != 0) {
 		wr_len = fields * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 		if (twr_len + wr_len <= SGE_MAX_WR_LEN &&
 		    tlsp->sc->tlst.combo_wrs) {
 			wr_len += twr_len;
 			txpkt = (void *)out;
 		} else {
 			wr_len += sizeof(*wr);
 			wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR));
 			wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA |
 			    V_FW_WR_LEN16(wr_len / 16));
 			wr->cookie = 0;
 
 			/*
 			 * If we were using scratch space, copy the
 			 * field updates work request to the ring.
 			 */
 			if (using_scratch) {
 				out = dst;
 				copy_to_txd(eq, txq->ss, &out, wr_len);
 			}
 
 			ndesc = howmany(wr_len, EQ_ESIZE);
 			MPASS(ndesc <= available);
 
 			txq->raw_wrs++;
 			txsd = &txq->sdesc[pidx];
 			txsd->m = NULL;
 			txsd->desc_used = ndesc;
 			IDXINCR(pidx, ndesc, eq->sidx);
 			dst = &eq->desc[pidx];
 
 			/*
 			 * Determine if we should use scratch space
 			 * for the TLS work request based on the
 			 * available space after advancing pidx for
 			 * the field updates work request.
 			 */
 			wr_len = twr_len;
 			using_scratch = (eq->sidx - pidx <
 			    howmany(wr_len, EQ_ESIZE));
 			if (using_scratch)
 				wr = (void *)txq->ss;
 			else
 				wr = dst;
 			txpkt = (void *)(wr + 1);
 		}
 	} else {
 		wr_len = twr_len;
 		txpkt = (void *)out;
 	}
 
 	wr_len = roundup2(wr_len, 16);
 	MPASS(ndesc + howmany(wr_len, EQ_ESIZE) <= available);
 
 	/* FW_ULPTX_WR */
 	wr->op_to_compl = htobe32(V_FW_WR_OP(FW_ULPTX_WR));
 	wr->flowid_len16 = htobe32(F_FW_ULPTX_WR_DATA |
 	    V_FW_WR_LEN16(wr_len / 16));
 	wr->cookie = 0;
 
 	/* ULP_TXPKT */
 	txpkt->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
 	    V_ULP_TXPKT_DATAMODIFY(0) |
 	    V_ULP_TXPKT_CHANNELID(tlsp->vi->pi->port_id) | V_ULP_TXPKT_DEST(0) |
 	    V_ULP_TXPKT_FID(txq->eq.cntxt_id) | V_ULP_TXPKT_RO(1));
 	txpkt->len = htobe32(howmany(twr_len - sizeof(*wr), 16));
 
 	/* ULPTX_IDATA sub-command */
 	idata = (void *)(txpkt + 1);
 	idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
 	    V_ULP_TX_SC_MORE(1));
 	idata->len = sizeof(struct cpl_tx_sec_pdu);
 
 	/*
 	 * The key context, CPL_TX_DATA, and immediate data are part
 	 * of this ULPTX_IDATA when using an inline key.  When reading
 	 * the key from memory, the CPL_TX_DATA and immediate data are
 	 * part of a separate ULPTX_IDATA.
 	 */
 	if (tlsp->inline_key)
 		idata->len += tlsp->tx_key_info_size +
 		    sizeof(struct cpl_tx_data) + imm_len;
 	idata->len = htobe32(idata->len);
 
 	/* CPL_TX_SEC_PDU */
 	sec_pdu = (void *)(idata + 1);
 
 	/*
 	 * For short records, AAD is counted as header data in SCMD0,
 	 * the IV is next followed by a cipher region for the payload.
 	 */
 	if (plen == tlen) {
 		aad_start = 0;
 		aad_stop = 0;
 		iv_offset = 1;
 		auth_start = 0;
 		auth_stop = 0;
 		auth_insert = 0;
 		cipher_start = AES_BLOCK_LEN + 1;
 		cipher_stop = 0;
 
 		sec_pdu->pldlen = htobe32(16 + plen -
 		    (m_tls->m_epg_hdrlen + offset));
 
 		/* These two flits are actually a CPL_TLS_TX_SCMD_FMT. */
 		sec_pdu->seqno_numivs = tlsp->scmd0_short.seqno_numivs;
 		sec_pdu->ivgen_hdrlen = htobe32(
 		    tlsp->scmd0_short.ivgen_hdrlen |
 		    V_SCMD_HDR_LEN(offset == 0 ? m_tls->m_epg_hdrlen : 0));
 
 		txq->kern_tls_short++;
 	} else {
 		/*
 		 * AAD is TLS header.  IV is after AAD.  The cipher region
 		 * starts after the IV.  See comments in ccr_authenc() and
 		 * ccr_gmac() in t4_crypto.c regarding cipher and auth
 		 * start/stop values.
 		 */
 		aad_start = 1;
 		aad_stop = TLS_HEADER_LENGTH;
 		iv_offset = TLS_HEADER_LENGTH + 1;
 		cipher_start = m_tls->m_epg_hdrlen + 1;
 		if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) {
 			cipher_stop = 0;
 			auth_start = cipher_start;
 			auth_stop = 0;
 			auth_insert = 0;
 		} else {
 			cipher_stop = 0;
 			auth_start = cipher_start;
 			auth_stop = 0;
 			auth_insert = 0;
 		}
 
 		sec_pdu->pldlen = htobe32(plen);
 
 		/* These two flits are actually a CPL_TLS_TX_SCMD_FMT. */
 		sec_pdu->seqno_numivs = tlsp->scmd0.seqno_numivs;
 		sec_pdu->ivgen_hdrlen = tlsp->scmd0.ivgen_hdrlen;
 
 		if (mtod(m_tls, vm_offset_t) == 0)
 			txq->kern_tls_full++;
 		else
 			txq->kern_tls_partial++;
 	}
 	sec_pdu->op_ivinsrtofst = htobe32(
 	    V_CPL_TX_SEC_PDU_OPCODE(CPL_TX_SEC_PDU) |
 	    V_CPL_TX_SEC_PDU_CPLLEN(2) | V_CPL_TX_SEC_PDU_PLACEHOLDER(0) |
 	    V_CPL_TX_SEC_PDU_IVINSRTOFST(iv_offset));
 	sec_pdu->aadstart_cipherstop_hi = htobe32(
 	    V_CPL_TX_SEC_PDU_AADSTART(aad_start) |
 	    V_CPL_TX_SEC_PDU_AADSTOP(aad_stop) |
 	    V_CPL_TX_SEC_PDU_CIPHERSTART(cipher_start) |
 	    V_CPL_TX_SEC_PDU_CIPHERSTOP_HI(cipher_stop >> 4));
 	sec_pdu->cipherstop_lo_authinsert = htobe32(
 	    V_CPL_TX_SEC_PDU_CIPHERSTOP_LO(cipher_stop & 0xf) |
 	    V_CPL_TX_SEC_PDU_AUTHSTART(auth_start) |
 	    V_CPL_TX_SEC_PDU_AUTHSTOP(auth_stop) |
 	    V_CPL_TX_SEC_PDU_AUTHINSERT(auth_insert));
 
 	sec_pdu->scmd1 = htobe64(m_tls->m_epg_seqno);
 
 	/* Key context */
 	out = (void *)(sec_pdu + 1);
 	if (tlsp->inline_key) {
 		memcpy(out, &tlsp->keyctx, tlsp->tx_key_info_size);
 		out += tlsp->tx_key_info_size;
 	} else {
 		/* ULPTX_SC_MEMRD to read key context. */
 		memrd = (void *)out;
 		memrd->cmd_to_len = htobe32(V_ULPTX_CMD(ULP_TX_SC_MEMRD) |
 		    V_ULP_TX_SC_MORE(1) |
 		    V_ULPTX_LEN16(tlsp->tx_key_info_size >> 4));
 		memrd->addr = htobe32(tlsp->tx_key_addr >> 5);
 
 		/* ULPTX_IDATA for CPL_TX_DATA and TLS header. */
 		idata = (void *)(memrd + 1);
 		idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
 		    V_ULP_TX_SC_MORE(1));
 		idata->len = htobe32(sizeof(struct cpl_tx_data) + imm_len);
 
 		out = (void *)(idata + 1);
 	}
 
 	/* CPL_TX_DATA */
 	tx_data = (void *)out;
 	OPCODE_TID(tx_data) = htonl(MK_OPCODE_TID(CPL_TX_DATA, tlsp->tid));
 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
 		mss = m->m_pkthdr.tso_segsz;
 		tlsp->prev_mss = mss;
 	} else if (tlsp->prev_mss != 0)
 		mss = tlsp->prev_mss;
 	else
 		mss = tlsp->vi->ifp->if_mtu -
 		    (m->m_pkthdr.l3hlen + m->m_pkthdr.l4hlen);
 	if (offset == 0) {
 		tx_data->len = htobe32(V_TX_DATA_MSS(mss) | V_TX_LENGTH(tlen));
 		tx_data->rsvd = htobe32(tcp_seqno);
 	} else {
 		tx_data->len = htobe32(V_TX_DATA_MSS(mss) |
 		    V_TX_LENGTH(tlen - (m_tls->m_epg_hdrlen + offset)));
 		tx_data->rsvd = htobe32(tcp_seqno + m_tls->m_epg_hdrlen + offset);
 	}
 	tx_data->flags = htobe32(F_TX_BYPASS);
 	if (last_wr && tcp->th_flags & TH_PUSH)
 		tx_data->flags |= htobe32(F_TX_PUSH | F_TX_SHOVE);
 
 	/* Populate the TLS header */
 	out = (void *)(tx_data + 1);
 	if (offset == 0) {
 		memcpy(out, m_tls->m_epg_hdr, m_tls->m_epg_hdrlen);
 		out += m_tls->m_epg_hdrlen;
 	}
 
 	/* AES IV for a short record. */
 	if (plen == tlen) {
 		iv = out;
 		if (tlsp->enc_mode == SCMD_CIPH_MODE_AES_GCM) {
 			memcpy(iv, tlsp->keyctx.u.txhdr.txsalt, SALT_SIZE);
 			memcpy(iv + 4, hdr + 1, 8);
 			*(uint32_t *)(iv + 12) = htobe32(2 +
 			    offset / AES_BLOCK_LEN);
 		} else
 			memcpy(iv, hdr + 1, AES_BLOCK_LEN);
 		out += AES_BLOCK_LEN;
 	}
 
 	if (imm_len % 16 != 0) {
 		/* Zero pad to an 8-byte boundary. */
 		memset(out, 0, 8 - (imm_len % 8));
 		out += 8 - (imm_len % 8);
 
 		/*
 		 * Insert a ULP_TX_SC_NOOP if needed so the SGL is
 		 * 16-byte aligned.
 		 */
 		if (imm_len % 16 <= 8) {
 			idata = (void *)out;
 			idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 			idata->len = htobe32(0);
 			out = (void *)(idata + 1);
 		}
 	}
 
 	/* SGL for record payload */
 	sglist_reset(txq->gl);
 	if (sglist_append_mbuf_epg(txq->gl, m_tls, m_tls->m_epg_hdrlen + offset,
 	    plen - (m_tls->m_epg_hdrlen + offset)) != 0) {
 #ifdef INVARIANTS
 		panic("%s: failed to append sglist", __func__);
 #endif
 	}
 	write_gl_to_buf(txq->gl, out);
 
 	if (using_scratch) {
 		out = dst;
 		copy_to_txd(eq, txq->ss, &out, wr_len);
 	}
 
 	ndesc += howmany(wr_len, EQ_ESIZE);
 	MPASS(ndesc <= available);
 
 	txq->kern_tls_records++;
 	txq->kern_tls_octets += tlen - mtod(m_tls, vm_offset_t);
 	if (mtod(m_tls, vm_offset_t) != 0) {
 		if (offset == 0)
 			txq->kern_tls_waste += mtod(m_tls, vm_offset_t);
 		else
 			txq->kern_tls_waste += mtod(m_tls, vm_offset_t) -
 			    (m_tls->m_epg_hdrlen + offset);
 	}
 
 	txsd = &txq->sdesc[pidx];
 	if (last_wr)
 		txsd->m = m;
 	else
 		txsd->m = NULL;
 	txsd->desc_used = howmany(wr_len, EQ_ESIZE);
 
 	return (ndesc);
 }
 
 static int
 ktls_write_tcp_fin(struct sge_txq *txq, void *dst, struct mbuf *m,
     u_int available, tcp_seq tcp_seqno, u_int pidx)
 {
 	struct tx_sdesc *txsd;
 	struct fw_eth_tx_pkt_wr *wr;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen;
 	struct ether_header *eh;
 	struct ip *ip, newip;
 	struct ip6_hdr *ip6, newip6;
 	struct tcphdr *tcp, newtcp;
 	caddr_t out;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m);
 
 	wr = dst;
 	pktlen = m->m_len;
 	ctrl = sizeof(struct cpl_tx_pkt_core) + pktlen;
 	len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + ctrl, 16);
 	ndesc = tx_len16_to_desc(len16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	cpl = (void *)(wr + 1);
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 
 	out = (void *)(cpl + 1);
 
 	/* Copy over Ethernet header. */
 	eh = mtod(m, struct ether_header *);
 	copy_to_txd(&txq->eq, (caddr_t)eh, &out, m->m_pkthdr.l2hlen);
 
 	/* Fixup length in IP header and copy out. */
 	if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
 		ip = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip = *ip;
 		newip.ip_len = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip, &out, sizeof(newip));
 		if (m->m_pkthdr.l3hlen > sizeof(*ip))
 			copy_to_txd(&txq->eq, (caddr_t)(ip + 1), &out,
 			    m->m_pkthdr.l3hlen - sizeof(*ip));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	} else {
 		ip6 = (void *)((char *)eh + m->m_pkthdr.l2hlen);
 		newip6 = *ip6;
 		newip6.ip6_plen = htons(pktlen - m->m_pkthdr.l2hlen);
 		copy_to_txd(&txq->eq, (caddr_t)&newip6, &out, sizeof(newip6));
 		MPASS(m->m_pkthdr.l3hlen == sizeof(*ip6));
 		ctrl1 = V_TXPKT_CSUM_TYPE(TX_CSUM_TCPIP6) |
 		    V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN) |
 		    V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
 	}
 	cpl->ctrl1 = htobe64(ctrl1);
 	txq->txcsum++;
 
 	/* Set sequence number in TCP header. */
 	tcp = (void *)((char *)eh + m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen);
 	newtcp = *tcp;
 	newtcp.th_seq = htonl(tcp_seqno);
 	copy_to_txd(&txq->eq, (caddr_t)&newtcp, &out, sizeof(newtcp));
 
 	/* Copy rest of packet. */
 	copy_to_txd(&txq->eq, (caddr_t)(tcp + 1), &out, m->m_len -
 	    (m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + sizeof(*tcp)));
 	txq->imm_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txq->kern_tls_fin++;
 
 	txsd = &txq->sdesc[pidx];
 	txsd->m = m;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 int
 t6_ktls_write_wr(struct sge_txq *txq, void *dst, struct mbuf *m, u_int nsegs,
     u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct tlspcb *tlsp;
 	struct tcphdr *tcp;
 	struct mbuf *m_tls;
 	struct ether_header *eh;
 	tcp_seq tcp_seqno;
 	u_int ndesc, pidx, totdesc;
 	uint16_t vlan_tag;
 	bool has_fin, set_l2t_idx;
 	void *tsopt;
 
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_pkthdr.snd_tag != NULL);
 	tlsp = mst_to_tls(m->m_pkthdr.snd_tag);
 
 	totdesc = 0;
 	eh = mtod(m, struct ether_header *);
 	tcp = (struct tcphdr *)((char *)eh + m->m_pkthdr.l2hlen +
 	    m->m_pkthdr.l3hlen);
 	pidx = eq->pidx;
 	has_fin = (tcp->th_flags & TH_FIN) != 0;
 
 	/*
 	 * If this TLS record has a FIN, then we will send any
 	 * requested options as part of the FIN packet.
 	 */
 	if (!has_fin && ktls_has_tcp_options(tcp)) {
 		ndesc = ktls_write_tcp_options(txq, dst, m, available, pidx);
 		totdesc += ndesc;
 		IDXINCR(pidx, ndesc, eq->sidx);
 		dst = &eq->desc[pidx];
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d wrote TCP options packet", __func__,
 		    tlsp->tid);
 #endif
 	}
 
 	/*
 	 * Allocate a new L2T entry if necessary.  This may write out
 	 * a work request to the txq.
 	 */
 	if (m->m_flags & M_VLANTAG)
 		vlan_tag = m->m_pkthdr.ether_vtag;
 	else
 		vlan_tag = 0xfff;
 	set_l2t_idx = false;
 	if (tlsp->l2te == NULL || tlsp->l2te->vlan != vlan_tag ||
 	    memcmp(tlsp->l2te->dmac, eh->ether_dhost, ETHER_ADDR_LEN) != 0) {
 		set_l2t_idx = true;
 		if (tlsp->l2te)
 			t4_l2t_release(tlsp->l2te);
 		tlsp->l2te = t4_l2t_alloc_tls(tlsp->sc, txq, dst, &ndesc,
 		    vlan_tag, tlsp->vi->pi->lport, eh->ether_dhost);
 		if (tlsp->l2te == NULL)
 			CXGBE_UNIMPLEMENTED("failed to allocate TLS L2TE");
 		if (ndesc != 0) {
 			MPASS(ndesc <= available - totdesc);
 
 			txq->raw_wrs++;
 			txsd = &txq->sdesc[pidx];
 			txsd->m = NULL;
 			txsd->desc_used = ndesc;
 			totdesc += ndesc;
 			IDXINCR(pidx, ndesc, eq->sidx);
 			dst = &eq->desc[pidx];
 		}
 	}
 
 	/*
 	 * Iterate over each TLS record constructing a work request
 	 * for that record.
 	 */
 	for (m_tls = m->m_next; m_tls != NULL; m_tls = m_tls->m_next) {
 		MPASS(m_tls->m_flags & M_EXTPG);
 
 		/*
 		 * Determine the initial TCP sequence number for this
 		 * record.
 		 */
 		tsopt = NULL;
 		if (m_tls == m->m_next) {
 			tcp_seqno = ntohl(tcp->th_seq) -
 			    mtod(m_tls, vm_offset_t);
 			if (tlsp->using_timestamps)
 				tsopt = ktls_find_tcp_timestamps(tcp);
 		} else {
 			MPASS(mtod(m_tls, vm_offset_t) == 0);
 			tcp_seqno = tlsp->prev_seq;
 		}
 
 		ndesc = ktls_write_tls_wr(tlsp, txq, dst, m, tcp, m_tls,
 		    nsegs, available - totdesc, tcp_seqno, tsopt, pidx,
 		    set_l2t_idx);
 		totdesc += ndesc;
 		IDXINCR(pidx, ndesc, eq->sidx);
 		dst = &eq->desc[pidx];
 
 		/*
 		 * The value of nsegs from the header mbuf's metadata
 		 * is only valid for the first TLS record.
 		 */
 		nsegs = 0;
 
 		/* Only need to set the L2T index once. */
 		set_l2t_idx = false;
 	}
 
 	if (has_fin) {
 		/*
 		 * If the TCP header for this chain has FIN sent, then
 		 * explicitly send a packet that has FIN set.  This
 		 * will also have PUSH set if requested.  This assumes
 		 * we sent at least one TLS record work request and
 		 * uses the TCP sequence number after that reqeust as
 		 * the sequence number for the FIN packet.
 		 */
 		ndesc = ktls_write_tcp_fin(txq, dst, m, available,
 		    tlsp->prev_seq, pidx);
 		totdesc += ndesc;
 	}
 
 	MPASS(totdesc <= available);
 	return (totdesc);
 }
 
 static void
 t6_tls_tag_free(struct m_snd_tag *mst)
 {
 	struct adapter *sc;
 	struct tlspcb *tlsp;
 
 	tlsp = mst_to_tls(mst);
 	sc = tlsp->sc;
 
 	CTR2(KTR_CXGBE, "%s: tid %d", __func__, tlsp->tid);
 
 	if (tlsp->l2te)
 		t4_l2t_release(tlsp->l2te);
 	if (tlsp->tid >= 0)
 		release_tid(sc, tlsp->tid, tlsp->ctrlq);
 	if (tlsp->ce)
 		t4_release_clip_entry(sc, tlsp->ce);
 	if (tlsp->tx_key_addr >= 0)
 		t4_free_tls_keyid(sc, tlsp->tx_key_addr);
 
 	zfree(tlsp, M_CXGBE);
 }
 
 void
 t6_ktls_modload(void)
 {
 
 	t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, ktls_act_open_rpl,
 	    CPL_COOKIE_KERN_TLS);
 }
 
 void
 t6_ktls_modunload(void)
 {
 
 	t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, NULL,
 	    CPL_COOKIE_KERN_TLS);
 }
 
 #else
 
 int
 t6_tls_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
     struct m_snd_tag **pt)
 {
 	return (ENXIO);
 }
 
 int
 t6_ktls_parse_pkt(struct mbuf *m, int *nsegsp, int *len16p)
 {
 	return (EINVAL);
 }
 
 int
 t6_ktls_write_wr(struct sge_txq *txq, void *dst, struct mbuf *m, u_int nsegs,
     u_int available)
 {
 	panic("can't happen");
 }
 
 void
 t6_ktls_modload(void)
 {
 }
 
 void
 t6_ktls_modunload(void)
 {
 }
 
 #endif
diff --git a/sys/dev/cxgbe/cxgbei/cxgbei.c b/sys/dev/cxgbe/cxgbei/cxgbei.c
index b6dc5d5c3577..d11465e4567d 100644
--- a/sys/dev/cxgbe/cxgbei/cxgbei.c
+++ b/sys/dev/cxgbe/cxgbei/cxgbei.c
@@ -1,988 +1,988 @@
 /*-
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  *
  * Chelsio T5xx iSCSI driver
  *
  * Written by: Sreenivasa Honnur <shonnur@chelsio.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 
 #ifdef TCP_OFFLOAD
 #include <sys/errno.h>
 #include <sys/gsb_crc32.h>
 #include <sys/kthread.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/mbuf.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
 #include <sys/uio.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/toecore.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_fsm.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_da.h>
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_error.h>
 #include <cam/ctl/ctl_frontend.h>
 #include <cam/ctl/ctl_debug.h>
 #include <cam/ctl/ctl_ha.h>
 #include <cam/ctl/ctl_ioctl.h>
 
 #include <dev/iscsi/icl.h>
 #include <dev/iscsi/iscsi_proto.h>
 #include <dev/iscsi/iscsi_ioctl.h>
 #include <dev/iscsi/iscsi.h>
 #include <cam/ctl/ctl_frontend_iscsi.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_xpt.h>
 #include <cam/cam_debug.h>
 #include <cam/cam_sim.h>
 #include <cam/cam_xpt_sim.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_compat.h>
 #include <cam/scsi/scsi_message.h>
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"	/* for PCIE_MEM_ACCESS */
 #include "tom/t4_tom.h"
 #include "cxgbei.h"
 
 static void
 read_pdu_limits(struct adapter *sc, uint32_t *max_tx_data_len,
     uint32_t *max_rx_data_len, struct ppod_region *pr)
 {
 	uint32_t tx_len, rx_len, r, v;
 
 	rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
 	tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
 
 	r = t4_read_reg(sc, A_TP_PARA_REG2);
 	rx_len = min(rx_len, G_MAXRXDATA(r));
 	tx_len = min(tx_len, G_MAXRXDATA(r));
 
 	r = t4_read_reg(sc, A_TP_PARA_REG7);
 	v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
 	rx_len = min(rx_len, v);
 	tx_len = min(tx_len, v);
 
 	/*
 	 * AHS is not supported by the kernel so we'll not account for
 	 * it either in our PDU len -> data segment len conversions.
 	 */
 	rx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
 	    ISCSI_DATA_DIGEST_SIZE;
 	tx_len -= ISCSI_BHS_SIZE + ISCSI_HEADER_DIGEST_SIZE +
 	    ISCSI_DATA_DIGEST_SIZE;
 
 	/*
 	 * DDP can place only 4 pages for a single PDU.  A single
 	 * request might use larger pages than the smallest page size,
 	 * but that cannot be guaranteed.  Assume the smallest DDP
 	 * page size for this limit.
 	 */
 	rx_len = min(rx_len, 4 * (1U << pr->pr_page_shift[0]));
 
 	if (chip_id(sc) == CHELSIO_T5) {
 		tx_len = min(tx_len, 15360);
 
 		rx_len = rounddown2(rx_len, 512);
 		tx_len = rounddown2(tx_len, 512);
 	}
 
 	*max_tx_data_len = tx_len;
 	*max_rx_data_len = rx_len;
 }
 
 /*
  * Initialize the software state of the iSCSI ULP driver.
  *
  * ENXIO means firmware didn't set up something that it was supposed to.
  */
 static int
 cxgbei_init(struct adapter *sc, struct cxgbei_data *ci)
 {
 	struct sysctl_oid *oid;
 	struct sysctl_oid_list *children;
 	struct ppod_region *pr;
 	uint32_t r;
 	int rc;
 
 	MPASS(sc->vres.iscsi.size > 0);
 	MPASS(ci != NULL);
 
 	pr = &ci->pr;
 	r = t4_read_reg(sc, A_ULP_RX_ISCSI_PSZ);
 	rc = t4_init_ppod_region(pr, &sc->vres.iscsi, r, "iSCSI page pods");
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "%s: failed to initialize the iSCSI page pod region: %u.\n",
 		    __func__, rc);
 		return (rc);
 	}
 
 	read_pdu_limits(sc, &ci->max_tx_data_len, &ci->max_rx_data_len, pr);
 
 	sysctl_ctx_init(&ci->ctx);
 	oid = device_get_sysctl_tree(sc->dev);	/* dev.t5nex.X */
 	children = SYSCTL_CHILDREN(oid);
 
 	oid = SYSCTL_ADD_NODE(&ci->ctx, children, OID_AUTO, "iscsi",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "iSCSI ULP settings");
 	children = SYSCTL_CHILDREN(oid);
 
 	ci->ddp_threshold = 2048;
 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "ddp_threshold",
 	    CTLFLAG_RW, &ci->ddp_threshold, 0, "Rx zero copy threshold");
 
 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_rx_data_len",
 	    CTLFLAG_RW, &ci->max_rx_data_len, 0,
 	    "Maximum receive data segment length");
 	SYSCTL_ADD_UINT(&ci->ctx, children, OID_AUTO, "max_tx_data_len",
 	    CTLFLAG_RW, &ci->max_tx_data_len, 0,
 	    "Maximum transmit data segment length");
 
 	return (0);
 }
 
 static int
 do_rx_iscsi_hdr(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	struct cpl_iscsi_hdr *cpl = mtod(m, struct cpl_iscsi_hdr *);
 	u_int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct icl_pdu *ip;
 	struct icl_cxgbei_pdu *icp;
 	uint16_t len_ddp = be16toh(cpl->pdu_len_ddp);
 	uint16_t len = be16toh(cpl->len);
 
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
 
 	ip = icl_cxgbei_new_pdu(M_NOWAIT);
 	if (ip == NULL)
 		CXGBE_UNIMPLEMENTED("PDU allocation failure");
 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
 	ip->ip_data_len = G_ISCSI_PDU_LEN(len_ddp) - len;
 	icp = ip_to_icp(ip);
 	icp->icp_seq = ntohl(cpl->seq);
 	icp->icp_flags = ICPF_RX_HDR;
 
 	/* This is the start of a new PDU.  There should be no old state. */
 	MPASS(toep->ulpcb2 == NULL);
 	toep->ulpcb2 = icp;
 
 #if 0
 	CTR5(KTR_CXGBE, "%s: tid %u, cpl->len %u, pdu_len_ddp 0x%04x, icp %p",
 	    __func__, tid, len, len_ddp, icp);
 #endif
 
 	m_freem(m);
 	return (0);
 }
 
 static int
 do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	struct cpl_iscsi_data *cpl =  mtod(m, struct cpl_iscsi_data *);
 	u_int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
 	struct icl_pdu *ip;
 
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl));
 
 	if (icp == NULL) {
 		/*
 		 * T6 completion enabled, start of a new pdu. Header
 		 * will come in completion CPL.
 		 */
 	        ip = icl_cxgbei_new_pdu(M_NOWAIT);
 	        if (ip == NULL)
 			CXGBE_UNIMPLEMENTED("PDU allocation failure");
 		icp = ip_to_icp(ip);
 	} else {
 		/* T5 mode, header is already received. */
 		MPASS(icp->icp_flags == ICPF_RX_HDR);
 		MPASS(icp->ip.ip_data_mbuf == NULL);
 		MPASS(icp->ip.ip_data_len == m->m_pkthdr.len - sizeof(*cpl));
 	}
 
 	/* Trim the cpl header from mbuf. */
 	m_adj(m, sizeof(*cpl));
 
 	icp->icp_flags |= ICPF_RX_FLBUF;
 	icp->ip.ip_data_mbuf = m;
 	toep->ofld_rxq->rx_iscsi_fl_pdus++;
 	toep->ofld_rxq->rx_iscsi_fl_octets += m->m_pkthdr.len;
 
 	/*
 	 * For T6, save the icp for further processing in the
 	 * completion handler.
 	 */
 	if (icp->icp_flags == ICPF_RX_FLBUF) {
 		MPASS(toep->ulpcb2 == NULL);
 		toep->ulpcb2 = icp;
 	}
 
 #if 0
 	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len %u, icp %p", __func__, tid,
 	    be16toh(cpl->len), icp);
 #endif
 
 	return (0);
 }
 
 static int
 mbuf_crc32c_helper(void *arg, void *data, u_int len)
 {
 	uint32_t *digestp = arg;
 
 	*digestp = calculate_crc32c(*digestp, data, len);
 	return (0);
 }
 
 static struct icl_pdu *
 parse_pdu(struct socket *so, struct toepcb *toep, struct icl_cxgbei_conn *icc,
     struct sockbuf *sb, u_int total_len)
 {
 	struct uio uio;
 	struct iovec iov[2];
 	struct iscsi_bhs bhs;
 	struct mbuf *m;
 	struct icl_pdu *ip;
 	u_int ahs_len, data_len, header_len, pdu_len;
 	uint32_t calc_digest, wire_digest;
 	int error;
 
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = curthread;
 
 	header_len = sizeof(struct iscsi_bhs);
 	if (icc->ic.ic_header_crc32c)
 		header_len += ISCSI_HEADER_DIGEST_SIZE;
 
 	if (total_len < header_len) {
 		ICL_WARN("truncated pre-offload PDU with len %u", total_len);
 		return (NULL);
 	}
 
 	iov[0].iov_base = &bhs;
 	iov[0].iov_len = sizeof(bhs);
 	iov[1].iov_base = &wire_digest;
 	iov[1].iov_len = sizeof(wire_digest);
 	uio.uio_iov = iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = header_len;
 	error = soreceive(so, NULL, &uio, NULL, NULL, NULL);
 	if (error != 0) {
 		ICL_WARN("failed to read BHS from pre-offload PDU: %d", error);
 		return (NULL);
 	}
 
 	ahs_len = bhs.bhs_total_ahs_len * 4;
 	data_len = bhs.bhs_data_segment_len[0] << 16 |
 	    bhs.bhs_data_segment_len[1] << 8 |
 	    bhs.bhs_data_segment_len[2];
 	pdu_len = header_len + ahs_len + roundup2(data_len, 4);
 	if (icc->ic.ic_data_crc32c && data_len != 0)
 		pdu_len += ISCSI_DATA_DIGEST_SIZE;
 
 	if (total_len < pdu_len) {
 		ICL_WARN("truncated pre-offload PDU len %u vs %u", total_len,
 		    pdu_len);
 		return (NULL);
 	}
 
 	if (ahs_len != 0) {
 		ICL_WARN("received pre-offload PDU with AHS");
 		return (NULL);
 	}
 
 	if (icc->ic.ic_header_crc32c) {
 		calc_digest = calculate_crc32c(0xffffffff, (caddr_t)&bhs,
 		    sizeof(bhs));
 		calc_digest ^= 0xffffffff;
 		if (calc_digest != wire_digest) {
 			ICL_WARN("received pre-offload PDU 0x%02x with "
 			    "invalid header digest (0x%x vs 0x%x)",
 			    bhs.bhs_opcode, wire_digest, calc_digest);
 			toep->ofld_rxq->rx_iscsi_header_digest_errors++;
 			return (NULL);
 		}
 	}
 
 	m = NULL;
 	if (data_len != 0) {
 		uio.uio_iov = NULL;
 		uio.uio_resid = roundup2(data_len, 4);
 		if (icc->ic.ic_data_crc32c)
 			uio.uio_resid += ISCSI_DATA_DIGEST_SIZE;
 
 		error = soreceive(so, NULL, &uio, &m, NULL, NULL);
 		if (error != 0) {
 			ICL_WARN("failed to read data payload from "
 			    "pre-offload PDU: %d", error);
 			return (NULL);
 		}
 
 		if (icc->ic.ic_data_crc32c) {
 			m_copydata(m, roundup2(data_len, 4),
 			    sizeof(wire_digest), (caddr_t)&wire_digest);
 
 			calc_digest = 0xffffffff;
 			m_apply(m, 0, roundup2(data_len, 4), mbuf_crc32c_helper,
 			    &calc_digest);
 			calc_digest ^= 0xffffffff;
 			if (calc_digest != wire_digest) {
 				ICL_WARN("received pre-offload PDU 0x%02x "
 				    "with invalid data digest (0x%x vs 0x%x)",
 				    bhs.bhs_opcode, wire_digest, calc_digest);
 				toep->ofld_rxq->rx_iscsi_data_digest_errors++;
 				m_freem(m);
 				return (NULL);
 			}
 		}
 	}
 
 	ip = icl_cxgbei_new_pdu(M_WAITOK);
 	icl_cxgbei_new_pdu_set_conn(ip, &icc->ic);
 	*ip->ip_bhs = bhs;
 	ip->ip_data_len = data_len;
 	ip->ip_data_mbuf = m;
 	return (ip);
 }
 
 void
 parse_pdus(struct icl_cxgbei_conn *icc, struct sockbuf *sb)
 {
 	struct icl_conn *ic = &icc->ic;
 	struct socket *so = ic->ic_socket;
 	struct toepcb *toep = icc->toep;
 	struct icl_pdu *ip, *lastip;
 	u_int total_len;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	CTR3(KTR_CXGBE, "%s: tid %u, %u bytes in so_rcv", __func__, toep->tid,
 	    sbused(sb));
 
 	lastip = NULL;
 	while (sbused(sb) != 0 && (sb->sb_state & SBS_CANTRCVMORE) == 0) {
 		total_len = sbused(sb);
 		SOCKBUF_UNLOCK(sb);
 
 		ip = parse_pdu(so, toep, icc, sb, total_len);
 
 		if (ip == NULL) {
 			ic->ic_error(ic);
 			SOCKBUF_LOCK(sb);
 			return;
 		}
 
 		if (lastip == NULL)
 			STAILQ_INSERT_HEAD(&icc->rcvd_pdus, ip, ip_next);
 		else
 			STAILQ_INSERT_AFTER(&icc->rcvd_pdus, lastip, ip,
 			    ip_next);
 		lastip = ip;
 
 		SOCKBUF_LOCK(sb);
 	}
 }
 
 static int
 do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
 	u_int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct tcpcb *tp;
 	struct icl_cxgbei_conn *icc;
 	struct icl_conn *ic;
 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
 	struct icl_pdu *ip;
 	u_int pdu_len, val;
 	struct epoch_tracker et;
 
 	MPASS(m == NULL);
 
 	/* Must already be assembling a PDU. */
 	MPASS(icp != NULL);
 	MPASS(icp->icp_flags & ICPF_RX_HDR);	/* Data is optional. */
 	MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
 
 	pdu_len = be16toh(cpl->len);	/* includes everything. */
 	val = be32toh(cpl->ddpvld);
 
 #if 0
 	CTR5(KTR_CXGBE,
 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp_flags 0x%08x",
 	    __func__, tid, pdu_len, val, icp->icp_flags);
 #endif
 
 	icp->icp_flags |= ICPF_RX_STATUS;
 	ip = &icp->ip;
 	if (val & F_DDP_PADDING_ERR) {
 		ICL_WARN("received PDU 0x%02x with invalid padding",
 		    ip->ip_bhs->bhs_opcode);
 		toep->ofld_rxq->rx_iscsi_padding_errors++;
 	}
 	if (val & F_DDP_HDRCRC_ERR) {
 		ICL_WARN("received PDU 0x%02x with invalid header digest",
 		    ip->ip_bhs->bhs_opcode);
 		toep->ofld_rxq->rx_iscsi_header_digest_errors++;
 	}
 	if (val & F_DDP_DATACRC_ERR) {
 		ICL_WARN("received PDU 0x%02x with invalid data digest",
 		    ip->ip_bhs->bhs_opcode);
 		toep->ofld_rxq->rx_iscsi_data_digest_errors++;
 	}
 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
 		MPASS(ip->ip_data_len > 0);
 		icp->icp_flags |= ICPF_RX_DDP;
 		toep->ofld_rxq->rx_iscsi_ddp_pdus++;
 		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
 	}
 
 	INP_WLOCK(inp);
-	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
+	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, pdu_len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		icl_cxgbei_conn_pdu_free(NULL, ip);
 		toep->ulpcb2 = NULL;
 		return (0);
 	}
 
 	/*
 	 * T6+ does not report data PDUs received via DDP without F
 	 * set.  This can result in gaps in the TCP sequence space.
 	 */
 	tp = intotcpcb(inp);
 	MPASS(chip_id(sc) >= CHELSIO_T6 || icp->icp_seq == tp->rcv_nxt);
 	tp->rcv_nxt = icp->icp_seq + pdu_len;
 	tp->t_rcvtime = ticks;
 
 	/*
 	 * Don't update the window size or return credits since RX
 	 * flow control is disabled.
 	 */
 
 	so = inp->inp_socket;
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	icc = toep->ulpcb;
 	if (__predict_false(icc == NULL || sb->sb_state & SBS_CANTRCVMORE)) {
 		CTR5(KTR_CXGBE,
 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
 		    __func__, tid, pdu_len, icc, sb->sb_state);
 		SOCKBUF_UNLOCK(sb);
 		INP_WUNLOCK(inp);
 
 		CURVNET_SET(so->so_vnet);
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		icl_cxgbei_conn_pdu_free(NULL, ip);
 		toep->ulpcb2 = NULL;
 		return (0);
 	}
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 	ic = &icc->ic;
 	if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR |
 	    F_DDP_DATACRC_ERR)) != 0) {
 		SOCKBUF_UNLOCK(sb);
 		INP_WUNLOCK(inp);
 
 		icl_cxgbei_conn_pdu_free(NULL, ip);
 		toep->ulpcb2 = NULL;
 		ic->ic_error(ic);
 		return (0);
 	}
 
 	icl_cxgbei_new_pdu_set_conn(ip, ic);
 
 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
 	if (!icc->rx_active) {
 		icc->rx_active = true;
 		wakeup(&icc->rx_active);
 	}
 	SOCKBUF_UNLOCK(sb);
 	INP_WUNLOCK(inp);
 
 	toep->ulpcb2 = NULL;
 
 	return (0);
 }
 
 static int
 do_rx_iscsi_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct epoch_tracker et;
 	struct adapter *sc = iq->adapter;
 	struct cpl_rx_iscsi_cmp *cpl = mtod(m, struct cpl_rx_iscsi_cmp *);
 	u_int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
 	struct icl_pdu *ip;
 	struct cxgbei_cmp *cmp;
 	struct inpcb *inp = toep->inp;
 #ifdef INVARIANTS
 	uint16_t len = be16toh(cpl->len);
 	u_int data_digest_len;
 #endif
 	struct socket *so;
 	struct sockbuf *sb;
 	struct tcpcb *tp;
 	struct icl_cxgbei_conn *icc;
 	struct icl_conn *ic;
 	struct iscsi_bhs_data_out *bhsdo;
 	u_int val = be32toh(cpl->ddpvld);
 	u_int npdus, pdu_len;
 	uint32_t prev_seg_len;
 
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
 
 	if ((val & F_DDP_PDU) == 0) {
 		MPASS(icp != NULL);
 		MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
 		ip = &icp->ip;
 	}
 
 	if (icp == NULL) {
 		/* T6 completion enabled, start of a new PDU. */
 		ip = icl_cxgbei_new_pdu(M_NOWAIT);
 		if (ip == NULL)
 			CXGBE_UNIMPLEMENTED("PDU allocation failure");
 		icp = ip_to_icp(ip);
 	}
 	pdu_len = G_ISCSI_PDU_LEN(be16toh(cpl->pdu_len_ddp));
 
 #if 0
 	CTR5(KTR_CXGBE,
 	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp %p",
 	    __func__, tid, pdu_len, val, icp);
 #endif
 
 	/* Copy header */
 	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
 	bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
 	ip->ip_data_len = bhsdo->bhsdo_data_segment_len[0] << 16 |
 	    bhsdo->bhsdo_data_segment_len[1] << 8 |
 	    bhsdo->bhsdo_data_segment_len[2];
 	icp->icp_seq = ntohl(cpl->seq);
 	icp->icp_flags |= ICPF_RX_HDR;
 	icp->icp_flags |= ICPF_RX_STATUS;
 
 	if (val & F_DDP_PADDING_ERR) {
 		ICL_WARN("received PDU 0x%02x with invalid padding",
 		    ip->ip_bhs->bhs_opcode);
 		toep->ofld_rxq->rx_iscsi_padding_errors++;
 	}
 	if (val & F_DDP_HDRCRC_ERR) {
 		ICL_WARN("received PDU 0x%02x with invalid header digest",
 		    ip->ip_bhs->bhs_opcode);
 		toep->ofld_rxq->rx_iscsi_header_digest_errors++;
 	}
 	if (val & F_DDP_DATACRC_ERR) {
 		ICL_WARN("received PDU 0x%02x with invalid data digest",
 		    ip->ip_bhs->bhs_opcode);
 		toep->ofld_rxq->rx_iscsi_data_digest_errors++;
 	}
 
 	INP_WLOCK(inp);
-	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
+	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, pdu_len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		icl_cxgbei_conn_pdu_free(NULL, ip);
 		toep->ulpcb2 = NULL;
 		m_freem(m);
 		return (0);
 	}
 
 	tp = intotcpcb(inp);
 
 	/*
 	 * If icc is NULL, the connection is being closed in
 	 * icl_cxgbei_conn_close(), just drop this data.
 	 */
 	icc = toep->ulpcb;
 	if (__predict_false(icc == NULL)) {
 		CTR4(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p",
 		    __func__, tid, pdu_len, icc);
 
 		/*
 		 * Update rcv_nxt so the sequence number of the FIN
 		 * doesn't appear wrong.
 		 */
 		tp->rcv_nxt = icp->icp_seq + pdu_len;
 		tp->t_rcvtime = ticks;
 		INP_WUNLOCK(inp);
 
 		icl_cxgbei_conn_pdu_free(NULL, ip);
 		toep->ulpcb2 = NULL;
 		m_freem(m);
 		return (0);
 	}
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 	ic = &icc->ic;
 	if ((val & (F_DDP_PADDING_ERR | F_DDP_HDRCRC_ERR |
 	    F_DDP_DATACRC_ERR)) != 0) {
 		INP_WUNLOCK(inp);
 
 		icl_cxgbei_conn_pdu_free(NULL, ip);
 		toep->ulpcb2 = NULL;
 		m_freem(m);
 		ic->ic_error(ic);
 		return (0);
 	}
 
 #ifdef INVARIANTS
 	data_digest_len = (icc->ulp_submode & ULP_CRC_DATA) ?
 	    ISCSI_DATA_DIGEST_SIZE : 0;
 	MPASS(roundup2(ip->ip_data_len, 4) == pdu_len - len - data_digest_len);
 #endif
 
 	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
 		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
 		MPASS(ip->ip_data_len > 0);
 		icp->icp_flags |= ICPF_RX_DDP;
 		bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
 
 		switch (ip->ip_bhs->bhs_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) {
 		case ISCSI_BHS_OPCODE_SCSI_DATA_IN:
 			cmp = cxgbei_find_cmp(icc,
 			    be32toh(bhsdo->bhsdo_initiator_task_tag));
 			break;
 		case ISCSI_BHS_OPCODE_SCSI_DATA_OUT:
 			cmp = cxgbei_find_cmp(icc,
 			    be32toh(bhsdo->bhsdo_target_transfer_tag));
 			break;
 		default:
 			__assert_unreachable();
 		}
 		MPASS(cmp != NULL);
 
 		/*
 		 * The difference between the end of the last burst
 		 * and the offset of the last PDU in this burst is
 		 * the additional data received via DDP.
 		 */
 		prev_seg_len = be32toh(bhsdo->bhsdo_buffer_offset) -
 		    cmp->next_buffer_offset;
 
 		if (prev_seg_len != 0) {
 			uint32_t orig_datasn;
 
 			/*
 			 * Return a "large" PDU representing the burst
 			 * of PDUs.  Adjust the offset and length of
 			 * this PDU to represent the entire burst.
 			 */
 			ip->ip_data_len += prev_seg_len;
 			bhsdo->bhsdo_data_segment_len[2] = ip->ip_data_len;
 			bhsdo->bhsdo_data_segment_len[1] = ip->ip_data_len >> 8;
 			bhsdo->bhsdo_data_segment_len[0] = ip->ip_data_len >> 16;
 			bhsdo->bhsdo_buffer_offset =
 			    htobe32(cmp->next_buffer_offset);
 
 			orig_datasn = htobe32(bhsdo->bhsdo_datasn);
 			npdus = orig_datasn - cmp->last_datasn;
 			bhsdo->bhsdo_datasn = htobe32(cmp->last_datasn + 1);
 			cmp->last_datasn = orig_datasn;
 			ip->ip_additional_pdus = npdus - 1;
 		} else {
 			MPASS(htobe32(bhsdo->bhsdo_datasn) ==
 			    cmp->last_datasn + 1);
 			npdus = 1;
 			cmp->last_datasn = htobe32(bhsdo->bhsdo_datasn);
 		}
 
 		cmp->next_buffer_offset += ip->ip_data_len;
 		toep->ofld_rxq->rx_iscsi_ddp_pdus += npdus;
 		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
 	} else {
 		MPASS(icp->icp_flags & (ICPF_RX_FLBUF));
 		MPASS(ip->ip_data_len == ip->ip_data_mbuf->m_pkthdr.len);
 	}
 
 	tp->rcv_nxt = icp->icp_seq + pdu_len;
 	tp->t_rcvtime = ticks;
 
 	/*
 	 * Don't update the window size or return credits since RX
 	 * flow control is disabled.
 	 */
 
 	so = inp->inp_socket;
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 		CTR5(KTR_CXGBE,
 		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
 		    __func__, tid, pdu_len, icc, sb->sb_state);
 		SOCKBUF_UNLOCK(sb);
 		INP_WUNLOCK(inp);
 
 		CURVNET_SET(so->so_vnet);
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp != NULL)
 			INP_WUNLOCK(inp);
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		icl_cxgbei_conn_pdu_free(NULL, ip);
 		toep->ulpcb2 = NULL;
 		m_freem(m);
 		return (0);
 	}
 
 	icl_cxgbei_new_pdu_set_conn(ip, ic);
 
 	/* Enqueue the PDU to the received pdus queue. */
 	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
 	if (!icc->rx_active) {
 		icc->rx_active = true;
 		wakeup(&icc->rx_active);
 	}
 	SOCKBUF_UNLOCK(sb);
 	INP_WUNLOCK(inp);
 
 	toep->ulpcb2 = NULL;
 	m_freem(m);
 
 	return (0);
 }
 
 static int
 cxgbei_activate(struct adapter *sc)
 {
 	struct cxgbei_data *ci;
 	int rc;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (uld_active(sc, ULD_ISCSI)) {
 		KASSERT(0, ("%s: iSCSI offload already enabled on adapter %p",
 		    __func__, sc));
 		return (0);
 	}
 
 	if (sc->iscsicaps == 0 || sc->vres.iscsi.size == 0) {
 		device_printf(sc->dev,
 		    "not iSCSI offload capable, or capability disabled.\n");
 		return (ENOSYS);
 	}
 
 	/* per-adapter softc for iSCSI */
 	ci = malloc(sizeof(*ci), M_CXGBE, M_ZERO | M_WAITOK);
 	if (ci == NULL)
 		return (ENOMEM);
 
 	rc = cxgbei_init(sc, ci);
 	if (rc != 0) {
 		free(ci, M_CXGBE);
 		return (rc);
 	}
 
 	sc->iscsi_ulp_softc = ci;
 
 	return (0);
 }
 
 static int
 cxgbei_deactivate(struct adapter *sc)
 {
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (ci != NULL) {
 		sysctl_ctx_free(&ci->ctx);
 		t4_free_ppod_region(&ci->pr);
 		free(ci, M_CXGBE);
 		sc->iscsi_ulp_softc = NULL;
 	}
 
 	return (0);
 }
 
 static void
 cxgbei_activate_all(struct adapter *sc, void *arg __unused)
 {
 
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isact") != 0)
 		return;
 
 	/* Activate iSCSI if any port on this adapter has IFCAP_TOE enabled. */
 	if (sc->offload_map && !uld_active(sc, ULD_ISCSI))
 		(void) t4_activate_uld(sc, ULD_ISCSI);
 
 	end_synchronized_op(sc, 0);
 }
 
 static void
 cxgbei_deactivate_all(struct adapter *sc, void *arg __unused)
 {
 
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4isdea") != 0)
 		return;
 
 	if (uld_active(sc, ULD_ISCSI))
 	    (void) t4_deactivate_uld(sc, ULD_ISCSI);
 
 	end_synchronized_op(sc, 0);
 }
 
 static struct uld_info cxgbei_uld_info = {
 	.uld_id = ULD_ISCSI,
 	.activate = cxgbei_activate,
 	.deactivate = cxgbei_deactivate,
 };
 
 static int
 cxgbei_mod_load(void)
 {
 	int rc;
 
 	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
 	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
 	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, do_rx_iscsi_cmp);
 
 	rc = t4_register_uld(&cxgbei_uld_info);
 	if (rc != 0)
 		return (rc);
 
 	t4_iterate(cxgbei_activate_all, NULL);
 
 	return (rc);
 }
 
 static int
 cxgbei_mod_unload(void)
 {
 
 	t4_iterate(cxgbei_deactivate_all, NULL);
 
 	if (t4_unregister_uld(&cxgbei_uld_info) == EBUSY)
 		return (EBUSY);
 
 	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
 	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
 	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, NULL);
 
 	return (0);
 }
 #endif
 
 static int
 cxgbei_modevent(module_t mod, int cmd, void *arg)
 {
 	int rc = 0;
 
 #ifdef TCP_OFFLOAD
 	switch (cmd) {
 	case MOD_LOAD:
 		rc = cxgbei_mod_load();
 		if (rc == 0)
 			rc = icl_cxgbei_mod_load();
 		break;
 
 	case MOD_UNLOAD:
 		rc = icl_cxgbei_mod_unload();
 		if (rc == 0)
 			rc = cxgbei_mod_unload();
 		break;
 
 	default:
 		rc = EINVAL;
 	}
 #else
 	printf("cxgbei: compiled without TCP_OFFLOAD support.\n");
 	rc = EOPNOTSUPP;
 #endif
 
 	return (rc);
 }
 
 static moduledata_t cxgbei_mod = {
 	"cxgbei",
 	cxgbei_modevent,
 	NULL,
 };
 
 MODULE_VERSION(cxgbei, 1);
 DECLARE_MODULE(cxgbei, cxgbei_mod, SI_SUB_EXEC, SI_ORDER_ANY);
 MODULE_DEPEND(cxgbei, t4_tom, 1, 1, 1);
 MODULE_DEPEND(cxgbei, cxgbe, 1, 1, 1);
 MODULE_DEPEND(cxgbei, icl, 1, 1, 1);
diff --git a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
index 1b896516d546..82201b358e91 100644
--- a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
+++ b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
@@ -1,1880 +1,1879 @@
 /*-
  * Copyright (c) 2012 The FreeBSD Foundation
  * Copyright (c) 2015 Chelsio Communications, Inc.
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * cxgbei implementation of iSCSI Common Layer kobj(9) interface.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/uio.h>
 #include <machine/bus.h>
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 #include <dev/iscsi/icl.h>
 #include <dev/iscsi/iscsi_proto.h>
 #include <icl_conn_if.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_da.h>
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_error.h>
 #include <cam/ctl/ctl_frontend.h>
 #include <cam/ctl/ctl_debug.h>
 #include <cam/ctl/ctl_ha.h>
 #include <cam/ctl/ctl_ioctl.h>
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_xpt.h>
 #include <cam/cam_debug.h>
 #include <cam/cam_sim.h>
 #include <cam/cam_xpt_sim.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_periph.h>
 #include <cam/cam_compat.h>
 #include <cam/scsi/scsi_message.h>
 
 #include "common/common.h"
 #include "common/t4_regs.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom.h"
 #include "cxgbei.h"
 
 /*
  * Use the page pod tag for the TT hash.
  */
 #define	TT_HASH(icc, tt)	(G_PPOD_TAG(tt) & (icc)->cmp_hash_mask)
 
 struct cxgbei_ddp_state {
 	struct ppod_reservation prsv;
 	struct cxgbei_cmp cmp;
 };
 
 static MALLOC_DEFINE(M_CXGBEI, "cxgbei", "cxgbei(4)");
 
 SYSCTL_NODE(_kern_icl, OID_AUTO, cxgbei, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Chelsio iSCSI offload");
 static int first_burst_length = 8192;
 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, first_burst_length, CTLFLAG_RWTUN,
     &first_burst_length, 0, "First burst length");
 static int max_burst_length = 2 * 1024 * 1024;
 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, max_burst_length, CTLFLAG_RWTUN,
     &max_burst_length, 0, "Maximum burst length");
 static int sendspace = 1048576;
 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, sendspace, CTLFLAG_RWTUN,
     &sendspace, 0, "Default send socket buffer size");
 static int recvspace = 1048576;
 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, recvspace, CTLFLAG_RWTUN,
     &recvspace, 0, "Default receive socket buffer size");
 
 static volatile u_int icl_cxgbei_ncons;
 
 static icl_conn_new_pdu_t	icl_cxgbei_conn_new_pdu;
 static icl_conn_pdu_data_segment_length_t
 				    icl_cxgbei_conn_pdu_data_segment_length;
 static icl_conn_pdu_append_bio_t	icl_cxgbei_conn_pdu_append_bio;
 static icl_conn_pdu_append_data_t	icl_cxgbei_conn_pdu_append_data;
 static icl_conn_pdu_get_bio_t	icl_cxgbei_conn_pdu_get_bio;
 static icl_conn_pdu_get_data_t	icl_cxgbei_conn_pdu_get_data;
 static icl_conn_pdu_queue_t	icl_cxgbei_conn_pdu_queue;
 static icl_conn_pdu_queue_cb_t	icl_cxgbei_conn_pdu_queue_cb;
 static icl_conn_handoff_t	icl_cxgbei_conn_handoff;
 static icl_conn_free_t		icl_cxgbei_conn_free;
 static icl_conn_close_t		icl_cxgbei_conn_close;
 static icl_conn_task_setup_t	icl_cxgbei_conn_task_setup;
 static icl_conn_task_done_t	icl_cxgbei_conn_task_done;
 static icl_conn_transfer_setup_t	icl_cxgbei_conn_transfer_setup;
 static icl_conn_transfer_done_t	icl_cxgbei_conn_transfer_done;
 
 static kobj_method_t icl_cxgbei_methods[] = {
 	KOBJMETHOD(icl_conn_new_pdu, icl_cxgbei_conn_new_pdu),
 	KOBJMETHOD(icl_conn_pdu_free, icl_cxgbei_conn_pdu_free),
 	KOBJMETHOD(icl_conn_pdu_data_segment_length,
 	    icl_cxgbei_conn_pdu_data_segment_length),
 	KOBJMETHOD(icl_conn_pdu_append_bio, icl_cxgbei_conn_pdu_append_bio),
 	KOBJMETHOD(icl_conn_pdu_append_data, icl_cxgbei_conn_pdu_append_data),
 	KOBJMETHOD(icl_conn_pdu_get_bio, icl_cxgbei_conn_pdu_get_bio),
 	KOBJMETHOD(icl_conn_pdu_get_data, icl_cxgbei_conn_pdu_get_data),
 	KOBJMETHOD(icl_conn_pdu_queue, icl_cxgbei_conn_pdu_queue),
 	KOBJMETHOD(icl_conn_pdu_queue_cb, icl_cxgbei_conn_pdu_queue_cb),
 	KOBJMETHOD(icl_conn_handoff, icl_cxgbei_conn_handoff),
 	KOBJMETHOD(icl_conn_free, icl_cxgbei_conn_free),
 	KOBJMETHOD(icl_conn_close, icl_cxgbei_conn_close),
 	KOBJMETHOD(icl_conn_task_setup, icl_cxgbei_conn_task_setup),
 	KOBJMETHOD(icl_conn_task_done, icl_cxgbei_conn_task_done),
 	KOBJMETHOD(icl_conn_transfer_setup, icl_cxgbei_conn_transfer_setup),
 	KOBJMETHOD(icl_conn_transfer_done, icl_cxgbei_conn_transfer_done),
 	{ 0, 0 }
 };
 
 DEFINE_CLASS(icl_cxgbei, icl_cxgbei_methods, sizeof(struct icl_cxgbei_conn));
 
 void
 icl_cxgbei_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
 {
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 
 	KASSERT(icp->ref_cnt != 0, ("freeing deleted PDU"));
 	MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
 	MPASS(ic == ip->ip_conn);
 
 	m_freem(ip->ip_ahs_mbuf);
 	m_freem(ip->ip_data_mbuf);
 	m_freem(ip->ip_bhs_mbuf);
 
 	KASSERT(ic != NULL || icp->ref_cnt == 1,
 	    ("orphaned PDU has oustanding references"));
 
 	if (atomic_fetchadd_int(&icp->ref_cnt, -1) != 1)
 		return;
 
 	free(icp, M_CXGBEI);
 #ifdef DIAGNOSTIC
 	if (__predict_true(ic != NULL))
 		refcount_release(&ic->ic_outstanding_pdus);
 #endif
 }
 
 static void
 icl_cxgbei_pdu_call_cb(struct icl_pdu *ip)
 {
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 
 	MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
 
 	if (icp->cb != NULL)
 		icp->cb(ip, icp->error);
 #ifdef DIAGNOSTIC
 	if (__predict_true(ip->ip_conn != NULL))
 		refcount_release(&ip->ip_conn->ic_outstanding_pdus);
 #endif
 	free(icp, M_CXGBEI);
 }
 
 static void
 icl_cxgbei_pdu_done(struct icl_pdu *ip, int error)
 {
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 
 	if (error != 0)
 		icp->error = error;
 
 	m_freem(ip->ip_ahs_mbuf);
 	ip->ip_ahs_mbuf = NULL;
 	m_freem(ip->ip_data_mbuf);
 	ip->ip_data_mbuf = NULL;
 	m_freem(ip->ip_bhs_mbuf);
 	ip->ip_bhs_mbuf = NULL;
 
 	/*
 	 * All other references to this PDU should have been dropped
 	 * by the m_freem() of ip_data_mbuf.
 	 */
 	if (atomic_fetchadd_int(&icp->ref_cnt, -1) == 1)
 		icl_cxgbei_pdu_call_cb(ip);
 	else
 		__assert_unreachable();
 }
 
 static void
 icl_cxgbei_mbuf_done(struct mbuf *mb)
 {
 
 	struct icl_cxgbei_pdu *icp = (struct icl_cxgbei_pdu *)mb->m_ext.ext_arg1;
 
 	/*
 	 * NB: mb_free_mext() might leave ref_cnt as 1 without
 	 * decrementing it if it hits the fast path in the ref_cnt
 	 * check.
 	 */
 	icl_cxgbei_pdu_call_cb(&icp->ip);
 }
 
 struct icl_pdu *
 icl_cxgbei_new_pdu(int flags)
 {
 	struct icl_cxgbei_pdu *icp;
 	struct icl_pdu *ip;
 	struct mbuf *m;
 
 	icp = malloc(sizeof(*icp), M_CXGBEI, flags | M_ZERO);
 	if (__predict_false(icp == NULL))
 		return (NULL);
 
 	icp->icp_signature = CXGBEI_PDU_SIGNATURE;
 	icp->ref_cnt = 1;
 	ip = &icp->ip;
 
 	m = m_gethdr(flags, MT_DATA);
 	if (__predict_false(m == NULL)) {
 		free(icp, M_CXGBEI);
 		return (NULL);
 	}
 
 	ip->ip_bhs_mbuf = m;
 	ip->ip_bhs = mtod(m, struct iscsi_bhs *);
 	memset(ip->ip_bhs, 0, sizeof(*ip->ip_bhs));
 	m->m_len = sizeof(struct iscsi_bhs);
 	m->m_pkthdr.len = m->m_len;
 
 	return (ip);
 }
 
 void
 icl_cxgbei_new_pdu_set_conn(struct icl_pdu *ip, struct icl_conn *ic)
 {
 
 	ip->ip_conn = ic;
 #ifdef DIAGNOSTIC
 	refcount_acquire(&ic->ic_outstanding_pdus);
 #endif
 }
 
 /*
  * Allocate icl_pdu with empty BHS to fill up by the caller.
  */
 static struct icl_pdu *
 icl_cxgbei_conn_new_pdu(struct icl_conn *ic, int flags)
 {
 	struct icl_pdu *ip;
 
 	ip = icl_cxgbei_new_pdu(flags);
 	if (__predict_false(ip == NULL))
 		return (NULL);
 	icl_cxgbei_new_pdu_set_conn(ip, ic);
 
 	return (ip);
 }
 
 static size_t
 icl_pdu_data_segment_length(const struct icl_pdu *request)
 {
 	uint32_t len = 0;
 
 	len += request->ip_bhs->bhs_data_segment_len[0];
 	len <<= 8;
 	len += request->ip_bhs->bhs_data_segment_len[1];
 	len <<= 8;
 	len += request->ip_bhs->bhs_data_segment_len[2];
 
 	return (len);
 }
 
 size_t
 icl_cxgbei_conn_pdu_data_segment_length(struct icl_conn *ic,
     const struct icl_pdu *request)
 {
 
 	return (icl_pdu_data_segment_length(request));
 }
 
 static struct mbuf *
 finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp)
 {
 	struct icl_pdu *ip = &icp->ip;
 	uint8_t ulp_submode, padding;
 	struct mbuf *m, *last;
 	struct iscsi_bhs *bhs;
 	int data_len;
 
 	/*
 	 * Fix up the data segment mbuf first.
 	 */
 	m = ip->ip_data_mbuf;
 	ulp_submode = icc->ulp_submode;
 	if (m != NULL) {
 		last = m_last(m);
 
 		/*
 		 * Round up the data segment to a 4B boundary.	Pad with 0 if
 		 * necessary.  There will definitely be room in the mbuf.
 		 */
 		padding = roundup2(ip->ip_data_len, 4) - ip->ip_data_len;
 		if (padding != 0) {
 			MPASS(padding <= M_TRAILINGSPACE(last));
 			bzero(mtod(last, uint8_t *) + last->m_len, padding);
 			last->m_len += padding;
 		}
 	} else {
 		MPASS(ip->ip_data_len == 0);
 		ulp_submode &= ~ULP_CRC_DATA;
 		padding = 0;
 	}
 
 	/*
 	 * Now the header mbuf that has the BHS.
 	 */
 	m = ip->ip_bhs_mbuf;
 	MPASS(m->m_pkthdr.len == sizeof(struct iscsi_bhs));
 	MPASS(m->m_len == sizeof(struct iscsi_bhs));
 
 	bhs = ip->ip_bhs;
 	data_len = ip->ip_data_len;
 	if (data_len > icc->ic.ic_max_send_data_segment_length) {
 		struct iscsi_bhs_data_in *bhsdi;
 		int flags;
 
 		KASSERT(padding == 0, ("%s: ISO with padding %d for icp %p",
 		    __func__, padding, icp));
 		switch (bhs->bhs_opcode) {
 		case ISCSI_BHS_OPCODE_SCSI_DATA_OUT:
 			flags = 1;
 			break;
 		case ISCSI_BHS_OPCODE_SCSI_DATA_IN:
 			flags = 2;
 			break;
 		default:
 			panic("invalid opcode %#x for ISO", bhs->bhs_opcode);
 		}
 		data_len = icc->ic.ic_max_send_data_segment_length;
 		bhsdi = (struct iscsi_bhs_data_in *)bhs;
 		if (bhsdi->bhsdi_flags & BHSDI_FLAGS_F) {
 			/*
 			 * Firmware will set F on the final PDU in the
 			 * burst.
 			 */
 			flags |= CXGBE_ISO_F;
 			bhsdi->bhsdi_flags &= ~BHSDI_FLAGS_F;
 		}
 		set_mbuf_iscsi_iso(m, true);
 		set_mbuf_iscsi_iso_flags(m, flags);
 		set_mbuf_iscsi_iso_mss(m, data_len);
 	}
 
 	bhs->bhs_data_segment_len[2] = data_len;
 	bhs->bhs_data_segment_len[1] = data_len >> 8;
 	bhs->bhs_data_segment_len[0] = data_len >> 16;
 
 	/*
 	 * Extract mbuf chain from PDU.
 	 */
 	m->m_pkthdr.len += ip->ip_data_len + padding;
 	m->m_next = ip->ip_data_mbuf;
 	set_mbuf_ulp_submode(m, ulp_submode);
 	ip->ip_bhs_mbuf = NULL;
 	ip->ip_data_mbuf = NULL;
 	ip->ip_bhs = NULL;
 
 	/*
 	 * Drop PDU reference on icp.  Additional references might
 	 * still be held by zero-copy PDU buffers (ICL_NOCOPY).
 	 */
 	if (atomic_fetchadd_int(&icp->ref_cnt, -1) == 1)
 		icl_cxgbei_pdu_call_cb(ip);
 
 	return (m);
 }
 
 static void
 icl_cxgbei_tx_main(void *arg)
 {
 	struct epoch_tracker et;
 	struct icl_cxgbei_conn *icc = arg;
 	struct icl_conn *ic = &icc->ic;
 	struct toepcb *toep = icc->toep;
 	struct socket *so = ic->ic_socket;
 	struct inpcb *inp = sotoinpcb(so);
 	struct icl_pdu *ip;
 	struct mbuf *m;
 	struct mbufq mq;
 	STAILQ_HEAD(, icl_pdu) tx_pdus = STAILQ_HEAD_INITIALIZER(tx_pdus);
 
 	mbufq_init(&mq, INT_MAX);
 
 	ICL_CONN_LOCK(ic);
 	while (__predict_true(!ic->ic_disconnecting)) {
 		while (STAILQ_EMPTY(&icc->sent_pdus)) {
 			icc->tx_active = false;
 			mtx_sleep(&icc->tx_active, ic->ic_lock, 0, "-", 0);
 			if (__predict_false(ic->ic_disconnecting))
 				goto out;
 			MPASS(icc->tx_active);
 		}
 
 		STAILQ_SWAP(&icc->sent_pdus, &tx_pdus, icl_pdu);
 		ICL_CONN_UNLOCK(ic);
 
 		while ((ip = STAILQ_FIRST(&tx_pdus)) != NULL) {
 			STAILQ_REMOVE_HEAD(&tx_pdus, ip_next);
 
 			m = finalize_pdu(icc, ip_to_icp(ip));
 			M_ASSERTPKTHDR(m);
 			MPASS((m->m_pkthdr.len & 3) == 0);
 
 			mbufq_enqueue(&mq, m);
 		}
 
 		ICL_CONN_LOCK(ic);
 		if (__predict_false(ic->ic_disconnecting) ||
 		    __predict_false(ic->ic_socket == NULL)) {
 			mbufq_drain(&mq);
 			break;
 		}
 
 		CURVNET_SET(toep->vnet);
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
 
 		ICL_CONN_UNLOCK(ic);
-		if (__predict_false(inp->inp_flags & (INP_DROPPED |
-		    INP_TIMEWAIT)) ||
+		if (__predict_false(inp->inp_flags & INP_DROPPED) ||
 		    __predict_false((toep->flags & TPF_ATTACHED) == 0)) {
 			mbufq_drain(&mq);
 		} else {
 			mbufq_concat(&toep->ulp_pduq, &mq);
 			t4_push_pdus(icc->sc, toep, 0);
 		}
 		INP_WUNLOCK(inp);
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		ICL_CONN_LOCK(ic);
 	}
 out:
 	ICL_CONN_UNLOCK(ic);
 
 	kthread_exit();
 }
 
 static void
 icl_cxgbei_rx_main(void *arg)
 {
 	struct icl_cxgbei_conn *icc = arg;
 	struct icl_conn *ic = &icc->ic;
 	struct icl_pdu *ip;
 	struct sockbuf *sb;
 	STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus);
 	bool cantrcvmore;
 
 	sb = &ic->ic_socket->so_rcv;
 	SOCKBUF_LOCK(sb);
 	while (__predict_true(!ic->ic_disconnecting)) {
 		while (STAILQ_EMPTY(&icc->rcvd_pdus)) {
 			icc->rx_active = false;
 			mtx_sleep(&icc->rx_active, SOCKBUF_MTX(sb), 0, "-", 0);
 			if (__predict_false(ic->ic_disconnecting))
 				goto out;
 			MPASS(icc->rx_active);
 		}
 
 		if (__predict_false(sbused(sb)) != 0) {
 			/*
 			 * PDUs were received before the tid
 			 * transitioned to ULP mode.  Convert
 			 * them to icl_cxgbei_pdus and insert
 			 * them into the head of rcvd_pdus.
 			 */
 			parse_pdus(icc, sb);
 		}
 		cantrcvmore = (sb->sb_state & SBS_CANTRCVMORE) != 0;
 		MPASS(STAILQ_EMPTY(&rx_pdus));
 		STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu);
 		SOCKBUF_UNLOCK(sb);
 
 		/* Hand over PDUs to ICL. */
 		while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) {
 			STAILQ_REMOVE_HEAD(&rx_pdus, ip_next);
 			if (cantrcvmore)
 				icl_cxgbei_pdu_done(ip, ENOTCONN);
 			else
 				ic->ic_receive(ip);
 		}
 
 		SOCKBUF_LOCK(sb);
 	}
 out:
 	/*
 	 * Since ic_disconnecting is set before the SOCKBUF_MTX is
 	 * locked in icl_cxgbei_conn_close, the loop above can exit
 	 * before icl_cxgbei_conn_close can lock SOCKBUF_MTX and block
 	 * waiting for the thread exit.
 	 */
 	while (!icc->rx_exiting)
 		mtx_sleep(&icc->rx_active, SOCKBUF_MTX(sb), 0, "-", 0);
 	SOCKBUF_UNLOCK(sb);
 
 	kthread_exit();
 }
 
 static void
 cxgbei_free_mext_pg(struct mbuf *m)
 {
 	struct icl_cxgbei_pdu *icp;
 
 	M_ASSERTEXTPG(m);
 
 	/*
 	 * Nothing to do for the pages; they are owned by the PDU /
 	 * I/O request.
 	 */
 
 	/* Drop reference on the PDU. */
 	icp = m->m_ext.ext_arg1;
 	if (atomic_fetchadd_int(&icp->ref_cnt, -1) == 1)
 		icl_cxgbei_pdu_call_cb(&icp->ip);
 }
 
 static struct mbuf *
 cxgbei_getm(size_t len, int flags)
 {
 	struct mbuf *m, *m0, *m_tail;
 
 	m_tail = m0 = NULL;
 
 	/* Allocate as jumbo mbufs of size MJUM16BYTES. */
 	while (len >= MJUM16BYTES) {
 		m = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUM16BYTES);
 		if (__predict_false(m == NULL)) {
 			if ((flags & M_WAITOK) != 0) {
 				/* Fall back to non-jumbo mbufs. */
 				break;
 			}
 			return (NULL);
 		}
 		if (m0 == NULL) {
 			m0 = m_tail = m;
 		} else {
 			m_tail->m_next = m;
 			m_tail = m;
 		}
 		len -= MJUM16BYTES;
 	}
 
 	/* Allocate mbuf chain for the remaining data. */
 	if (len != 0) {
 		m = m_getm2(NULL, len, flags, MT_DATA, 0);
 		if (__predict_false(m == NULL)) {
 			m_freem(m0);
 			return (NULL);
 		}
 		if (m0 == NULL)
 			m0 = m;
 		else
 			m_tail->m_next = m;
 	}
 
 	return (m0);
 }
 
 int
 icl_cxgbei_conn_pdu_append_bio(struct icl_conn *ic, struct icl_pdu *ip,
     struct bio *bp, size_t offset, size_t len, int flags)
 {
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 	struct mbuf *m, *m_tail;
 	vm_offset_t vaddr;
 	size_t page_offset, todo, mtodo;
 	boolean_t mapped;
 	int i;
 
 	MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
 	MPASS(ic == ip->ip_conn);
 	KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len));
 
 	m_tail = ip->ip_data_mbuf;
 	if (m_tail != NULL)
 		for (; m_tail->m_next != NULL; m_tail = m_tail->m_next)
 			;
 
 	MPASS(bp->bio_flags & BIO_UNMAPPED);
 	if (offset < PAGE_SIZE - bp->bio_ma_offset) {
 		page_offset = bp->bio_ma_offset + offset;
 		i = 0;
 	} else {
 		offset -= PAGE_SIZE - bp->bio_ma_offset;
 		for (i = 1; offset >= PAGE_SIZE; i++)
 			offset -= PAGE_SIZE;
 		page_offset = offset;
 	}
 
 	if (flags & ICL_NOCOPY) {
 		m = NULL;
 		while (len > 0) {
 			if (m == NULL) {
 				m = mb_alloc_ext_pgs(flags & ~ICL_NOCOPY,
 				    cxgbei_free_mext_pg);
 				if (__predict_false(m == NULL))
 					return (ENOMEM);
 				atomic_add_int(&icp->ref_cnt, 1);
 				m->m_ext.ext_arg1 = icp;
 				m->m_epg_1st_off = page_offset;
 			}
 
 			todo = MIN(len, PAGE_SIZE - page_offset);
 
 			m->m_epg_pa[m->m_epg_npgs] =
 			    VM_PAGE_TO_PHYS(bp->bio_ma[i]);
 			m->m_epg_npgs++;
 			m->m_epg_last_len = todo;
 			m->m_len += todo;
 			m->m_ext.ext_size += PAGE_SIZE;
 			MBUF_EXT_PGS_ASSERT_SANITY(m);
 
 			if (m->m_epg_npgs == MBUF_PEXT_MAX_PGS) {
 				if (m_tail != NULL)
 					m_tail->m_next = m;
 				else
 					ip->ip_data_mbuf = m;
 				m_tail = m;
 				ip->ip_data_len += m->m_len;
 				m = NULL;
 			}
 
 			page_offset = 0;
 			len -= todo;
 			i++;
 		}
 
 		if (m != NULL) {
 			if (m_tail != NULL)
 				m_tail->m_next = m;
 			else
 				ip->ip_data_mbuf = m;
 			ip->ip_data_len += m->m_len;
 		}
 		return (0);
 	}
 
 	m = cxgbei_getm(len, flags);
 	if (__predict_false(m == NULL))
 		return (ENOMEM);
 
 	if (ip->ip_data_mbuf == NULL) {
 		ip->ip_data_mbuf = m;
 		ip->ip_data_len = len;
 	} else {
 		m_tail->m_next = m;
 		ip->ip_data_len += len;
 	}
 
 	while (len > 0) {
 		todo = MIN(len, PAGE_SIZE - page_offset);
 
 		mapped = pmap_map_io_transient(bp->bio_ma + i, &vaddr, 1,
 		    FALSE);
 
 		do {
 			mtodo = min(todo, M_SIZE(m) - m->m_len);
 			memcpy(mtod(m, char *) + m->m_len, (char *)vaddr +
 			    page_offset, mtodo);
 			m->m_len += mtodo;
 			if (m->m_len == M_SIZE(m))
 				m = m->m_next;
 			page_offset += mtodo;
 			todo -= mtodo;
 		} while (todo > 0);
 
 		if (__predict_false(mapped))
 			pmap_unmap_io_transient(bp->bio_ma + 1, &vaddr, 1,
 			    FALSE);
 
 		page_offset = 0;
 		len -= todo;
 		i++;
 	}
 
 	MPASS(ip->ip_data_len <= max(ic->ic_max_send_data_segment_length,
 	    ic->ic_hw_isomax));
 
 	return (0);
 }
 
 int
 icl_cxgbei_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *ip,
     const void *addr, size_t len, int flags)
 {
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 	struct mbuf *m, *m_tail;
 	const char *src;
 
 	MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
 	MPASS(ic == ip->ip_conn);
 	KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len));
 
 	m_tail = ip->ip_data_mbuf;
 	if (m_tail != NULL)
 		for (; m_tail->m_next != NULL; m_tail = m_tail->m_next)
 			;
 
 	if (flags & ICL_NOCOPY) {
 		m = m_get(flags & ~ICL_NOCOPY, MT_DATA);
 		if (m == NULL) {
 			ICL_WARN("failed to allocate mbuf");
 			return (ENOMEM);
 		}
 
 		m->m_flags |= M_RDONLY;
 		m_extaddref(m, __DECONST(char *, addr), len, &icp->ref_cnt,
 		    icl_cxgbei_mbuf_done, icp, NULL);
 		m->m_len = len;
 		if (ip->ip_data_mbuf == NULL) {
 			ip->ip_data_mbuf = m;
 			ip->ip_data_len = len;
 		} else {
 			m_tail->m_next = m;
 			m_tail = m_tail->m_next;
 			ip->ip_data_len += len;
 		}
 
 		return (0);
 	}
 
 	m = cxgbei_getm(len, flags);
 	if (__predict_false(m == NULL))
 		return (ENOMEM);
 
 	if (ip->ip_data_mbuf == NULL) {
 		ip->ip_data_mbuf = m;
 		ip->ip_data_len = len;
 	} else {
 		m_tail->m_next = m;
 		ip->ip_data_len += len;
 	}
 	src = (const char *)addr;
 	for (; m != NULL; m = m->m_next) {
 		m->m_len = min(len, M_SIZE(m));
 		memcpy(mtod(m, void *), src, m->m_len);
 		src += m->m_len;
 		len -= m->m_len;
 	}
 	MPASS(len == 0);
 
 	MPASS(ip->ip_data_len <= max(ic->ic_max_send_data_segment_length,
 	    ic->ic_hw_isomax));
 
 	return (0);
 }
 
 void
 icl_cxgbei_conn_pdu_get_bio(struct icl_conn *ic, struct icl_pdu *ip,
     size_t pdu_off, struct bio *bp, size_t bio_off, size_t len)
 {
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 	vm_offset_t vaddr;
 	size_t page_offset, todo;
 	boolean_t mapped;
 	int i;
 
 	if (icp->icp_flags & ICPF_RX_DDP)
 		return; /* data is DDP'ed, no need to copy */
 
 	MPASS(bp->bio_flags & BIO_UNMAPPED);
 	if (bio_off < PAGE_SIZE - bp->bio_ma_offset) {
 		page_offset = bp->bio_ma_offset + bio_off;
 		i = 0;
 	} else {
 		bio_off -= PAGE_SIZE - bp->bio_ma_offset;
 		for (i = 1; bio_off >= PAGE_SIZE; i++)
 			bio_off -= PAGE_SIZE;
 		page_offset = bio_off;
 	}
 
 	while (len > 0) {
 		todo = MIN(len, PAGE_SIZE - page_offset);
 
 		mapped = pmap_map_io_transient(bp->bio_ma + i, &vaddr, 1,
 		    FALSE);
 		m_copydata(ip->ip_data_mbuf, pdu_off, todo, (char *)vaddr +
 		    page_offset);
 		if (__predict_false(mapped))
 			pmap_unmap_io_transient(bp->bio_ma + 1, &vaddr, 1,
 			    FALSE);
 
 		page_offset = 0;
 		pdu_off += todo;
 		len -= todo;
 		i++;
 	}
 }
 
 void
 icl_cxgbei_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip,
     size_t off, void *addr, size_t len)
 {
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 
 	if (icp->icp_flags & ICPF_RX_DDP)
 		return; /* data is DDP'ed, no need to copy */
 	m_copydata(ip->ip_data_mbuf, off, len, addr);
 }
 
 void
 icl_cxgbei_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip)
 {
 	icl_cxgbei_conn_pdu_queue_cb(ic, ip, NULL);
 }
 
 void
 icl_cxgbei_conn_pdu_queue_cb(struct icl_conn *ic, struct icl_pdu *ip,
 			     icl_pdu_cb cb)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 	struct socket *so = ic->ic_socket;
 
 	MPASS(ic == ip->ip_conn);
 	MPASS(ip->ip_bhs_mbuf != NULL);
 	/* The kernel doesn't generate PDUs with AHS. */
 	MPASS(ip->ip_ahs_mbuf == NULL && ip->ip_ahs_len == 0);
 
 	ICL_CONN_LOCK_ASSERT(ic);
 
 	icp->cb = cb;
 
 	/* NOTE: sowriteable without so_snd lock is a mostly harmless race. */
 	if (ic->ic_disconnecting || so == NULL || !sowriteable(so)) {
 		icl_cxgbei_pdu_done(ip, ENOTCONN);
 		return;
 	}
 
 	STAILQ_INSERT_TAIL(&icc->sent_pdus, ip, ip_next);
 	if (!icc->tx_active) {
 		icc->tx_active = true;
 		wakeup(&icc->tx_active);
 	}
 }
 
 static struct icl_conn *
 icl_cxgbei_new_conn(const char *name, struct mtx *lock)
 {
 	struct icl_cxgbei_conn *icc;
 	struct icl_conn *ic;
 
 	refcount_acquire(&icl_cxgbei_ncons);
 
 	icc = (struct icl_cxgbei_conn *)kobj_create(&icl_cxgbei_class, M_CXGBE,
 	    M_WAITOK | M_ZERO);
 	icc->icc_signature = CXGBEI_CONN_SIGNATURE;
 	STAILQ_INIT(&icc->rcvd_pdus);
 	STAILQ_INIT(&icc->sent_pdus);
 
 	icc->cmp_table = hashinit(64, M_CXGBEI, &icc->cmp_hash_mask);
 	mtx_init(&icc->cmp_lock, "cxgbei_cmp", NULL, MTX_DEF);
 
 	ic = &icc->ic;
 	ic->ic_lock = lock;
 
 #ifdef DIAGNOSTIC
 	refcount_init(&ic->ic_outstanding_pdus, 0);
 #endif
 	ic->ic_name = name;
 	ic->ic_offload = "cxgbei";
 	ic->ic_unmapped = true;
 
 	CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc);
 
 	return (ic);
 }
 
 void
 icl_cxgbei_conn_free(struct icl_conn *ic)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 
 	CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc);
 
 	mtx_destroy(&icc->cmp_lock);
 	hashdestroy(icc->cmp_table, M_CXGBEI, icc->cmp_hash_mask);
 	kobj_delete((struct kobj *)icc, M_CXGBE);
 	refcount_release(&icl_cxgbei_ncons);
 }
 
 static int
 icl_cxgbei_setsockopt(struct icl_conn *ic, struct socket *so, int sspace,
     int rspace)
 {
 	struct sockopt opt;
 	int error, one = 1, ss, rs;
 
 	ss = max(sendspace, sspace);
 	rs = max(recvspace, rspace);
 
 	error = soreserve(so, ss, rs);
 	if (error != 0)
 		return (error);
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_flags |= SB_AUTOSIZE;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	SOCKBUF_LOCK(&so->so_rcv);
 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	/*
 	 * Disable Nagle.
 	 */
 	bzero(&opt, sizeof(opt));
 	opt.sopt_dir = SOPT_SET;
 	opt.sopt_level = IPPROTO_TCP;
 	opt.sopt_name = TCP_NODELAY;
 	opt.sopt_val = &one;
 	opt.sopt_valsize = sizeof(one);
 	error = sosetopt(so, &opt);
 	if (error != 0)
 		return (error);
 
 	return (0);
 }
 
 /*
  * Request/response structure used to find out the adapter offloading a socket.
  */
 struct find_ofld_adapter_rr {
 	struct socket *so;
 	struct adapter *sc;	/* result */
 };
 
 static void
 find_offload_adapter(struct adapter *sc, void *arg)
 {
 	struct find_ofld_adapter_rr *fa = arg;
 	struct socket *so = fa->so;
 	struct tom_data *td = sc->tom_softc;
 	struct tcpcb *tp;
 	struct inpcb *inp;
 
 	/* Non-TCP were filtered out earlier. */
 	MPASS(so->so_proto->pr_protocol == IPPROTO_TCP);
 
 	if (fa->sc != NULL)
 		return;	/* Found already. */
 
 	if (td == NULL)
 		return;	/* TOE not enabled on this adapter. */
 
 	inp = sotoinpcb(so);
 	INP_WLOCK(inp);
-	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
+	if ((inp->inp_flags & INP_DROPPED) == 0) {
 		tp = intotcpcb(inp);
 		if (tp->t_flags & TF_TOE && tp->tod == &td->tod)
 			fa->sc = sc;	/* Found. */
 	}
 	INP_WUNLOCK(inp);
 }
 
 static bool
 is_memfree(struct adapter *sc)
 {
 	uint32_t em;
 
 	em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE);
 	if ((em & F_EXT_MEM_ENABLE) != 0)
 		return (false);
 	if (is_t5(sc) && (em & F_EXT_MEM1_ENABLE) != 0)
 		return (false);
 	return (true);
 }
 
 /* XXXNP: move this to t4_tom. */
 static void
 send_iscsi_flowc_wr(struct adapter *sc, struct toepcb *toep, int maxlen)
 {
 	struct wrqe *wr;
 	struct fw_flowc_wr *flowc;
 	const u_int nparams = 1;
 	u_int flowclen;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
 
 	wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	flowc = wrtod(wr);
 	memset(flowc, 0, wr->wr_len);
 
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(nparams));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
 	    V_FW_WR_FLOWID(toep->tid));
 
 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_TXDATAPLEN_MAX;
 	flowc->mnemval[0].val = htobe32(maxlen);
 
 	txsd->tx_credits = howmany(flowclen, 16);
 	txsd->plen = 0;
 	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
 	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 
 	t4_wrq_tx(sc, wr);
 }
 
 static void
 set_ulp_mode_iscsi(struct adapter *sc, struct toepcb *toep, u_int ulp_submode)
 {
 	uint64_t val;
 
 	CTR3(KTR_CXGBE, "%s: tid %u, ULP_MODE_ISCSI, submode=%#x",
 	    __func__, toep->tid, ulp_submode);
 
 	val = V_TCB_ULP_TYPE(ULP_MODE_ISCSI) | V_TCB_ULP_RAW(ulp_submode);
 	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_ULP_TYPE,
 	    V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val,
 	    0, 0);
 
 	val = V_TF_RX_FLOW_CONTROL_DISABLE(1ULL);
 	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, val, val, 0, 0);
 }
 
 /*
  * XXXNP: Who is responsible for cleaning up the socket if this returns with an
  * error?  Review all error paths.
  *
  * XXXNP: What happens to the socket's fd reference if the operation is
  * successful, and how does that affect the socket's life cycle?
  */
 int
 icl_cxgbei_conn_handoff(struct icl_conn *ic, int fd)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct find_ofld_adapter_rr fa;
 	struct file *fp;
 	struct socket *so;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct toepcb *toep;
 	cap_rights_t rights;
 	u_int max_iso_payload, max_rx_pdu_len, max_tx_pdu_len;
 	int error, max_iso_pdus;
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 	ICL_CONN_LOCK_ASSERT_NOT(ic);
 
 	/*
 	 * Steal the socket from userland.
 	 */
 	error = fget(curthread, fd,
 	    cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 	so = fp->f_data;
 	if (so->so_type != SOCK_STREAM ||
 	    so->so_proto->pr_protocol != IPPROTO_TCP) {
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 
 	ICL_CONN_LOCK(ic);
 	if (ic->ic_socket != NULL) {
 		ICL_CONN_UNLOCK(ic);
 		fdrop(fp, curthread);
 		return (EBUSY);
 	}
 	ic->ic_disconnecting = false;
 	ic->ic_socket = so;
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	fdrop(fp, curthread);
 	ICL_CONN_UNLOCK(ic);
 
 	/* Find the adapter offloading this socket. */
 	fa.sc = NULL;
 	fa.so = so;
 	t4_iterate(find_offload_adapter, &fa);
 	if (fa.sc == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 	icc->sc = fa.sc;
 
 	max_rx_pdu_len = ISCSI_BHS_SIZE + ic->ic_max_recv_data_segment_length;
 	max_tx_pdu_len = ISCSI_BHS_SIZE + ic->ic_max_send_data_segment_length;
 	if (ic->ic_header_crc32c) {
 		max_rx_pdu_len += ISCSI_HEADER_DIGEST_SIZE;
 		max_tx_pdu_len += ISCSI_HEADER_DIGEST_SIZE;
 	}
 	if (ic->ic_data_crc32c) {
 		max_rx_pdu_len += ISCSI_DATA_DIGEST_SIZE;
 		max_tx_pdu_len += ISCSI_DATA_DIGEST_SIZE;
 	}
 
 	inp = sotoinpcb(so);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
-	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		error = ENOTCONN;
 		goto out;
 	}
 
 	/*
 	 * socket could not have been "unoffloaded" if here.
 	 */
 	MPASS(tp->t_flags & TF_TOE);
 	MPASS(tp->tod != NULL);
 	MPASS(tp->t_toe != NULL);
 	toep = tp->t_toe;
 	MPASS(toep->vi->adapter == icc->sc);
 
 	if (ulp_mode(toep) != ULP_MODE_NONE) {
 		INP_WUNLOCK(inp);
 		error = EINVAL;
 		goto out;
 	}
 
 	icc->toep = toep;
 
 	icc->ulp_submode = 0;
 	if (ic->ic_header_crc32c)
 		icc->ulp_submode |= ULP_CRC_HEADER;
 	if (ic->ic_data_crc32c)
 		icc->ulp_submode |= ULP_CRC_DATA;
 
 	if (icc->sc->tt.iso && chip_id(icc->sc) >= CHELSIO_T5 &&
 	    !is_memfree(icc->sc)) {
 		max_iso_payload = rounddown(CXGBEI_MAX_ISO_PAYLOAD,
 		    tp->t_maxseg);
 		max_iso_pdus = max_iso_payload / max_tx_pdu_len;
 		ic->ic_hw_isomax = max_iso_pdus *
 		    ic->ic_max_send_data_segment_length;
 	} else
 		max_iso_pdus = 1;
 
 	toep->params.ulp_mode = ULP_MODE_ISCSI;
 	toep->ulpcb = icc;
 
 	send_iscsi_flowc_wr(icc->sc, toep,
 	    roundup(max_iso_pdus * max_tx_pdu_len, tp->t_maxseg));
 	set_ulp_mode_iscsi(icc->sc, toep, icc->ulp_submode);
 	INP_WUNLOCK(inp);
 
 	error = kthread_add(icl_cxgbei_tx_main, icc, NULL, &icc->tx_thread, 0,
 	    0, "%stx (cxgbei)", ic->ic_name);
 	if (error != 0)
 		goto out;
 
 	error = kthread_add(icl_cxgbei_rx_main, icc, NULL, &icc->rx_thread, 0,
 	    0, "%srx (cxgbei)", ic->ic_name);
 	if (error != 0)
 		goto out;
 
 	error = icl_cxgbei_setsockopt(ic, so, max_tx_pdu_len, max_rx_pdu_len);
 out:
 	if (error != 0)
 		icl_cxgbei_conn_close(ic);
 	return (error);
 }
 
 void
 icl_cxgbei_conn_close(struct icl_conn *ic)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct icl_pdu *ip;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct inpcb *inp;
 	struct toepcb *toep = icc->toep;
 
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
 	ICL_CONN_LOCK_ASSERT_NOT(ic);
 
 	ICL_CONN_LOCK(ic);
 	so = ic->ic_socket;
 	if (ic->ic_disconnecting || so == NULL) {
 		CTR4(KTR_CXGBE, "%s: icc %p (disconnecting = %d), so %p",
 		    __func__, icc, ic->ic_disconnecting, so);
 		ICL_CONN_UNLOCK(ic);
 		return;
 	}
 	ic->ic_disconnecting = true;
 
 #ifdef DIAGNOSTIC
 	KASSERT(ic->ic_outstanding_pdus == 0,
 	    ("destroying session with %d outstanding PDUs",
 	     ic->ic_outstanding_pdus));
 #endif
 
 	CTR3(KTR_CXGBE, "%s: tid %d, icc %p", __func__, toep ? toep->tid : -1,
 	    icc);
 
 	/*
 	 * Wait for the transmit thread to stop processing
 	 * this connection.
 	 */
 	if (icc->tx_thread != NULL) {
 		wakeup(&icc->tx_active);
 		mtx_sleep(icc->tx_thread, ic->ic_lock, 0, "conclo", 0);
 	}
 
 	/* Discard PDUs queued for TX. */
 	while (!STAILQ_EMPTY(&icc->sent_pdus)) {
 		ip = STAILQ_FIRST(&icc->sent_pdus);
 		STAILQ_REMOVE_HEAD(&icc->sent_pdus, ip_next);
 		icl_cxgbei_pdu_done(ip, ENOTCONN);
 	}
 	ICL_CONN_UNLOCK(ic);
 
 	inp = sotoinpcb(so);
 	sb = &so->so_rcv;
 
 	/*
 	 * Wait for the receive thread to stop processing this
 	 * connection.
 	 */
 	SOCKBUF_LOCK(sb);
 	if (icc->rx_thread != NULL) {
 		icc->rx_exiting = true;
 		wakeup(&icc->rx_active);
 		mtx_sleep(icc->rx_thread, SOCKBUF_MTX(sb), 0, "conclo", 0);
 	}
 
 	/*
 	 * Discard received PDUs not passed to the iSCSI layer.
 	 */
 	while (!STAILQ_EMPTY(&icc->rcvd_pdus)) {
 		ip = STAILQ_FIRST(&icc->rcvd_pdus);
 		STAILQ_REMOVE_HEAD(&icc->rcvd_pdus, ip_next);
 		icl_cxgbei_pdu_done(ip, ENOTCONN);
 	}
 	SOCKBUF_UNLOCK(sb);
 
 	INP_WLOCK(inp);
 	if (toep != NULL) {	/* NULL if connection was never offloaded. */
 		toep->ulpcb = NULL;
 
 		/* Discard mbufs queued for TX. */
 		mbufq_drain(&toep->ulp_pduq);
 
 		/*
 		 * Grab a reference to use when waiting for the final
 		 * CPL to be received.  If toep->inp is NULL, then
 		 * final_cpl_received() has already been called (e.g.
 		 * due to the peer sending a RST).
 		 */
 		if (toep->inp != NULL) {
 			toep = hold_toepcb(toep);
 			toep->flags |= TPF_WAITING_FOR_FINAL;
 		} else
 			toep = NULL;
 	}
 	INP_WUNLOCK(inp);
 
 	ICL_CONN_LOCK(ic);
 	ic->ic_socket = NULL;
 	ICL_CONN_UNLOCK(ic);
 
 	/*
 	 * XXXNP: we should send RST instead of FIN when PDUs held in various
 	 * queues were purged instead of delivered reliably but soabort isn't
 	 * really general purpose and wouldn't do the right thing here.
 	 */
 	soclose(so);
 
 	/*
 	 * Wait for the socket to fully close.  This ensures any
 	 * pending received data has been received (and in particular,
 	 * any data that would be received by DDP has been handled).
 	 * Callers assume that it is safe to free buffers for tasks
 	 * and transfers after this function returns.
 	 */
 	if (toep != NULL) {
 		struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep);
 
 		mtx_lock(lock);
 		while ((toep->flags & TPF_WAITING_FOR_FINAL) != 0)
 			mtx_sleep(toep, lock, PSOCK, "conclo2", 0);
 		mtx_unlock(lock);
 		free_toepcb(toep);
 	}
 }
 
 static void
 cxgbei_insert_cmp(struct icl_cxgbei_conn *icc, struct cxgbei_cmp *cmp,
     uint32_t tt)
 {
 #ifdef INVARIANTS
 	struct cxgbei_cmp *cmp2;
 #endif
 
 	cmp->tt = tt;
 
 	mtx_lock(&icc->cmp_lock);
 #ifdef INVARIANTS
 	LIST_FOREACH(cmp2, &icc->cmp_table[TT_HASH(icc, tt)], link) {
 		KASSERT(cmp2->tt != tt, ("%s: duplicate cmp", __func__));
 	}
 #endif
 	LIST_INSERT_HEAD(&icc->cmp_table[TT_HASH(icc, tt)], cmp, link);
 	mtx_unlock(&icc->cmp_lock);
 }
 
 struct cxgbei_cmp *
 cxgbei_find_cmp(struct icl_cxgbei_conn *icc, uint32_t tt)
 {
 	struct cxgbei_cmp *cmp;
 
 	mtx_lock(&icc->cmp_lock);
 	LIST_FOREACH(cmp, &icc->cmp_table[TT_HASH(icc, tt)], link) {
 		if (cmp->tt == tt)
 			break;
 	}
 	mtx_unlock(&icc->cmp_lock);
 	return (cmp);
 }
 
 static void
 cxgbei_rm_cmp(struct icl_cxgbei_conn *icc, struct cxgbei_cmp *cmp)
 {
 #ifdef INVARIANTS
 	struct cxgbei_cmp *cmp2;
 #endif
 
 	mtx_lock(&icc->cmp_lock);
 
 #ifdef INVARIANTS
 	LIST_FOREACH(cmp2, &icc->cmp_table[TT_HASH(icc, cmp->tt)], link) {
 		if (cmp2 == cmp)
 			goto found;
 	}
 	panic("%s: could not find cmp", __func__);
 found:
 #endif
 	LIST_REMOVE(cmp, link);
 	mtx_unlock(&icc->cmp_lock);
 }
 
 int
 icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip,
     struct ccb_scsiio *csio, uint32_t *ittp, void **arg)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct toepcb *toep = icc->toep;
 	struct adapter *sc = icc->sc;
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 	struct ppod_region *pr = &ci->pr;
 	struct cxgbei_ddp_state *ddp;
 	struct ppod_reservation *prsv;
 	struct inpcb *inp;
 	struct mbufq mq;
 	uint32_t itt;
 	int rc = 0;
 
 	ICL_CONN_LOCK_ASSERT(ic);
 
 	/* This is for the offload driver's state.  Must not be set already. */
 	MPASS(arg != NULL);
 	MPASS(*arg == NULL);
 
 	if ((csio->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_IN ||
 	    csio->dxfer_len < ci->ddp_threshold || ic->ic_disconnecting ||
 	    ic->ic_socket == NULL) {
 no_ddp:
 		/*
 		 * No DDP for this I/O.	 Allocate an ITT (based on the one
 		 * passed in) that cannot be a valid hardware DDP tag in the
 		 * iSCSI region.
 		 */
 		itt = *ittp & M_PPOD_TAG;
 		itt = V_PPOD_TAG(itt) | pr->pr_invalid_bit;
 		*ittp = htobe32(itt);
 		MPASS(*arg == NULL);	/* State is maintained for DDP only. */
 		if (rc != 0)
 			counter_u64_add(
 			    toep->ofld_rxq->rx_iscsi_ddp_setup_error, 1);
 		return (0);
 	}
 
 	/*
 	 * Reserve resources for DDP, update the itt that should be used in the
 	 * PDU, and save DDP specific state for this I/O in *arg.
 	 */
 	ddp = malloc(sizeof(*ddp), M_CXGBEI, M_NOWAIT | M_ZERO);
 	if (ddp == NULL) {
 		rc = ENOMEM;
 		goto no_ddp;
 	}
 	prsv = &ddp->prsv;
 
 	mbufq_init(&mq, INT_MAX);
 	switch (csio->ccb_h.flags & CAM_DATA_MASK) {
 	case CAM_DATA_BIO:
 		rc = t4_alloc_page_pods_for_bio(pr,
 		    (struct bio *)csio->data_ptr, prsv);
 		if (rc != 0) {
 			free(ddp, M_CXGBEI);
 			goto no_ddp;
 		}
 
 		rc = t4_write_page_pods_for_bio(sc, toep, prsv,
 		    (struct bio *)csio->data_ptr, &mq);
 		if (__predict_false(rc != 0)) {
 			mbufq_drain(&mq);
 			t4_free_page_pods(prsv);
 			free(ddp, M_CXGBEI);
 			goto no_ddp;
 		}
 		break;
 	case CAM_DATA_VADDR:
 		rc = t4_alloc_page_pods_for_buf(pr, (vm_offset_t)csio->data_ptr,
 		    csio->dxfer_len, prsv);
 		if (rc != 0) {
 			free(ddp, M_CXGBEI);
 			goto no_ddp;
 		}
 
 		rc = t4_write_page_pods_for_buf(sc, toep, prsv,
 		    (vm_offset_t)csio->data_ptr, csio->dxfer_len, &mq);
 		if (__predict_false(rc != 0)) {
 			mbufq_drain(&mq);
 			t4_free_page_pods(prsv);
 			free(ddp, M_CXGBEI);
 			goto no_ddp;
 		}
 		break;
 	default:
 		free(ddp, M_CXGBEI);
 		rc = EINVAL;
 		goto no_ddp;
 	}
 
 	/*
 	 * Do not get inp from toep->inp as the toepcb might have
 	 * detached already.
 	 */
 	inp = sotoinpcb(ic->ic_socket);
 	INP_WLOCK(inp);
-	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) != 0) {
+	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		mbufq_drain(&mq);
 		t4_free_page_pods(prsv);
 		free(ddp, M_CXGBEI);
 		goto no_ddp;
 	}
 	mbufq_concat(&toep->ulp_pduq, &mq);
 	INP_WUNLOCK(inp);
 
 	ddp->cmp.last_datasn = -1;
 	cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag);
 	*ittp = htobe32(prsv->prsv_tag);
 	*arg = prsv;
 	counter_u64_add(toep->ofld_rxq->rx_iscsi_ddp_setup_ok, 1);
 	return (0);
 }
 
 void
 icl_cxgbei_conn_task_done(struct icl_conn *ic, void *arg)
 {
 
 	if (arg != NULL) {
 		struct cxgbei_ddp_state *ddp = arg;
 
 		cxgbei_rm_cmp(ic_to_icc(ic), &ddp->cmp);
 		t4_free_page_pods(&ddp->prsv);
 		free(ddp, M_CXGBEI);
 	}
 }
 
 static inline bool
 ddp_sgl_check(struct ctl_sg_entry *sg, int entries, int xferlen)
 {
 #ifdef INVARIANTS
 	int total_len = 0;
 #endif
 
 	MPASS(entries > 0);
 	if (((vm_offset_t)sg[--entries].addr & 3U) != 0)
 		return (false);
 
 #ifdef INVARIANTS
 	total_len += sg[entries].len;
 #endif
 
 	while (--entries >= 0) {
 		if (((vm_offset_t)sg[entries].addr & PAGE_MASK) != 0 ||
 		    (sg[entries].len % PAGE_SIZE) != 0)
 			return (false);
 #ifdef INVARIANTS
 		total_len += sg[entries].len;
 #endif
 	}
 
 	MPASS(total_len == xferlen);
 	return (true);
 }
 
 #define io_to_ddp_state(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND2].ptr)
 
 int
 icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, struct icl_pdu *ip,
     union ctl_io *io, uint32_t *tttp, void **arg)
 {
 	struct icl_cxgbei_conn *icc = ic_to_icc(ic);
 	struct toepcb *toep = icc->toep;
 	struct ctl_scsiio *ctsio = &io->scsiio;
 	struct adapter *sc = icc->sc;
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 	struct ppod_region *pr = &ci->pr;
 	struct cxgbei_ddp_state *ddp;
 	struct ppod_reservation *prsv;
 	struct ctl_sg_entry *sgl, sg_entry;
 	struct inpcb *inp;
 	struct mbufq mq;
 	int sg_entries = ctsio->kern_sg_entries;
 	uint32_t ttt;
 	int xferlen, rc = 0, alias;
 
 	/* This is for the offload driver's state.  Must not be set already. */
 	MPASS(arg != NULL);
 	MPASS(*arg == NULL);
 
 	if (ctsio->ext_data_filled == 0) {
 		int first_burst;
 #ifdef INVARIANTS
 		struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 
 		MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE);
 		MPASS(ic == ip->ip_conn);
 		MPASS(ip->ip_bhs_mbuf != NULL);
 #endif
 		first_burst = icl_pdu_data_segment_length(ip);
 
 		/*
 		 * Note that ICL calls conn_transfer_setup even if the first
 		 * burst had everything and there's nothing left to transfer.
 		 *
 		 * NB: The CTL frontend might have provided a buffer
 		 * whose length (kern_data_len) is smaller than the
 		 * FirstBurstLength of unsolicited data.  Treat those
 		 * as an empty transfer.
 		 */
 		xferlen = ctsio->kern_data_len;
 		if (xferlen < first_burst ||
 		    xferlen - first_burst < ci->ddp_threshold) {
 no_ddp:
 			/*
 			 * No DDP for this transfer.  Allocate a TTT (based on
 			 * the one passed in) that cannot be a valid hardware
 			 * DDP tag in the iSCSI region.
 			 */
 			ttt = *tttp & M_PPOD_TAG;
 			ttt = V_PPOD_TAG(ttt) | pr->pr_invalid_bit;
 			*tttp = htobe32(ttt);
 			MPASS(io_to_ddp_state(io) == NULL);
 			if (rc != 0)
 				counter_u64_add(
 				    toep->ofld_rxq->rx_iscsi_ddp_setup_error, 1);
 			return (0);
 		}
 
 		if (sg_entries == 0) {
 			sgl = &sg_entry;
 			sgl->len = xferlen;
 			sgl->addr = (void *)ctsio->kern_data_ptr;
 			sg_entries = 1;
 		} else
 			sgl = (void *)ctsio->kern_data_ptr;
 
 		if (!ddp_sgl_check(sgl, sg_entries, xferlen))
 			goto no_ddp;
 
 		/*
 		 * Reserve resources for DDP, update the ttt that should be used
 		 * in the PDU, and save DDP specific state for this I/O.
 		 */
 		MPASS(io_to_ddp_state(io) == NULL);
 		ddp = malloc(sizeof(*ddp), M_CXGBEI, M_NOWAIT | M_ZERO);
 		if (ddp == NULL) {
 			rc = ENOMEM;
 			goto no_ddp;
 		}
 		prsv = &ddp->prsv;
 
 		rc = t4_alloc_page_pods_for_sgl(pr, sgl, sg_entries, prsv);
 		if (rc != 0) {
 			free(ddp, M_CXGBEI);
 			goto no_ddp;
 		}
 
 		mbufq_init(&mq, INT_MAX);
 		rc = t4_write_page_pods_for_sgl(sc, toep, prsv, sgl, sg_entries,
 		    xferlen, &mq);
 		if (__predict_false(rc != 0)) {
 			mbufq_drain(&mq);
 			t4_free_page_pods(prsv);
 			free(ddp, M_CXGBEI);
 			goto no_ddp;
 		}
 
 		/*
 		 * Do not get inp from toep->inp as the toepcb might
 		 * have detached already.
 		 */
 		ICL_CONN_LOCK(ic);
 		if (ic->ic_disconnecting || ic->ic_socket == NULL) {
 			ICL_CONN_UNLOCK(ic);
 			mbufq_drain(&mq);
 			t4_free_page_pods(prsv);
 			free(ddp, M_CXGBEI);
 			return (ECONNRESET);
 		}
 		inp = sotoinpcb(ic->ic_socket);
 		INP_WLOCK(inp);
 		ICL_CONN_UNLOCK(ic);
-		if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) != 0) {
+		if ((inp->inp_flags & INP_DROPPED) != 0) {
 			INP_WUNLOCK(inp);
 			mbufq_drain(&mq);
 			t4_free_page_pods(prsv);
 			free(ddp, M_CXGBEI);
 			return (ECONNRESET);
 		}
 		mbufq_concat(&toep->ulp_pduq, &mq);
 		INP_WUNLOCK(inp);
 
 		ddp->cmp.next_buffer_offset = ctsio->kern_rel_offset +
 		    first_burst;
 		ddp->cmp.last_datasn = -1;
 		cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag);
 		*tttp = htobe32(prsv->prsv_tag);
 		io_to_ddp_state(io) = ddp;
 		*arg = ctsio;
 		counter_u64_add(toep->ofld_rxq->rx_iscsi_ddp_setup_ok, 1);
 		return (0);
 	}
 
 	/*
 	 * In the middle of an I/O.  A non-NULL page pod reservation indicates
 	 * that a DDP buffer is being used for the I/O.
 	 */
 	ddp = io_to_ddp_state(ctsio);
 	if (ddp == NULL)
 		goto no_ddp;
 	prsv = &ddp->prsv;
 
 	alias = (prsv->prsv_tag & pr->pr_alias_mask) >> pr->pr_alias_shift;
 	alias++;
 	prsv->prsv_tag &= ~pr->pr_alias_mask;
 	prsv->prsv_tag |= alias << pr->pr_alias_shift & pr->pr_alias_mask;
 
 	ddp->cmp.last_datasn = -1;
 	cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag);
 	*tttp = htobe32(prsv->prsv_tag);
 	*arg = ctsio;
 
 	return (0);
 }
 
 void
 icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *arg)
 {
 	struct ctl_scsiio *ctsio = arg;
 
 	if (ctsio != NULL) {
 		struct cxgbei_ddp_state *ddp;
 
 		ddp = io_to_ddp_state(ctsio);
 		MPASS(ddp != NULL);
 
 		cxgbei_rm_cmp(ic_to_icc(ic), &ddp->cmp);
 		if (ctsio->kern_data_len == ctsio->ext_data_filled ||
 		    ic->ic_disconnecting) {
 			t4_free_page_pods(&ddp->prsv);
 			free(ddp, M_CXGBEI);
 			io_to_ddp_state(ctsio) = NULL;
 		}
 	}
 }
 
 #ifdef COMPAT_FREEBSD13
 static void
 cxgbei_limits(struct adapter *sc, void *arg)
 {
 	struct icl_drv_limits *idl = arg;
 	struct cxgbei_data *ci;
 	int max_dsl;
 
 	if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4lims") != 0)
 		return;
 
 	if (uld_active(sc, ULD_ISCSI)) {
 		ci = sc->iscsi_ulp_softc;
 		MPASS(ci != NULL);
 
 
 		max_dsl = ci->max_rx_data_len;
 		if (idl->idl_max_recv_data_segment_length > max_dsl)
 			idl->idl_max_recv_data_segment_length = max_dsl;
 
 		max_dsl = ci->max_tx_data_len;
 		if (idl->idl_max_send_data_segment_length > max_dsl)
 			idl->idl_max_send_data_segment_length = max_dsl;
 	}
 
 	end_synchronized_op(sc, LOCK_HELD);
 }
 #endif
 
 static int
 cxgbei_limits_fd(struct icl_drv_limits *idl, int fd)
 {
 	struct find_ofld_adapter_rr fa;
 	struct file *fp;
 	struct socket *so;
 	struct adapter *sc;
 	struct cxgbei_data *ci;
 	cap_rights_t rights;
 	int error;
 
 	error = fget(curthread, fd,
 	    cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 	so = fp->f_data;
 	if (so->so_type != SOCK_STREAM ||
 	    so->so_proto->pr_protocol != IPPROTO_TCP) {
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 
 	/* Find the adapter offloading this socket. */
 	fa.sc = NULL;
 	fa.so = so;
 	t4_iterate(find_offload_adapter, &fa);
 	if (fa.sc == NULL) {
 		fdrop(fp, curthread);
 		return (ENXIO);
 	}
 	fdrop(fp, curthread);
 
 	sc = fa.sc;
 	error = begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4lims");
 	if (error != 0)
 		return (error);
 
 	if (uld_active(sc, ULD_ISCSI)) {
 		ci = sc->iscsi_ulp_softc;
 		MPASS(ci != NULL);
 
 		idl->idl_max_recv_data_segment_length = ci->max_rx_data_len;
 		idl->idl_max_send_data_segment_length = ci->max_tx_data_len;
 	} else
 		error = ENXIO;
 
 	end_synchronized_op(sc, LOCK_HELD);
 
 	return (error);
 }
 
 static int
 icl_cxgbei_limits(struct icl_drv_limits *idl, int socket)
 {
 
 	/* Maximum allowed by the RFC.	cxgbei_limits will clip them. */
 	idl->idl_max_recv_data_segment_length = (1 << 24) - 1;
 	idl->idl_max_send_data_segment_length = (1 << 24) - 1;
 
 	/* These are somewhat arbitrary. */
 	idl->idl_max_burst_length = max_burst_length;
 	idl->idl_first_burst_length = first_burst_length;
 
 #ifdef COMPAT_FREEBSD13
 	if (socket == 0) {
 		t4_iterate(cxgbei_limits, idl);
 		return (0);
 	}
 #endif
 
 	return (cxgbei_limits_fd(idl, socket));
 }
 
 int
 icl_cxgbei_mod_load(void)
 {
 	int rc;
 
 	refcount_init(&icl_cxgbei_ncons, 0);
 
 	rc = icl_register("cxgbei", false, -100, icl_cxgbei_limits,
 	    icl_cxgbei_new_conn);
 
 	return (rc);
 }
 
 int
 icl_cxgbei_mod_unload(void)
 {
 
 	if (icl_cxgbei_ncons != 0)
 		return (EBUSY);
 
 	icl_unregister("cxgbei", false);
 
 	return (0);
 }
 #endif
diff --git a/sys/dev/cxgbe/iw_cxgbe/qp.c b/sys/dev/cxgbe/iw_cxgbe/qp.c
index b2901f93988e..d3d4c0573a6f 100644
--- a/sys/dev/cxgbe/iw_cxgbe/qp.c
+++ b/sys/dev/cxgbe/iw_cxgbe/qp.c
@@ -1,1971 +1,1971 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009-2013 Chelsio, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
  * General Public License (GPL) Version 2, available from the file
  * COPYING in the main directory of this source tree, or the
  * OpenIB.org BSD license below:
  *
  *     Redistribution and use in source and binary forms, with or
  *     without modification, are permitted provided that the following
  *     conditions are met:
  *
  *      - Redistributions of source code must retain the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer.
  *
  *      - Redistributions in binary form must reproduce the above
  *        copyright notice, this list of conditions and the following
  *        disclaimer in the documentation and/or other materials
  *        provided with the distribution.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/types.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/taskqueue.h>
 #include <netinet/in.h>
 #include <net/route.h>
 
 #include <netinet/in_systm.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcpip.h>
 
 #include <netinet/toecore.h>
 
 struct sge_iq;
 struct rss_header;
 struct cpl_set_tcb_rpl;
 #include <linux/types.h>
 #include "offload.h"
 #include "tom/t4_tom.h"
 
 #include "iw_cxgbe.h"
 #include "user.h"
 
 static int creds(struct toepcb *toep, struct inpcb *inp, size_t wrsize);
 static int max_fr_immd = T4_MAX_FR_IMMD;//SYSCTL parameter later...
 
 static int alloc_ird(struct c4iw_dev *dev, u32 ird)
 {
 	int ret = 0;
 
 	spin_lock_irq(&dev->lock);
 	if (ird <= dev->avail_ird)
 		dev->avail_ird -= ird;
 	else
 		ret = -ENOMEM;
 	spin_unlock_irq(&dev->lock);
 
 	if (ret)
 		log(LOG_WARNING, "%s: device IRD resources exhausted\n",
 			device_get_nameunit(dev->rdev.adap->dev));
 
 	return ret;
 }
 
 static void free_ird(struct c4iw_dev *dev, int ird)
 {
 	spin_lock_irq(&dev->lock);
 	dev->avail_ird += ird;
 	spin_unlock_irq(&dev->lock);
 }
 
 static void set_state(struct c4iw_qp *qhp, enum c4iw_qp_state state)
 {
 	unsigned long flag;
 	spin_lock_irqsave(&qhp->lock, flag);
 	qhp->attr.state = state;
 	spin_unlock_irqrestore(&qhp->lock, flag);
 }
 
 static int destroy_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,
 		      struct c4iw_dev_ucontext *uctx)
 {
 	struct c4iw_dev *rhp = rdev_to_c4iw_dev(rdev);
 	/*
 	 * uP clears EQ contexts when the connection exits rdma mode,
 	 * so no need to post a RESET WR for these EQs.
 	 */
 	dma_free_coherent(rhp->ibdev.dma_device,
 			wq->rq.memsize, wq->rq.queue,
 			dma_unmap_addr(&wq->rq, mapping));
 	dma_free_coherent(rhp->ibdev.dma_device,
 			wq->sq.memsize, wq->sq.queue,
 			dma_unmap_addr(&wq->sq, mapping));
 	c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size);
 	kfree(wq->rq.sw_rq);
 	kfree(wq->sq.sw_sq);
 	c4iw_put_qpid(rdev, wq->rq.qid, uctx);
 	c4iw_put_qpid(rdev, wq->sq.qid, uctx);
 	return 0;
 }
 
 static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,
 		     struct t4_cq *rcq, struct t4_cq *scq,
 		     struct c4iw_dev_ucontext *uctx)
 {
 	struct adapter *sc = rdev->adap;
 	struct c4iw_dev *rhp = rdev_to_c4iw_dev(rdev);
 	int user = (uctx != &rdev->uctx);
 	struct fw_ri_res_wr *res_wr;
 	struct fw_ri_res *res;
 	int wr_len;
 	struct c4iw_wr_wait wr_wait;
 	int ret = 0;
 	int eqsize;
 	struct wrqe *wr;
 	u64 sq_bar2_qoffset = 0, rq_bar2_qoffset = 0;
 
 	wq->sq.qid = c4iw_get_qpid(rdev, uctx);
 	if (!wq->sq.qid)
 		return -ENOMEM;
 
 	wq->rq.qid = c4iw_get_qpid(rdev, uctx);
 	if (!wq->rq.qid) {
 		ret = -ENOMEM;
 		goto free_sq_qid;
 	}
 
 	if (!user) {
 		wq->sq.sw_sq = kzalloc(wq->sq.size * sizeof *wq->sq.sw_sq,
 				 GFP_KERNEL);
 		if (!wq->sq.sw_sq) {
 			ret = -ENOMEM;
 			goto free_rq_qid;
 		}
 
 		wq->rq.sw_rq = kzalloc(wq->rq.size * sizeof *wq->rq.sw_rq,
 				 GFP_KERNEL);
 		if (!wq->rq.sw_rq) {
 			ret = -ENOMEM;
 			goto free_sw_sq;
 		}
 	}
 
 	/*
 	 * RQT must be a power of 2 and at least 16 deep.
 	 */
 	wq->rq.rqt_size = roundup_pow_of_two(max_t(u16, wq->rq.size, 16));
 	wq->rq.rqt_hwaddr = c4iw_rqtpool_alloc(rdev, wq->rq.rqt_size);
 	if (!wq->rq.rqt_hwaddr) {
 		ret = -ENOMEM;
 		goto free_sw_rq;
 	}
 
 	/*QP memory, allocate DMAable memory for Send & Receive Queues */
 	wq->sq.queue = dma_alloc_coherent(rhp->ibdev.dma_device, wq->sq.memsize,
 				       &(wq->sq.dma_addr), GFP_KERNEL);
 	if (!wq->sq.queue) {
 		ret = -ENOMEM;
 		goto free_hwaddr;
 	}
 	wq->sq.phys_addr = vtophys(wq->sq.queue);
 	dma_unmap_addr_set(&wq->sq, mapping, wq->sq.dma_addr);
 	memset(wq->sq.queue, 0, wq->sq.memsize);
 
 	wq->rq.queue = dma_alloc_coherent(rhp->ibdev.dma_device,
 			wq->rq.memsize, &(wq->rq.dma_addr), GFP_KERNEL);
 	if (!wq->rq.queue) {
 		ret = -ENOMEM;
 		goto free_sq_dma;
 	}
 	wq->rq.phys_addr = vtophys(wq->rq.queue);
 	dma_unmap_addr_set(&wq->rq, mapping, wq->rq.dma_addr);
 	memset(wq->rq.queue, 0, wq->rq.memsize);
 
 	CTR5(KTR_IW_CXGBE,
 	    "%s QP sq base va 0x%p pa 0x%llx rq base va 0x%p pa 0x%llx",
 	    __func__,
 	    wq->sq.queue, (unsigned long long)wq->sq.phys_addr,
 	    wq->rq.queue, (unsigned long long)wq->rq.phys_addr);
 
 	/* Doorbell/WC regions, determine the BAR2 queue offset and qid. */
 	t4_bar2_sge_qregs(rdev->adap, wq->sq.qid, T4_BAR2_QTYPE_EGRESS, user,
 			&sq_bar2_qoffset, &wq->sq.bar2_qid);
 	t4_bar2_sge_qregs(rdev->adap, wq->rq.qid, T4_BAR2_QTYPE_EGRESS, user,
 			&rq_bar2_qoffset, &wq->rq.bar2_qid);
 
 	if (user) {
 		/* Compute BAR2 DB/WC physical address(page-aligned) for
 		 * Userspace mapping.
 		 */
 		wq->sq.bar2_pa = (rdev->bar2_pa + sq_bar2_qoffset) & PAGE_MASK;
 		wq->rq.bar2_pa = (rdev->bar2_pa + rq_bar2_qoffset) & PAGE_MASK;
 		CTR3(KTR_IW_CXGBE,
 			"%s BAR2 DB/WC sq base pa 0x%llx rq base pa 0x%llx",
 			__func__, (unsigned long long)wq->sq.bar2_pa,
 			(unsigned long long)wq->rq.bar2_pa);
 	} else {
 		/* Compute BAR2 DB/WC virtual address to access in kernel. */
 		wq->sq.bar2_va = (void __iomem *)((u64)rdev->bar2_kva +
 				sq_bar2_qoffset);
 		wq->rq.bar2_va = (void __iomem *)((u64)rdev->bar2_kva +
 				rq_bar2_qoffset);
 		CTR3(KTR_IW_CXGBE, "%s BAR2 DB/WC sq base va %p rq base va %p",
 			__func__, (unsigned long long)wq->sq.bar2_va,
 			(unsigned long long)wq->rq.bar2_va);
 	}
 
 	wq->rdev = rdev;
 	wq->rq.msn = 1;
 
 	/* build fw_ri_res_wr */
 	wr_len = sizeof *res_wr + 2 * sizeof *res;
 
 	wr = alloc_wrqe(wr_len, &sc->sge.ctrlq[0]);
 	if (wr == NULL) {
 		ret = -ENOMEM;
 		goto free_rq_dma;
 	}
         res_wr = wrtod(wr);
 
 	memset(res_wr, 0, wr_len);
 	res_wr->op_nres = cpu_to_be32(
 			V_FW_WR_OP(FW_RI_RES_WR) |
 			V_FW_RI_RES_WR_NRES(2) |
 			F_FW_WR_COMPL);
 	res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16));
 	res_wr->cookie = (unsigned long) &wr_wait;
 	res = res_wr->res;
 	res->u.sqrq.restype = FW_RI_RES_TYPE_SQ;
 	res->u.sqrq.op = FW_RI_RES_OP_WRITE;
 
 	/* eqsize is the number of 64B entries plus the status page size. */
 	eqsize = wq->sq.size * T4_SQ_NUM_SLOTS +
 			rdev->hw_queue.t4_eq_status_entries;
 
 	res->u.sqrq.fetchszm_to_iqid = cpu_to_be32(
 		V_FW_RI_RES_WR_HOSTFCMODE(0) |	/* no host cidx updates */
 		V_FW_RI_RES_WR_CPRIO(0) |	/* don't keep in chip cache */
 		V_FW_RI_RES_WR_PCIECHN(0) |	/* set by uP at ri_init time */
 		V_FW_RI_RES_WR_IQID(scq->cqid));
 	res->u.sqrq.dcaen_to_eqsize = cpu_to_be32(
 		V_FW_RI_RES_WR_DCAEN(0) |
 		V_FW_RI_RES_WR_DCACPU(0) |
 		V_FW_RI_RES_WR_FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 		    X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) |
 		V_FW_RI_RES_WR_FBMAX(3) |
 		V_FW_RI_RES_WR_CIDXFTHRESHO(0) |
 		V_FW_RI_RES_WR_CIDXFTHRESH(0) |
 		V_FW_RI_RES_WR_EQSIZE(eqsize));
 	res->u.sqrq.eqid = cpu_to_be32(wq->sq.qid);
 	res->u.sqrq.eqaddr = cpu_to_be64(wq->sq.dma_addr);
 	res++;
 	res->u.sqrq.restype = FW_RI_RES_TYPE_RQ;
 	res->u.sqrq.op = FW_RI_RES_OP_WRITE;
 
 	/* eqsize is the number of 64B entries plus the status page size. */
 	eqsize = wq->rq.size * T4_RQ_NUM_SLOTS +
 			rdev->hw_queue.t4_eq_status_entries;
 	res->u.sqrq.fetchszm_to_iqid = cpu_to_be32(
 		V_FW_RI_RES_WR_HOSTFCMODE(0) |	/* no host cidx updates */
 		V_FW_RI_RES_WR_CPRIO(0) |	/* don't keep in chip cache */
 		V_FW_RI_RES_WR_PCIECHN(0) |	/* set by uP at ri_init time */
 		V_FW_RI_RES_WR_IQID(rcq->cqid));
 	res->u.sqrq.dcaen_to_eqsize = cpu_to_be32(
 		V_FW_RI_RES_WR_DCAEN(0) |
 		V_FW_RI_RES_WR_DCACPU(0) |
 		V_FW_RI_RES_WR_FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 		    X_FETCHBURSTMIN_64B : X_FETCHBURSTMIN_64B_T6) |
 		V_FW_RI_RES_WR_FBMAX(3) |
 		V_FW_RI_RES_WR_CIDXFTHRESHO(0) |
 		V_FW_RI_RES_WR_CIDXFTHRESH(0) |
 		V_FW_RI_RES_WR_EQSIZE(eqsize));
 	res->u.sqrq.eqid = cpu_to_be32(wq->rq.qid);
 	res->u.sqrq.eqaddr = cpu_to_be64(wq->rq.dma_addr);
 
 	c4iw_init_wr_wait(&wr_wait);
 
 	t4_wrq_tx(sc, wr);
 	ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, wq->sq.qid,
 			NULL, __func__);
 	if (ret)
 		goto free_rq_dma;
 
 	CTR5(KTR_IW_CXGBE,
 	    "%s sqid 0x%x rqid 0x%x kdb 0x%p squdb 0x%llx rqudb 0x%llx",
 	    __func__, wq->sq.qid, wq->rq.qid,
 	    (unsigned long long)wq->sq.bar2_va,
 	    (unsigned long long)wq->rq.bar2_va);
 
 	return 0;
 free_rq_dma:
 	dma_free_coherent(rhp->ibdev.dma_device,
 			  wq->rq.memsize, wq->rq.queue,
 			  dma_unmap_addr(&wq->rq, mapping));
 free_sq_dma:
 	dma_free_coherent(rhp->ibdev.dma_device,
 			  wq->sq.memsize, wq->sq.queue,
 			  dma_unmap_addr(&wq->sq, mapping));
 free_hwaddr:
 	c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size);
 free_sw_rq:
 	kfree(wq->rq.sw_rq);
 free_sw_sq:
 	kfree(wq->sq.sw_sq);
 free_rq_qid:
 	c4iw_put_qpid(rdev, wq->rq.qid, uctx);
 free_sq_qid:
 	c4iw_put_qpid(rdev, wq->sq.qid, uctx);
 	return ret;
 }
 
 static int build_immd(struct t4_sq *sq, struct fw_ri_immd *immdp,
 		      const struct ib_send_wr *wr, int max, u32 *plenp)
 {
 	u8 *dstp, *srcp;
 	u32 plen = 0;
 	int i;
 	int rem, len;
 
 	dstp = (u8 *)immdp->data;
 	for (i = 0; i < wr->num_sge; i++) {
 		if ((plen + wr->sg_list[i].length) > max)
 			return -EMSGSIZE;
 		srcp = (u8 *)(unsigned long)wr->sg_list[i].addr;
 		plen += wr->sg_list[i].length;
 		rem = wr->sg_list[i].length;
 		while (rem) {
 			if (dstp == (u8 *)&sq->queue[sq->size])
 				dstp = (u8 *)sq->queue;
 			if (rem <= (u8 *)&sq->queue[sq->size] - dstp)
 				len = rem;
 			else
 				len = (u8 *)&sq->queue[sq->size] - dstp;
 			memcpy(dstp, srcp, len);
 			dstp += len;
 			srcp += len;
 			rem -= len;
 		}
 	}
 	len = roundup(plen + sizeof *immdp, 16) - (plen + sizeof *immdp);
 	if (len)
 		memset(dstp, 0, len);
 	immdp->op = FW_RI_DATA_IMMD;
 	immdp->r1 = 0;
 	immdp->r2 = 0;
 	immdp->immdlen = cpu_to_be32(plen);
 	*plenp = plen;
 	return 0;
 }
 
 static int build_isgl(__be64 *queue_start, __be64 *queue_end,
 		      struct fw_ri_isgl *isglp, struct ib_sge *sg_list,
 		      int num_sge, u32 *plenp)
 
 {
 	int i;
 	u32 plen = 0;
 	__be64 *flitp = (__be64 *)isglp->sge;
 
 	for (i = 0; i < num_sge; i++) {
 		if ((plen + sg_list[i].length) < plen)
 			return -EMSGSIZE;
 		plen += sg_list[i].length;
 		*flitp = cpu_to_be64(((u64)sg_list[i].lkey << 32) |
 				     sg_list[i].length);
 		if (++flitp == queue_end)
 			flitp = queue_start;
 		*flitp = cpu_to_be64(sg_list[i].addr);
 		if (++flitp == queue_end)
 			flitp = queue_start;
 	}
 	*flitp = (__force __be64)0;
 	isglp->op = FW_RI_DATA_ISGL;
 	isglp->r1 = 0;
 	isglp->nsge = cpu_to_be16(num_sge);
 	isglp->r2 = 0;
 	if (plenp)
 		*plenp = plen;
 	return 0;
 }
 
 static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe,
 			   const struct ib_send_wr *wr, u8 *len16)
 {
 	u32 plen;
 	int size;
 	int ret;
 
 	if (wr->num_sge > T4_MAX_SEND_SGE)
 		return -EINVAL;
 	switch (wr->opcode) {
 	case IB_WR_SEND:
 		if (wr->send_flags & IB_SEND_SOLICITED)
 			wqe->send.sendop_pkd = cpu_to_be32(
 				V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_SE));
 		else
 			wqe->send.sendop_pkd = cpu_to_be32(
 				V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND));
 		wqe->send.stag_inv = 0;
 		break;
 	case IB_WR_SEND_WITH_INV:
 		if (wr->send_flags & IB_SEND_SOLICITED)
 			wqe->send.sendop_pkd = cpu_to_be32(
 				V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_SE_INV));
 		else
 			wqe->send.sendop_pkd = cpu_to_be32(
 				V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_INV));
 		wqe->send.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey);
 		break;
 
 	default:
 		return -EINVAL;
 	}
 	wqe->send.r3 = 0;
 	wqe->send.r4 = 0;
 
 	plen = 0;
 	if (wr->num_sge) {
 		if (wr->send_flags & IB_SEND_INLINE) {
 			ret = build_immd(sq, wqe->send.u.immd_src, wr,
 					 T4_MAX_SEND_INLINE, &plen);
 			if (ret)
 				return ret;
 			size = sizeof wqe->send + sizeof(struct fw_ri_immd) +
 			       plen;
 		} else {
 			ret = build_isgl((__be64 *)sq->queue,
 					 (__be64 *)&sq->queue[sq->size],
 					 wqe->send.u.isgl_src,
 					 wr->sg_list, wr->num_sge, &plen);
 			if (ret)
 				return ret;
 			size = sizeof wqe->send + sizeof(struct fw_ri_isgl) +
 			       wr->num_sge * sizeof(struct fw_ri_sge);
 		}
 	} else {
 		wqe->send.u.immd_src[0].op = FW_RI_DATA_IMMD;
 		wqe->send.u.immd_src[0].r1 = 0;
 		wqe->send.u.immd_src[0].r2 = 0;
 		wqe->send.u.immd_src[0].immdlen = 0;
 		size = sizeof wqe->send + sizeof(struct fw_ri_immd);
 		plen = 0;
 	}
 	*len16 = DIV_ROUND_UP(size, 16);
 	wqe->send.plen = cpu_to_be32(plen);
 	return 0;
 }
 
 static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe,
 			    const struct ib_send_wr *wr, u8 *len16)
 {
 	u32 plen;
 	int size;
 	int ret;
 
 	if (wr->num_sge > T4_MAX_SEND_SGE)
 		return -EINVAL;
 	wqe->write.immd_data = 0;
 	wqe->write.stag_sink = cpu_to_be32(rdma_wr(wr)->rkey);
 	wqe->write.to_sink = cpu_to_be64(rdma_wr(wr)->remote_addr);
 	if (wr->num_sge) {
 		if (wr->send_flags & IB_SEND_INLINE) {
 			ret = build_immd(sq, wqe->write.u.immd_src, wr,
 					 T4_MAX_WRITE_INLINE, &plen);
 			if (ret)
 				return ret;
 			size = sizeof wqe->write + sizeof(struct fw_ri_immd) +
 			       plen;
 		} else {
 			ret = build_isgl((__be64 *)sq->queue,
 					 (__be64 *)&sq->queue[sq->size],
 					 wqe->write.u.isgl_src,
 					 wr->sg_list, wr->num_sge, &plen);
 			if (ret)
 				return ret;
 			size = sizeof wqe->write + sizeof(struct fw_ri_isgl) +
 			       wr->num_sge * sizeof(struct fw_ri_sge);
 		}
 	} else {
 		wqe->write.u.immd_src[0].op = FW_RI_DATA_IMMD;
 		wqe->write.u.immd_src[0].r1 = 0;
 		wqe->write.u.immd_src[0].r2 = 0;
 		wqe->write.u.immd_src[0].immdlen = 0;
 		size = sizeof wqe->write + sizeof(struct fw_ri_immd);
 		plen = 0;
 	}
 	*len16 = DIV_ROUND_UP(size, 16);
 	wqe->write.plen = cpu_to_be32(plen);
 	return 0;
 }
 
 static int build_rdma_read(union t4_wr *wqe, const struct ib_send_wr *wr, u8 *len16)
 {
 	if (wr->num_sge > 1)
 		return -EINVAL;
 	if (wr->num_sge && wr->sg_list[0].length) {
 		wqe->read.stag_src = cpu_to_be32(rdma_wr(wr)->rkey);
 		wqe->read.to_src_hi = cpu_to_be32((u32)(rdma_wr(wr)->remote_addr
 							>> 32));
 		wqe->read.to_src_lo =
 			cpu_to_be32((u32)rdma_wr(wr)->remote_addr);
 		wqe->read.stag_sink = cpu_to_be32(wr->sg_list[0].lkey);
 		wqe->read.plen = cpu_to_be32(wr->sg_list[0].length);
 		wqe->read.to_sink_hi = cpu_to_be32((u32)(wr->sg_list[0].addr
 							 >> 32));
 		wqe->read.to_sink_lo = cpu_to_be32((u32)(wr->sg_list[0].addr));
 	} else {
 		wqe->read.stag_src = cpu_to_be32(2);
 		wqe->read.to_src_hi = 0;
 		wqe->read.to_src_lo = 0;
 		wqe->read.stag_sink = cpu_to_be32(2);
 		wqe->read.plen = 0;
 		wqe->read.to_sink_hi = 0;
 		wqe->read.to_sink_lo = 0;
 	}
 	wqe->read.r2 = 0;
 	wqe->read.r5 = 0;
 	*len16 = DIV_ROUND_UP(sizeof wqe->read, 16);
 	return 0;
 }
 
 static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe,
 			   const struct ib_recv_wr *wr, u8 *len16)
 {
 	int ret;
 
 	ret = build_isgl((__be64 *)qhp->wq.rq.queue,
 			 (__be64 *)&qhp->wq.rq.queue[qhp->wq.rq.size],
 			 &wqe->recv.isgl, wr->sg_list, wr->num_sge, NULL);
 	if (ret)
 		return ret;
 	*len16 = DIV_ROUND_UP(sizeof wqe->recv +
 			      wr->num_sge * sizeof(struct fw_ri_sge), 16);
 	return 0;
 }
 
 static int build_inv_stag(union t4_wr *wqe, const struct ib_send_wr *wr,
 			  u8 *len16)
 {
 	wqe->inv.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey);
 	wqe->inv.r2 = 0;
 	*len16 = DIV_ROUND_UP(sizeof wqe->inv, 16);
 	return 0;
 }
 
 static void free_qp_work(struct work_struct *work)
 {
 	struct c4iw_ucontext *ucontext;
 	struct c4iw_qp *qhp;
 	struct c4iw_dev *rhp;
 
 	qhp = container_of(work, struct c4iw_qp, free_work);
 	ucontext = qhp->ucontext;
 	rhp = qhp->rhp;
 
 	CTR3(KTR_IW_CXGBE, "%s qhp %p ucontext %p", __func__,
 			qhp, ucontext);
 	destroy_qp(&rhp->rdev, &qhp->wq,
 		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
 
 	kfree(qhp);
 }
 
 static void queue_qp_free(struct kref *kref)
 {
 	struct c4iw_qp *qhp;
 
 	qhp = container_of(kref, struct c4iw_qp, kref);
 	CTR2(KTR_IW_CXGBE, "%s qhp %p", __func__, qhp);
 	queue_work(qhp->rhp->rdev.free_workq, &qhp->free_work);
 }
 
 void c4iw_qp_add_ref(struct ib_qp *qp)
 {
 	CTR2(KTR_IW_CXGBE, "%s ib_qp %p", __func__, qp);
 	kref_get(&to_c4iw_qp(qp)->kref);
 }
 
 void c4iw_qp_rem_ref(struct ib_qp *qp)
 {
 	CTR2(KTR_IW_CXGBE, "%s ib_qp %p", __func__, qp);
 	kref_put(&to_c4iw_qp(qp)->kref, queue_qp_free);
 }
 
 static void complete_sq_drain_wr(struct c4iw_qp *qhp, const struct ib_send_wr *wr)
 {
 	struct t4_cqe cqe = {};
 	struct c4iw_cq *schp;
 	unsigned long flag;
 	struct t4_cq *cq;
 
 	schp = to_c4iw_cq(qhp->ibqp.send_cq);
 	cq = &schp->cq;
 
 	PDBG("%s drain sq id %u\n", __func__, qhp->wq.sq.qid);
 	cqe.u.drain_cookie = wr->wr_id;
 	cqe.header = cpu_to_be32(V_CQE_STATUS(T4_ERR_SWFLUSH) |
 				 V_CQE_OPCODE(C4IW_DRAIN_OPCODE) |
 				 V_CQE_TYPE(1) |
 				 V_CQE_SWCQE(1) |
 				 V_CQE_QPID(qhp->wq.sq.qid));
 
 	spin_lock_irqsave(&schp->lock, flag);
 	cqe.bits_type_ts = cpu_to_be64(V_CQE_GENBIT((u64)cq->gen));
 	cq->sw_queue[cq->sw_pidx] = cqe;
 	t4_swcq_produce(cq);
 	spin_unlock_irqrestore(&schp->lock, flag);
 
 	spin_lock_irqsave(&schp->comp_handler_lock, flag);
 	(*schp->ibcq.comp_handler)(&schp->ibcq,
 				   schp->ibcq.cq_context);
 	spin_unlock_irqrestore(&schp->comp_handler_lock, flag);
 }
 
 static void complete_rq_drain_wr(struct c4iw_qp *qhp, const struct ib_recv_wr *wr)
 {
 	struct t4_cqe cqe = {};
 	struct c4iw_cq *rchp;
 	unsigned long flag;
 	struct t4_cq *cq;
 
 	rchp = to_c4iw_cq(qhp->ibqp.recv_cq);
 	cq = &rchp->cq;
 
 	PDBG("%s drain rq id %u\n", __func__, qhp->wq.sq.qid);
 	cqe.u.drain_cookie = wr->wr_id;
 	cqe.header = cpu_to_be32(V_CQE_STATUS(T4_ERR_SWFLUSH) |
 				 V_CQE_OPCODE(C4IW_DRAIN_OPCODE) |
 				 V_CQE_TYPE(0) |
 				 V_CQE_SWCQE(1) |
 				 V_CQE_QPID(qhp->wq.sq.qid));
 
 	spin_lock_irqsave(&rchp->lock, flag);
 	cqe.bits_type_ts = cpu_to_be64(V_CQE_GENBIT((u64)cq->gen));
 	cq->sw_queue[cq->sw_pidx] = cqe;
 	t4_swcq_produce(cq);
 	spin_unlock_irqrestore(&rchp->lock, flag);
 
 	spin_lock_irqsave(&rchp->comp_handler_lock, flag);
 	(*rchp->ibcq.comp_handler)(&rchp->ibcq,
 				   rchp->ibcq.cq_context);
 	spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
 }
 
 static int build_tpte_memreg(struct fw_ri_fr_nsmr_tpte_wr *fr,
 		const struct ib_reg_wr *wr, struct c4iw_mr *mhp, u8 *len16)
 {
 	__be64 *p = (__be64 *)fr->pbl;
 
 	if (wr->mr->page_size > C4IW_MAX_PAGE_SIZE)
 		return -EINVAL;
 
 	fr->r2 = cpu_to_be32(0);
 	fr->stag = cpu_to_be32(mhp->ibmr.rkey);
 
 	fr->tpte.valid_to_pdid = cpu_to_be32(F_FW_RI_TPTE_VALID |
 			V_FW_RI_TPTE_STAGKEY((mhp->ibmr.rkey & M_FW_RI_TPTE_STAGKEY)) |
 			V_FW_RI_TPTE_STAGSTATE(1) |
 			V_FW_RI_TPTE_STAGTYPE(FW_RI_STAG_NSMR) |
 			V_FW_RI_TPTE_PDID(mhp->attr.pdid));
 	fr->tpte.locread_to_qpid = cpu_to_be32(
 			V_FW_RI_TPTE_PERM(c4iw_ib_to_tpt_access(wr->access)) |
 			V_FW_RI_TPTE_ADDRTYPE(FW_RI_VA_BASED_TO) |
 			V_FW_RI_TPTE_PS(ilog2(wr->mr->page_size) - 12));
 	fr->tpte.nosnoop_pbladdr = cpu_to_be32(V_FW_RI_TPTE_PBLADDR(
 			      PBL_OFF(&mhp->rhp->rdev, mhp->attr.pbl_addr)>>3));
 	fr->tpte.dca_mwbcnt_pstag = cpu_to_be32(0);
 	fr->tpte.len_hi = cpu_to_be32(mhp->ibmr.length >> 32);
 	fr->tpte.len_lo = cpu_to_be32(mhp->ibmr.length & 0xffffffff);
 	fr->tpte.va_hi = cpu_to_be32(mhp->ibmr.iova >> 32);
 	fr->tpte.va_lo_fbo = cpu_to_be32(mhp->ibmr.iova & 0xffffffff);
 
 	p[0] = cpu_to_be64((u64)mhp->mpl[0]);
 	p[1] = cpu_to_be64((u64)mhp->mpl[1]);
 
 	*len16 = DIV_ROUND_UP(sizeof(*fr), 16);
 	return 0;
 }
 
 static int build_memreg(struct t4_sq *sq, union t4_wr *wqe,
 		const struct ib_reg_wr *wr, struct c4iw_mr *mhp, u8 *len16,
 		bool dsgl_supported)
 {
 	struct fw_ri_immd *imdp;
 	__be64 *p;
 	int i;
 	int pbllen = roundup(mhp->mpl_len * sizeof(u64), 32);
 	int rem;
 
 	if (mhp->mpl_len > t4_max_fr_depth(&mhp->rhp->rdev, use_dsgl))
 		return -EINVAL;
 	if (wr->mr->page_size > C4IW_MAX_PAGE_SIZE)
 		return -EINVAL;
 
 	wqe->fr.qpbinde_to_dcacpu = 0;
 	wqe->fr.pgsz_shift = ilog2(wr->mr->page_size) - 12;
 	wqe->fr.addr_type = FW_RI_VA_BASED_TO;
 	wqe->fr.mem_perms = c4iw_ib_to_tpt_access(wr->access);
 	wqe->fr.len_hi = cpu_to_be32(mhp->ibmr.length >> 32);
 	wqe->fr.len_lo = cpu_to_be32(mhp->ibmr.length & 0xffffffff);
 	wqe->fr.stag = cpu_to_be32(wr->key);
 	wqe->fr.va_hi = cpu_to_be32(mhp->ibmr.iova >> 32);
 	wqe->fr.va_lo_fbo = cpu_to_be32(mhp->ibmr.iova & 0xffffffff);
 
 	if (dsgl_supported && use_dsgl && (pbllen > max_fr_immd)) {
 		struct fw_ri_dsgl *sglp;
 
 		for (i = 0; i < mhp->mpl_len; i++)
 			mhp->mpl[i] =
 				     (__force u64)cpu_to_be64((u64)mhp->mpl[i]);
 
 		sglp = (struct fw_ri_dsgl *)(&wqe->fr + 1);
 		sglp->op = FW_RI_DATA_DSGL;
 		sglp->r1 = 0;
 		sglp->nsge = cpu_to_be16(1);
 		sglp->addr0 = cpu_to_be64(mhp->mpl_addr);
 		sglp->len0 = cpu_to_be32(pbllen);
 
 		*len16 = DIV_ROUND_UP(sizeof(wqe->fr) + sizeof(*sglp), 16);
 	} else {
 		imdp = (struct fw_ri_immd *)(&wqe->fr + 1);
 		imdp->op = FW_RI_DATA_IMMD;
 		imdp->r1 = 0;
 		imdp->r2 = 0;
 		imdp->immdlen = cpu_to_be32(pbllen);
 		p = (__be64 *)(imdp + 1);
 		rem = pbllen;
 		for (i = 0; i < mhp->mpl_len; i++) {
 			*p = cpu_to_be64((u64)mhp->mpl[i]);
 			rem -= sizeof(*p);
 			if (++p == (__be64 *)&sq->queue[sq->size])
 				p = (__be64 *)sq->queue;
 		}
 		BUG_ON(rem < 0);
 		while (rem) {
 			*p = 0;
 			rem -= sizeof(*p);
 			if (++p == (__be64 *)&sq->queue[sq->size])
 				p = (__be64 *)sq->queue;
 		}
 		*len16 = DIV_ROUND_UP(sizeof(wqe->fr) + sizeof(*imdp)
 				+ pbllen, 16);
 	}
 
 	return 0;
 }
 
 int c4iw_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 		   const struct ib_send_wr **bad_wr)
 {
 	int err = 0;
 	u8 len16 = 0;
 	enum fw_wr_opcodes fw_opcode = 0;
 	enum fw_ri_wr_flags fw_flags;
 	struct c4iw_qp *qhp;
 	union t4_wr *wqe = NULL;
 	u32 num_wrs;
 	struct t4_swsqe *swsqe;
 	unsigned long flag;
 	u16 idx = 0;
 	struct c4iw_rdev *rdev;
 
 	qhp = to_c4iw_qp(ibqp);
 	rdev = &qhp->rhp->rdev;
 	spin_lock_irqsave(&qhp->lock, flag);
 	if (t4_wq_in_error(&qhp->wq)) {
 		spin_unlock_irqrestore(&qhp->lock, flag);
 		complete_sq_drain_wr(qhp, wr);
 		return err;
 	}
 	num_wrs = t4_sq_avail(&qhp->wq);
 	if (num_wrs == 0) {
 		spin_unlock_irqrestore(&qhp->lock, flag);
 		*bad_wr = wr;
 		return -ENOMEM;
 	}
 	while (wr) {
 		if (num_wrs == 0) {
 			err = -ENOMEM;
 			*bad_wr = wr;
 			break;
 		}
 		wqe = (union t4_wr *)((u8 *)qhp->wq.sq.queue +
 		      qhp->wq.sq.wq_pidx * T4_EQ_ENTRY_SIZE);
 
 		fw_flags = 0;
 		if (wr->send_flags & IB_SEND_SOLICITED)
 			fw_flags |= FW_RI_SOLICITED_EVENT_FLAG;
 		if (wr->send_flags & IB_SEND_SIGNALED || qhp->sq_sig_all)
 			fw_flags |= FW_RI_COMPLETION_FLAG;
 		swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx];
 		switch (wr->opcode) {
 		case IB_WR_SEND_WITH_INV:
 		case IB_WR_SEND:
 			if (wr->send_flags & IB_SEND_FENCE)
 				fw_flags |= FW_RI_READ_FENCE_FLAG;
 			fw_opcode = FW_RI_SEND_WR;
 			if (wr->opcode == IB_WR_SEND)
 				swsqe->opcode = FW_RI_SEND;
 			else
 				swsqe->opcode = FW_RI_SEND_WITH_INV;
 			err = build_rdma_send(&qhp->wq.sq, wqe, wr, &len16);
 			break;
 		case IB_WR_RDMA_WRITE:
 			fw_opcode = FW_RI_RDMA_WRITE_WR;
 			swsqe->opcode = FW_RI_RDMA_WRITE;
 			err = build_rdma_write(&qhp->wq.sq, wqe, wr, &len16);
 			break;
 		case IB_WR_RDMA_READ:
 		case IB_WR_RDMA_READ_WITH_INV:
 			fw_opcode = FW_RI_RDMA_READ_WR;
 			swsqe->opcode = FW_RI_READ_REQ;
 			if (wr->opcode == IB_WR_RDMA_READ_WITH_INV) {
 				c4iw_invalidate_mr(qhp->rhp,
 						   wr->sg_list[0].lkey);
 				fw_flags = FW_RI_RDMA_READ_INVALIDATE;
 			} else {
 				fw_flags = 0;
 			}
 			err = build_rdma_read(wqe, wr, &len16);
 			if (err)
 				break;
 			swsqe->read_len = wr->sg_list[0].length;
 			if (!qhp->wq.sq.oldest_read)
 				qhp->wq.sq.oldest_read = swsqe;
 			break;
 		case IB_WR_REG_MR: {
 			struct c4iw_mr *mhp = to_c4iw_mr(reg_wr(wr)->mr);
 
 			swsqe->opcode = FW_RI_FAST_REGISTER;
 			if (rdev->adap->params.fr_nsmr_tpte_wr_support &&
 					!mhp->attr.state && mhp->mpl_len <= 2) {
 				fw_opcode = FW_RI_FR_NSMR_TPTE_WR;
 				err = build_tpte_memreg(&wqe->fr_tpte, reg_wr(wr),
 						mhp, &len16);
 			} else {
 				fw_opcode = FW_RI_FR_NSMR_WR;
 				err = build_memreg(&qhp->wq.sq, wqe, reg_wr(wr),
 					mhp, &len16,
 					rdev->adap->params.ulptx_memwrite_dsgl);
 			}
 			if (err)
 				break;
 			mhp->attr.state = 1;
 			break;
 		}
 		case IB_WR_LOCAL_INV:
 			if (wr->send_flags & IB_SEND_FENCE)
 				fw_flags |= FW_RI_LOCAL_FENCE_FLAG;
 			fw_opcode = FW_RI_INV_LSTAG_WR;
 			swsqe->opcode = FW_RI_LOCAL_INV;
 			err = build_inv_stag(wqe, wr, &len16);
 			c4iw_invalidate_mr(qhp->rhp, wr->ex.invalidate_rkey);
 			break;
 		default:
 			CTR2(KTR_IW_CXGBE, "%s post of type =%d TBD!", __func__,
 			     wr->opcode);
 			err = -EINVAL;
 		}
 		if (err) {
 			*bad_wr = wr;
 			break;
 		}
 		swsqe->idx = qhp->wq.sq.pidx;
 		swsqe->complete = 0;
 		swsqe->signaled = (wr->send_flags & IB_SEND_SIGNALED) ||
 					qhp->sq_sig_all;
 		swsqe->flushed = 0;
 		swsqe->wr_id = wr->wr_id;
 
 		init_wr_hdr(wqe, qhp->wq.sq.pidx, fw_opcode, fw_flags, len16);
 
 		CTR5(KTR_IW_CXGBE,
 		    "%s cookie 0x%llx pidx 0x%x opcode 0x%x read_len %u",
 		    __func__, (unsigned long long)wr->wr_id, qhp->wq.sq.pidx,
 		    swsqe->opcode, swsqe->read_len);
 		wr = wr->next;
 		num_wrs--;
 		t4_sq_produce(&qhp->wq, len16);
 		idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
 	}
 
 	t4_ring_sq_db(&qhp->wq, idx, wqe, rdev->adap->iwt.wc_en);
 	spin_unlock_irqrestore(&qhp->lock, flag);
 	return err;
 }
 
 int c4iw_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
 		      const struct ib_recv_wr **bad_wr)
 {
 	int err = 0;
 	struct c4iw_qp *qhp;
 	union t4_recv_wr *wqe = NULL;
 	u32 num_wrs;
 	u8 len16 = 0;
 	unsigned long flag;
 	u16 idx = 0;
 
 	qhp = to_c4iw_qp(ibqp);
 	spin_lock_irqsave(&qhp->lock, flag);
 	if (t4_wq_in_error(&qhp->wq)) {
 		spin_unlock_irqrestore(&qhp->lock, flag);
 		complete_rq_drain_wr(qhp, wr);
 		return err;
 	}
 	num_wrs = t4_rq_avail(&qhp->wq);
 	if (num_wrs == 0) {
 		spin_unlock_irqrestore(&qhp->lock, flag);
 		*bad_wr = wr;
 		return -ENOMEM;
 	}
 	while (wr) {
 		if (wr->num_sge > T4_MAX_RECV_SGE) {
 			err = -EINVAL;
 			*bad_wr = wr;
 			break;
 		}
 		wqe = (union t4_recv_wr *)((u8 *)qhp->wq.rq.queue +
 					   qhp->wq.rq.wq_pidx *
 					   T4_EQ_ENTRY_SIZE);
 		if (num_wrs)
 			err = build_rdma_recv(qhp, wqe, wr, &len16);
 		else
 			err = -ENOMEM;
 		if (err) {
 			*bad_wr = wr;
 			break;
 		}
 
 		qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].wr_id = wr->wr_id;
 
 		wqe->recv.opcode = FW_RI_RECV_WR;
 		wqe->recv.r1 = 0;
 		wqe->recv.wrid = qhp->wq.rq.pidx;
 		wqe->recv.r2[0] = 0;
 		wqe->recv.r2[1] = 0;
 		wqe->recv.r2[2] = 0;
 		wqe->recv.len16 = len16;
 		CTR3(KTR_IW_CXGBE, "%s cookie 0x%llx pidx %u", __func__,
 		     (unsigned long long) wr->wr_id, qhp->wq.rq.pidx);
 		t4_rq_produce(&qhp->wq, len16);
 		idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
 		wr = wr->next;
 		num_wrs--;
 	}
 
 	t4_ring_rq_db(&qhp->wq, idx, wqe, qhp->rhp->rdev.adap->iwt.wc_en);
 	spin_unlock_irqrestore(&qhp->lock, flag);
 	return err;
 }
 
 static inline void build_term_codes(struct t4_cqe *err_cqe, u8 *layer_type,
 				    u8 *ecode)
 {
 	int status;
 	int tagged;
 	int opcode;
 	int rqtype;
 	int send_inv;
 
 	if (!err_cqe) {
 		*layer_type = LAYER_RDMAP|DDP_LOCAL_CATA;
 		*ecode = 0;
 		return;
 	}
 
 	status = CQE_STATUS(err_cqe);
 	opcode = CQE_OPCODE(err_cqe);
 	rqtype = RQ_TYPE(err_cqe);
 	send_inv = (opcode == FW_RI_SEND_WITH_INV) ||
 		   (opcode == FW_RI_SEND_WITH_SE_INV);
 	tagged = (opcode == FW_RI_RDMA_WRITE) ||
 		 (rqtype && (opcode == FW_RI_READ_RESP));
 
 	switch (status) {
 	case T4_ERR_STAG:
 		if (send_inv) {
 			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
 			*ecode = RDMAP_CANT_INV_STAG;
 		} else {
 			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
 			*ecode = RDMAP_INV_STAG;
 		}
 		break;
 	case T4_ERR_PDID:
 		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
 		if ((opcode == FW_RI_SEND_WITH_INV) ||
 		    (opcode == FW_RI_SEND_WITH_SE_INV))
 			*ecode = RDMAP_CANT_INV_STAG;
 		else
 			*ecode = RDMAP_STAG_NOT_ASSOC;
 		break;
 	case T4_ERR_QPID:
 		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
 		*ecode = RDMAP_STAG_NOT_ASSOC;
 		break;
 	case T4_ERR_ACCESS:
 		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
 		*ecode = RDMAP_ACC_VIOL;
 		break;
 	case T4_ERR_WRAP:
 		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
 		*ecode = RDMAP_TO_WRAP;
 		break;
 	case T4_ERR_BOUND:
 		if (tagged) {
 			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
 			*ecode = DDPT_BASE_BOUNDS;
 		} else {
 			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
 			*ecode = RDMAP_BASE_BOUNDS;
 		}
 		break;
 	case T4_ERR_INVALIDATE_SHARED_MR:
 	case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND:
 		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
 		*ecode = RDMAP_CANT_INV_STAG;
 		break;
 	case T4_ERR_ECC:
 	case T4_ERR_ECC_PSTAG:
 	case T4_ERR_INTERNAL_ERR:
 		*layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA;
 		*ecode = 0;
 		break;
 	case T4_ERR_OUT_OF_RQE:
 		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
 		*ecode = DDPU_INV_MSN_NOBUF;
 		break;
 	case T4_ERR_PBL_ADDR_BOUND:
 		*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
 		*ecode = DDPT_BASE_BOUNDS;
 		break;
 	case T4_ERR_CRC:
 		*layer_type = LAYER_MPA|DDP_LLP;
 		*ecode = MPA_CRC_ERR;
 		break;
 	case T4_ERR_MARKER:
 		*layer_type = LAYER_MPA|DDP_LLP;
 		*ecode = MPA_MARKER_ERR;
 		break;
 	case T4_ERR_PDU_LEN_ERR:
 		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
 		*ecode = DDPU_MSG_TOOBIG;
 		break;
 	case T4_ERR_DDP_VERSION:
 		if (tagged) {
 			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
 			*ecode = DDPT_INV_VERS;
 		} else {
 			*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
 			*ecode = DDPU_INV_VERS;
 		}
 		break;
 	case T4_ERR_RDMA_VERSION:
 		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
 		*ecode = RDMAP_INV_VERS;
 		break;
 	case T4_ERR_OPCODE:
 		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
 		*ecode = RDMAP_INV_OPCODE;
 		break;
 	case T4_ERR_DDP_QUEUE_NUM:
 		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
 		*ecode = DDPU_INV_QN;
 		break;
 	case T4_ERR_MSN:
 	case T4_ERR_MSN_GAP:
 	case T4_ERR_MSN_RANGE:
 	case T4_ERR_IRD_OVERFLOW:
 		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
 		*ecode = DDPU_INV_MSN_RANGE;
 		break;
 	case T4_ERR_TBIT:
 		*layer_type = LAYER_DDP|DDP_LOCAL_CATA;
 		*ecode = 0;
 		break;
 	case T4_ERR_MO:
 		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
 		*ecode = DDPU_INV_MO;
 		break;
 	default:
 		*layer_type = LAYER_RDMAP|DDP_LOCAL_CATA;
 		*ecode = 0;
 		break;
 	}
 }
 
 static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe,
 			   gfp_t gfp)
 {
 	int ret;
 	struct fw_ri_wr *wqe;
 	struct terminate_message *term;
 	struct wrqe *wr;
 	struct socket *so = qhp->ep->com.so;
         struct inpcb *inp = sotoinpcb(so);
         struct tcpcb *tp = intotcpcb(inp);
         struct toepcb *toep = tp->t_toe;
 
 	CTR4(KTR_IW_CXGBE, "%s qhp %p qid 0x%x tid %u", __func__, qhp,
 	    qhp->wq.sq.qid, qhp->ep->hwtid);
 
 	wr = alloc_wrqe(sizeof(*wqe), &toep->ofld_txq->wrq);
 	if (wr == NULL)
 		return;
         wqe = wrtod(wr);
 
 	memset(wqe, 0, sizeof *wqe);
 	wqe->op_compl = cpu_to_be32(V_FW_WR_OP(FW_RI_WR));
 	wqe->flowid_len16 = cpu_to_be32(
 		V_FW_WR_FLOWID(qhp->ep->hwtid) |
 		V_FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16)));
 
 	wqe->u.terminate.type = FW_RI_TYPE_TERMINATE;
 	wqe->u.terminate.immdlen = cpu_to_be32(sizeof *term);
 	term = (struct terminate_message *)wqe->u.terminate.termmsg;
 	if (qhp->attr.layer_etype == (LAYER_MPA|DDP_LLP)) {
 		term->layer_etype = qhp->attr.layer_etype;
 		term->ecode = qhp->attr.ecode;
 	} else
 		build_term_codes(err_cqe, &term->layer_etype, &term->ecode);
 	ret = creds(toep, inp, sizeof(*wqe));
 	if (ret) {
 		free_wrqe(wr);
 		return;
 	}
 	t4_wrq_tx(qhp->rhp->rdev.adap, wr);
 }
 
 /* Assumes qhp lock is held. */
 static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp,
 		       struct c4iw_cq *schp)
 {
 	int count;
 	int rq_flushed, sq_flushed;
 	unsigned long flag;
 
 	CTR4(KTR_IW_CXGBE, "%s qhp %p rchp %p schp %p", __func__, qhp, rchp,
 	    schp);
 
 	/* locking hierarchy: cq lock first, then qp lock. */
 	spin_lock_irqsave(&rchp->lock, flag);
 	spin_lock(&qhp->lock);
 
 	if (qhp->wq.flushed) {
 		spin_unlock(&qhp->lock);
 		spin_unlock_irqrestore(&rchp->lock, flag);
 		return;
 	}
 	qhp->wq.flushed = 1;
 
 	c4iw_flush_hw_cq(rchp);
 	c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count);
 	rq_flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count);
 	spin_unlock(&qhp->lock);
 	spin_unlock_irqrestore(&rchp->lock, flag);
 
 	/* locking hierarchy: cq lock first, then qp lock. */
 	spin_lock_irqsave(&schp->lock, flag);
 	spin_lock(&qhp->lock);
 	if (schp != rchp)
 		c4iw_flush_hw_cq(schp);
 	sq_flushed = c4iw_flush_sq(qhp);
 	spin_unlock(&qhp->lock);
 	spin_unlock_irqrestore(&schp->lock, flag);
 
 	if (schp == rchp) {
 		if (t4_clear_cq_armed(&rchp->cq) &&
 		    (rq_flushed || sq_flushed)) {
 			spin_lock_irqsave(&rchp->comp_handler_lock, flag);
 			(*rchp->ibcq.comp_handler)(&rchp->ibcq,
 						   rchp->ibcq.cq_context);
 			spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
 		}
 	} else {
 		if (t4_clear_cq_armed(&rchp->cq) && rq_flushed) {
 			spin_lock_irqsave(&rchp->comp_handler_lock, flag);
 			(*rchp->ibcq.comp_handler)(&rchp->ibcq,
 						   rchp->ibcq.cq_context);
 			spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
 		}
 		if (t4_clear_cq_armed(&schp->cq) && sq_flushed) {
 			spin_lock_irqsave(&schp->comp_handler_lock, flag);
 			(*schp->ibcq.comp_handler)(&schp->ibcq,
 						   schp->ibcq.cq_context);
 			spin_unlock_irqrestore(&schp->comp_handler_lock, flag);
 		}
 	}
 }
 
 static void flush_qp(struct c4iw_qp *qhp)
 {
 	struct c4iw_cq *rchp, *schp;
 	unsigned long flag;
 
 	rchp = to_c4iw_cq(qhp->ibqp.recv_cq);
 	schp = to_c4iw_cq(qhp->ibqp.send_cq);
 
 	t4_set_wq_in_error(&qhp->wq);
 	if (qhp->ibqp.uobject) {
 		t4_set_cq_in_error(&rchp->cq);
 		spin_lock_irqsave(&rchp->comp_handler_lock, flag);
 		(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
 		spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
 		if (schp != rchp) {
 			t4_set_cq_in_error(&schp->cq);
 			spin_lock_irqsave(&schp->comp_handler_lock, flag);
 			(*schp->ibcq.comp_handler)(&schp->ibcq,
 					schp->ibcq.cq_context);
 			spin_unlock_irqrestore(&schp->comp_handler_lock, flag);
 		}
 		return;
 	}
 	__flush_qp(qhp, rchp, schp);
 }
 
 static int
 rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp, struct c4iw_ep *ep)
 {
 	struct c4iw_rdev *rdev = &rhp->rdev;
 	struct adapter *sc = rdev->adap;
 	struct fw_ri_wr *wqe;
 	int ret;
 	struct wrqe *wr;
 	struct socket *so = ep->com.so;
         struct inpcb *inp = sotoinpcb(so);
         struct tcpcb *tp = intotcpcb(inp);
         struct toepcb *toep = tp->t_toe;
 
 	KASSERT(rhp == qhp->rhp && ep == qhp->ep, ("%s: EDOOFUS", __func__));
 
 	CTR5(KTR_IW_CXGBE, "%s qhp %p qid 0x%x ep %p tid %u", __func__, qhp,
 	    qhp->wq.sq.qid, ep, ep->hwtid);
 
 	wr = alloc_wrqe(sizeof(*wqe), &toep->ofld_txq->wrq);
 	if (wr == NULL)
 		return (0);
 	wqe = wrtod(wr);
 
 	memset(wqe, 0, sizeof *wqe);
 
 	wqe->op_compl = cpu_to_be32(V_FW_WR_OP(FW_RI_WR) | F_FW_WR_COMPL);
 	wqe->flowid_len16 = cpu_to_be32(V_FW_WR_FLOWID(ep->hwtid) |
 	    V_FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16)));
 	wqe->cookie = (unsigned long) &ep->com.wr_wait;
 	wqe->u.fini.type = FW_RI_TYPE_FINI;
 
 	c4iw_init_wr_wait(&ep->com.wr_wait);
 
 	ret = creds(toep, inp, sizeof(*wqe));
 	if (ret) {
 		free_wrqe(wr);
 		return ret;
 	}
 	t4_wrq_tx(sc, wr);
 
 	ret = c4iw_wait_for_reply(rdev, &ep->com.wr_wait, ep->hwtid,
 			qhp->wq.sq.qid, ep->com.so, __func__);
 	return ret;
 }
 
 static void build_rtr_msg(u8 p2p_type, struct fw_ri_init *init)
 {
 	CTR2(KTR_IW_CXGBE, "%s p2p_type = %d", __func__, p2p_type);
 	memset(&init->u, 0, sizeof init->u);
 	switch (p2p_type) {
 	case FW_RI_INIT_P2PTYPE_RDMA_WRITE:
 		init->u.write.opcode = FW_RI_RDMA_WRITE_WR;
 		init->u.write.stag_sink = cpu_to_be32(1);
 		init->u.write.to_sink = cpu_to_be64(1);
 		init->u.write.u.immd_src[0].op = FW_RI_DATA_IMMD;
 		init->u.write.len16 = DIV_ROUND_UP(sizeof init->u.write +
 						   sizeof(struct fw_ri_immd),
 						   16);
 		break;
 	case FW_RI_INIT_P2PTYPE_READ_REQ:
 		init->u.write.opcode = FW_RI_RDMA_READ_WR;
 		init->u.read.stag_src = cpu_to_be32(1);
 		init->u.read.to_src_lo = cpu_to_be32(1);
 		init->u.read.stag_sink = cpu_to_be32(1);
 		init->u.read.to_sink_lo = cpu_to_be32(1);
 		init->u.read.len16 = DIV_ROUND_UP(sizeof init->u.read, 16);
 		break;
 	}
 }
 
 static int
 creds(struct toepcb *toep, struct inpcb *inp, size_t wrsize)
 {
 	struct ofld_tx_sdesc *txsd;
 
 	CTR3(KTR_IW_CXGBE, "%s:creB  %p %u", __func__, toep , wrsize);
 	INP_WLOCK(inp);
-	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) != 0) {
+	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		return (EINVAL);
 	}
 	txsd = &toep->txsd[toep->txsd_pidx];
 	txsd->tx_credits = howmany(wrsize, 16);
 	txsd->plen = 0;
 	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
 			("%s: not enough credits (%d)", __func__, toep->tx_credits));
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 	INP_WUNLOCK(inp);
 	CTR5(KTR_IW_CXGBE, "%s:creE  %p %u %u %u", __func__, toep ,
 	    txsd->tx_credits, toep->tx_credits, toep->txsd_pidx);
 	return (0);
 }
 
 static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp)
 {
 	struct fw_ri_wr *wqe;
 	int ret;
 	struct wrqe *wr;
 	struct c4iw_ep *ep = qhp->ep;
 	struct c4iw_rdev *rdev = &qhp->rhp->rdev;
 	struct adapter *sc = rdev->adap;
 	struct socket *so = ep->com.so;
         struct inpcb *inp = sotoinpcb(so);
         struct tcpcb *tp = intotcpcb(inp);
         struct toepcb *toep = tp->t_toe;
 
 	CTR5(KTR_IW_CXGBE, "%s qhp %p qid 0x%x ep %p tid %u", __func__, qhp,
 	    qhp->wq.sq.qid, ep, ep->hwtid);
 
 	wr = alloc_wrqe(sizeof(*wqe), &toep->ofld_txq->wrq);
 	if (wr == NULL)
 		return (0);
 	wqe = wrtod(wr);
 	ret = alloc_ird(rhp, qhp->attr.max_ird);
 	if (ret) {
 		qhp->attr.max_ird = 0;
 		free_wrqe(wr);
 		return ret;
 	}
 
 	memset(wqe, 0, sizeof *wqe);
 
 	wqe->op_compl = cpu_to_be32(
 		V_FW_WR_OP(FW_RI_WR) |
 		F_FW_WR_COMPL);
 	wqe->flowid_len16 = cpu_to_be32(V_FW_WR_FLOWID(ep->hwtid) |
 	    V_FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16)));
 
 	wqe->cookie = (unsigned long) &ep->com.wr_wait;
 
 	wqe->u.init.type = FW_RI_TYPE_INIT;
 	wqe->u.init.mpareqbit_p2ptype =
 		V_FW_RI_WR_MPAREQBIT(qhp->attr.mpa_attr.initiator) |
 		V_FW_RI_WR_P2PTYPE(qhp->attr.mpa_attr.p2p_type);
 	wqe->u.init.mpa_attrs = FW_RI_MPA_IETF_ENABLE;
 	if (qhp->attr.mpa_attr.recv_marker_enabled)
 		wqe->u.init.mpa_attrs |= FW_RI_MPA_RX_MARKER_ENABLE;
 	if (qhp->attr.mpa_attr.xmit_marker_enabled)
 		wqe->u.init.mpa_attrs |= FW_RI_MPA_TX_MARKER_ENABLE;
 	if (qhp->attr.mpa_attr.crc_enabled)
 		wqe->u.init.mpa_attrs |= FW_RI_MPA_CRC_ENABLE;
 
 	wqe->u.init.qp_caps = FW_RI_QP_RDMA_READ_ENABLE |
 			    FW_RI_QP_RDMA_WRITE_ENABLE |
 			    FW_RI_QP_BIND_ENABLE;
 	if (!qhp->ibqp.uobject)
 		wqe->u.init.qp_caps |= FW_RI_QP_FAST_REGISTER_ENABLE |
 				     FW_RI_QP_STAG0_ENABLE;
 	wqe->u.init.nrqe = cpu_to_be16(t4_rqes_posted(&qhp->wq));
 	wqe->u.init.pdid = cpu_to_be32(qhp->attr.pd);
 	wqe->u.init.qpid = cpu_to_be32(qhp->wq.sq.qid);
 	wqe->u.init.sq_eqid = cpu_to_be32(qhp->wq.sq.qid);
 	wqe->u.init.rq_eqid = cpu_to_be32(qhp->wq.rq.qid);
 	wqe->u.init.scqid = cpu_to_be32(qhp->attr.scq);
 	wqe->u.init.rcqid = cpu_to_be32(qhp->attr.rcq);
 	wqe->u.init.ord_max = cpu_to_be32(qhp->attr.max_ord);
 	wqe->u.init.ird_max = cpu_to_be32(qhp->attr.max_ird);
 	wqe->u.init.iss = cpu_to_be32(ep->snd_seq);
 	wqe->u.init.irs = cpu_to_be32(ep->rcv_seq);
 	wqe->u.init.hwrqsize = cpu_to_be32(qhp->wq.rq.rqt_size);
 	wqe->u.init.hwrqaddr = cpu_to_be32(qhp->wq.rq.rqt_hwaddr -
 	    sc->vres.rq.start);
 	if (qhp->attr.mpa_attr.initiator)
 		build_rtr_msg(qhp->attr.mpa_attr.p2p_type, &wqe->u.init);
 
 	c4iw_init_wr_wait(&ep->com.wr_wait);
 
 	ret = creds(toep, inp, sizeof(*wqe));
 	if (ret) {
 		free_wrqe(wr);
 		free_ird(rhp, qhp->attr.max_ird);
 		return ret;
 	}
 	t4_wrq_tx(sc, wr);
 
 	ret = c4iw_wait_for_reply(rdev, &ep->com.wr_wait, ep->hwtid,
 			qhp->wq.sq.qid, ep->com.so, __func__);
 
 	toep->params.ulp_mode = ULP_MODE_RDMA;
 	free_ird(rhp, qhp->attr.max_ird);
 
 	return ret;
 }
 
 int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
 		   enum c4iw_qp_attr_mask mask,
 		   struct c4iw_qp_attributes *attrs,
 		   int internal)
 {
 	int ret = 0;
 	struct c4iw_qp_attributes newattr = qhp->attr;
 	int disconnect = 0;
 	int terminate = 0;
 	int abort = 0;
 	int free = 0;
 	struct c4iw_ep *ep = NULL;
 
 	CTR5(KTR_IW_CXGBE, "%s qhp %p sqid 0x%x rqid 0x%x ep %p", __func__, qhp,
 	    qhp->wq.sq.qid, qhp->wq.rq.qid, qhp->ep);
 	CTR3(KTR_IW_CXGBE, "%s state %d -> %d", __func__, qhp->attr.state,
 	    (mask & C4IW_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1);
 
 	mutex_lock(&qhp->mutex);
 
 	/* Process attr changes if in IDLE */
 	if (mask & C4IW_QP_ATTR_VALID_MODIFY) {
 		if (qhp->attr.state != C4IW_QP_STATE_IDLE) {
 			ret = -EIO;
 			goto out;
 		}
 		if (mask & C4IW_QP_ATTR_ENABLE_RDMA_READ)
 			newattr.enable_rdma_read = attrs->enable_rdma_read;
 		if (mask & C4IW_QP_ATTR_ENABLE_RDMA_WRITE)
 			newattr.enable_rdma_write = attrs->enable_rdma_write;
 		if (mask & C4IW_QP_ATTR_ENABLE_RDMA_BIND)
 			newattr.enable_bind = attrs->enable_bind;
 		if (mask & C4IW_QP_ATTR_MAX_ORD) {
 			if (attrs->max_ord > c4iw_max_read_depth) {
 				ret = -EINVAL;
 				goto out;
 			}
 			newattr.max_ord = attrs->max_ord;
 		}
 		if (mask & C4IW_QP_ATTR_MAX_IRD) {
 			if (attrs->max_ird > cur_max_read_depth(rhp)) {
 				ret = -EINVAL;
 				goto out;
 			}
 			newattr.max_ird = attrs->max_ird;
 		}
 		qhp->attr = newattr;
 	}
 
 	if (!(mask & C4IW_QP_ATTR_NEXT_STATE))
 		goto out;
 	if (qhp->attr.state == attrs->next_state)
 		goto out;
 
 	/* Return EINPROGRESS if QP is already in transition state.
 	 * Eg: CLOSING->IDLE transition or *->ERROR transition.
 	 * This can happen while connection is switching(due to rdma_fini)
 	 * from iWARP/RDDP to TOE mode and any inflight RDMA RX data will
 	 * reach TOE driver -> TCP stack -> iWARP driver. In this way
 	 * iWARP driver keep receiving inflight RDMA RX data until socket
 	 * is closed or aborted. And if iWARP CM is in FPDU sate, then
 	 * it tries to put QP in TERM state and disconnects endpoint.
 	 * But as QP is already in transition state, this event is ignored.
 	 */
 	if ((qhp->attr.state >= C4IW_QP_STATE_ERROR) &&
 		(attrs->next_state == C4IW_QP_STATE_TERMINATE)) {
 		ret = -EINPROGRESS;
 		goto out;
 	}
 
 	switch (qhp->attr.state) {
 	case C4IW_QP_STATE_IDLE:
 		switch (attrs->next_state) {
 		case C4IW_QP_STATE_RTS:
 			if (!(mask & C4IW_QP_ATTR_LLP_STREAM_HANDLE)) {
 				ret = -EINVAL;
 				goto out;
 			}
 			if (!(mask & C4IW_QP_ATTR_MPA_ATTR)) {
 				ret = -EINVAL;
 				goto out;
 			}
 			qhp->attr.mpa_attr = attrs->mpa_attr;
 			qhp->attr.llp_stream_handle = attrs->llp_stream_handle;
 			qhp->ep = qhp->attr.llp_stream_handle;
 			set_state(qhp, C4IW_QP_STATE_RTS);
 
 			/*
 			 * Ref the endpoint here and deref when we
 			 * disassociate the endpoint from the QP.  This
 			 * happens in CLOSING->IDLE transition or *->ERROR
 			 * transition.
 			 */
 			c4iw_get_ep(&qhp->ep->com);
 			ret = rdma_init(rhp, qhp);
 			if (ret)
 				goto err;
 			break;
 		case C4IW_QP_STATE_ERROR:
 			set_state(qhp, C4IW_QP_STATE_ERROR);
 			flush_qp(qhp);
 			break;
 		default:
 			ret = -EINVAL;
 			goto out;
 		}
 		break;
 	case C4IW_QP_STATE_RTS:
 		switch (attrs->next_state) {
 		case C4IW_QP_STATE_CLOSING:
 			BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2);
 			t4_set_wq_in_error(&qhp->wq);
 			set_state(qhp, C4IW_QP_STATE_CLOSING);
 			ep = qhp->ep;
 			if (!internal) {
 				abort = 0;
 				disconnect = 1;
 				c4iw_get_ep(&qhp->ep->com);
 			}
 			ret = rdma_fini(rhp, qhp, ep);
 			if (ret)
 				goto err;
 			break;
 		case C4IW_QP_STATE_TERMINATE:
 			t4_set_wq_in_error(&qhp->wq);
 			set_state(qhp, C4IW_QP_STATE_TERMINATE);
 			qhp->attr.layer_etype = attrs->layer_etype;
 			qhp->attr.ecode = attrs->ecode;
 			ep = qhp->ep;
 			if (!internal) {
 				c4iw_get_ep(&qhp->ep->com);
 				terminate = 1;
 				disconnect = 1;
 			} else {
 				terminate = qhp->attr.send_term;
 				ret = rdma_fini(rhp, qhp, ep);
 				if (ret)
 					goto err;
 			}
 			break;
 		case C4IW_QP_STATE_ERROR:
 			t4_set_wq_in_error(&qhp->wq);
 			set_state(qhp, C4IW_QP_STATE_ERROR);
 			if (!internal) {
 				abort = 1;
 				disconnect = 1;
 				ep = qhp->ep;
 				c4iw_get_ep(&qhp->ep->com);
 			}
 			goto err;
 			break;
 		default:
 			ret = -EINVAL;
 			goto out;
 		}
 		break;
 	case C4IW_QP_STATE_CLOSING:
 
 		/*
 		 * Allow kernel users to move to ERROR for qp draining.
 		 */
 		if (!internal && (qhp->ibqp.uobject || attrs->next_state !=
 				  C4IW_QP_STATE_ERROR)) {
 			ret = -EINVAL;
 			goto out;
 		}
 		switch (attrs->next_state) {
 		case C4IW_QP_STATE_IDLE:
 			flush_qp(qhp);
 			set_state(qhp, C4IW_QP_STATE_IDLE);
 			qhp->attr.llp_stream_handle = NULL;
 			c4iw_put_ep(&qhp->ep->com);
 			qhp->ep = NULL;
 			wake_up(&qhp->wait);
 			break;
 		case C4IW_QP_STATE_ERROR:
 			goto err;
 		default:
 			ret = -EINVAL;
 			goto err;
 		}
 		break;
 	case C4IW_QP_STATE_ERROR:
 		if (attrs->next_state != C4IW_QP_STATE_IDLE) {
 			ret = -EINVAL;
 			goto out;
 		}
 		if (!t4_sq_empty(&qhp->wq) || !t4_rq_empty(&qhp->wq)) {
 			ret = -EINVAL;
 			goto out;
 		}
 		set_state(qhp, C4IW_QP_STATE_IDLE);
 		break;
 	case C4IW_QP_STATE_TERMINATE:
 		if (!internal) {
 			ret = -EINVAL;
 			goto out;
 		}
 		goto err;
 		break;
 	default:
 		printf("%s in a bad state %d\n",
 		       __func__, qhp->attr.state);
 		ret = -EINVAL;
 		goto err;
 		break;
 	}
 	goto out;
 err:
 	CTR3(KTR_IW_CXGBE, "%s disassociating ep %p qpid 0x%x", __func__,
 	    qhp->ep, qhp->wq.sq.qid);
 
 	/* disassociate the LLP connection */
 	qhp->attr.llp_stream_handle = NULL;
 	if (!ep)
 		ep = qhp->ep;
 	qhp->ep = NULL;
 	set_state(qhp, C4IW_QP_STATE_ERROR);
 	free = 1;
 	abort = 1;
 	BUG_ON(!ep);
 	flush_qp(qhp);
 	wake_up(&qhp->wait);
 out:
 	mutex_unlock(&qhp->mutex);
 
 	if (terminate)
 		post_terminate(qhp, NULL, internal ? GFP_ATOMIC : GFP_KERNEL);
 
 	/*
 	 * If disconnect is 1, then we need to initiate a disconnect
 	 * on the EP.  This can be a normal close (RTS->CLOSING) or
 	 * an abnormal close (RTS/CLOSING->ERROR).
 	 */
 	if (disconnect) {
 		__c4iw_ep_disconnect(ep, abort, internal ? GFP_ATOMIC :
 							 GFP_KERNEL);
 		c4iw_put_ep(&ep->com);
 	}
 
 	/*
 	 * If free is 1, then we've disassociated the EP from the QP
 	 * and we need to dereference the EP.
 	 */
 	if (free)
 		c4iw_put_ep(&ep->com);
 	CTR2(KTR_IW_CXGBE, "%s exit state %d", __func__, qhp->attr.state);
 	return ret;
 }
 
 int c4iw_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 {
 	struct c4iw_dev *rhp;
 	struct c4iw_qp *qhp;
 	struct c4iw_qp_attributes attrs;
 
 	CTR2(KTR_IW_CXGBE, "%s ib_qp %p", __func__, ib_qp);
 	qhp = to_c4iw_qp(ib_qp);
 	rhp = qhp->rhp;
 
 	attrs.next_state = C4IW_QP_STATE_ERROR;
 	if (qhp->attr.state == C4IW_QP_STATE_TERMINATE)
 		c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
 	else
 		c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0);
 	wait_event(qhp->wait, !qhp->ep);
 
 	remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
 
 	free_ird(rhp, qhp->attr.max_ird);
 	c4iw_qp_rem_ref(ib_qp);
 
 	CTR3(KTR_IW_CXGBE, "%s ib_qp %p qpid 0x%0x", __func__, ib_qp,
 	    qhp->wq.sq.qid);
 	return 0;
 }
 
 struct ib_qp *
 c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
     struct ib_udata *udata)
 {
 	struct c4iw_dev *rhp;
 	struct c4iw_qp *qhp;
 	struct c4iw_pd *php;
 	struct c4iw_cq *schp;
 	struct c4iw_cq *rchp;
 	struct c4iw_create_qp_resp uresp;
 	unsigned int sqsize, rqsize;
 	struct c4iw_ucontext *ucontext;
 	int ret;
 	struct c4iw_mm_entry *sq_key_mm = NULL, *rq_key_mm = NULL;
 	struct c4iw_mm_entry *sq_db_key_mm = NULL, *rq_db_key_mm = NULL;
 
 	CTR2(KTR_IW_CXGBE, "%s ib_pd %p", __func__, pd);
 
 	if (attrs->qp_type != IB_QPT_RC)
 		return ERR_PTR(-EINVAL);
 
 	php = to_c4iw_pd(pd);
 	rhp = php->rhp;
 	schp = get_chp(rhp, ((struct c4iw_cq *)attrs->send_cq)->cq.cqid);
 	rchp = get_chp(rhp, ((struct c4iw_cq *)attrs->recv_cq)->cq.cqid);
 	if (!schp || !rchp)
 		return ERR_PTR(-EINVAL);
 
 	if (attrs->cap.max_inline_data > T4_MAX_SEND_INLINE)
 		return ERR_PTR(-EINVAL);
 
 	if (attrs->cap.max_recv_wr > rhp->rdev.hw_queue.t4_max_rq_size)
 		return ERR_PTR(-E2BIG);
 	rqsize = attrs->cap.max_recv_wr + 1;
 	if (rqsize < 8)
 		rqsize = 8;
 
 	if (attrs->cap.max_send_wr > rhp->rdev.hw_queue.t4_max_sq_size)
 		return ERR_PTR(-E2BIG);
 	sqsize = attrs->cap.max_send_wr + 1;
 	if (sqsize < 8)
 		sqsize = 8;
 
 	ucontext = pd->uobject ? to_c4iw_ucontext(pd->uobject->context) : NULL;
 
 	qhp = kzalloc(sizeof(*qhp), GFP_KERNEL);
 	if (!qhp)
 		return ERR_PTR(-ENOMEM);
 	qhp->wq.sq.size = sqsize;
 	qhp->wq.sq.memsize =
 		(sqsize + rhp->rdev.hw_queue.t4_eq_status_entries) *
 		sizeof(*qhp->wq.sq.queue) + 16 * sizeof(__be64);
 	qhp->wq.sq.flush_cidx = -1;
 	qhp->wq.rq.size = rqsize;
 	qhp->wq.rq.memsize =
 		(rqsize + rhp->rdev.hw_queue.t4_eq_status_entries) *
 		sizeof(*qhp->wq.rq.queue);
 
 	if (ucontext) {
 		qhp->wq.sq.memsize = roundup(qhp->wq.sq.memsize, PAGE_SIZE);
 		qhp->wq.rq.memsize = roundup(qhp->wq.rq.memsize, PAGE_SIZE);
 	}
 
 	CTR5(KTR_IW_CXGBE, "%s sqsize %u sqmemsize %zu rqsize %u rqmemsize %zu",
 	    __func__, sqsize, qhp->wq.sq.memsize, rqsize, qhp->wq.rq.memsize);
 
 	ret = create_qp(&rhp->rdev, &qhp->wq, &schp->cq, &rchp->cq,
 			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
 	if (ret)
 		goto err1;
 
 	attrs->cap.max_recv_wr = rqsize - 1;
 	attrs->cap.max_send_wr = sqsize - 1;
 	attrs->cap.max_inline_data = T4_MAX_SEND_INLINE;
 
 	qhp->rhp = rhp;
 	qhp->attr.pd = php->pdid;
 	qhp->attr.scq = ((struct c4iw_cq *) attrs->send_cq)->cq.cqid;
 	qhp->attr.rcq = ((struct c4iw_cq *) attrs->recv_cq)->cq.cqid;
 	qhp->attr.sq_num_entries = attrs->cap.max_send_wr;
 	qhp->attr.rq_num_entries = attrs->cap.max_recv_wr;
 	qhp->attr.sq_max_sges = attrs->cap.max_send_sge;
 	qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge;
 	qhp->attr.rq_max_sges = attrs->cap.max_recv_sge;
 	qhp->attr.state = C4IW_QP_STATE_IDLE;
 	qhp->attr.next_state = C4IW_QP_STATE_IDLE;
 	qhp->attr.enable_rdma_read = 1;
 	qhp->attr.enable_rdma_write = 1;
 	qhp->attr.enable_bind = 1;
 	qhp->attr.max_ord = 0;
 	qhp->attr.max_ird = 0;
 	qhp->sq_sig_all = attrs->sq_sig_type == IB_SIGNAL_ALL_WR;
 	spin_lock_init(&qhp->lock);
 	mutex_init(&qhp->mutex);
 	init_waitqueue_head(&qhp->wait);
 	kref_init(&qhp->kref);
 	INIT_WORK(&qhp->free_work, free_qp_work);
 
 	ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid);
 	if (ret)
 		goto err2;
 
 	if (udata) {
 		sq_key_mm = kmalloc(sizeof(*sq_key_mm), GFP_KERNEL);
 		if (!sq_key_mm) {
 			ret = -ENOMEM;
 			goto err3;
 		}
 		rq_key_mm = kmalloc(sizeof(*rq_key_mm), GFP_KERNEL);
 		if (!rq_key_mm) {
 			ret = -ENOMEM;
 			goto err4;
 		}
 		sq_db_key_mm = kmalloc(sizeof(*sq_db_key_mm), GFP_KERNEL);
 		if (!sq_db_key_mm) {
 			ret = -ENOMEM;
 			goto err5;
 		}
 		rq_db_key_mm = kmalloc(sizeof(*rq_db_key_mm), GFP_KERNEL);
 		if (!rq_db_key_mm) {
 			ret = -ENOMEM;
 			goto err6;
 		}
 		uresp.flags = 0;
 		uresp.qid_mask = rhp->rdev.qpmask;
 		uresp.sqid = qhp->wq.sq.qid;
 		uresp.sq_size = qhp->wq.sq.size;
 		uresp.sq_memsize = qhp->wq.sq.memsize;
 		uresp.rqid = qhp->wq.rq.qid;
 		uresp.rq_size = qhp->wq.rq.size;
 		uresp.rq_memsize = qhp->wq.rq.memsize;
 		spin_lock(&ucontext->mmap_lock);
 		uresp.ma_sync_key =  0;
 		uresp.sq_key = ucontext->key;
 		ucontext->key += PAGE_SIZE;
 		uresp.rq_key = ucontext->key;
 		ucontext->key += PAGE_SIZE;
 		uresp.sq_db_gts_key = ucontext->key;
 		ucontext->key += PAGE_SIZE;
 		uresp.rq_db_gts_key = ucontext->key;
 		ucontext->key += PAGE_SIZE;
 		spin_unlock(&ucontext->mmap_lock);
 		ret = ib_copy_to_udata(udata, &uresp, sizeof uresp);
 		if (ret)
 			goto err7;
 		sq_key_mm->key = uresp.sq_key;
 		sq_key_mm->addr = qhp->wq.sq.phys_addr;
 		sq_key_mm->len = PAGE_ALIGN(qhp->wq.sq.memsize);
 		CTR4(KTR_IW_CXGBE, "%s sq_key_mm %x, %x, %d", __func__,
 				sq_key_mm->key, sq_key_mm->addr,
 				sq_key_mm->len);
 		insert_mmap(ucontext, sq_key_mm);
 		rq_key_mm->key = uresp.rq_key;
 		rq_key_mm->addr = qhp->wq.rq.phys_addr;
 		rq_key_mm->len = PAGE_ALIGN(qhp->wq.rq.memsize);
 		CTR4(KTR_IW_CXGBE, "%s rq_key_mm %x, %x, %d", __func__,
 				rq_key_mm->key, rq_key_mm->addr,
 				rq_key_mm->len);
 		insert_mmap(ucontext, rq_key_mm);
 		sq_db_key_mm->key = uresp.sq_db_gts_key;
 		sq_db_key_mm->addr = (u64)qhp->wq.sq.bar2_pa;
 		sq_db_key_mm->len = PAGE_SIZE;
 		CTR4(KTR_IW_CXGBE, "%s sq_db_key_mm %x, %x, %d", __func__,
 				sq_db_key_mm->key, sq_db_key_mm->addr,
 				sq_db_key_mm->len);
 		insert_mmap(ucontext, sq_db_key_mm);
 		rq_db_key_mm->key = uresp.rq_db_gts_key;
 		rq_db_key_mm->addr = (u64)qhp->wq.rq.bar2_pa;
 		rq_db_key_mm->len = PAGE_SIZE;
 		CTR4(KTR_IW_CXGBE, "%s rq_db_key_mm %x, %x, %d", __func__,
 				rq_db_key_mm->key, rq_db_key_mm->addr,
 				rq_db_key_mm->len);
 		insert_mmap(ucontext, rq_db_key_mm);
 
 		qhp->ucontext = ucontext;
 	}
 	qhp->ibqp.qp_num = qhp->wq.sq.qid;
 	init_timer(&(qhp->timer));
 
 	CTR5(KTR_IW_CXGBE, "%s sq id %u size %u memsize %zu num_entries %u",
 		 __func__, qhp->wq.sq.qid,
 		 qhp->wq.sq.size, qhp->wq.sq.memsize, attrs->cap.max_send_wr);
 	CTR5(KTR_IW_CXGBE, "%s rq id %u size %u memsize %zu num_entries %u",
 		 __func__, qhp->wq.rq.qid,
 		 qhp->wq.rq.size, qhp->wq.rq.memsize, attrs->cap.max_recv_wr);
 	return &qhp->ibqp;
 err7:
 	kfree(rq_db_key_mm);
 err6:
 	kfree(sq_db_key_mm);
 err5:
 	kfree(rq_key_mm);
 err4:
 	kfree(sq_key_mm);
 err3:
 	remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
 err2:
 	destroy_qp(&rhp->rdev, &qhp->wq,
 		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
 err1:
 	kfree(qhp);
 	return ERR_PTR(ret);
 }
 
 int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		      int attr_mask, struct ib_udata *udata)
 {
 	struct c4iw_dev *rhp;
 	struct c4iw_qp *qhp;
 	enum c4iw_qp_attr_mask mask = 0;
 	struct c4iw_qp_attributes attrs;
 
 	CTR2(KTR_IW_CXGBE, "%s ib_qp %p", __func__, ibqp);
 
 	/* iwarp does not support the RTR state */
 	if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR))
 		attr_mask &= ~IB_QP_STATE;
 
 	/* Make sure we still have something left to do */
 	if (!attr_mask)
 		return 0;
 
 	memset(&attrs, 0, sizeof attrs);
 	qhp = to_c4iw_qp(ibqp);
 	rhp = qhp->rhp;
 
 	attrs.next_state = c4iw_convert_state(attr->qp_state);
 	attrs.enable_rdma_read = (attr->qp_access_flags &
 			       IB_ACCESS_REMOTE_READ) ?  1 : 0;
 	attrs.enable_rdma_write = (attr->qp_access_flags &
 				IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
 	attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0;
 
 
 	mask |= (attr_mask & IB_QP_STATE) ? C4IW_QP_ATTR_NEXT_STATE : 0;
 	mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ?
 			(C4IW_QP_ATTR_ENABLE_RDMA_READ |
 			 C4IW_QP_ATTR_ENABLE_RDMA_WRITE |
 			 C4IW_QP_ATTR_ENABLE_RDMA_BIND) : 0;
 
 	return c4iw_modify_qp(rhp, qhp, mask, &attrs, 0);
 }
 
 struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn)
 {
 	CTR3(KTR_IW_CXGBE, "%s ib_dev %p qpn 0x%x", __func__, dev, qpn);
 	return (struct ib_qp *)get_qhp(to_c4iw_dev(dev), qpn);
 }
 
 int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		     int attr_mask, struct ib_qp_init_attr *init_attr)
 {
 	struct c4iw_qp *qhp = to_c4iw_qp(ibqp);
 
 	memset(attr, 0, sizeof *attr);
 	memset(init_attr, 0, sizeof *init_attr);
 	attr->qp_state = to_ib_qp_state(qhp->attr.state);
 	init_attr->cap.max_send_wr = qhp->attr.sq_num_entries;
 	init_attr->cap.max_recv_wr = qhp->attr.rq_num_entries;
 	init_attr->cap.max_send_sge = qhp->attr.sq_max_sges;
 	init_attr->cap.max_recv_sge = qhp->attr.sq_max_sges;
 	init_attr->cap.max_inline_data = T4_MAX_SEND_INLINE;
 	init_attr->sq_sig_type = qhp->sq_sig_all ? IB_SIGNAL_ALL_WR : 0;
 	return 0;
 }
 #endif
diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c
index 59d1c367f94c..bfc9eb3b76f7 100644
--- a/sys/dev/cxgbe/tom/t4_cpl_io.c
+++ b/sys/dev/cxgbe/tom/t4_cpl_io.c
@@ -1,2470 +1,2470 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 
 #ifdef TCP_OFFLOAD
 #include <sys/param.h>
 #include <sys/aio.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sglist.h>
 #include <sys/taskqueue.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 
 #include <dev/iscsi/iscsi_proto.h>
 
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 static void	t4_aiotx_cancel(struct kaiocb *job);
 static void	t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep);
 
 void
 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp)
 {
 	struct wrqe *wr;
 	struct fw_flowc_wr *flowc;
 	unsigned int nparams, flowclen, paramidx;
 	struct vi_info *vi = toep->vi;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	unsigned int pfvf = sc->pf << S_FW_VIID_PFN;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 	KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT),
 	    ("%s: flowc for tid %u sent already", __func__, toep->tid));
 
 	if (tp != NULL)
 		nparams = 8;
 	else
 		nparams = 6;
 	if (ulp_mode(toep) == ULP_MODE_TLS)
 		nparams++;
 	if (toep->tls.fcplenmax != 0)
 		nparams++;
 	if (toep->params.tc_idx != -1) {
 		MPASS(toep->params.tc_idx >= 0 &&
 		    toep->params.tc_idx < sc->params.nsched_cls);
 		nparams++;
 	}
 
 	flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
 
 	wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	flowc = wrtod(wr);
 	memset(flowc, 0, wr->wr_len);
 
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(nparams));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
 	    V_FW_WR_FLOWID(toep->tid));
 
 #define FLOWC_PARAM(__m, __v) \
 	do { \
 		flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \
 		flowc->mnemval[paramidx].val = htobe32(__v); \
 		paramidx++; \
 	} while (0)
 
 	paramidx = 0;
 
 	FLOWC_PARAM(PFNVFN, pfvf);
 	FLOWC_PARAM(CH, pi->tx_chan);
 	FLOWC_PARAM(PORT, pi->tx_chan);
 	FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id);
 	FLOWC_PARAM(SNDBUF, toep->params.sndbuf);
 	if (tp) {
 		FLOWC_PARAM(MSS, toep->params.emss);
 		FLOWC_PARAM(SNDNXT, tp->snd_nxt);
 		FLOWC_PARAM(RCVNXT, tp->rcv_nxt);
 	} else
 		FLOWC_PARAM(MSS, 512);
 	CTR6(KTR_CXGBE,
 	    "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x",
 	    __func__, toep->tid, toep->params.emss, toep->params.sndbuf,
 	    tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0);
 
 	if (ulp_mode(toep) == ULP_MODE_TLS)
 		FLOWC_PARAM(ULP_MODE, ulp_mode(toep));
 	if (toep->tls.fcplenmax != 0)
 		FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax);
 	if (toep->params.tc_idx != -1)
 		FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx);
 #undef FLOWC_PARAM
 
 	KASSERT(paramidx == nparams, ("nparams mismatch"));
 
 	txsd->tx_credits = howmany(flowclen, 16);
 	txsd->plen = 0;
 	KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
 	    ("%s: not enough credits (%d)", __func__, toep->tx_credits));
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 
 	toep->flags |= TPF_FLOWC_WR_SENT;
         t4_wrq_tx(sc, wr);
 }
 
 #ifdef RATELIMIT
 /*
  * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second.
  */
 static int
 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps)
 {
 	int tc_idx, rc;
 	const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000;
 	const int port_id = toep->vi->pi->port_id;
 
 	CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps);
 
 	if (kbps == 0) {
 		/* unbind */
 		tc_idx = -1;
 	} else {
 		rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx);
 		if (rc != 0)
 			return (rc);
 		MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls);
 	}
 
 	if (toep->params.tc_idx != tc_idx) {
 		struct wrqe *wr;
 		struct fw_flowc_wr *flowc;
 		int nparams = 1, flowclen, flowclen16;
 		struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 
 		flowclen = sizeof(*flowc) + nparams * sizeof(struct
 		    fw_flowc_mnemval);
 		flowclen16 = howmany(flowclen, 16);
 		if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 ||
 		    (wr = alloc_wrqe(roundup2(flowclen, 16),
 		    &toep->ofld_txq->wrq)) == NULL) {
 			if (tc_idx >= 0)
 				t4_release_cl_rl(sc, port_id, tc_idx);
 			return (ENOMEM);
 		}
 
 		flowc = wrtod(wr);
 		memset(flowc, 0, wr->wr_len);
 
 		flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 		    V_FW_FLOWC_WR_NPARAMS(nparams));
 		flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) |
 		    V_FW_WR_FLOWID(toep->tid));
 
 		flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
 		if (tc_idx == -1)
 			flowc->mnemval[0].val = htobe32(0xff);
 		else
 			flowc->mnemval[0].val = htobe32(tc_idx);
 
 		txsd->tx_credits = flowclen16;
 		txsd->plen = 0;
 		toep->tx_credits -= txsd->tx_credits;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 			toep->txsd_pidx = 0;
 		toep->txsd_avail--;
 		t4_wrq_tx(sc, wr);
 	}
 
 	if (toep->params.tc_idx >= 0)
 		t4_release_cl_rl(sc, port_id, toep->params.tc_idx);
 	toep->params.tc_idx = tc_idx;
 
 	return (0);
 }
 #endif
 
 void
 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt)
 {
 	struct wrqe *wr;
 	struct cpl_abort_req *req;
 	int tid = toep->tid;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);	/* don't use if INP_DROPPED */
 
 	INP_WLOCK_ASSERT(inp);
 
 	CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s",
 	    __func__, toep->tid,
 	    inp->inp_flags & INP_DROPPED ? "inp dropped" :
 	    tcpstates[tp->t_state],
 	    toep->flags, inp->inp_flags,
 	    toep->flags & TPF_ABORT_SHUTDOWN ?
 	    " (abort already in progress)" : "");
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		return;	/* abort already in progress */
 
 	toep->flags |= TPF_ABORT_SHUTDOWN;
 
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %d.", __func__, tid));
 
 	wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid);
 	if (inp->inp_flags & INP_DROPPED)
 		req->rsvd0 = htobe32(snd_nxt);
 	else
 		req->rsvd0 = htobe32(tp->snd_nxt);
 	req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT);
 	req->cmd = CPL_ABORT_SEND_RST;
 
 	/*
 	 * XXX: What's the correct way to tell that the inp hasn't been detached
 	 * from its socket?  Should I even be flushing the snd buffer here?
 	 */
-	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
+	if ((inp->inp_flags & INP_DROPPED) == 0) {
 		struct socket *so = inp->inp_socket;
 
 		if (so != NULL)	/* because I'm not sure.  See comment above */
 			sbflush(&so->so_snd);
 	}
 
 	t4_l2t_send(sc, wr, toep->l2te);
 }
 
 /*
  * Called when a connection is established to translate the TCP options
  * reported by HW to FreeBSD's native format.
  */
 static void
 assign_rxopt(struct tcpcb *tp, uint16_t opt)
 {
 	struct toepcb *toep = tp->t_toe;
 	struct inpcb *inp = tp->t_inpcb;
 	struct adapter *sc = td_adapter(toep->td);
 
 	INP_LOCK_ASSERT(inp);
 
 	toep->params.mtu_idx = G_TCPOPT_MSS(opt);
 	tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx];
 	if (inp->inp_inc.inc_flags & INC_ISIPV6)
 		tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	else
 		tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr);
 
 	toep->params.emss = tp->t_maxseg;
 	if (G_TCPOPT_TSTAMP(opt)) {
 		toep->params.tstamp = 1;
 		toep->params.emss -= TCPOLEN_TSTAMP_APPA;
 		tp->t_flags |= TF_RCVD_TSTMP;	/* timestamps ok */
 		tp->ts_recent = 0;		/* hmmm */
 		tp->ts_recent_age = tcp_ts_getticks();
 	} else
 		toep->params.tstamp = 0;
 
 	if (G_TCPOPT_SACK(opt)) {
 		toep->params.sack = 1;
 		tp->t_flags |= TF_SACK_PERMIT;	/* should already be set */
 	} else {
 		toep->params.sack = 0;
 		tp->t_flags &= ~TF_SACK_PERMIT;	/* sack disallowed by peer */
 	}
 
 	if (G_TCPOPT_WSCALE_OK(opt))
 		tp->t_flags |= TF_RCVD_SCALE;
 
 	/* Doing window scaling? */
 	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 		tp->rcv_scale = tp->request_r_scale;
 		tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
 	} else
 		toep->params.wscale = 0;
 
 	CTR6(KTR_CXGBE,
 	    "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u",
 	    toep->tid, toep->params.mtu_idx, toep->params.emss,
 	    toep->params.tstamp, toep->params.sack, toep->params.wscale);
 }
 
 /*
  * Completes some final bits of initialization for just established connections
  * and changes their state to TCPS_ESTABLISHED.
  *
  * The ISNs are from the exchange of SYNs.
  */
 void
 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt)
 {
 	struct inpcb *inp = toep->inp;
 	struct socket *so = inp->inp_socket;
 	struct tcpcb *tp = intotcpcb(inp);
 	uint16_t tcpopt = be16toh(opt);
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(tp->t_state == TCPS_SYN_SENT ||
 	    tp->t_state == TCPS_SYN_RECEIVED,
 	    ("%s: TCP state %s", __func__, tcpstates[tp->t_state]));
 
 	CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p",
 	    __func__, toep->tid, so, inp, tp, toep);
 
 	tcp_state_change(tp, TCPS_ESTABLISHED);
 	tp->t_starttime = ticks;
 	TCPSTAT_INC(tcps_connects);
 
 	tp->irs = irs;
 	tcp_rcvseqinit(tp);
 	tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10;
 	tp->rcv_adv += tp->rcv_wnd;
 	tp->last_ack_sent = tp->rcv_nxt;
 
 	tp->iss = iss;
 	tcp_sendseqinit(tp);
 	tp->snd_una = iss + 1;
 	tp->snd_nxt = iss + 1;
 	tp->snd_max = iss + 1;
 
 	assign_rxopt(tp, tcpopt);
 	send_flowc_wr(toep, tp);
 
 	soisconnected(so);
 
 	if (ulp_mode(toep) == ULP_MODE_TLS)
 		tls_establish(toep);
 }
 
 int
 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
 {
 	struct wrqe *wr;
 	struct cpl_rx_data_ack *req;
 	uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 
 	KASSERT(credits >= 0, ("%s: %d credits", __func__, credits));
 
 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
 	if (wr == NULL)
 		return (0);
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
 	req->credit_dack = htobe32(dack | V_RX_CREDITS(credits));
 
 	t4_wrq_tx(sc, wr);
 	return (credits);
 }
 
 void
 send_rx_modulate(struct adapter *sc, struct toepcb *toep)
 {
 	struct wrqe *wr;
 	struct cpl_rx_data_ack *req;
 
 	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
 	if (wr == NULL)
 		return;
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
 
 	t4_wrq_tx(sc, wr);
 }
 
 void
 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_rcv;
 	struct toepcb *toep = tp->t_toe;
 	int rx_credits;
 
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
 	if (rx_credits > 0 &&
 	    (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 ||
 	    (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
 	    sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) {
 		rx_credits = send_rx_credits(sc, toep, rx_credits);
 		tp->rcv_wnd += rx_credits;
 		tp->rcv_adv += rx_credits;
 	} else if (toep->flags & TPF_FORCE_CREDITS)
 		send_rx_modulate(sc, toep);
 }
 
 void
 t4_rcvd(struct toedev *tod, struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_rcv;
 
 	SOCKBUF_LOCK(sb);
 	t4_rcvd_locked(tod, tp);
 	SOCKBUF_UNLOCK(sb);
 }
 
 /*
  * Close a connection by sending a CPL_CLOSE_CON_REQ message.
  */
 int
 t4_close_conn(struct adapter *sc, struct toepcb *toep)
 {
 	struct wrqe *wr;
 	struct cpl_close_con_req *req;
 	unsigned int tid = toep->tid;
 
 	CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
 	    toep->flags & TPF_FIN_SENT ? ", IGNORED" : "");
 
 	if (toep->flags & TPF_FIN_SENT)
 		return (0);
 
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, tid));
 
 	wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
         req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) |
 	    V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr)));
 	req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) |
 	    V_FW_WR_FLOWID(tid));
         req->wr.wr_lo = cpu_to_be64(0);
         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
 	req->rsvd = 0;
 
 	toep->flags |= TPF_FIN_SENT;
 	toep->flags &= ~TPF_SEND_FIN;
 	t4_l2t_send(sc, wr, toep->l2te);
 
 	return (0);
 }
 
 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
 #define MIN_ISO_TX_CREDITS  (howmany(sizeof(struct cpl_tx_data_iso), 16))
 #define MIN_TX_CREDITS(iso)						\
 	(MIN_OFLD_TX_CREDITS + ((iso) ? MIN_ISO_TX_CREDITS : 0))
 
 /* Maximum amount of immediate data we could stuff in a WR */
 static inline int
 max_imm_payload(int tx_credits, int iso)
 {
 	const int iso_cpl_size = iso ? sizeof(struct cpl_tx_data_iso) : 0;
 	const int n = 1;	/* Use no more than one desc for imm. data WR */
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits < MIN_TX_CREDITS(iso))
 		return (0);
 
 	if (tx_credits >= (n * EQ_ESIZE) / 16)
 		return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr) -
 		    iso_cpl_size);
 	else
 		return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr) -
 		    iso_cpl_size);
 }
 
 /* Maximum number of SGL entries we could stuff in a WR */
 static inline int
 max_dsgl_nsegs(int tx_credits, int iso)
 {
 	int nseg = 1;	/* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
 	int sge_pair_credits = tx_credits - MIN_TX_CREDITS(iso);
 
 	KASSERT(tx_credits >= 0 &&
 		tx_credits <= MAX_OFLD_TX_CREDITS,
 		("%s: %d credits", __func__, tx_credits));
 
 	if (tx_credits < MIN_TX_CREDITS(iso))
 		return (0);
 
 	nseg += 2 * (sge_pair_credits * 16 / 24);
 	if ((sge_pair_credits * 16) % 24 == 16)
 		nseg++;
 
 	return (nseg);
 }
 
 static inline void
 write_tx_wr(void *dst, struct toepcb *toep, int fw_wr_opcode,
     unsigned int immdlen, unsigned int plen, uint8_t credits, int shove,
     int ulp_submode)
 {
 	struct fw_ofld_tx_data_wr *txwr = dst;
 
 	txwr->op_to_immdlen = htobe32(V_WR_OP(fw_wr_opcode) |
 	    V_FW_WR_IMMDLEN(immdlen));
 	txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
 	    V_FW_WR_LEN16(credits));
 	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) |
 	    V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove));
 	txwr->plen = htobe32(plen);
 
 	if (toep->params.tx_align > 0) {
 		if (plen < 2 * toep->params.emss)
 			txwr->lsodisable_to_flags |=
 			    htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE);
 		else
 			txwr->lsodisable_to_flags |=
 			    htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD |
 				(toep->params.nagle == 0 ? 0 :
 				F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE));
 	}
 }
 
 /*
  * Generate a DSGL from a starting mbuf.  The total number of segments and the
  * maximum segments in any one mbuf are provided.
  */
 static void
 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
 {
 	struct mbuf *m;
 	struct ulptx_sgl *usgl = dst;
 	int i, j, rc;
 	struct sglist sg;
 	struct sglist_seg segs[n];
 
 	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
 
 	sglist_init(&sg, n, segs);
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 
 	i = -1;
 	for (m = start; m != stop; m = m->m_next) {
 		if (m->m_flags & M_EXTPG)
 			rc = sglist_append_mbuf_epg(&sg, m,
 			    mtod(m, vm_offset_t), m->m_len);
 		else
 			rc = sglist_append(&sg, mtod(m, void *), m->m_len);
 		if (__predict_false(rc != 0))
 			panic("%s: sglist_append %d", __func__, rc);
 
 		for (j = 0; j < sg.sg_nseg; i++, j++) {
 			if (i < 0) {
 				usgl->len0 = htobe32(segs[j].ss_len);
 				usgl->addr0 = htobe64(segs[j].ss_paddr);
 			} else {
 				usgl->sge[i / 2].len[i & 1] =
 				    htobe32(segs[j].ss_len);
 				usgl->sge[i / 2].addr[i & 1] =
 				    htobe64(segs[j].ss_paddr);
 			}
 #ifdef INVARIANTS
 			nsegs--;
 #endif
 		}
 		sglist_reset(&sg);
 	}
 	if (i & 1)
 		usgl->sge[i / 2].len[1] = htobe32(0);
 	KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p",
 	    __func__, nsegs, start, stop));
 }
 
 /*
  * Max number of SGL entries an offload tx work request can have.  This is 41
  * (1 + 40) for a full 512B work request.
  * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40)
  */
 #define OFLD_SGL_LEN (41)
 
 /*
  * Send data and/or a FIN to the peer.
  *
  * The socket's so_snd buffer consists of a stream of data starting with sb_mb
  * and linked together with m_next.  sb_sndptr, if set, is the last mbuf that
  * was transmitted.
  *
  * drop indicates the number of bytes that should be dropped from the head of
  * the send buffer.  It is an optimization that lets do_fw4_ack avoid creating
  * contention on the send buffer lock (before this change it used to do
  * sowwakeup and then t4_push_frames right after that when recovering from tx
  * stalls).  When drop is set this function MUST drop the bytes and wake up any
  * writers.
  */
 void
 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct mbuf *sndptr, *m, *sb_sndptr;
 	struct fw_ofld_tx_data_wr *txwr;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_snd;
 	int tx_credits, shove, compl, sowwakeup;
 	struct ofld_tx_sdesc *txsd;
 	bool nomap_mbuf_seen;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 
 	KASSERT(ulp_mode(toep) == ULP_MODE_NONE ||
 	    ulp_mode(toep) == ULP_MODE_TCPDDP ||
 	    ulp_mode(toep) == ULP_MODE_TLS ||
 	    ulp_mode(toep) == ULP_MODE_RDMA,
 	    ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
 
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
 	    __func__, toep->tid, toep->flags, tp->t_flags, drop);
 #endif
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 #ifdef RATELIMIT
 	if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
 	    (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 	}
 #endif
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	do {
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 		max_imm = max_imm_payload(tx_credits, 0);
 		max_nsegs = max_dsgl_nsegs(tx_credits, 0);
 
 		SOCKBUF_LOCK(sb);
 		sowwakeup = drop;
 		if (drop) {
 			sbdrop_locked(sb, drop);
 			drop = 0;
 		}
 		sb_sndptr = sb->sb_sndptr;
 		sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb;
 		plen = 0;
 		nsegs = 0;
 		max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 		nomap_mbuf_seen = false;
 		for (m = sndptr; m != NULL; m = m->m_next) {
 			int n;
 
 			if ((m->m_flags & M_NOTAVAIL) != 0)
 				break;
 			if (m->m_flags & M_EXTPG) {
 #ifdef KERN_TLS
 				if (m->m_epg_tls != NULL) {
 					toep->flags |= TPF_KTLS;
 					if (plen == 0) {
 						SOCKBUF_UNLOCK(sb);
 						t4_push_ktls(sc, toep, 0);
 						return;
 					}
 					break;
 				}
 #endif
 				n = sglist_count_mbuf_epg(m,
 				    mtod(m, vm_offset_t), m->m_len);
 			} else
 				n = sglist_count(mtod(m, void *), m->m_len);
 
 			nsegs += n;
 			plen += m->m_len;
 
 			/* This mbuf sent us _over_ the nsegs limit, back out */
 			if (plen > max_imm && nsegs > max_nsegs) {
 				nsegs -= n;
 				plen -= m->m_len;
 				if (plen == 0) {
 					/* Too few credits */
 					toep->flags |= TPF_TX_SUSPENDED;
 					if (sowwakeup) {
 						if (!TAILQ_EMPTY(
 						    &toep->aiotx_jobq))
 							t4_aiotx_queue_toep(so,
 							    toep);
 						sowwakeup_locked(so);
 					} else
 						SOCKBUF_UNLOCK(sb);
 					SOCKBUF_UNLOCK_ASSERT(sb);
 					return;
 				}
 				break;
 			}
 
 			if (m->m_flags & M_EXTPG)
 				nomap_mbuf_seen = true;
 			if (max_nsegs_1mbuf < n)
 				max_nsegs_1mbuf = n;
 			sb_sndptr = m;	/* new sb->sb_sndptr if all goes well */
 
 			/* This mbuf put us right at the max_nsegs limit */
 			if (plen > max_imm && nsegs == max_nsegs) {
 				m = m->m_next;
 				break;
 			}
 		}
 
 		if (sbused(sb) > sb->sb_hiwat * 5 / 8 &&
 		    toep->plen_nocompl + plen >= sb->sb_hiwat / 4)
 			compl = 1;
 		else
 			compl = 0;
 
 		if (sb->sb_flags & SB_AUTOSIZE &&
 		    V_tcp_do_autosndbuf &&
 		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
 		    sbused(sb) >= sb->sb_hiwat * 7 / 8) {
 			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
 			    V_tcp_autosndbuf_max);
 
 			if (!sbreserve_locked(so, SO_SND, newsize, NULL))
 				sb->sb_flags &= ~SB_AUTOSIZE;
 			else
 				sowwakeup = 1;	/* room available */
 		}
 		if (sowwakeup) {
 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 				t4_aiotx_queue_toep(so, toep);
 			sowwakeup_locked(so);
 		} else
 			SOCKBUF_UNLOCK(sb);
 		SOCKBUF_UNLOCK_ASSERT(sb);
 
 		/* nothing to send */
 		if (plen == 0) {
 			KASSERT(m == NULL || (m->m_flags & M_NOTAVAIL) != 0,
 			    ("%s: nothing to send, but m != NULL is ready",
 			    __func__));
 			break;
 		}
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		shove = m == NULL && !(tp->t_flags & TF_MORETOCOME);
 		if (plen <= max_imm && !nomap_mbuf_seen) {
 
 			/* Immediate data tx */
 
 			wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
 					&toep->ofld_txq->wrq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr->wr_len, 16);
 			write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, plen, plen,
 			    credits, shove, 0);
 			m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
 			nsegs = 0;
 		} else {
 			int wr_len;
 
 			/* DSGL tx */
 
 			wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
 			    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 			wr = alloc_wrqe(roundup2(wr_len, 16),
 			    &toep->ofld_txq->wrq);
 			if (wr == NULL) {
 				/* XXX: how will we recover from this? */
 				toep->flags |= TPF_TX_SUSPENDED;
 				return;
 			}
 			txwr = wrtod(wr);
 			credits = howmany(wr_len, 16);
 			write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, 0, plen,
 			    credits, shove, 0);
 			write_tx_sgl(txwr + 1, sndptr, m, nsegs,
 			    max_nsegs_1mbuf);
 			if (wr_len & 0xf) {
 				uint64_t *pad = (uint64_t *)
 				    ((uintptr_t)txwr + wr_len);
 				*pad = 0;
 			}
 		}
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		toep->tx_credits -= credits;
 		toep->tx_nocompl += credits;
 		toep->plen_nocompl += plen;
 		if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
 		    toep->tx_nocompl >= toep->tx_total / 4)
 			compl = 1;
 
 		if (compl || ulp_mode(toep) == ULP_MODE_RDMA) {
 			txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
 			toep->tx_nocompl = 0;
 			toep->plen_nocompl = 0;
 		}
 
 		tp->snd_nxt += plen;
 		tp->snd_max += plen;
 
 		SOCKBUF_LOCK(sb);
 		KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__));
 		sb->sb_sndptr = sb_sndptr;
 		SOCKBUF_UNLOCK(sb);
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	} while (m != NULL && (m->m_flags & M_NOTAVAIL) == 0);
 
 	/* Send a FIN if requested, but only if there's no more data to send */
 	if (m == NULL && toep->flags & TPF_SEND_FIN)
 		t4_close_conn(sc, toep);
 }
 
 static inline void
 rqdrop_locked(struct mbufq *q, int plen)
 {
 	struct mbuf *m;
 
 	while (plen > 0) {
 		m = mbufq_dequeue(q);
 
 		/* Too many credits. */
 		MPASS(m != NULL);
 		M_ASSERTPKTHDR(m);
 
 		/* Partial credits. */
 		MPASS(plen >= m->m_pkthdr.len);
 
 		plen -= m->m_pkthdr.len;
 		m_freem(m);
 	}
 }
 
 /*
  * Not a bit in the TCB, but is a bit in the ulp_submode field of the
  * CPL_TX_DATA flags field in FW_ISCSI_TX_DATA_WR.
  */
 #define	ULP_ISO		G_TX_ULP_SUBMODE(F_FW_ISCSI_TX_DATA_WR_ULPSUBMODE_ISO)
 
 static void
 write_tx_data_iso(void *dst, u_int ulp_submode, uint8_t flags, uint16_t mss,
     int len, int npdu)
 {
 	struct cpl_tx_data_iso *cpl;
 	unsigned int burst_size;
 	unsigned int last;
 
 	/*
 	 * The firmware will set the 'F' bit on the last PDU when
 	 * either condition is true:
 	 *
 	 * - this large PDU is marked as the "last" slice
 	 *
 	 * - the amount of data payload bytes equals the burst_size
 	 *
 	 * The strategy used here is to always set the burst_size
 	 * artificially high (len includes the size of the template
 	 * BHS) and only set the "last" flag if the original PDU had
 	 * 'F' set.
 	 */
 	burst_size = len;
 	last = !!(flags & CXGBE_ISO_F);
 
 	cpl = (struct cpl_tx_data_iso *)dst;
 	cpl->op_to_scsi = htonl(V_CPL_TX_DATA_ISO_OP(CPL_TX_DATA_ISO) |
 	    V_CPL_TX_DATA_ISO_FIRST(1) | V_CPL_TX_DATA_ISO_LAST(last) |
 	    V_CPL_TX_DATA_ISO_CPLHDRLEN(0) |
 	    V_CPL_TX_DATA_ISO_HDRCRC(!!(ulp_submode & ULP_CRC_HEADER)) |
 	    V_CPL_TX_DATA_ISO_PLDCRC(!!(ulp_submode & ULP_CRC_DATA)) |
 	    V_CPL_TX_DATA_ISO_IMMEDIATE(0) |
 	    V_CPL_TX_DATA_ISO_SCSI(CXGBE_ISO_TYPE(flags)));
 
 	cpl->ahs_len = 0;
 	cpl->mpdu = htons(DIV_ROUND_UP(mss, 4));
 	cpl->burst_size = htonl(DIV_ROUND_UP(burst_size, 4));
 	cpl->len = htonl(len);
 	cpl->reserved2_seglen_offset = htonl(0);
 	cpl->datasn_offset = htonl(0);
 	cpl->buffer_offset = htonl(0);
 	cpl->reserved3 = 0;
 }
 
 static struct wrqe *
 write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr)
 {
 	struct mbuf *m;
 	struct fw_ofld_tx_data_wr *txwr;
 	struct cpl_tx_data_iso *cpl_iso;
 	void *p;
 	struct wrqe *wr;
 	u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
 	u_int adjusted_plen, imm_data, ulp_submode;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	int tx_credits, shove, npdu, wr_len;
 	uint16_t iso_mss;
 	static const u_int ulp_extra_len[] = {0, 4, 4, 8};
 	bool iso, nomap_mbuf_seen;
 
 	M_ASSERTPKTHDR(sndptr);
 
 	tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 	if (mbuf_raw_wr(sndptr)) {
 		plen = sndptr->m_pkthdr.len;
 		KASSERT(plen <= SGE_MAX_WR_LEN,
 		    ("raw WR len %u is greater than max WR len", plen));
 		if (plen > tx_credits * 16)
 			return (NULL);
 
 		wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq);
 		if (__predict_false(wr == NULL))
 			return (NULL);
 
 		m_copydata(sndptr, 0, plen, wrtod(wr));
 		return (wr);
 	}
 
 	iso = mbuf_iscsi_iso(sndptr);
 	max_imm = max_imm_payload(tx_credits, iso);
 	max_nsegs = max_dsgl_nsegs(tx_credits, iso);
 	iso_mss = mbuf_iscsi_iso_mss(sndptr);
 
 	plen = 0;
 	nsegs = 0;
 	max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 	nomap_mbuf_seen = false;
 	for (m = sndptr; m != NULL; m = m->m_next) {
 		int n;
 
 		if (m->m_flags & M_EXTPG)
 			n = sglist_count_mbuf_epg(m, mtod(m, vm_offset_t),
 			    m->m_len);
 		else
 			n = sglist_count(mtod(m, void *), m->m_len);
 
 		nsegs += n;
 		plen += m->m_len;
 
 		/*
 		 * This mbuf would send us _over_ the nsegs limit.
 		 * Suspend tx because the PDU can't be sent out.
 		 */
 		if ((nomap_mbuf_seen || plen > max_imm) && nsegs > max_nsegs)
 			return (NULL);
 
 		if (m->m_flags & M_EXTPG)
 			nomap_mbuf_seen = true;
 		if (max_nsegs_1mbuf < n)
 			max_nsegs_1mbuf = n;
 	}
 
 	if (__predict_false(toep->flags & TPF_FIN_SENT))
 		panic("%s: excess tx.", __func__);
 
 	/*
 	 * We have a PDU to send.  All of it goes out in one WR so 'm'
 	 * is NULL.  A PDU's length is always a multiple of 4.
 	 */
 	MPASS(m == NULL);
 	MPASS((plen & 3) == 0);
 	MPASS(sndptr->m_pkthdr.len == plen);
 
 	shove = !(tp->t_flags & TF_MORETOCOME);
 
 	/*
 	 * plen doesn't include header and data digests, which are
 	 * generated and inserted in the right places by the TOE, but
 	 * they do occupy TCP sequence space and need to be accounted
 	 * for.
 	 */
 	ulp_submode = mbuf_ulp_submode(sndptr);
 	MPASS(ulp_submode < nitems(ulp_extra_len));
 	npdu = iso ? howmany(plen - ISCSI_BHS_SIZE, iso_mss) : 1;
 	adjusted_plen = plen + ulp_extra_len[ulp_submode] * npdu;
 	if (iso)
 		adjusted_plen += ISCSI_BHS_SIZE * (npdu - 1);
 	wr_len = sizeof(*txwr);
 	if (iso)
 		wr_len += sizeof(struct cpl_tx_data_iso);
 	if (plen <= max_imm && !nomap_mbuf_seen) {
 		/* Immediate data tx */
 		imm_data = plen;
 		wr_len += plen;
 		nsegs = 0;
 	} else {
 		/* DSGL tx */
 		imm_data = 0;
 		wr_len += sizeof(struct ulptx_sgl) +
 		    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 	}
 
 	wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq);
 	if (wr == NULL) {
 		/* XXX: how will we recover from this? */
 		return (NULL);
 	}
 	txwr = wrtod(wr);
 	credits = howmany(wr->wr_len, 16);
 
 	if (iso) {
 		write_tx_wr(txwr, toep, FW_ISCSI_TX_DATA_WR,
 		    imm_data + sizeof(struct cpl_tx_data_iso),
 		    adjusted_plen, credits, shove, ulp_submode | ULP_ISO);
 		cpl_iso = (struct cpl_tx_data_iso *)(txwr + 1);
 		MPASS(plen == sndptr->m_pkthdr.len);
 		write_tx_data_iso(cpl_iso, ulp_submode,
 		    mbuf_iscsi_iso_flags(sndptr), iso_mss, plen, npdu);
 		p = cpl_iso + 1;
 	} else {
 		write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, imm_data,
 		    adjusted_plen, credits, shove, ulp_submode);
 		p = txwr + 1;
 	}
 
 	if (imm_data != 0) {
 		m_copydata(sndptr, 0, plen, p);
 	} else {
 		write_tx_sgl(p, sndptr, m, nsegs, max_nsegs_1mbuf);
 		if (wr_len & 0xf) {
 			uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len);
 			*pad = 0;
 		}
 	}
 
 	KASSERT(toep->tx_credits >= credits,
 	    ("%s: not enough credits: credits %u "
 		"toep->tx_credits %u tx_credits %u nsegs %u "
 		"max_nsegs %u iso %d", __func__, credits,
 		toep->tx_credits, tx_credits, nsegs, max_nsegs, iso));
 
 	tp->snd_nxt += adjusted_plen;
 	tp->snd_max += adjusted_plen;
 
 	counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, npdu);
 	counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen);
 	if (iso)
 		counter_u64_add(toep->ofld_txq->tx_iscsi_iso_wrs, 1);
 
 	return (wr);
 }
 
 void
 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct mbuf *sndptr, *m;
 	struct fw_wr_hdr *wrhdr;
 	struct wrqe *wr;
 	u_int plen, credits;
 	struct inpcb *inp = toep->inp;
 	struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 	struct mbufq *pduq = &toep->ulp_pduq;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 	KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI,
 	    ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
 
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	if (drop) {
 		struct socket *so = inp->inp_socket;
 		struct sockbuf *sb = &so->so_snd;
 		int sbu;
 
 		/*
 		 * An unlocked read is ok here as the data should only
 		 * transition from a non-zero value to either another
 		 * non-zero value or zero.  Once it is zero it should
 		 * stay zero.
 		 */
 		if (__predict_false(sbused(sb)) > 0) {
 			SOCKBUF_LOCK(sb);
 			sbu = sbused(sb);
 			if (sbu > 0) {
 				/*
 				 * The data transmitted before the
 				 * tid's ULP mode changed to ISCSI is
 				 * still in so_snd.  Incoming credits
 				 * should account for so_snd first.
 				 */
 				sbdrop_locked(sb, min(sbu, drop));
 				drop -= min(sbu, drop);
 			}
 			sowwakeup_locked(so);	/* unlocks so_snd */
 		}
 		rqdrop_locked(&toep->ulp_pdu_reclaimq, drop);
 	}
 
 	while ((sndptr = mbufq_first(pduq)) != NULL) {
 		wr = write_iscsi_mbuf_wr(toep, sndptr);
 		if (wr == NULL) {
 			toep->flags |= TPF_TX_SUSPENDED;
 			return;
 		}
 
 		plen = sndptr->m_pkthdr.len;
 		credits = howmany(wr->wr_len, 16);
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		m = mbufq_dequeue(pduq);
 		MPASS(m == sndptr);
 		mbufq_enqueue(&toep->ulp_pdu_reclaimq, m);
 
 		toep->tx_credits -= credits;
 		toep->tx_nocompl += credits;
 		toep->plen_nocompl += plen;
 
 		/*
 		 * Ensure there are enough credits for a full-sized WR
 		 * as page pod WRs can be full-sized.
 		 */
 		if (toep->tx_credits <= SGE_MAX_WR_LEN * 5 / 4 &&
 		    toep->tx_nocompl >= toep->tx_total / 4) {
 			wrhdr = wrtod(wr);
 			wrhdr->hi |= htobe32(F_FW_WR_COMPL);
 			toep->tx_nocompl = 0;
 			toep->plen_nocompl = 0;
 		}
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = plen;
 		txsd->tx_credits = credits;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	}
 
 	/* Send a FIN if requested, but only if there are no more PDUs to send */
 	if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN)
 		t4_close_conn(sc, toep);
 }
 
 static inline void
 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop)
 {
 
 	if (ulp_mode(toep) == ULP_MODE_ISCSI)
 		t4_push_pdus(sc, toep, drop);
 	else if (toep->flags & TPF_KTLS)
 		t4_push_ktls(sc, toep, drop);
 	else
 		t4_push_frames(sc, toep, drop);
 }
 
 int
 t4_tod_output(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #ifdef INVARIANTS
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	t4_push_data(sc, toep, 0);
 
 	return (0);
 }
 
 int
 t4_send_fin(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #ifdef INVARIANTS
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	toep->flags |= TPF_SEND_FIN;
 	if (tp->t_state >= TCPS_ESTABLISHED)
 		t4_push_data(sc, toep, 0);
 
 	return (0);
 }
 
 int
 t4_send_rst(struct toedev *tod, struct tcpcb *tp)
 {
 	struct adapter *sc = tod->tod_softc;
 #if defined(INVARIANTS)
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp %p dropped.", __func__, inp));
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 
 	/* hmmmm */
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc for tid %u [%s] not sent already",
 	    __func__, toep->tid, tcpstates[tp->t_state]));
 
 	send_reset(sc, toep, 0);
 	return (0);
 }
 
 /*
  * Peer has sent us a FIN.
  */
 static int
 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_peer_close *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = NULL;
 	struct socket *so;
 	struct epoch_tracker et;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_PEER_CLOSE,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		/*
 		 * do_pass_establish must have run before do_peer_close and if
 		 * this is still a synqe instead of a toepcb then the connection
 		 * must be getting aborted.
 		 */
 		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
 		    toep, toep->flags);
 		return (0);
 	}
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CURVNET_SET(toep->vnet);
 	NET_EPOCH_ENTER(et);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR6(KTR_CXGBE,
 	    "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
 	    toep->ddp.flags, inp);
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		goto done;
 
 	so = inp->inp_socket;
 	socantrcvmore(so);
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
 		DDP_LOCK(toep);
 		if (__predict_false(toep->ddp.flags &
 		    (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)))
 			handle_ddp_close(toep, tp, cpl->rcv_nxt);
 		DDP_UNLOCK(toep);
 	}
 
 	if (ulp_mode(toep) == ULP_MODE_RDMA ||
 	    (ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6)) {
 		/*
 		 * There might be data received via DDP before the FIN
 		 * not reported to the driver.  Just assume the
 		 * sequence number in the CPL is correct as the
 		 * sequence number of the FIN.
 		 */
 	} else {
 		KASSERT(tp->rcv_nxt + 1 == be32toh(cpl->rcv_nxt),
 		    ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
 		    be32toh(cpl->rcv_nxt)));
 	}
 
 	tp->rcv_nxt = be32toh(cpl->rcv_nxt);
 
 	switch (tp->t_state) {
 	case TCPS_SYN_RECEIVED:
 		tp->t_starttime = ticks;
 		/* FALLTHROUGH */ 
 
 	case TCPS_ESTABLISHED:
 		tcp_state_change(tp, TCPS_CLOSE_WAIT);
 		break;
 
 	case TCPS_FIN_WAIT_1:
 		tcp_state_change(tp, TCPS_CLOSING);
 		break;
 
 	case TCPS_FIN_WAIT_2:
 		restore_so_proto(so, inp->inp_vflag & INP_IPV6);
 		tcp_twstart(tp);
 		INP_UNLOCK_ASSERT(inp);	 /* safe, we have a ref on the inp */
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		INP_WLOCK(inp);
 		final_cpl_received(toep);
 		return (0);
 
 	default:
 		log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n",
 		    __func__, tid, tp->t_state);
 	}
 done:
 	INP_WUNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * Peer has ACK'd our FIN.
  */
 static int
 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = NULL;
 	struct socket *so = NULL;
 	struct epoch_tracker et;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_CLOSE_CON_RPL,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CURVNET_SET(toep->vnet);
 	NET_EPOCH_ENTER(et);
 	INP_WLOCK(inp);
 	tp = intotcpcb(inp);
 
 	CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
 
 	if (toep->flags & TPF_ABORT_SHUTDOWN)
 		goto done;
 
 	so = inp->inp_socket;
 	tp->snd_una = be32toh(cpl->snd_nxt) - 1;	/* exclude FIN */
 
 	switch (tp->t_state) {
 	case TCPS_CLOSING:	/* see TCPS_FIN_WAIT_2 in do_peer_close too */
 		restore_so_proto(so, inp->inp_vflag & INP_IPV6);
 		tcp_twstart(tp);
 release:
 		INP_UNLOCK_ASSERT(inp);	/* safe, we have a ref on the  inp */
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		INP_WLOCK(inp);
 		final_cpl_received(toep);	/* no more CPLs expected */
 
 		return (0);
 	case TCPS_LAST_ACK:
 		if (tcp_close(tp))
 			INP_WUNLOCK(inp);
 		goto release;
 
 	case TCPS_FIN_WAIT_1:
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 			soisdisconnected(so);
 		tcp_state_change(tp, TCPS_FIN_WAIT_2);
 		break;
 
 	default:
 		log(LOG_ERR,
 		    "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n",
 		    __func__, tid, tcpstates[tp->t_state]);
 	}
 done:
 	INP_WUNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 void
 send_abort_rpl(struct adapter *sc, struct sge_ofld_txq *ofld_txq, int tid,
     int rst_status)
 {
 	struct wrqe *wr;
 	struct cpl_abort_rpl *cpl;
 
 	wr = alloc_wrqe(sizeof(*cpl), &ofld_txq->wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	cpl = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid);
 	cpl->cmd = rst_status;
 
 	t4_wrq_tx(sc, wr);
 }
 
 static int
 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason)
 {
 	switch (abort_reason) {
 	case CPL_ERR_BAD_SYN:
 	case CPL_ERR_CONN_RESET:
 		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
 	case CPL_ERR_XMIT_TIMEDOUT:
 	case CPL_ERR_PERSIST_TIMEDOUT:
 	case CPL_ERR_FINWAIT2_TIMEDOUT:
 	case CPL_ERR_KEEPALIVE_TIMEDOUT:
 		return (ETIMEDOUT);
 	default:
 		return (EIO);
 	}
 }
 
 /*
  * TCP RST from the peer, timeout, or some other such critical error.
  */
 static int
 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct sge_ofld_txq *ofld_txq = toep->ofld_txq;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct epoch_tracker et;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_REQ_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (toep->flags & TPF_SYNQE)
 		return (do_abort_req_synqe(iq, rss, m));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	if (negative_advice(cpl->status)) {
 		CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)",
 		    __func__, cpl->status, tid, toep->flags);
 		return (0);	/* Ignore negative advice */
 	}
 
 	inp = toep->inp;
 	CURVNET_SET(toep->vnet);
 	NET_EPOCH_ENTER(et);	/* for tcp_close */
 	INP_WLOCK(inp);
 
 	tp = intotcpcb(inp);
 
 	CTR6(KTR_CXGBE,
 	    "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
 	    __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
 	    inp->inp_flags, cpl->status);
 
 	/*
 	 * If we'd initiated an abort earlier the reply to it is responsible for
 	 * cleaning up resources.  Otherwise we tear everything down right here
 	 * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
 	 */
 	if (toep->flags & TPF_ABORT_SHUTDOWN) {
 		INP_WUNLOCK(inp);
 		goto done;
 	}
 	toep->flags |= TPF_ABORT_SHUTDOWN;
 
-	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) {
+	if ((inp->inp_flags & INP_DROPPED) == 0) {
 		struct socket *so = inp->inp_socket;
 
 		if (so != NULL)
 			so_error_set(so, abort_status_to_errno(tp,
 			    cpl->status));
 		tp = tcp_close(tp);
 		if (tp == NULL)
 			INP_WLOCK(inp);	/* re-acquire */
 	}
 
 	final_cpl_received(toep);
 done:
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
 	return (0);
 }
 
 /*
  * Reply to the CPL_ABORT_REQ (send_reset)
  */
 static int
 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	KASSERT(opcode == CPL_ABORT_RPL_RSS,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 
 	if (toep->flags & TPF_SYNQE)
 		return (do_abort_rpl_synqe(iq, rss, m));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
 	    __func__, tid, toep, inp, cpl->status);
 
 	KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 	    ("%s: wasn't expecting abort reply", __func__));
 
 	INP_WLOCK(inp);
 	final_cpl_received(toep);
 
 	return (0);
 }
 
 static int
 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_data *cpl = mtod(m, const void *);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct epoch_tracker et;
 	int len, rx_credits;
 	uint32_t ddp_placed = 0;
 
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		/*
 		 * do_pass_establish must have run before do_rx_data and if this
 		 * is still a synqe instead of a toepcb then the connection must
 		 * be getting aborted.
 		 */
 		MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
 		CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
 		    toep, toep->flags);
 		m_freem(m);
 		return (0);
 	}
 
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
 	len = m->m_pkthdr.len;
 
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	tp = intotcpcb(inp);
 
 	if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS &&
 	   toep->flags & TPF_TLS_RECEIVE)) {
 		/* Received "raw" data on a TLS socket. */
 		CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)",
 		    __func__, tid, len);
 		do_rx_data_tls(cpl, toep, m);
 		return (0);
 	}
 
 	if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq)))
 		ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt;
 
 	tp->rcv_nxt += len;
 	if (tp->rcv_wnd < len) {
 		KASSERT(ulp_mode(toep) == ULP_MODE_RDMA,
 				("%s: negative window size", __func__));
 	}
 
 	tp->rcv_wnd -= len;
 	tp->t_rcvtime = ticks;
 
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 		DDP_LOCK(toep);
 	so = inp_inpcbtosocket(inp);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
 		    __func__, tid, len);
 		m_freem(m);
 		SOCKBUF_UNLOCK(sb);
 		if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 			DDP_UNLOCK(toep);
 		INP_WUNLOCK(inp);
 
 		CURVNET_SET(toep->vnet);
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		return (0);
 	}
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
 	CURVNET_SET(toep->vnet);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    len > (sbspace(sb) / 8 * 7)) {
 		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
 	}
 
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
 		int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off;
 
 		if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0)
 			CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)",
 			    __func__, tid, len);
 
 		if (changed) {
 			if (toep->ddp.flags & DDP_SC_REQ)
 				toep->ddp.flags ^= DDP_ON | DDP_SC_REQ;
 			else {
 				KASSERT(cpl->ddp_off == 1,
 				    ("%s: DDP switched on by itself.",
 				    __func__));
 
 				/* Fell out of DDP mode */
 				toep->ddp.flags &= ~DDP_ON;
 				CTR1(KTR_CXGBE, "%s: fell out of DDP mode",
 				    __func__);
 
 				insert_ddp_data(toep, ddp_placed);
 			}
 		}
 
 		if (toep->ddp.flags & DDP_ON) {
 			/*
 			 * CPL_RX_DATA with DDP on can only be an indicate.
 			 * Start posting queued AIO requests via DDP.  The
 			 * payload that arrived in this indicate is appended
 			 * to the socket buffer as usual.
 			 */
 			handle_ddp_indicate(toep);
 		}
 	}
 
 	sbappendstream_locked(sb, m, 0);
 	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
 	if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
 		rx_credits = send_rx_credits(sc, toep, rx_credits);
 		tp->rcv_wnd += rx_credits;
 		tp->rcv_adv += rx_credits;
 	}
 
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 &&
 	    sbavail(sb) != 0) {
 		CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__,
 		    tid);
 		ddp_queue_toep(toep);
 	}
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(sb);
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 		DDP_UNLOCK(toep);
 
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 static int
 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 	unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	uint8_t credits = cpl->credits;
 	struct ofld_tx_sdesc *txsd;
 	int plen;
 #ifdef INVARIANTS
 	unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl)));
 #endif
 
 	/*
 	 * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
 	 * now this comes back carrying the credits for the flowc.
 	 */
 	if (__predict_false(toep->flags & TPF_SYNQE)) {
 		KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 		    ("%s: credits for a synq entry %p", __func__, toep));
 		return (0);
 	}
 
 	inp = toep->inp;
 
 	KASSERT(opcode == CPL_FW4_ACK,
 	    ("%s: unexpected opcode 0x%x", __func__, opcode));
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 
 	INP_WLOCK(inp);
 
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) {
 		INP_WUNLOCK(inp);
 		return (0);
 	}
 
-	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
+	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("%s: inp_flags 0x%x", __func__, inp->inp_flags));
 
 	tp = intotcpcb(inp);
 
 	if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) {
 		tcp_seq snd_una = be32toh(cpl->snd_una);
 
 #ifdef INVARIANTS
 		if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
 			log(LOG_ERR,
 			    "%s: unexpected seq# %x for TID %u, snd_una %x\n",
 			    __func__, snd_una, toep->tid, tp->snd_una);
 		}
 #endif
 
 		if (tp->snd_una != snd_una) {
 			tp->snd_una = snd_una;
 			tp->ts_recent_age = tcp_ts_getticks();
 		}
 	}
 
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits);
 #endif
 	so = inp->inp_socket;
 	txsd = &toep->txsd[toep->txsd_cidx];
 	plen = 0;
 	while (credits) {
 		KASSERT(credits >= txsd->tx_credits,
 		    ("%s: too many (or partial) credits", __func__));
 		credits -= txsd->tx_credits;
 		toep->tx_credits += txsd->tx_credits;
 		plen += txsd->plen;
 		txsd++;
 		toep->txsd_avail++;
 		KASSERT(toep->txsd_avail <= toep->txsd_total,
 		    ("%s: txsd avail > total", __func__));
 		if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) {
 			txsd = &toep->txsd[0];
 			toep->txsd_cidx = 0;
 		}
 	}
 
 	if (toep->tx_credits == toep->tx_total) {
 		toep->tx_nocompl = 0;
 		toep->plen_nocompl = 0;
 	}
 
 	if (toep->flags & TPF_TX_SUSPENDED &&
 	    toep->tx_credits >= toep->tx_total / 4) {
 #ifdef VERBOSE_TRACES
 		CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__,
 		    tid);
 #endif
 		toep->flags &= ~TPF_TX_SUSPENDED;
 		CURVNET_SET(toep->vnet);
 		t4_push_data(sc, toep, plen);
 		CURVNET_RESTORE();
 	} else if (plen > 0) {
 		struct sockbuf *sb = &so->so_snd;
 		int sbu;
 
 		SOCKBUF_LOCK(sb);
 		sbu = sbused(sb);
 		if (ulp_mode(toep) == ULP_MODE_ISCSI) {
 			if (__predict_false(sbu > 0)) {
 				/*
 				 * The data transmitted before the
 				 * tid's ULP mode changed to ISCSI is
 				 * still in so_snd.  Incoming credits
 				 * should account for so_snd first.
 				 */
 				sbdrop_locked(sb, min(sbu, plen));
 				plen -= min(sbu, plen);
 			}
 			sowwakeup_locked(so);	/* unlocks so_snd */
 			rqdrop_locked(&toep->ulp_pdu_reclaimq, plen);
 		} else {
 #ifdef VERBOSE_TRACES
 			CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__,
 			    tid, plen);
 #endif
 			sbdrop_locked(sb, plen);
 			if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 				t4_aiotx_queue_toep(so, toep);
 			sowwakeup_locked(so);	/* unlocks so_snd */
 		}
 		SOCKBUF_UNLOCK_ASSERT(sb);
 	}
 
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 void
 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep,
     uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie)
 {
 	struct wrqe *wr;
 	struct cpl_set_tcb_field *req;
 	struct ofld_tx_sdesc *txsd;
 
 	MPASS((cookie & ~M_COOKIE) == 0);
 	if (reply) {
 		MPASS(cookie != CPL_COOKIE_RESERVED);
 	}
 
 	wr = alloc_wrqe(sizeof(*req), wrq);
 	if (wr == NULL) {
 		/* XXX */
 		panic("%s: allocation failure.", __func__);
 	}
 	req = wrtod(wr);
 
 	INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid);
 	req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id));
 	if (reply == 0)
 		req->reply_ctrl |= htobe16(F_NO_REPLY);
 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie));
 	req->mask = htobe64(mask);
 	req->val = htobe64(val);
 	if (wrq->eq.type == EQ_OFLD) {
 		txsd = &toep->txsd[toep->txsd_pidx];
 		txsd->tx_credits = howmany(sizeof(*req), 16);
 		txsd->plen = 0;
 		KASSERT(toep->tx_credits >= txsd->tx_credits &&
 		    toep->txsd_avail > 0,
 		    ("%s: not enough credits (%d)", __func__,
 		    toep->tx_credits));
 		toep->tx_credits -= txsd->tx_credits;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 			toep->txsd_pidx = 0;
 		toep->txsd_avail--;
 	}
 
 	t4_wrq_tx(sc, wr);
 }
 
 void
 t4_init_cpl_io_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
 	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl,
 	    CPL_COOKIE_TOM);
 	t4_register_cpl_handler(CPL_RX_DATA, do_rx_data);
 	t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM);
 }
 
 void
 t4_uninit_cpl_io_handlers(void)
 {
 
 	t4_register_cpl_handler(CPL_PEER_CLOSE, NULL);
 	t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL);
 	t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL);
 	t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM);
 	t4_register_cpl_handler(CPL_RX_DATA, NULL);
 	t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM);
 }
 
 /*
  * Use the 'backend1' field in AIO jobs to hold an error that should
  * be reported when the job is completed, the 'backend3' field to
  * store the amount of data sent by the AIO job so far, and the
  * 'backend4' field to hold a reference count on the job.
  *
  * Each unmapped mbuf holds a reference on the job as does the queue
  * so long as the job is queued.
  */
 #define	aio_error	backend1
 #define	aio_sent	backend3
 #define	aio_refs	backend4
 
 #define	jobtotid(job)							\
 	(((struct toepcb *)(so_sototcpcb((job)->fd_file->f_data)->t_toe))->tid)
 
 static void
 aiotx_free_job(struct kaiocb *job)
 {
 	long status;
 	int error;
 
 	if (refcount_release(&job->aio_refs) == 0)
 		return;
 
 	error = (intptr_t)job->aio_error;
 	status = job->aio_sent;
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__,
 	    jobtotid(job), job, status, error);
 #endif
 	if (error != 0 && status != 0)
 		error = 0;
 	if (error == ECANCELED)
 		aio_cancel(job);
 	else if (error)
 		aio_complete(job, -1, error);
 	else {
 		job->msgsnd = 1;
 		aio_complete(job, status, 0);
 	}
 }
 
 static void
 aiotx_free_pgs(struct mbuf *m)
 {
 	struct kaiocb *job;
 	vm_page_t pg;
 
 	M_ASSERTEXTPG(m);
 	job = m->m_ext.ext_arg1;
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__,
 	    m->m_len, jobtotid(job));
 #endif
 
 	for (int i = 0; i < m->m_epg_npgs; i++) {
 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
 		vm_page_unwire(pg, PQ_ACTIVE);
 	}
 
 	aiotx_free_job(job);
 }
 
 /*
  * Allocate a chain of unmapped mbufs describing the next 'len' bytes
  * of an AIO job.
  */
 static struct mbuf *
 alloc_aiotx_mbuf(struct kaiocb *job, int len)
 {
 	struct vmspace *vm;
 	vm_page_t pgs[MBUF_PEXT_MAX_PGS];
 	struct mbuf *m, *top, *last;
 	vm_map_t map;
 	vm_offset_t start;
 	int i, mlen, npages, pgoff;
 
 	KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes,
 	    ("%s(%p, %d): request to send beyond end of buffer", __func__,
 	    job, len));
 
 	/*
 	 * The AIO subsystem will cancel and drain all requests before
 	 * permitting a process to exit or exec, so p_vmspace should
 	 * be stable here.
 	 */
 	vm = job->userproc->p_vmspace;
 	map = &vm->vm_map;
 	start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent;
 	pgoff = start & PAGE_MASK;
 
 	top = NULL;
 	last = NULL;
 	while (len > 0) {
 		mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff);
 		KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0,
 		    ("%s: next start (%#jx + %#x) is not page aligned",
 		    __func__, (uintmax_t)start, mlen));
 
 		npages = vm_fault_quick_hold_pages(map, start, mlen,
 		    VM_PROT_WRITE, pgs, nitems(pgs));
 		if (npages < 0)
 			break;
 
 		m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs);
 		if (m == NULL) {
 			vm_page_unhold_pages(pgs, npages);
 			break;
 		}
 
 		m->m_epg_1st_off = pgoff;
 		m->m_epg_npgs = npages;
 		if (npages == 1) {
 			KASSERT(mlen + pgoff <= PAGE_SIZE,
 			    ("%s: single page is too large (off %d len %d)",
 			    __func__, pgoff, mlen));
 			m->m_epg_last_len = mlen;
 		} else {
 			m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) -
 			    (npages - 2) * PAGE_SIZE;
 		}
 		for (i = 0; i < npages; i++)
 			m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]);
 
 		m->m_len = mlen;
 		m->m_ext.ext_size = npages * PAGE_SIZE;
 		m->m_ext.ext_arg1 = job;
 		refcount_acquire(&job->aio_refs);
 
 #ifdef VERBOSE_TRACES
 		CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d",
 		    __func__, jobtotid(job), m, job, npages);
 #endif
 
 		if (top == NULL)
 			top = m;
 		else
 			last->m_next = m;
 		last = m;
 
 		len -= mlen;
 		start += mlen;
 		pgoff = 0;
 	}
 
 	return (top);
 }
 
 static void
 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job)
 {
 	struct sockbuf *sb;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct mbuf *m;
 	int error, len;
 	bool moretocome, sendmore;
 
 	sb = &so->so_snd;
 	SOCKBUF_UNLOCK(sb);
 	m = NULL;
 
 #ifdef MAC
 	error = mac_socket_check_send(job->fd_file->f_cred, so);
 	if (error != 0)
 		goto out;
 #endif
 
 	/* Inline sosend_generic(). */
 
 	error = SOCK_IO_SEND_LOCK(so, SBL_WAIT);
 	MPASS(error == 0);
 
 sendanother:
 	SOCKBUF_LOCK(sb);
 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		SOCKBUF_UNLOCK(sb);
 		SOCK_IO_SEND_UNLOCK(so);
 		if ((so->so_options & SO_NOSIGPIPE) == 0) {
 			PROC_LOCK(job->userproc);
 			kern_psignal(job->userproc, SIGPIPE);
 			PROC_UNLOCK(job->userproc);
 		}
 		error = EPIPE;
 		goto out;
 	}
 	if (so->so_error) {
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		SOCK_IO_SEND_UNLOCK(so);
 		goto out;
 	}
 	if ((so->so_state & SS_ISCONNECTED) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		SOCK_IO_SEND_UNLOCK(so);
 		error = ENOTCONN;
 		goto out;
 	}
 	if (sbspace(sb) < sb->sb_lowat) {
 		MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO));
 
 		/*
 		 * Don't block if there is too little room in the socket
 		 * buffer.  Instead, requeue the request.
 		 */
 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
 			SOCKBUF_UNLOCK(sb);
 			SOCK_IO_SEND_UNLOCK(so);
 			error = ECANCELED;
 			goto out;
 		}
 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
 		SOCKBUF_UNLOCK(sb);
 		SOCK_IO_SEND_UNLOCK(so);
 		goto out;
 	}
 
 	/*
 	 * Write as much data as the socket permits, but no more than a
 	 * a single sndbuf at a time.
 	 */
 	len = sbspace(sb);
 	if (len > job->uaiocb.aio_nbytes - job->aio_sent) {
 		len = job->uaiocb.aio_nbytes - job->aio_sent;
 		moretocome = false;
 	} else
 		moretocome = true;
 	if (len > toep->params.sndbuf) {
 		len = toep->params.sndbuf;
 		sendmore = true;
 	} else
 		sendmore = false;
 
 	if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 		moretocome = true;
 	SOCKBUF_UNLOCK(sb);
 	MPASS(len != 0);
 
 	m = alloc_aiotx_mbuf(job, len);
 	if (m == NULL) {
 		SOCK_IO_SEND_UNLOCK(so);
 		error = EFAULT;
 		goto out;
 	}
 
 	/* Inlined tcp_usr_send(). */
 
 	inp = toep->inp;
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		SOCK_IO_SEND_UNLOCK(so);
 		error = ECONNRESET;
 		goto out;
 	}
 
 	job->aio_sent += m_length(m, NULL);
 
 	sbappendstream(sb, m, 0);
 	m = NULL;
 
 	if (!(inp->inp_flags & INP_DROPPED)) {
 		tp = intotcpcb(inp);
 		if (moretocome)
 			tp->t_flags |= TF_MORETOCOME;
 		error = tcp_output(tp);
 		if (error < 0) {
 			INP_UNLOCK_ASSERT(inp);
 			SOCK_IO_SEND_UNLOCK(so);
 			error = -error;
 			goto out;
 		}
 		if (moretocome)
 			tp->t_flags &= ~TF_MORETOCOME;
 	}
 
 	INP_WUNLOCK(inp);
 	if (sendmore)
 		goto sendanother;
 	SOCK_IO_SEND_UNLOCK(so);
 
 	if (error)
 		goto out;
 
 	/*
 	 * If this is a blocking socket and the request has not been
 	 * fully completed, requeue it until the socket is ready
 	 * again.
 	 */
 	if (job->aio_sent < job->uaiocb.aio_nbytes &&
 	    !(so->so_state & SS_NBIO)) {
 		SOCKBUF_LOCK(sb);
 		if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
 			SOCKBUF_UNLOCK(sb);
 			error = ECANCELED;
 			goto out;
 		}
 		TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
 		return;
 	}
 
 	/*
 	 * If the request will not be requeued, drop the queue's
 	 * reference to the job.  Any mbufs in flight should still
 	 * hold a reference, but this drops the reference that the
 	 * queue owns while it is waiting to queue mbufs to the
 	 * socket.
 	 */
 	aiotx_free_job(job);
 
 out:
 	if (error) {
 		job->aio_error = (void *)(intptr_t)error;
 		aiotx_free_job(job);
 	}
 	m_freem(m);
 	SOCKBUF_LOCK(sb);
 }
 
 static void
 t4_aiotx_task(void *context, int pending)
 {
 	struct toepcb *toep = context;
 	struct socket *so;
 	struct kaiocb *job;
 	struct epoch_tracker et;
 
 	so = toep->aiotx_so;
 	CURVNET_SET(toep->vnet);
 	NET_EPOCH_ENTER(et);
 	SOCKBUF_LOCK(&so->so_snd);
 	while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) {
 		job = TAILQ_FIRST(&toep->aiotx_jobq);
 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
 		if (!aio_clear_cancel_function(job))
 			continue;
 
 		t4_aiotx_process_job(toep, so, job);
 	}
 	toep->aiotx_so = NULL;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	NET_EPOCH_EXIT(et);
 
 	free_toepcb(toep);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 static void
 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep)
 {
 
 	SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd);
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s",
 	    __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false");
 #endif
 	if (toep->aiotx_so != NULL)
 		return;
 	soref(so);
 	toep->aiotx_so = so;
 	hold_toepcb(toep);
 	soaio_enqueue(&toep->aiotx_task);
 }
 
 static void
 t4_aiotx_cancel(struct kaiocb *job)
 {
 	struct socket *so;
 	struct sockbuf *sb;
 	struct tcpcb *tp;
 	struct toepcb *toep;
 
 	so = job->fd_file->f_data;
 	tp = so_sototcpcb(so);
 	toep = tp->t_toe;
 	MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE);
 	sb = &so->so_snd;
 
 	SOCKBUF_LOCK(sb);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
 	SOCKBUF_UNLOCK(sb);
 
 	job->aio_error = (void *)(intptr_t)ECANCELED;
 	aiotx_free_job(job);
 }
 
 int
 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = td_adapter(toep->td);
 
 	/* This only handles writes. */
 	if (job->uaiocb.aio_lio_opcode != LIO_WRITE)
 		return (EOPNOTSUPP);
 
 	if (!sc->tt.tx_zcopy)
 		return (EOPNOTSUPP);
 
 	if (tls_tx_key(toep))
 		return (EOPNOTSUPP);
 
 	SOCKBUF_LOCK(&so->so_snd);
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid);
 #endif
 	if (!aio_set_cancel_function(job, t4_aiotx_cancel))
 		panic("new job was cancelled");
 	refcount_init(&job->aio_refs, 1);
 	TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list);
 	if (sowriteable(so))
 		t4_aiotx_queue_toep(so, toep);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	return (0);
 }
 
 void
 aiotx_init_toep(struct toepcb *toep)
 {
 
 	TAILQ_INIT(&toep->aiotx_jobq);
 	TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep);
 }
 #endif
diff --git a/sys/dev/cxgbe/tom/t4_ddp.c b/sys/dev/cxgbe/tom/t4_ddp.c
index 11fea91b060e..605157286bb2 100644
--- a/sys/dev/cxgbe/tom/t4_ddp.c
+++ b/sys/dev/cxgbe/tom/t4_ddp.c
@@ -1,2262 +1,2262 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/aio.h>
 #include <sys/bio.h>
 #include <sys/file.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/tcp_var.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/toecore.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/ctl/ctl_io.h>
 
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom.h"
 
 /*
  * Use the 'backend3' field in AIO jobs to store the amount of data
  * received by the AIO job so far.
  */
 #define	aio_received	backend3
 
 static void aio_ddp_requeue_task(void *context, int pending);
 static void ddp_complete_all(struct toepcb *toep, int error);
 static void t4_aio_cancel_active(struct kaiocb *job);
 static void t4_aio_cancel_queued(struct kaiocb *job);
 
 static TAILQ_HEAD(, pageset) ddp_orphan_pagesets;
 static struct mtx ddp_orphan_pagesets_lock;
 static struct task ddp_orphan_task;
 
 #define MAX_DDP_BUFFER_SIZE		(M_TCB_RX_DDP_BUF0_LEN)
 
 /*
  * A page set holds information about a buffer used for DDP.  The page
  * set holds resources such as the VM pages backing the buffer (either
  * held or wired) and the page pods associated with the buffer.
  * Recently used page sets are cached to allow for efficient reuse of
  * buffers (avoiding the need to re-fault in pages, hold them, etc.).
  * Note that cached page sets keep the backing pages wired.  The
  * number of wired pages is capped by only allowing for two wired
  * pagesets per connection.  This is not a perfect cap, but is a
  * trade-off for performance.
  *
  * If an application ping-pongs two buffers for a connection via
  * aio_read(2) then those buffers should remain wired and expensive VM
  * fault lookups should be avoided after each buffer has been used
  * once.  If an application uses more than two buffers then this will
  * fall back to doing expensive VM fault lookups for each operation.
  */
 static void
 free_pageset(struct tom_data *td, struct pageset *ps)
 {
 	vm_page_t p;
 	int i;
 
 	if (ps->prsv.prsv_nppods > 0)
 		t4_free_page_pods(&ps->prsv);
 
 	for (i = 0; i < ps->npages; i++) {
 		p = ps->pages[i];
 		vm_page_unwire(p, PQ_INACTIVE);
 	}
 	mtx_lock(&ddp_orphan_pagesets_lock);
 	TAILQ_INSERT_TAIL(&ddp_orphan_pagesets, ps, link);
 	taskqueue_enqueue(taskqueue_thread, &ddp_orphan_task);
 	mtx_unlock(&ddp_orphan_pagesets_lock);
 }
 
 static void
 ddp_free_orphan_pagesets(void *context, int pending)
 {
 	struct pageset *ps;
 
 	mtx_lock(&ddp_orphan_pagesets_lock);
 	while (!TAILQ_EMPTY(&ddp_orphan_pagesets)) {
 		ps = TAILQ_FIRST(&ddp_orphan_pagesets);
 		TAILQ_REMOVE(&ddp_orphan_pagesets, ps, link);
 		mtx_unlock(&ddp_orphan_pagesets_lock);
 		if (ps->vm)
 			vmspace_free(ps->vm);
 		free(ps, M_CXGBE);
 		mtx_lock(&ddp_orphan_pagesets_lock);
 	}
 	mtx_unlock(&ddp_orphan_pagesets_lock);
 }
 
 static void
 recycle_pageset(struct toepcb *toep, struct pageset *ps)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (!(toep->ddp.flags & DDP_DEAD)) {
 		KASSERT(toep->ddp.cached_count + toep->ddp.active_count <
 		    nitems(toep->ddp.db), ("too many wired pagesets"));
 		TAILQ_INSERT_HEAD(&toep->ddp.cached_pagesets, ps, link);
 		toep->ddp.cached_count++;
 	} else
 		free_pageset(toep->td, ps);
 }
 
 static void
 ddp_complete_one(struct kaiocb *job, int error)
 {
 	long copied;
 
 	/*
 	 * If this job had copied data out of the socket buffer before
 	 * it was cancelled, report it as a short read rather than an
 	 * error.
 	 */
 	copied = job->aio_received;
 	if (copied != 0 || error == 0)
 		aio_complete(job, copied, 0);
 	else
 		aio_complete(job, -1, error);
 }
 
 static void
 free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db)
 {
 
 	if (db->job) {
 		/*
 		 * XXX: If we are un-offloading the socket then we
 		 * should requeue these on the socket somehow.  If we
 		 * got a FIN from the remote end, then this completes
 		 * any remaining requests with an EOF read.
 		 */
 		if (!aio_clear_cancel_function(db->job))
 			ddp_complete_one(db->job, 0);
 	}
 
 	if (db->ps)
 		free_pageset(td, db->ps);
 }
 
 void
 ddp_init_toep(struct toepcb *toep)
 {
 
 	TAILQ_INIT(&toep->ddp.aiojobq);
 	TASK_INIT(&toep->ddp.requeue_task, 0, aio_ddp_requeue_task, toep);
 	toep->ddp.flags = DDP_OK;
 	toep->ddp.active_id = -1;
 	mtx_init(&toep->ddp.lock, "t4 ddp", NULL, MTX_DEF);
 }
 
 void
 ddp_uninit_toep(struct toepcb *toep)
 {
 
 	mtx_destroy(&toep->ddp.lock);
 }
 
 void
 release_ddp_resources(struct toepcb *toep)
 {
 	struct pageset *ps;
 	int i;
 
 	DDP_LOCK(toep);
 	toep->ddp.flags |= DDP_DEAD;
 	for (i = 0; i < nitems(toep->ddp.db); i++) {
 		free_ddp_buffer(toep->td, &toep->ddp.db[i]);
 	}
 	while ((ps = TAILQ_FIRST(&toep->ddp.cached_pagesets)) != NULL) {
 		TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
 		free_pageset(toep->td, ps);
 	}
 	ddp_complete_all(toep, 0);
 	DDP_UNLOCK(toep);
 }
 
 #ifdef INVARIANTS
 void
 ddp_assert_empty(struct toepcb *toep)
 {
 	int i;
 
 	MPASS(!(toep->ddp.flags & DDP_TASK_ACTIVE));
 	for (i = 0; i < nitems(toep->ddp.db); i++) {
 		MPASS(toep->ddp.db[i].job == NULL);
 		MPASS(toep->ddp.db[i].ps == NULL);
 	}
 	MPASS(TAILQ_EMPTY(&toep->ddp.cached_pagesets));
 	MPASS(TAILQ_EMPTY(&toep->ddp.aiojobq));
 }
 #endif
 
 static void
 complete_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db,
     unsigned int db_idx)
 {
 	unsigned int db_flag;
 
 	toep->ddp.active_count--;
 	if (toep->ddp.active_id == db_idx) {
 		if (toep->ddp.active_count == 0) {
 			KASSERT(toep->ddp.db[db_idx ^ 1].job == NULL,
 			    ("%s: active_count mismatch", __func__));
 			toep->ddp.active_id = -1;
 		} else
 			toep->ddp.active_id ^= 1;
 #ifdef VERBOSE_TRACES
 		CTR3(KTR_CXGBE, "%s: tid %u, ddp_active_id = %d", __func__,
 		    toep->tid, toep->ddp.active_id);
 #endif
 	} else {
 		KASSERT(toep->ddp.active_count != 0 &&
 		    toep->ddp.active_id != -1,
 		    ("%s: active count mismatch", __func__));
 	}
 
 	db->cancel_pending = 0;
 	db->job = NULL;
 	recycle_pageset(toep, db->ps);
 	db->ps = NULL;
 
 	db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 	KASSERT(toep->ddp.flags & db_flag,
 	    ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x",
 	    __func__, toep, toep->ddp.flags));
 	toep->ddp.flags &= ~db_flag;
 }
 
 /* XXX: handle_ddp_data code duplication */
 void
 insert_ddp_data(struct toepcb *toep, uint32_t n)
 {
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	size_t placed;
 	long copied;
 	unsigned int db_idx;
 #ifdef INVARIANTS
 	unsigned int db_flag;
 #endif
 
 	INP_WLOCK_ASSERT(inp);
 	DDP_ASSERT_LOCKED(toep);
 
 	tp->rcv_nxt += n;
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
 	tp->rcv_wnd -= n;
 #endif
 	CTR2(KTR_CXGBE, "%s: placed %u bytes before falling out of DDP",
 	    __func__, n);
 	while (toep->ddp.active_count > 0) {
 		MPASS(toep->ddp.active_id != -1);
 		db_idx = toep->ddp.active_id;
 #ifdef INVARIANTS
 		db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 #endif
 		MPASS((toep->ddp.flags & db_flag) != 0);
 		db = &toep->ddp.db[db_idx];
 		job = db->job;
 		copied = job->aio_received;
 		placed = n;
 		if (placed > job->uaiocb.aio_nbytes - copied)
 			placed = job->uaiocb.aio_nbytes - copied;
 		if (placed > 0)
 			job->msgrcv = 1;
 		if (!aio_clear_cancel_function(job)) {
 			/*
 			 * Update the copied length for when
 			 * t4_aio_cancel_active() completes this
 			 * request.
 			 */
 			job->aio_received += placed;
 		} else if (copied + placed != 0) {
 			CTR4(KTR_CXGBE,
 			    "%s: completing %p (copied %ld, placed %lu)",
 			    __func__, job, copied, placed);
 			/* XXX: This always completes if there is some data. */
 			aio_complete(job, copied + placed, 0);
 		} else if (aio_set_cancel_function(job, t4_aio_cancel_queued)) {
 			TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list);
 			toep->ddp.waiting_count++;
 		} else
 			aio_cancel(job);
 		n -= placed;
 		complete_ddp_buffer(toep, db, db_idx);
 	}
 
 	MPASS(n == 0);
 }
 
 /* SET_TCB_FIELD sent as a ULP command looks like this */
 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
 
 /* RX_DATA_ACK sent as a ULP command looks like this */
 #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core))
 
 static inline void *
 mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep,
     uint64_t word, uint64_t mask, uint64_t val)
 {
 	struct ulptx_idata *ulpsc;
 	struct cpl_set_tcb_field_core *req;
 
 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
 	ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
 
 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	ulpsc->len = htobe32(sizeof(*req));
 
 	req = (struct cpl_set_tcb_field_core *)(ulpsc + 1);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid));
 	req->reply_ctrl = htobe16(V_NO_REPLY(1) |
 	    V_QUEUENO(toep->ofld_rxq->iq.abs_id));
 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
         req->mask = htobe64(mask);
         req->val = htobe64(val);
 
 	ulpsc = (struct ulptx_idata *)(req + 1);
 	if (LEN__SET_TCB_FIELD_ULP % 16) {
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		ulpsc->len = htobe32(0);
 		return (ulpsc + 1);
 	}
 	return (ulpsc);
 }
 
 static inline void *
 mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep)
 {
 	struct ulptx_idata *ulpsc;
 	struct cpl_rx_data_ack_core *req;
 
 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
 	ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16));
 
 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	ulpsc->len = htobe32(sizeof(*req));
 
 	req = (struct cpl_rx_data_ack_core *)(ulpsc + 1);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid));
 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
 
 	ulpsc = (struct ulptx_idata *)(req + 1);
 	if (LEN__RX_DATA_ACK_ULP % 16) {
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		ulpsc->len = htobe32(0);
 		return (ulpsc + 1);
 	}
 	return (ulpsc);
 }
 
 static struct wrqe *
 mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx,
     struct pageset *ps, int offset, uint64_t ddp_flags, uint64_t ddp_flags_mask)
 {
 	struct wrqe *wr;
 	struct work_request_hdr *wrh;
 	struct ulp_txpkt *ulpmc;
 	int len;
 
 	KASSERT(db_idx == 0 || db_idx == 1,
 	    ("%s: bad DDP buffer index %d", __func__, db_idx));
 
 	/*
 	 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an
 	 * RX_DATA_ACK (with RX_MODULATE to speed up delivery).
 	 *
 	 * The work request header is 16B and always ends at a 16B boundary.
 	 * The ULPTX master commands that follow must all end at 16B boundaries
 	 * too so we round up the size to 16.
 	 */
 	len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) +
 	    roundup2(LEN__RX_DATA_ACK_ULP, 16);
 
 	wr = alloc_wrqe(len, toep->ctrlq);
 	if (wr == NULL)
 		return (NULL);
 	wrh = wrtod(wr);
 	INIT_ULPTX_WRH(wrh, len, 1, 0);	/* atomic */
 	ulpmc = (struct ulp_txpkt *)(wrh + 1);
 
 	/* Write the buffer's tag */
 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 	    W_TCB_RX_DDP_BUF0_TAG + db_idx,
 	    V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
 	    V_TCB_RX_DDP_BUF0_TAG(ps->prsv.prsv_tag));
 
 	/* Update the current offset in the DDP buffer and its total length */
 	if (db_idx == 0)
 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 		    W_TCB_RX_DDP_BUF0_OFFSET,
 		    V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
 		    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
 		    V_TCB_RX_DDP_BUF0_OFFSET(offset) |
 		    V_TCB_RX_DDP_BUF0_LEN(ps->len));
 	else
 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 		    W_TCB_RX_DDP_BUF1_OFFSET,
 		    V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
 		    V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32),
 		    V_TCB_RX_DDP_BUF1_OFFSET(offset) |
 		    V_TCB_RX_DDP_BUF1_LEN((u64)ps->len << 32));
 
 	/* Update DDP flags */
 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS,
 	    ddp_flags_mask, ddp_flags);
 
 	/* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */
 	ulpmc = mk_rx_data_ack_ulp(ulpmc, toep);
 
 	return (wr);
 }
 
 static int
 handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
 {
 	uint32_t report = be32toh(ddp_report);
 	unsigned int db_idx;
 	struct inpcb *inp = toep->inp;
 	struct ddp_buffer *db;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct kaiocb *job;
 	long copied;
 
 	db_idx = report & F_DDP_BUF_IDX ? 1 : 0;
 
 	if (__predict_false(!(report & F_DDP_INV)))
 		CXGBE_UNIMPLEMENTED("DDP buffer still valid");
 
 	INP_WLOCK(inp);
 	so = inp_inpcbtosocket(inp);
 	sb = &so->so_rcv;
 	DDP_LOCK(toep);
 
 	KASSERT(toep->ddp.active_id == db_idx,
 	    ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx,
 	    toep->ddp.active_id, toep->tid));
 	db = &toep->ddp.db[db_idx];
 	job = db->job;
 
-	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
+	if (__predict_false(inp->inp_flags & INP_DROPPED)) {
 		/*
 		 * This can happen due to an administrative tcpdrop(8).
 		 * Just fail the request with ECONNRESET.
 		 */
 		CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
 		    __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
 		if (aio_clear_cancel_function(job))
 			ddp_complete_one(job, ECONNRESET);
 		goto completed;
 	}
 
 	tp = intotcpcb(inp);
 
 	/*
 	 * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the
 	 * sequence number of the next byte to receive.  The length of
 	 * the data received for this message must be computed by
 	 * comparing the new and old values of rcv_nxt.
 	 *
 	 * For RX_DATA_DDP, len might be non-zero, but it is only the
 	 * length of the most recent DMA.  It does not include the
 	 * total length of the data received since the previous update
 	 * for this DDP buffer.  rcv_nxt is the sequence number of the
 	 * first received byte from the most recent DMA.
 	 */
 	len += be32toh(rcv_nxt) - tp->rcv_nxt;
 	tp->rcv_nxt += len;
 	tp->t_rcvtime = ticks;
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
 	tp->rcv_wnd -= len;
 #endif
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%s: tid %u, DDP[%d] placed %d bytes (%#x)", __func__,
 	    toep->tid, db_idx, len, report);
 #endif
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
 	CURVNET_SET(toep->vnet);
 	SOCKBUF_LOCK(sb);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    len > (sbspace(sb) / 8 * 7)) {
 		struct adapter *sc = td_adapter(toep->td);
 		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
 	}
 	SOCKBUF_UNLOCK(sb);
 	CURVNET_RESTORE();
 
 	job->msgrcv = 1;
 	if (db->cancel_pending) {
 		/*
 		 * Update the job's length but defer completion to the
 		 * TCB_RPL callback.
 		 */
 		job->aio_received += len;
 		goto out;
 	} else if (!aio_clear_cancel_function(job)) {
 		/*
 		 * Update the copied length for when
 		 * t4_aio_cancel_active() completes this request.
 		 */
 		job->aio_received += len;
 	} else {
 		copied = job->aio_received;
 #ifdef VERBOSE_TRACES
 		CTR5(KTR_CXGBE,
 		    "%s: tid %u, completing %p (copied %ld, placed %d)",
 		    __func__, toep->tid, job, copied, len);
 #endif
 		aio_complete(job, copied + len, 0);
 		t4_rcvd(&toep->td->tod, tp);
 	}
 
 completed:
 	complete_ddp_buffer(toep, db, db_idx);
 	if (toep->ddp.waiting_count > 0)
 		ddp_queue_toep(toep);
 out:
 	DDP_UNLOCK(toep);
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 void
 handle_ddp_indicate(struct toepcb *toep)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	MPASS(toep->ddp.active_count == 0);
 	MPASS((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0);
 	if (toep->ddp.waiting_count == 0) {
 		/*
 		 * The pending requests that triggered the request for an
 		 * an indicate were cancelled.  Those cancels should have
 		 * already disabled DDP.  Just ignore this as the data is
 		 * going into the socket buffer anyway.
 		 */
 		return;
 	}
 	CTR3(KTR_CXGBE, "%s: tid %d indicated (%d waiting)", __func__,
 	    toep->tid, toep->ddp.waiting_count);
 	ddp_queue_toep(toep);
 }
 
 CTASSERT(CPL_COOKIE_DDP0 + 1 == CPL_COOKIE_DDP1);
 
 static int
 do_ddp_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	unsigned int db_idx;
 	struct toepcb *toep;
 	struct inpcb *inp;
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	long copied;
 
 	if (cpl->status != CPL_ERR_NONE)
 		panic("XXX: tcp_rpl failed: %d", cpl->status);
 
 	toep = lookup_tid(sc, tid);
 	inp = toep->inp;
 	switch (cpl->cookie) {
 	case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(CPL_COOKIE_DDP0):
 	case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(CPL_COOKIE_DDP1):
 		/*
 		 * XXX: This duplicates a lot of code with handle_ddp_data().
 		 */
 		db_idx = G_COOKIE(cpl->cookie) - CPL_COOKIE_DDP0;
 		MPASS(db_idx < nitems(toep->ddp.db));
 		INP_WLOCK(inp);
 		DDP_LOCK(toep);
 		db = &toep->ddp.db[db_idx];
 
 		/*
 		 * handle_ddp_data() should leave the job around until
 		 * this callback runs once a cancel is pending.
 		 */
 		MPASS(db != NULL);
 		MPASS(db->job != NULL);
 		MPASS(db->cancel_pending);
 
 		/*
 		 * XXX: It's not clear what happens if there is data
 		 * placed when the buffer is invalidated.  I suspect we
 		 * need to read the TCB to see how much data was placed.
 		 *
 		 * For now this just pretends like nothing was placed.
 		 *
 		 * XXX: Note that if we did check the PCB we would need to
 		 * also take care of updating the tp, etc.
 		 */
 		job = db->job;
 		copied = job->aio_received;
 		if (copied == 0) {
 			CTR2(KTR_CXGBE, "%s: cancelling %p", __func__, job);
 			aio_cancel(job);
 		} else {
 			CTR3(KTR_CXGBE, "%s: completing %p (copied %ld)",
 			    __func__, job, copied);
 			aio_complete(job, copied, 0);
 			t4_rcvd(&toep->td->tod, intotcpcb(inp));
 		}
 
 		complete_ddp_buffer(toep, db, db_idx);
 		if (toep->ddp.waiting_count > 0)
 			ddp_queue_toep(toep);
 		DDP_UNLOCK(toep);
 		INP_WUNLOCK(inp);
 		break;
 	default:
 		panic("XXX: unknown tcb_rpl offset %#x, cookie %#x",
 		    G_WORD(cpl->cookie), G_COOKIE(cpl->cookie));
 	}
 
 	return (0);
 }
 
 void
 handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, __be32 rcv_nxt)
 {
 	struct ddp_buffer *db;
 	struct kaiocb *job;
 	long copied;
 	unsigned int db_idx;
 #ifdef INVARIANTS
 	unsigned int db_flag;
 #endif
 	int len, placed;
 
 	INP_WLOCK_ASSERT(toep->inp);
 	DDP_ASSERT_LOCKED(toep);
 
 	/* - 1 is to ignore the byte for FIN */
 	len = be32toh(rcv_nxt) - tp->rcv_nxt - 1;
 	tp->rcv_nxt += len;
 
 	while (toep->ddp.active_count > 0) {
 		MPASS(toep->ddp.active_id != -1);
 		db_idx = toep->ddp.active_id;
 #ifdef INVARIANTS
 		db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 #endif
 		MPASS((toep->ddp.flags & db_flag) != 0);
 		db = &toep->ddp.db[db_idx];
 		job = db->job;
 		copied = job->aio_received;
 		placed = len;
 		if (placed > job->uaiocb.aio_nbytes - copied)
 			placed = job->uaiocb.aio_nbytes - copied;
 		if (placed > 0)
 			job->msgrcv = 1;
 		if (!aio_clear_cancel_function(job)) {
 			/*
 			 * Update the copied length for when
 			 * t4_aio_cancel_active() completes this
 			 * request.
 			 */
 			job->aio_received += placed;
 		} else {
 			CTR4(KTR_CXGBE, "%s: tid %d completed buf %d len %d",
 			    __func__, toep->tid, db_idx, placed);
 			aio_complete(job, copied + placed, 0);
 		}
 		len -= placed;
 		complete_ddp_buffer(toep, db, db_idx);
 	}
 
 	MPASS(len == 0);
 	ddp_complete_all(toep, 0);
 }
 
 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
 	 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
 	 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
 	 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)
 
 extern cpl_handler_t t4_cpl_handler[];
 
 static int
 do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	uint32_t vld;
 	struct toepcb *toep = lookup_tid(sc, tid);
 
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	vld = be32toh(cpl->ddpvld);
 	if (__predict_false(vld & DDP_ERR)) {
 		panic("%s: DDP error 0x%x (tid %d, toep %p)",
 		    __func__, vld, tid, toep);
 	}
 
 	if (ulp_mode(toep) == ULP_MODE_ISCSI) {
 		t4_cpl_handler[CPL_RX_ISCSI_DDP](iq, rss, m);
 		return (0);
 	}
 
 	handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len));
 
 	return (0);
 }
 
 static int
 do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0);
 
 	return (0);
 }
 
 static void
 enable_ddp(struct adapter *sc, struct toepcb *toep)
 {
 
 	KASSERT((toep->ddp.flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK,
 	    ("%s: toep %p has bad ddp_flags 0x%x",
 	    __func__, toep, toep->ddp.flags));
 
 	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
 	    __func__, toep->tid, time_uptime);
 
 	DDP_ASSERT_LOCKED(toep);
 	toep->ddp.flags |= DDP_SC_REQ;
 	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_RX_DDP_FLAGS,
 	    V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) |
 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) |
 	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1),
 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1), 0, 0);
 	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS,
 	    V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0);
 }
 
 static int
 calculate_hcf(int n1, int n2)
 {
 	int a, b, t;
 
 	if (n1 <= n2) {
 		a = n1;
 		b = n2;
 	} else {
 		a = n2;
 		b = n1;
 	}
 
 	while (a != 0) {
 		t = a;
 		a = b % a;
 		b = t;
 	}
 
 	return (b);
 }
 
 static inline int
 pages_to_nppods(int npages, int ddp_page_shift)
 {
 
 	MPASS(ddp_page_shift >= PAGE_SHIFT);
 
 	return (howmany(npages >> (ddp_page_shift - PAGE_SHIFT), PPOD_PAGES));
 }
 
 static int
 alloc_page_pods(struct ppod_region *pr, u_int nppods, u_int pgsz_idx,
     struct ppod_reservation *prsv)
 {
 	vmem_addr_t addr;       /* relative to start of region */
 
 	if (vmem_alloc(pr->pr_arena, PPOD_SZ(nppods), M_NOWAIT | M_FIRSTFIT,
 	    &addr) != 0)
 		return (ENOMEM);
 
 #ifdef VERBOSE_TRACES
 	CTR5(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d, pgsz %d",
 	    __func__, pr->pr_arena, (uint32_t)addr & pr->pr_tag_mask,
 	    nppods, 1 << pr->pr_page_shift[pgsz_idx]);
 #endif
 
 	/*
 	 * The hardware tagmask includes an extra invalid bit but the arena was
 	 * seeded with valid values only.  An allocation out of this arena will
 	 * fit inside the tagmask but won't have the invalid bit set.
 	 */
 	MPASS((addr & pr->pr_tag_mask) == addr);
 	MPASS((addr & pr->pr_invalid_bit) == 0);
 
 	prsv->prsv_pr = pr;
 	prsv->prsv_tag = V_PPOD_PGSZ(pgsz_idx) | addr;
 	prsv->prsv_nppods = nppods;
 
 	return (0);
 }
 
 static int
 t4_alloc_page_pods_for_vmpages(struct ppod_region *pr, vm_page_t *pages,
     int npages, struct ppod_reservation *prsv)
 {
 	int i, hcf, seglen, idx, nppods;
 
 	/*
 	 * The DDP page size is unrelated to the VM page size.  We combine
 	 * contiguous physical pages into larger segments to get the best DDP
 	 * page size possible.  This is the largest of the four sizes in
 	 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in
 	 * the page list.
 	 */
 	hcf = 0;
 	for (i = 0; i < npages; i++) {
 		seglen = PAGE_SIZE;
 		while (i < npages - 1 &&
 		    VM_PAGE_TO_PHYS(pages[i]) + PAGE_SIZE ==
 		    VM_PAGE_TO_PHYS(pages[i + 1])) {
 			seglen += PAGE_SIZE;
 			i++;
 		}
 
 		hcf = calculate_hcf(hcf, seglen);
 		if (hcf < (1 << pr->pr_page_shift[1])) {
 			idx = 0;
 			goto have_pgsz;	/* give up, short circuit */
 		}
 	}
 
 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
 			break;
 	}
 #undef PR_PAGE_MASK
 
 have_pgsz:
 	MPASS(idx <= M_PPOD_PGSZ);
 
 	nppods = pages_to_nppods(npages, pr->pr_page_shift[idx]);
 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
 		return (ENOMEM);
 	MPASS(prsv->prsv_nppods > 0);
 
 	return (0);
 }
 
 int
 t4_alloc_page_pods_for_ps(struct ppod_region *pr, struct pageset *ps)
 {
 	struct ppod_reservation *prsv = &ps->prsv;
 
 	KASSERT(prsv->prsv_nppods == 0,
 	    ("%s: page pods already allocated", __func__));
 
 	return (t4_alloc_page_pods_for_vmpages(pr, ps->pages, ps->npages,
 	    prsv));
 }
 
 int
 t4_alloc_page_pods_for_bio(struct ppod_region *pr, struct bio *bp,
     struct ppod_reservation *prsv)
 {
 
 	MPASS(bp->bio_flags & BIO_UNMAPPED);
 
 	return (t4_alloc_page_pods_for_vmpages(pr, bp->bio_ma, bp->bio_ma_n,
 	    prsv));
 }
 
 int
 t4_alloc_page_pods_for_buf(struct ppod_region *pr, vm_offset_t buf, int len,
     struct ppod_reservation *prsv)
 {
 	int hcf, seglen, idx, npages, nppods;
 	uintptr_t start_pva, end_pva, pva, p1;
 
 	MPASS(buf > 0);
 	MPASS(len > 0);
 
 	/*
 	 * The DDP page size is unrelated to the VM page size.  We combine
 	 * contiguous physical pages into larger segments to get the best DDP
 	 * page size possible.  This is the largest of the four sizes in
 	 * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes
 	 * in the page list.
 	 */
 	hcf = 0;
 	start_pva = trunc_page(buf);
 	end_pva = trunc_page(buf + len - 1);
 	pva = start_pva;
 	while (pva <= end_pva) {
 		seglen = PAGE_SIZE;
 		p1 = pmap_kextract(pva);
 		pva += PAGE_SIZE;
 		while (pva <= end_pva && p1 + seglen == pmap_kextract(pva)) {
 			seglen += PAGE_SIZE;
 			pva += PAGE_SIZE;
 		}
 
 		hcf = calculate_hcf(hcf, seglen);
 		if (hcf < (1 << pr->pr_page_shift[1])) {
 			idx = 0;
 			goto have_pgsz;	/* give up, short circuit */
 		}
 	}
 
 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
 			break;
 	}
 #undef PR_PAGE_MASK
 
 have_pgsz:
 	MPASS(idx <= M_PPOD_PGSZ);
 
 	npages = 1;
 	npages += (end_pva - start_pva) >> pr->pr_page_shift[idx];
 	nppods = howmany(npages, PPOD_PAGES);
 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
 		return (ENOMEM);
 	MPASS(prsv->prsv_nppods > 0);
 
 	return (0);
 }
 
 int
 t4_alloc_page_pods_for_sgl(struct ppod_region *pr, struct ctl_sg_entry *sgl,
     int entries, struct ppod_reservation *prsv)
 {
 	int hcf, seglen, idx = 0, npages, nppods, i, len;
 	uintptr_t start_pva, end_pva, pva, p1 ;
 	vm_offset_t buf;
 	struct ctl_sg_entry *sge;
 
 	MPASS(entries > 0);
 	MPASS(sgl);
 
 	/*
 	 * The DDP page size is unrelated to the VM page size.	We combine
 	 * contiguous physical pages into larger segments to get the best DDP
 	 * page size possible.	This is the largest of the four sizes in
 	 * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes
 	 * in the page list.
 	 */
 	hcf = 0;
 	for (i = entries - 1; i >= 0; i--) {
 		sge = sgl + i;
 		buf = (vm_offset_t)sge->addr;
 		len = sge->len;
 		start_pva = trunc_page(buf);
 		end_pva = trunc_page(buf + len - 1);
 		pva = start_pva;
 		while (pva <= end_pva) {
 			seglen = PAGE_SIZE;
 			p1 = pmap_kextract(pva);
 			pva += PAGE_SIZE;
 			while (pva <= end_pva && p1 + seglen ==
 			    pmap_kextract(pva)) {
 				seglen += PAGE_SIZE;
 				pva += PAGE_SIZE;
 			}
 
 			hcf = calculate_hcf(hcf, seglen);
 			if (hcf < (1 << pr->pr_page_shift[1])) {
 				idx = 0;
 				goto have_pgsz; /* give up, short circuit */
 			}
 		}
 	}
 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
 			break;
 	}
 #undef PR_PAGE_MASK
 
 have_pgsz:
 	MPASS(idx <= M_PPOD_PGSZ);
 
 	npages = 0;
 	while (entries--) {
 		npages++;
 		start_pva = trunc_page((vm_offset_t)sgl->addr);
 		end_pva = trunc_page((vm_offset_t)sgl->addr + sgl->len - 1);
 		npages += (end_pva - start_pva) >> pr->pr_page_shift[idx];
 		sgl = sgl + 1;
 	}
 	nppods = howmany(npages, PPOD_PAGES);
 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
 		return (ENOMEM);
 	MPASS(prsv->prsv_nppods > 0);
 	return (0);
 }
 
 void
 t4_free_page_pods(struct ppod_reservation *prsv)
 {
 	struct ppod_region *pr = prsv->prsv_pr;
 	vmem_addr_t addr;
 
 	MPASS(prsv != NULL);
 	MPASS(prsv->prsv_nppods != 0);
 
 	addr = prsv->prsv_tag & pr->pr_tag_mask;
 	MPASS((addr & pr->pr_invalid_bit) == 0);
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d", __func__,
 	    pr->pr_arena, addr, prsv->prsv_nppods);
 #endif
 
 	vmem_free(pr->pr_arena, addr, PPOD_SZ(prsv->prsv_nppods));
 	prsv->prsv_nppods = 0;
 }
 
 #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE)
 
 int
 t4_write_page_pods_for_ps(struct adapter *sc, struct sge_wrq *wrq, int tid,
     struct pageset *ps)
 {
 	struct wrqe *wr;
 	struct ulp_mem_io *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct pagepod *ppod;
 	int i, j, k, n, chunk, len, ddp_pgsz, idx;
 	u_int ppod_addr;
 	uint32_t cmd;
 	struct ppod_reservation *prsv = &ps->prsv;
 	struct ppod_region *pr = prsv->prsv_pr;
 	vm_paddr_t pa;
 
 	KASSERT(!(ps->flags & PS_PPODS_WRITTEN),
 	    ("%s: page pods already written", __func__));
 	MPASS(prsv->prsv_nppods > 0);
 
 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
 	if (is_t4(sc))
 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
 	else
 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
 
 		/* How many page pods are we writing in this cycle */
 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
 		chunk = PPOD_SZ(n);
 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
 
 		wr = alloc_wrqe(len, wrq);
 		if (wr == NULL)
 			return (ENOMEM);	/* ok to just bail out */
 		ulpmc = wrtod(wr);
 
 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
 		ulpmc->cmd = cmd;
 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 		ulpsc->len = htobe32(chunk);
 
 		ppod = (struct pagepod *)(ulpsc + 1);
 		for (j = 0; j < n; i++, j++, ppod++) {
 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
 			    V_PPOD_TID(tid) | prsv->prsv_tag);
 			ppod->len_offset = htobe64(V_PPOD_LEN(ps->len) |
 			    V_PPOD_OFST(ps->offset));
 			ppod->rsvd = 0;
 			idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
 			for (k = 0; k < nitems(ppod->addr); k++) {
 				if (idx < ps->npages) {
 					pa = VM_PAGE_TO_PHYS(ps->pages[idx]);
 					ppod->addr[k] = htobe64(pa);
 					idx += ddp_pgsz / PAGE_SIZE;
 				} else
 					ppod->addr[k] = 0;
 #if 0
 				CTR5(KTR_CXGBE,
 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
 				    __func__, tid, i, k,
 				    be64toh(ppod->addr[k]));
 #endif
 			}
 
 		}
 
 		t4_wrq_tx(sc, wr);
 	}
 	ps->flags |= PS_PPODS_WRITTEN;
 
 	return (0);
 }
 
 static struct mbuf *
 alloc_raw_wr_mbuf(int len)
 {
 	struct mbuf *m;
 
 	if (len <= MHLEN)
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 	else if (len <= MCLBYTES)
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	else
 		m = NULL;
 	if (m == NULL)
 		return (NULL);
 	m->m_pkthdr.len = len;
 	m->m_len = len;
 	set_mbuf_raw_wr(m, true);
 	return (m);
 }
 
 int
 t4_write_page_pods_for_bio(struct adapter *sc, struct toepcb *toep,
     struct ppod_reservation *prsv, struct bio *bp, struct mbufq *wrq)
 {
 	struct ulp_mem_io *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct pagepod *ppod;
 	int i, j, k, n, chunk, len, ddp_pgsz, idx;
 	u_int ppod_addr;
 	uint32_t cmd;
 	struct ppod_region *pr = prsv->prsv_pr;
 	vm_paddr_t pa;
 	struct mbuf *m;
 
 	MPASS(bp->bio_flags & BIO_UNMAPPED);
 
 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
 	if (is_t4(sc))
 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
 	else
 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
 
 		/* How many page pods are we writing in this cycle */
 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
 		MPASS(n > 0);
 		chunk = PPOD_SZ(n);
 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
 
 		m = alloc_raw_wr_mbuf(len);
 		if (m == NULL)
 			return (ENOMEM);
 
 		ulpmc = mtod(m, struct ulp_mem_io *);
 		INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
 		ulpmc->cmd = cmd;
 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 		ulpsc->len = htobe32(chunk);
 
 		ppod = (struct pagepod *)(ulpsc + 1);
 		for (j = 0; j < n; i++, j++, ppod++) {
 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
 			    V_PPOD_TID(toep->tid) |
 			    (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
 			ppod->len_offset = htobe64(V_PPOD_LEN(bp->bio_bcount) |
 			    V_PPOD_OFST(bp->bio_ma_offset));
 			ppod->rsvd = 0;
 			idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
 			for (k = 0; k < nitems(ppod->addr); k++) {
 				if (idx < bp->bio_ma_n) {
 					pa = VM_PAGE_TO_PHYS(bp->bio_ma[idx]);
 					ppod->addr[k] = htobe64(pa);
 					idx += ddp_pgsz / PAGE_SIZE;
 				} else
 					ppod->addr[k] = 0;
 #if 0
 				CTR5(KTR_CXGBE,
 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
 				    __func__, toep->tid, i, k,
 				    be64toh(ppod->addr[k]));
 #endif
 			}
 		}
 
 		mbufq_enqueue(wrq, m);
 	}
 
 	return (0);
 }
 
 int
 t4_write_page_pods_for_buf(struct adapter *sc, struct toepcb *toep,
     struct ppod_reservation *prsv, vm_offset_t buf, int buflen,
     struct mbufq *wrq)
 {
 	struct ulp_mem_io *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct pagepod *ppod;
 	int i, j, k, n, chunk, len, ddp_pgsz;
 	u_int ppod_addr, offset;
 	uint32_t cmd;
 	struct ppod_region *pr = prsv->prsv_pr;
 	uintptr_t end_pva, pva;
 	vm_paddr_t pa;
 	struct mbuf *m;
 
 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
 	if (is_t4(sc))
 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
 	else
 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
 	offset = buf & PAGE_MASK;
 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
 	pva = trunc_page(buf);
 	end_pva = trunc_page(buf + buflen - 1);
 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
 
 		/* How many page pods are we writing in this cycle */
 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
 		MPASS(n > 0);
 		chunk = PPOD_SZ(n);
 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
 
 		m = alloc_raw_wr_mbuf(len);
 		if (m == NULL)
 			return (ENOMEM);
 		ulpmc = mtod(m, struct ulp_mem_io *);
 
 		INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
 		ulpmc->cmd = cmd;
 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 		ulpsc->len = htobe32(chunk);
 
 		ppod = (struct pagepod *)(ulpsc + 1);
 		for (j = 0; j < n; i++, j++, ppod++) {
 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
 			    V_PPOD_TID(toep->tid) |
 			    (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
 			ppod->len_offset = htobe64(V_PPOD_LEN(buflen) |
 			    V_PPOD_OFST(offset));
 			ppod->rsvd = 0;
 
 			for (k = 0; k < nitems(ppod->addr); k++) {
 				if (pva > end_pva)
 					ppod->addr[k] = 0;
 				else {
 					pa = pmap_kextract(pva);
 					ppod->addr[k] = htobe64(pa);
 					pva += ddp_pgsz;
 				}
 #if 0
 				CTR5(KTR_CXGBE,
 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
 				    __func__, toep->tid, i, k,
 				    be64toh(ppod->addr[k]));
 #endif
 			}
 
 			/*
 			 * Walk back 1 segment so that the first address in the
 			 * next pod is the same as the last one in the current
 			 * pod.
 			 */
 			pva -= ddp_pgsz;
 		}
 
 		mbufq_enqueue(wrq, m);
 	}
 
 	MPASS(pva <= end_pva);
 
 	return (0);
 }
 
 int
 t4_write_page_pods_for_sgl(struct adapter *sc, struct toepcb *toep,
     struct ppod_reservation *prsv, struct ctl_sg_entry *sgl, int entries,
     int xferlen, struct mbufq *wrq)
 {
 	struct ulp_mem_io *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct pagepod *ppod;
 	int i, j, k, n, chunk, len, ddp_pgsz;
 	u_int ppod_addr, offset, sg_offset = 0;
 	uint32_t cmd;
 	struct ppod_region *pr = prsv->prsv_pr;
 	uintptr_t pva;
 	vm_paddr_t pa;
 	struct mbuf *m;
 
 	MPASS(sgl != NULL);
 	MPASS(entries > 0);
 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
 	if (is_t4(sc))
 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
 	else
 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
 	offset = (vm_offset_t)sgl->addr & PAGE_MASK;
 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
 	pva = trunc_page((vm_offset_t)sgl->addr);
 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
 
 		/* How many page pods are we writing in this cycle */
 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
 		MPASS(n > 0);
 		chunk = PPOD_SZ(n);
 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
 
 		m = alloc_raw_wr_mbuf(len);
 		if (m == NULL)
 			return (ENOMEM);
 		ulpmc = mtod(m, struct ulp_mem_io *);
 
 		INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
 		ulpmc->cmd = cmd;
 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 		ulpsc->len = htobe32(chunk);
 
 		ppod = (struct pagepod *)(ulpsc + 1);
 		for (j = 0; j < n; i++, j++, ppod++) {
 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
 			    V_PPOD_TID(toep->tid) |
 			    (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
 			ppod->len_offset = htobe64(V_PPOD_LEN(xferlen) |
 			    V_PPOD_OFST(offset));
 			ppod->rsvd = 0;
 
 			for (k = 0; k < nitems(ppod->addr); k++) {
 				if (entries != 0) {
 					pa = pmap_kextract(pva + sg_offset);
 					ppod->addr[k] = htobe64(pa);
 				} else
 					ppod->addr[k] = 0;
 
 #if 0
 				CTR5(KTR_CXGBE,
 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
 				    __func__, toep->tid, i, k,
 				    be64toh(ppod->addr[k]));
 #endif
 
 				/*
 				 * If this is the last entry in a pod,
 				 * reuse the same entry for first address
 				 * in the next pod.
 				 */
 				if (k + 1 == nitems(ppod->addr))
 					break;
 
 				/*
 				 * Don't move to the next DDP page if the
 				 * sgl is already finished.
 				 */
 				if (entries == 0)
 					continue;
 
 				sg_offset += ddp_pgsz;
 				if (sg_offset == sgl->len) {
 					/*
 					 * This sgl entry is done.  Go
 					 * to the next.
 					 */
 					entries--;
 					sgl++;
 					sg_offset = 0;
 					if (entries != 0)
 						pva = trunc_page(
 						    (vm_offset_t)sgl->addr);
 				}
 			}
 		}
 
 		mbufq_enqueue(wrq, m);
 	}
 
 	return (0);
 }
 
 /*
  * Prepare a pageset for DDP.  This sets up page pods.
  */
 static int
 prep_pageset(struct adapter *sc, struct toepcb *toep, struct pageset *ps)
 {
 	struct tom_data *td = sc->tom_softc;
 
 	if (ps->prsv.prsv_nppods == 0 &&
 	    t4_alloc_page_pods_for_ps(&td->pr, ps) != 0) {
 		return (0);
 	}
 	if (!(ps->flags & PS_PPODS_WRITTEN) &&
 	    t4_write_page_pods_for_ps(sc, toep->ctrlq, toep->tid, ps) != 0) {
 		return (0);
 	}
 
 	return (1);
 }
 
 int
 t4_init_ppod_region(struct ppod_region *pr, struct t4_range *r, u_int psz,
     const char *name)
 {
 	int i;
 
 	MPASS(pr != NULL);
 	MPASS(r->size > 0);
 
 	pr->pr_start = r->start;
 	pr->pr_len = r->size;
 	pr->pr_page_shift[0] = 12 + G_HPZ0(psz);
 	pr->pr_page_shift[1] = 12 + G_HPZ1(psz);
 	pr->pr_page_shift[2] = 12 + G_HPZ2(psz);
 	pr->pr_page_shift[3] = 12 + G_HPZ3(psz);
 
 	/* The SGL -> page pod algorithm requires the sizes to be in order. */
 	for (i = 1; i < nitems(pr->pr_page_shift); i++) {
 		if (pr->pr_page_shift[i] <= pr->pr_page_shift[i - 1])
 			return (ENXIO);
 	}
 
 	pr->pr_tag_mask = ((1 << fls(r->size)) - 1) & V_PPOD_TAG(M_PPOD_TAG);
 	pr->pr_alias_mask = V_PPOD_TAG(M_PPOD_TAG) & ~pr->pr_tag_mask;
 	if (pr->pr_tag_mask == 0 || pr->pr_alias_mask == 0)
 		return (ENXIO);
 	pr->pr_alias_shift = fls(pr->pr_tag_mask);
 	pr->pr_invalid_bit = 1 << (pr->pr_alias_shift - 1);
 
 	pr->pr_arena = vmem_create(name, 0, pr->pr_len, PPOD_SIZE, 0,
 	    M_FIRSTFIT | M_NOWAIT);
 	if (pr->pr_arena == NULL)
 		return (ENOMEM);
 
 	return (0);
 }
 
 void
 t4_free_ppod_region(struct ppod_region *pr)
 {
 
 	MPASS(pr != NULL);
 
 	if (pr->pr_arena)
 		vmem_destroy(pr->pr_arena);
 	bzero(pr, sizeof(*pr));
 }
 
 static int
 pscmp(struct pageset *ps, struct vmspace *vm, vm_offset_t start, int npages,
     int pgoff, int len)
 {
 
 	if (ps->start != start || ps->npages != npages ||
 	    ps->offset != pgoff || ps->len != len)
 		return (1);
 
 	return (ps->vm != vm || ps->vm_timestamp != vm->vm_map.timestamp);
 }
 
 static int
 hold_aio(struct toepcb *toep, struct kaiocb *job, struct pageset **pps)
 {
 	struct vmspace *vm;
 	vm_map_t map;
 	vm_offset_t start, end, pgoff;
 	struct pageset *ps;
 	int n;
 
 	DDP_ASSERT_LOCKED(toep);
 
 	/*
 	 * The AIO subsystem will cancel and drain all requests before
 	 * permitting a process to exit or exec, so p_vmspace should
 	 * be stable here.
 	 */
 	vm = job->userproc->p_vmspace;
 	map = &vm->vm_map;
 	start = (uintptr_t)job->uaiocb.aio_buf;
 	pgoff = start & PAGE_MASK;
 	end = round_page(start + job->uaiocb.aio_nbytes);
 	start = trunc_page(start);
 
 	if (end - start > MAX_DDP_BUFFER_SIZE) {
 		/*
 		 * Truncate the request to a short read.
 		 * Alternatively, we could DDP in chunks to the larger
 		 * buffer, but that would be quite a bit more work.
 		 *
 		 * When truncating, round the request down to avoid
 		 * crossing a cache line on the final transaction.
 		 */
 		end = rounddown2(start + MAX_DDP_BUFFER_SIZE, CACHE_LINE_SIZE);
 #ifdef VERBOSE_TRACES
 		CTR4(KTR_CXGBE, "%s: tid %d, truncating size from %lu to %lu",
 		    __func__, toep->tid, (unsigned long)job->uaiocb.aio_nbytes,
 		    (unsigned long)(end - (start + pgoff)));
 		job->uaiocb.aio_nbytes = end - (start + pgoff);
 #endif
 		end = round_page(end);
 	}
 
 	n = atop(end - start);
 
 	/*
 	 * Try to reuse a cached pageset.
 	 */
 	TAILQ_FOREACH(ps, &toep->ddp.cached_pagesets, link) {
 		if (pscmp(ps, vm, start, n, pgoff,
 		    job->uaiocb.aio_nbytes) == 0) {
 			TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
 			toep->ddp.cached_count--;
 			*pps = ps;
 			return (0);
 		}
 	}
 
 	/*
 	 * If there are too many cached pagesets to create a new one,
 	 * free a pageset before creating a new one.
 	 */
 	KASSERT(toep->ddp.active_count + toep->ddp.cached_count <=
 	    nitems(toep->ddp.db), ("%s: too many wired pagesets", __func__));
 	if (toep->ddp.active_count + toep->ddp.cached_count ==
 	    nitems(toep->ddp.db)) {
 		KASSERT(toep->ddp.cached_count > 0,
 		    ("no cached pageset to free"));
 		ps = TAILQ_LAST(&toep->ddp.cached_pagesets, pagesetq);
 		TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
 		toep->ddp.cached_count--;
 		free_pageset(toep->td, ps);
 	}
 	DDP_UNLOCK(toep);
 
 	/* Create a new pageset. */
 	ps = malloc(sizeof(*ps) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK |
 	    M_ZERO);
 	ps->pages = (vm_page_t *)(ps + 1);
 	ps->vm_timestamp = map->timestamp;
 	ps->npages = vm_fault_quick_hold_pages(map, start, end - start,
 	    VM_PROT_WRITE, ps->pages, n);
 
 	DDP_LOCK(toep);
 	if (ps->npages < 0) {
 		free(ps, M_CXGBE);
 		return (EFAULT);
 	}
 
 	KASSERT(ps->npages == n, ("hold_aio: page count mismatch: %d vs %d",
 	    ps->npages, n));
 
 	ps->offset = pgoff;
 	ps->len = job->uaiocb.aio_nbytes;
 	refcount_acquire(&vm->vm_refcnt);
 	ps->vm = vm;
 	ps->start = start;
 
 	CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d",
 	    __func__, toep->tid, ps, job, ps->npages);
 	*pps = ps;
 	return (0);
 }
 
 static void
 ddp_complete_all(struct toepcb *toep, int error)
 {
 	struct kaiocb *job;
 
 	DDP_ASSERT_LOCKED(toep);
 	while (!TAILQ_EMPTY(&toep->ddp.aiojobq)) {
 		job = TAILQ_FIRST(&toep->ddp.aiojobq);
 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 		toep->ddp.waiting_count--;
 		if (aio_clear_cancel_function(job))
 			ddp_complete_one(job, error);
 	}
 }
 
 static void
 aio_ddp_cancel_one(struct kaiocb *job)
 {
 	long copied;
 
 	/*
 	 * If this job had copied data out of the socket buffer before
 	 * it was cancelled, report it as a short read rather than an
 	 * error.
 	 */
 	copied = job->aio_received;
 	if (copied != 0)
 		aio_complete(job, copied, 0);
 	else
 		aio_cancel(job);
 }
 
 /*
  * Called when the main loop wants to requeue a job to retry it later.
  * Deals with the race of the job being cancelled while it was being
  * examined.
  */
 static void
 aio_ddp_requeue_one(struct toepcb *toep, struct kaiocb *job)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (!(toep->ddp.flags & DDP_DEAD) &&
 	    aio_set_cancel_function(job, t4_aio_cancel_queued)) {
 		TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list);
 		toep->ddp.waiting_count++;
 	} else
 		aio_ddp_cancel_one(job);
 }
 
 static void
 aio_ddp_requeue(struct toepcb *toep)
 {
 	struct adapter *sc = td_adapter(toep->td);
 	struct socket *so;
 	struct sockbuf *sb;
 	struct inpcb *inp;
 	struct kaiocb *job;
 	struct ddp_buffer *db;
 	size_t copied, offset, resid;
 	struct pageset *ps;
 	struct mbuf *m;
 	uint64_t ddp_flags, ddp_flags_mask;
 	struct wrqe *wr;
 	int buf_flag, db_idx, error;
 
 	DDP_ASSERT_LOCKED(toep);
 
 restart:
 	if (toep->ddp.flags & DDP_DEAD) {
 		MPASS(toep->ddp.waiting_count == 0);
 		MPASS(toep->ddp.active_count == 0);
 		return;
 	}
 
 	if (toep->ddp.waiting_count == 0 ||
 	    toep->ddp.active_count == nitems(toep->ddp.db)) {
 		return;
 	}
 
 	job = TAILQ_FIRST(&toep->ddp.aiojobq);
 	so = job->fd_file->f_data;
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	/* We will never get anything unless we are or were connected. */
 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
 		SOCKBUF_UNLOCK(sb);
 		ddp_complete_all(toep, ENOTCONN);
 		return;
 	}
 
 	KASSERT(toep->ddp.active_count == 0 || sbavail(sb) == 0,
 	    ("%s: pending sockbuf data and DDP is active", __func__));
 
 	/* Abort if socket has reported problems. */
 	/* XXX: Wait for any queued DDP's to finish and/or flush them? */
 	if (so->so_error && sbavail(sb) == 0) {
 		toep->ddp.waiting_count--;
 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 		if (!aio_clear_cancel_function(job)) {
 			SOCKBUF_UNLOCK(sb);
 			goto restart;
 		}
 
 		/*
 		 * If this job has previously copied some data, report
 		 * a short read and leave the error to be reported by
 		 * a future request.
 		 */
 		copied = job->aio_received;
 		if (copied != 0) {
 			SOCKBUF_UNLOCK(sb);
 			aio_complete(job, copied, 0);
 			goto restart;
 		}
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		aio_complete(job, -1, error);
 		goto restart;
 	}
 
 	/*
 	 * Door is closed.  If there is pending data in the socket buffer,
 	 * deliver it.  If there are pending DDP requests, wait for those
 	 * to complete.  Once they have completed, return EOF reads.
 	 */
 	if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		if (toep->ddp.active_count != 0)
 			return;
 		ddp_complete_all(toep, 0);
 		return;
 	}
 
 	/*
 	 * If DDP is not enabled and there is no pending socket buffer
 	 * data, try to enable DDP.
 	 */
 	if (sbavail(sb) == 0 && (toep->ddp.flags & DDP_ON) == 0) {
 		SOCKBUF_UNLOCK(sb);
 
 		/*
 		 * Wait for the card to ACK that DDP is enabled before
 		 * queueing any buffers.  Currently this waits for an
 		 * indicate to arrive.  This could use a TCB_SET_FIELD_RPL
 		 * message to know that DDP was enabled instead of waiting
 		 * for the indicate which would avoid copying the indicate
 		 * if no data is pending.
 		 *
 		 * XXX: Might want to limit the indicate size to the size
 		 * of the first queued request.
 		 */
 		if ((toep->ddp.flags & DDP_SC_REQ) == 0)
 			enable_ddp(sc, toep);
 		return;
 	}
 	SOCKBUF_UNLOCK(sb);
 
 	/*
 	 * If another thread is queueing a buffer for DDP, let it
 	 * drain any work and return.
 	 */
 	if (toep->ddp.queueing != NULL)
 		return;
 
 	/* Take the next job to prep it for DDP. */
 	toep->ddp.waiting_count--;
 	TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 	if (!aio_clear_cancel_function(job))
 		goto restart;
 	toep->ddp.queueing = job;
 
 	/* NB: This drops DDP_LOCK while it holds the backing VM pages. */
 	error = hold_aio(toep, job, &ps);
 	if (error != 0) {
 		ddp_complete_one(job, error);
 		toep->ddp.queueing = NULL;
 		goto restart;
 	}
 
 	SOCKBUF_LOCK(sb);
 	if (so->so_error && sbavail(sb) == 0) {
 		copied = job->aio_received;
 		if (copied != 0) {
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_complete(job, copied, 0);
 			toep->ddp.queueing = NULL;
 			goto restart;
 		}
 
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(sb);
 		recycle_pageset(toep, ps);
 		aio_complete(job, -1, error);
 		toep->ddp.queueing = NULL;
 		goto restart;
 	}
 
 	if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
 		SOCKBUF_UNLOCK(sb);
 		recycle_pageset(toep, ps);
 		if (toep->ddp.active_count != 0) {
 			/*
 			 * The door is closed, but there are still pending
 			 * DDP buffers.  Requeue.  These jobs will all be
 			 * completed once those buffers drain.
 			 */
 			aio_ddp_requeue_one(toep, job);
 			toep->ddp.queueing = NULL;
 			return;
 		}
 		ddp_complete_one(job, 0);
 		ddp_complete_all(toep, 0);
 		toep->ddp.queueing = NULL;
 		return;
 	}
 
 sbcopy:
 	/*
 	 * If the toep is dead, there shouldn't be any data in the socket
 	 * buffer, so the above case should have handled this.
 	 */
 	MPASS(!(toep->ddp.flags & DDP_DEAD));
 
 	/*
 	 * If there is pending data in the socket buffer (either
 	 * from before the requests were queued or a DDP indicate),
 	 * copy those mbufs out directly.
 	 */
 	copied = 0;
 	offset = ps->offset + job->aio_received;
 	MPASS(job->aio_received <= job->uaiocb.aio_nbytes);
 	resid = job->uaiocb.aio_nbytes - job->aio_received;
 	m = sb->sb_mb;
 	KASSERT(m == NULL || toep->ddp.active_count == 0,
 	    ("%s: sockbuf data with active DDP", __func__));
 	while (m != NULL && resid > 0) {
 		struct iovec iov[1];
 		struct uio uio;
 #ifdef INVARIANTS
 		int error;
 #endif
 
 		iov[0].iov_base = mtod(m, void *);
 		iov[0].iov_len = m->m_len;
 		if (iov[0].iov_len > resid)
 			iov[0].iov_len = resid;
 		uio.uio_iov = iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_offset = 0;
 		uio.uio_resid = iov[0].iov_len;
 		uio.uio_segflg = UIO_SYSSPACE;
 		uio.uio_rw = UIO_WRITE;
 #ifdef INVARIANTS
 		error = uiomove_fromphys(ps->pages, offset + copied,
 		    uio.uio_resid, &uio);
 #else
 		uiomove_fromphys(ps->pages, offset + copied, uio.uio_resid, &uio);
 #endif
 		MPASS(error == 0 && uio.uio_resid == 0);
 		copied += uio.uio_offset;
 		resid -= uio.uio_offset;
 		m = m->m_next;
 	}
 	if (copied != 0) {
 		sbdrop_locked(sb, copied);
 		job->aio_received += copied;
 		job->msgrcv = 1;
 		copied = job->aio_received;
 		inp = sotoinpcb(so);
 		if (!INP_TRY_WLOCK(inp)) {
 			/*
 			 * The reference on the socket file descriptor in
 			 * the AIO job should keep 'sb' and 'inp' stable.
 			 * Our caller has a reference on the 'toep' that
 			 * keeps it stable.
 			 */
 			SOCKBUF_UNLOCK(sb);
 			DDP_UNLOCK(toep);
 			INP_WLOCK(inp);
 			DDP_LOCK(toep);
 			SOCKBUF_LOCK(sb);
 
 			/*
 			 * If the socket has been closed, we should detect
 			 * that and complete this request if needed on
 			 * the next trip around the loop.
 			 */
 		}
 		t4_rcvd_locked(&toep->td->tod, intotcpcb(inp));
 		INP_WUNLOCK(inp);
 		if (resid == 0 || toep->ddp.flags & DDP_DEAD) {
 			/*
 			 * We filled the entire buffer with socket
 			 * data, DDP is not being used, or the socket
 			 * is being shut down, so complete the
 			 * request.
 			 */
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_complete(job, copied, 0);
 			toep->ddp.queueing = NULL;
 			goto restart;
 		}
 
 		/*
 		 * If DDP is not enabled, requeue this request and restart.
 		 * This will either enable DDP or wait for more data to
 		 * arrive on the socket buffer.
 		 */
 		if ((toep->ddp.flags & (DDP_ON | DDP_SC_REQ)) != DDP_ON) {
 			SOCKBUF_UNLOCK(sb);
 			recycle_pageset(toep, ps);
 			aio_ddp_requeue_one(toep, job);
 			toep->ddp.queueing = NULL;
 			goto restart;
 		}
 
 		/*
 		 * An indicate might have arrived and been added to
 		 * the socket buffer while it was unlocked after the
 		 * copy to lock the INP.  If so, restart the copy.
 		 */
 		if (sbavail(sb) != 0)
 			goto sbcopy;
 	}
 	SOCKBUF_UNLOCK(sb);
 
 	if (prep_pageset(sc, toep, ps) == 0) {
 		recycle_pageset(toep, ps);
 		aio_ddp_requeue_one(toep, job);
 		toep->ddp.queueing = NULL;
 
 		/*
 		 * XXX: Need to retry this later.  Mostly need a trigger
 		 * when page pods are freed up.
 		 */
 		printf("%s: prep_pageset failed\n", __func__);
 		return;
 	}
 
 	/* Determine which DDP buffer to use. */
 	if (toep->ddp.db[0].job == NULL) {
 		db_idx = 0;
 	} else {
 		MPASS(toep->ddp.db[1].job == NULL);
 		db_idx = 1;
 	}
 
 	ddp_flags = 0;
 	ddp_flags_mask = 0;
 	if (db_idx == 0) {
 		ddp_flags |= V_TF_DDP_BUF0_VALID(1);
 		if (so->so_state & SS_NBIO)
 			ddp_flags |= V_TF_DDP_BUF0_FLUSH(1);
 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) |
 		    V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) |
 		    V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1);
 		buf_flag = DDP_BUF0_ACTIVE;
 	} else {
 		ddp_flags |= V_TF_DDP_BUF1_VALID(1);
 		if (so->so_state & SS_NBIO)
 			ddp_flags |= V_TF_DDP_BUF1_FLUSH(1);
 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) |
 		    V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) |
 		    V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1);
 		buf_flag = DDP_BUF1_ACTIVE;
 	}
 	MPASS((toep->ddp.flags & buf_flag) == 0);
 	if ((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) {
 		MPASS(db_idx == 0);
 		MPASS(toep->ddp.active_id == -1);
 		MPASS(toep->ddp.active_count == 0);
 		ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1);
 	}
 
 	/*
 	 * The TID for this connection should still be valid.  If DDP_DEAD
 	 * is set, SBS_CANTRCVMORE should be set, so we shouldn't be
 	 * this far anyway.  Even if the socket is closing on the other
 	 * end, the AIO job holds a reference on this end of the socket
 	 * which will keep it open and keep the TCP PCB attached until
 	 * after the job is completed.
 	 */
 	wr = mk_update_tcb_for_ddp(sc, toep, db_idx, ps, job->aio_received,
 	    ddp_flags, ddp_flags_mask);
 	if (wr == NULL) {
 		recycle_pageset(toep, ps);
 		aio_ddp_requeue_one(toep, job);
 		toep->ddp.queueing = NULL;
 
 		/*
 		 * XXX: Need a way to kick a retry here.
 		 *
 		 * XXX: We know the fixed size needed and could
 		 * preallocate this using a blocking request at the
 		 * start of the task to avoid having to handle this
 		 * edge case.
 		 */
 		printf("%s: mk_update_tcb_for_ddp failed\n", __func__);
 		return;
 	}
 
 	if (!aio_set_cancel_function(job, t4_aio_cancel_active)) {
 		free_wrqe(wr);
 		recycle_pageset(toep, ps);
 		aio_ddp_cancel_one(job);
 		toep->ddp.queueing = NULL;
 		goto restart;
 	}
 
 #ifdef VERBOSE_TRACES
 	CTR6(KTR_CXGBE,
 	    "%s: tid %u, scheduling %p for DDP[%d] (flags %#lx/%#lx)", __func__,
 	    toep->tid, job, db_idx, ddp_flags, ddp_flags_mask);
 #endif
 	/* Give the chip the go-ahead. */
 	t4_wrq_tx(sc, wr);
 	db = &toep->ddp.db[db_idx];
 	db->cancel_pending = 0;
 	db->job = job;
 	db->ps = ps;
 	toep->ddp.queueing = NULL;
 	toep->ddp.flags |= buf_flag;
 	toep->ddp.active_count++;
 	if (toep->ddp.active_count == 1) {
 		MPASS(toep->ddp.active_id == -1);
 		toep->ddp.active_id = db_idx;
 		CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__,
 		    toep->ddp.active_id);
 	}
 	goto restart;
 }
 
 void
 ddp_queue_toep(struct toepcb *toep)
 {
 
 	DDP_ASSERT_LOCKED(toep);
 	if (toep->ddp.flags & DDP_TASK_ACTIVE)
 		return;
 	toep->ddp.flags |= DDP_TASK_ACTIVE;
 	hold_toepcb(toep);
 	soaio_enqueue(&toep->ddp.requeue_task);
 }
 
 static void
 aio_ddp_requeue_task(void *context, int pending)
 {
 	struct toepcb *toep = context;
 
 	DDP_LOCK(toep);
 	aio_ddp_requeue(toep);
 	toep->ddp.flags &= ~DDP_TASK_ACTIVE;
 	DDP_UNLOCK(toep);
 
 	free_toepcb(toep);
 }
 
 static void
 t4_aio_cancel_active(struct kaiocb *job)
 {
 	struct socket *so = job->fd_file->f_data;
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = td_adapter(toep->td);
 	uint64_t valid_flag;
 	int i;
 
 	DDP_LOCK(toep);
 	if (aio_cancel_cleared(job)) {
 		DDP_UNLOCK(toep);
 		aio_ddp_cancel_one(job);
 		return;
 	}
 
 	for (i = 0; i < nitems(toep->ddp.db); i++) {
 		if (toep->ddp.db[i].job == job) {
 			/* Should only ever get one cancel request for a job. */
 			MPASS(toep->ddp.db[i].cancel_pending == 0);
 
 			/*
 			 * Invalidate this buffer.  It will be
 			 * cancelled or partially completed once the
 			 * card ACKs the invalidate.
 			 */
 			valid_flag = i == 0 ? V_TF_DDP_BUF0_VALID(1) :
 			    V_TF_DDP_BUF1_VALID(1);
 			t4_set_tcb_field(sc, toep->ctrlq, toep,
 			    W_TCB_RX_DDP_FLAGS, valid_flag, 0, 1,
 			    CPL_COOKIE_DDP0 + i);
 			toep->ddp.db[i].cancel_pending = 1;
 			CTR2(KTR_CXGBE, "%s: request %p marked pending",
 			    __func__, job);
 			break;
 		}
 	}
 	DDP_UNLOCK(toep);
 }
 
 static void
 t4_aio_cancel_queued(struct kaiocb *job)
 {
 	struct socket *so = job->fd_file->f_data;
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 
 	DDP_LOCK(toep);
 	if (!aio_cancel_cleared(job)) {
 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
 		toep->ddp.waiting_count--;
 		if (toep->ddp.waiting_count == 0)
 			ddp_queue_toep(toep);
 	}
 	CTR2(KTR_CXGBE, "%s: request %p cancelled", __func__, job);
 	DDP_UNLOCK(toep);
 
 	aio_ddp_cancel_one(job);
 }
 
 int
 t4_aio_queue_ddp(struct socket *so, struct kaiocb *job)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 
 
 	/* Ignore writes. */
 	if (job->uaiocb.aio_lio_opcode != LIO_READ)
 		return (EOPNOTSUPP);
 
 	DDP_LOCK(toep);
 
 	/*
 	 * XXX: Think about possibly returning errors for ENOTCONN,
 	 * etc.  Perhaps the caller would only queue the request
 	 * if it failed with EOPNOTSUPP?
 	 */
 
 #ifdef VERBOSE_TRACES
 	CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid);
 #endif
 	if (!aio_set_cancel_function(job, t4_aio_cancel_queued))
 		panic("new job was cancelled");
 	TAILQ_INSERT_TAIL(&toep->ddp.aiojobq, job, list);
 	toep->ddp.waiting_count++;
 	toep->ddp.flags |= DDP_OK;
 
 	/*
 	 * Try to handle this request synchronously.  If this has
 	 * to block because the task is running, it will just bail
 	 * and let the task handle it instead.
 	 */
 	aio_ddp_requeue(toep);
 	DDP_UNLOCK(toep);
 	return (0);
 }
 
 void
 t4_ddp_mod_load(void)
 {
 
 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl,
 	    CPL_COOKIE_DDP0);
 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl,
 	    CPL_COOKIE_DDP1);
 	t4_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
 	TAILQ_INIT(&ddp_orphan_pagesets);
 	mtx_init(&ddp_orphan_pagesets_lock, "ddp orphans", NULL, MTX_DEF);
 	TASK_INIT(&ddp_orphan_task, 0, ddp_free_orphan_pagesets, NULL);
 }
 
 void
 t4_ddp_mod_unload(void)
 {
 
 	taskqueue_drain(taskqueue_thread, &ddp_orphan_task);
 	MPASS(TAILQ_EMPTY(&ddp_orphan_pagesets));
 	mtx_destroy(&ddp_orphan_pagesets_lock);
 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP0);
 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP1);
 	t4_register_cpl_handler(CPL_RX_DATA_DDP, NULL);
 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, NULL);
 }
 #endif
diff --git a/sys/dev/cxgbe/tom/t4_tls.c b/sys/dev/cxgbe/tom/t4_tls.c
index 447943b5b9db..9f9d4a48de93 100644
--- a/sys/dev/cxgbe/tom/t4_tls.c
+++ b/sys/dev/cxgbe/tom/t4_tls.c
@@ -1,1242 +1,1242 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2017-2018 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_inet.h"
 #include "opt_kern_tls.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifdef KERN_TLS
 #include <sys/param.h>
 #include <sys/ktr.h>
 #include <sys/ktls.h>
 #include <sys/sglist.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/systm.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 #include <opencrypto/cryptodev.h>
 #include <opencrypto/xform.h>
 
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_tcb.h"
 #include "crypto/t4_crypto.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 
 /*
  * The TCP sequence number of a CPL_TLS_DATA mbuf is saved here while
  * the mbuf is in the ulp_pdu_reclaimq.
  */
 #define	tls_tcp_seq	PH_loc.thirtytwo[0]
 
 static void
 t4_set_tls_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask,
     uint64_t val)
 {
 	struct adapter *sc = td_adapter(toep->td);
 
 	t4_set_tcb_field(sc, &toep->ofld_txq->wrq, toep, word, mask, val, 0, 0);
 }
 
 /* TLS and DTLS common routines */
 bool
 can_tls_offload(struct adapter *sc)
 {
 
 	return (sc->tt.tls && sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS);
 }
 
 int
 tls_tx_key(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	return (tls_ofld->tx_key_addr >= 0);
 }
 
 /* Set TLS Key-Id in TCB */
 static void
 t4_set_tls_keyid(struct toepcb *toep, unsigned int key_id)
 {
 
 	t4_set_tls_tcb_field(toep, W_TCB_RX_TLS_KEY_TAG,
 			 V_TCB_RX_TLS_KEY_TAG(M_TCB_RX_TLS_BUF_TAG),
 			 V_TCB_RX_TLS_KEY_TAG(key_id));
 }
 
 /* Clear TF_RX_QUIESCE to re-enable receive. */
 static void
 t4_clear_rx_quiesce(struct toepcb *toep)
 {
 
 	t4_set_tls_tcb_field(toep, W_TCB_T_FLAGS, V_TF_RX_QUIESCE(1), 0);
 }
 
 static void
 tls_clr_ofld_mode(struct toepcb *toep)
 {
 
 	tls_stop_handshake_timer(toep);
 
 	KASSERT(toep->tls.rx_key_addr == -1,
 	    ("%s: tid %d has RX key", __func__, toep->tid));
 
 	/* Switch to plain TOE mode. */
 	t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW,
 	    V_TCB_ULP_RAW(V_TF_TLS_ENABLE(1)),
 	    V_TCB_ULP_RAW(V_TF_TLS_ENABLE(0)));
 	t4_set_tls_tcb_field(toep, W_TCB_ULP_TYPE,
 	    V_TCB_ULP_TYPE(M_TCB_ULP_TYPE), V_TCB_ULP_TYPE(ULP_MODE_NONE));
 	t4_clear_rx_quiesce(toep);
 
 	toep->flags &= ~(TPF_FORCE_CREDITS | TPF_TLS_ESTABLISHED);
 	toep->params.ulp_mode = ULP_MODE_NONE;
 }
 
 /* TLS/DTLS content type  for CPL SFO */
 static inline unsigned char
 tls_content_type(unsigned char content_type)
 {
 	switch (content_type) {
 	case CONTENT_TYPE_CCS:
 		return CPL_TX_TLS_SFO_TYPE_CCS;
 	case CONTENT_TYPE_ALERT:
 		return CPL_TX_TLS_SFO_TYPE_ALERT;
 	case CONTENT_TYPE_HANDSHAKE:
 		return CPL_TX_TLS_SFO_TYPE_HANDSHAKE;
 	case CONTENT_TYPE_APP_DATA:
 		return CPL_TX_TLS_SFO_TYPE_DATA;
 	default:
 		return CPL_TX_TLS_SFO_TYPE_CUSTOM;
 	}
 }
 
 /* TLS Key memory management */
 static void
 clear_tls_keyid(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct adapter *sc = td_adapter(toep->td);
 
 	if (tls_ofld->rx_key_addr >= 0) {
 		t4_free_tls_keyid(sc, tls_ofld->rx_key_addr);
 		tls_ofld->rx_key_addr = -1;
 	}
 	if (tls_ofld->tx_key_addr >= 0) {
 		t4_free_tls_keyid(sc, tls_ofld->tx_key_addr);
 		tls_ofld->tx_key_addr = -1;
 	}
 }
 
 static int
 get_tp_plen_max(struct ktls_session *tls)
 {
 	int plen = ((min(3*4096, TP_TX_PG_SZ))/1448) * 1448;
 
 	return (tls->params.max_frame_len <= 8192 ? plen : FC_TP_PLEN_MAX);
 }
 
 /* Send request to get the key-id */
 static int
 tls_program_key_id(struct toepcb *toep, struct ktls_session *tls,
     int direction)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct adapter *sc = td_adapter(toep->td);
 	struct ofld_tx_sdesc *txsd;
 	int keyid;
 	struct wrqe *wr;
 	struct tls_key_req *kwr;
 	struct tls_keyctx *kctx;
 
 #ifdef INVARIANTS
 	int kwrlen, kctxlen, len;
 
 	kwrlen = sizeof(*kwr);
 	kctxlen = roundup2(sizeof(*kctx), 32);
 	len = roundup2(kwrlen + kctxlen, 16);
 	MPASS(TLS_KEY_WR_SZ == len);
 #endif
 	if (toep->txsd_avail == 0)
 		return (EAGAIN);
 
 	if ((keyid = t4_alloc_tls_keyid(sc)) < 0) {
 		return (ENOSPC);
 	}
 
 	wr = alloc_wrqe(TLS_KEY_WR_SZ, &toep->ofld_txq->wrq);
 	if (wr == NULL) {
 		t4_free_tls_keyid(sc, keyid);
 		return (ENOMEM);
 	}
 	kwr = wrtod(wr);
 	memset(kwr, 0, TLS_KEY_WR_SZ);
 
 	t4_write_tlskey_wr(tls, direction, toep->tid, F_FW_WR_COMPL, keyid,
 	    kwr);
 	kctx = (struct tls_keyctx *)(kwr + 1);
 	if (direction == KTLS_TX)
 		tls_ofld->tx_key_addr = keyid;
 	else
 		tls_ofld->rx_key_addr = keyid;
 	t4_tls_key_ctx(tls, direction, kctx);
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	txsd->tx_credits = DIV_ROUND_UP(TLS_KEY_WR_SZ, 16);
 	txsd->plen = 0;
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 
 	t4_wrq_tx(sc, wr);
 
 	return (0);
 }
 
 /*
  * In some cases a client connection can hang without sending the
  * ServerHelloDone message from the NIC to the host.  Send a dummy
  * RX_DATA_ACK with RX_MODULATE to unstick the connection.
  */
 static void
 tls_send_handshake_ack(void *arg)
 {
 	struct toepcb *toep = arg;
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct adapter *sc = td_adapter(toep->td);
 
 	/* Bail without rescheduling if the connection has closed. */
 	if ((toep->flags & (TPF_FIN_SENT | TPF_ABORT_SHUTDOWN)) != 0)
 		return;
 
 	/*
 	 * If this connection has timed out without receiving more
 	 * data, downgrade to plain TOE mode and don't re-arm the
 	 * timer.
 	 */
 	if (sc->tt.tls_rx_timeout != 0) {
 		struct inpcb *inp;
 		struct tcpcb *tp;
 
 		inp = toep->inp;
 		tp = intotcpcb(inp);
 		if ((ticks - tp->t_rcvtime) >= sc->tt.tls_rx_timeout) {
 			CTR2(KTR_CXGBE, "%s: tid %d clr_ofld_mode", __func__,
 			    toep->tid);
 			tls_clr_ofld_mode(toep);
 			return;
 		}
 	}
 
 	/*
 	 * XXX: Does not have the t4_get_tcb() checks to refine the
 	 * workaround.
 	 */
 	callout_schedule(&tls_ofld->handshake_timer, TLS_SRV_HELLO_RD_TM * hz);
 
 	CTR2(KTR_CXGBE, "%s: tid %d sending RX_DATA_ACK", __func__, toep->tid);
 	send_rx_modulate(sc, toep);
 }
 
 static void
 tls_start_handshake_timer(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	INP_WLOCK_ASSERT(toep->inp);
 	callout_reset(&tls_ofld->handshake_timer, TLS_SRV_HELLO_BKOFF_TM * hz,
 	    tls_send_handshake_ack, toep);
 }
 
 void
 tls_stop_handshake_timer(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	INP_WLOCK_ASSERT(toep->inp);
 	callout_stop(&tls_ofld->handshake_timer);
 }
 
 int
 tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, int direction)
 {
 	struct adapter *sc = td_adapter(toep->td);
 	int error, explicit_iv_size, key_offset, mac_first;
 
 	if (!can_tls_offload(td_adapter(toep->td)))
 		return (EINVAL);
 	switch (ulp_mode(toep)) {
 	case ULP_MODE_TLS:
 		break;
 	case ULP_MODE_NONE:
 	case ULP_MODE_TCPDDP:
 		if (direction != KTLS_TX)
 			return (EINVAL);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	switch (tls->params.cipher_algorithm) {
 	case CRYPTO_AES_CBC:
 		/* XXX: Explicitly ignore any provided IV. */
 		switch (tls->params.cipher_key_len) {
 		case 128 / 8:
 		case 192 / 8:
 		case 256 / 8:
 			break;
 		default:
 			error = EINVAL;
 			goto clr_ofld;
 		}
 		switch (tls->params.auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 		case CRYPTO_SHA2_256_HMAC:
 		case CRYPTO_SHA2_384_HMAC:
 			break;
 		default:
 			error = EPROTONOSUPPORT;
 			goto clr_ofld;
 		}
 		explicit_iv_size = AES_BLOCK_LEN;
 		mac_first = 1;
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
 		if (tls->params.iv_len != SALT_SIZE) {
 			error = EINVAL;
 			goto clr_ofld;
 		}
 		switch (tls->params.cipher_key_len) {
 		case 128 / 8:
 		case 192 / 8:
 		case 256 / 8:
 			break;
 		default:
 			error = EINVAL;
 			goto clr_ofld;
 		}
 		explicit_iv_size = 8;
 		mac_first = 0;
 		break;
 	default:
 		error = EPROTONOSUPPORT;
 		goto clr_ofld;
 	}
 
 	/* Only TLS 1.1 and TLS 1.2 are currently supported. */
 	if (tls->params.tls_vmajor != TLS_MAJOR_VER_ONE ||
 	    tls->params.tls_vminor < TLS_MINOR_VER_ONE ||
 	    tls->params.tls_vminor > TLS_MINOR_VER_TWO) {
 		error = EPROTONOSUPPORT;
 		goto clr_ofld;
 	}
 
 	/* Bail if we already have a key. */
 	if (direction == KTLS_TX) {
 		if (toep->tls.tx_key_addr != -1)
 			return (EOPNOTSUPP);
 	} else {
 		if (toep->tls.rx_key_addr != -1)
 			return (EOPNOTSUPP);
 	}
 
 	error = tls_program_key_id(toep, tls, direction);
 	if (error) {
 		if (direction == KTLS_RX)
 			goto clr_ofld;
 		return (error);
 	}
 
 	if (direction == KTLS_TX) {
 		toep->tls.scmd0.seqno_numivs =
 			(V_SCMD_SEQ_NO_CTRL(3) |
 			 V_SCMD_PROTO_VERSION(t4_tls_proto_ver(tls)) |
 			 V_SCMD_ENC_DEC_CTRL(SCMD_ENCDECCTRL_ENCRYPT) |
 			 V_SCMD_CIPH_AUTH_SEQ_CTRL((mac_first == 0)) |
 			 V_SCMD_CIPH_MODE(t4_tls_cipher_mode(tls)) |
 			 V_SCMD_AUTH_MODE(t4_tls_auth_mode(tls)) |
 			 V_SCMD_HMAC_CTRL(t4_tls_hmac_ctrl(tls)) |
 			 V_SCMD_IV_SIZE(explicit_iv_size / 2));
 
 		toep->tls.scmd0.ivgen_hdrlen =
 			(V_SCMD_IV_GEN_CTRL(1) |
 			 V_SCMD_KEY_CTX_INLINE(0) |
 			 V_SCMD_TLS_FRAG_ENABLE(1));
 
 		toep->tls.iv_len = explicit_iv_size;
 		toep->tls.frag_size = tls->params.max_frame_len;
 		toep->tls.fcplenmax = get_tp_plen_max(tls);
 		toep->tls.expn_per_ulp = tls->params.tls_hlen +
 		    tls->params.tls_tlen;
 		toep->tls.pdus_per_ulp = 1;
 		toep->tls.adjusted_plen = toep->tls.expn_per_ulp +
 		    tls->params.max_frame_len;
 		toep->tls.tx_key_info_size = t4_tls_key_info_size(tls);
 	} else {
 		/* Stop timer on handshake completion */
 		tls_stop_handshake_timer(toep);
 
 		toep->flags &= ~TPF_FORCE_CREDITS;
 		toep->flags |= TPF_TLS_RECEIVE;
 		toep->tls.rx_version = tls->params.tls_vmajor << 8 |
 		    tls->params.tls_vminor;
 
 		/*
 		 * RX key tags are an index into the key portion of MA
 		 * memory stored as an offset from the base address in
 		 * units of 64 bytes.
 		 */
 		key_offset = toep->tls.rx_key_addr - sc->vres.key.start;
 		t4_set_tls_keyid(toep, key_offset / 64);
 		t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW,
 				 V_TCB_ULP_RAW(M_TCB_ULP_RAW),
 				 V_TCB_ULP_RAW((V_TF_TLS_KEY_SIZE(3) |
 						V_TF_TLS_CONTROL(1) |
 						V_TF_TLS_ACTIVE(1) |
 						V_TF_TLS_ENABLE(1))));
 		t4_set_tls_tcb_field(toep, W_TCB_TLS_SEQ,
 				 V_TCB_TLS_SEQ(M_TCB_TLS_SEQ),
 				 V_TCB_TLS_SEQ(0));
 		t4_clear_rx_quiesce(toep);
 	}
 
 	return (0);
 
 clr_ofld:
 	if (ulp_mode(toep) == ULP_MODE_TLS) {
 		CTR2(KTR_CXGBE, "%s: tid %d clr_ofld_mode", __func__,
 		    toep->tid);
 		tls_clr_ofld_mode(toep);
 	}
 	return (error);
 }
 
 void
 tls_init_toep(struct toepcb *toep)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 
 	tls_ofld->rx_key_addr = -1;
 	tls_ofld->tx_key_addr = -1;
 }
 
 void
 tls_establish(struct toepcb *toep)
 {
 
 	/*
 	 * Enable PDU extraction.
 	 *
 	 * XXX: Supposedly this should be done by the firmware when
 	 * the ULP_MODE FLOWC parameter is set in send_flowc_wr(), but
 	 * in practice this seems to be required.
 	 */
 	CTR2(KTR_CXGBE, "%s: tid %d setting TLS_ENABLE", __func__, toep->tid);
 	t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW, V_TCB_ULP_RAW(M_TCB_ULP_RAW),
 	    V_TCB_ULP_RAW(V_TF_TLS_ENABLE(1)));
 
 	toep->flags |= TPF_FORCE_CREDITS | TPF_TLS_ESTABLISHED;
 
 	callout_init_rw(&toep->tls.handshake_timer, &toep->inp->inp_lock, 0);
 	tls_start_handshake_timer(toep);
 }
 
 void
 tls_detach(struct toepcb *toep)
 {
 
 	if (toep->flags & TPF_TLS_ESTABLISHED) {
 		tls_stop_handshake_timer(toep);
 		toep->flags &= ~TPF_TLS_ESTABLISHED;
 	}
 }
 
 void
 tls_uninit_toep(struct toepcb *toep)
 {
 
 	MPASS((toep->flags & TPF_TLS_ESTABLISHED) == 0);
 	clear_tls_keyid(toep);
 }
 
 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
 #define	MIN_OFLD_TLSTX_CREDITS(toep)					\
 	(howmany(sizeof(struct fw_tlstx_data_wr) + 			\
 	    sizeof(struct cpl_tx_tls_sfo) + sizeof(struct ulptx_idata) + \
 	    sizeof(struct ulptx_sc_memrd) +				\
 	    AES_BLOCK_LEN + 1, 16))
 
 static void
 write_tlstx_wr(struct fw_tlstx_data_wr *txwr, struct toepcb *toep,
     unsigned int plen, unsigned int expn, uint8_t credits, int shove)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	unsigned int len = plen + expn;
 
 	txwr->op_to_immdlen = htobe32(V_WR_OP(FW_TLSTX_DATA_WR) |
 	    V_FW_TLSTX_DATA_WR_COMPL(1) |
 	    V_FW_TLSTX_DATA_WR_IMMDLEN(0));
 	txwr->flowid_len16 = htobe32(V_FW_TLSTX_DATA_WR_FLOWID(toep->tid) |
 	    V_FW_TLSTX_DATA_WR_LEN16(credits));
 	txwr->plen = htobe32(len);
 	txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ULP_MODE_TLS) |
 	    V_TX_URG(0) | /* F_T6_TX_FORCE | */ V_TX_SHOVE(shove));
 	txwr->ctxloc_to_exp = htobe32(V_FW_TLSTX_DATA_WR_NUMIVS(1) |
 	    V_FW_TLSTX_DATA_WR_EXP(expn) |
 	    V_FW_TLSTX_DATA_WR_CTXLOC(TLS_SFO_WR_CONTEXTLOC_DDR) |
 	    V_FW_TLSTX_DATA_WR_IVDSGL(0) |
 	    V_FW_TLSTX_DATA_WR_KEYSIZE(tls_ofld->tx_key_info_size >> 4));
 	txwr->mfs = htobe16(tls_ofld->frag_size);
 	txwr->adjustedplen_pkd = htobe16(
 	    V_FW_TLSTX_DATA_WR_ADJUSTEDPLEN(tls_ofld->adjusted_plen));
 	txwr->expinplenmax_pkd = htobe16(
 	    V_FW_TLSTX_DATA_WR_EXPINPLENMAX(tls_ofld->expn_per_ulp));
 	txwr->pdusinplenmax_pkd = 
 	    V_FW_TLSTX_DATA_WR_PDUSINPLENMAX(tls_ofld->pdus_per_ulp);
 }
 
 static void
 write_tlstx_cpl(struct cpl_tx_tls_sfo *cpl, struct toepcb *toep,
     struct tls_hdr *tls_hdr, unsigned int plen, uint64_t seqno)
 {
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	int data_type, seglen;
 
 	seglen = plen;
 	data_type = tls_content_type(tls_hdr->type);
 	cpl->op_to_seg_len = htobe32(V_CPL_TX_TLS_SFO_OPCODE(CPL_TX_TLS_SFO) |
 	    V_CPL_TX_TLS_SFO_DATA_TYPE(data_type) |
 	    V_CPL_TX_TLS_SFO_CPL_LEN(2) | V_CPL_TX_TLS_SFO_SEG_LEN(seglen));
 	cpl->pld_len = htobe32(plen);
 	if (data_type == CPL_TX_TLS_SFO_TYPE_CUSTOM)
 		cpl->type_protover = htobe32(
 		    V_CPL_TX_TLS_SFO_TYPE(tls_hdr->type));
 	cpl->seqno_numivs = htobe32(tls_ofld->scmd0.seqno_numivs |
 	    V_SCMD_NUM_IVS(1));
 	cpl->ivgen_hdrlen = htobe32(tls_ofld->scmd0.ivgen_hdrlen);
 	cpl->scmd1 = htobe64(seqno);
 }
 
 static int
 count_ext_pgs_segs(struct mbuf *m)
 {
 	vm_paddr_t nextpa;
 	u_int i, nsegs;
 
 	MPASS(m->m_epg_npgs > 0);
 	nsegs = 1;
 	nextpa = m->m_epg_pa[0] + PAGE_SIZE;
 	for (i = 1; i < m->m_epg_npgs; i++) {
 		if (nextpa != m->m_epg_pa[i])
 			nsegs++;
 		nextpa = m->m_epg_pa[i] + PAGE_SIZE;
 	}
 	return (nsegs);
 }
 
 static void
 write_ktlstx_sgl(void *dst, struct mbuf *m, int nsegs)
 {
 	struct ulptx_sgl *usgl = dst;
 	vm_paddr_t pa;
 	uint32_t len;
 	int i, j;
 
 	KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
 
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 
 	/* Figure out the first S/G length. */
 	pa = m->m_epg_pa[0] + m->m_epg_1st_off;
 	usgl->addr0 = htobe64(pa);
 	len = m_epg_pagelen(m, 0, m->m_epg_1st_off);
 	pa += len;
 	for (i = 1; i < m->m_epg_npgs; i++) {
 		if (m->m_epg_pa[i] != pa)
 			break;
 		len += m_epg_pagelen(m, i, 0);
 		pa += m_epg_pagelen(m, i, 0);
 	}
 	usgl->len0 = htobe32(len);
 #ifdef INVARIANTS
 	nsegs--;
 #endif
 
 	j = -1;
 	for (; i < m->m_epg_npgs; i++) {
 		if (j == -1 || m->m_epg_pa[i] != pa) {
 			if (j >= 0)
 				usgl->sge[j / 2].len[j & 1] = htobe32(len);
 			j++;
 #ifdef INVARIANTS
 			nsegs--;
 #endif
 			pa = m->m_epg_pa[i];
 			usgl->sge[j / 2].addr[j & 1] = htobe64(pa);
 			len = m_epg_pagelen(m, i, 0);
 			pa += len;
 		} else {
 			len += m_epg_pagelen(m, i, 0);
 			pa += m_epg_pagelen(m, i, 0);
 		}
 	}
 	if (j >= 0) {
 		usgl->sge[j / 2].len[j & 1] = htobe32(len);
 
 		if ((j & 1) == 0)
 			usgl->sge[j / 2].len[1] = htobe32(0);
 	}
 	KASSERT(nsegs == 0, ("%s: nsegs %d, m %p", __func__, nsegs, m));
 }
 
 /*
  * Similar to t4_push_frames() but handles sockets that contain TLS
  * record mbufs.
  */
 void
 t4_push_ktls(struct adapter *sc, struct toepcb *toep, int drop)
 {
 	struct tls_hdr *thdr;
 	struct fw_tlstx_data_wr *txwr;
 	struct cpl_tx_tls_sfo *cpl;
 	struct ulptx_idata *idata;
 	struct ulptx_sc_memrd *memrd;
 	struct wrqe *wr;
 	struct mbuf *m;
 	u_int nsegs, credits, wr_len;
 	u_int expn_size;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct socket *so = inp->inp_socket;
 	struct sockbuf *sb = &so->so_snd;
 	int tls_size, tx_credits, shove, sowwakeup;
 	struct ofld_tx_sdesc *txsd;
 	char *buf;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 	    ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 
 	KASSERT(ulp_mode(toep) == ULP_MODE_NONE ||
 	    ulp_mode(toep) == ULP_MODE_TCPDDP || ulp_mode(toep) == ULP_MODE_TLS,
 	    ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
 	KASSERT(tls_tx_key(toep),
 	    ("%s: TX key not set for toep %p", __func__, toep));
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
 	    __func__, toep->tid, toep->flags, tp->t_flags);
 #endif
 	if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 		return;
 
 #ifdef RATELIMIT
 	if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
 	    (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 	}
 #endif
 
 	/*
 	 * This function doesn't resume by itself.  Someone else must clear the
 	 * flag and call this function.
 	 */
 	if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 		KASSERT(drop == 0,
 		    ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 		return;
 	}
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	for (;;) {
 		tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
 
 		SOCKBUF_LOCK(sb);
 		sowwakeup = drop;
 		if (drop) {
 			sbdrop_locked(sb, drop);
 			drop = 0;
 		}
 
 		m = sb->sb_sndptr != NULL ? sb->sb_sndptr->m_next : sb->sb_mb;
 
 		/*
 		 * Send a FIN if requested, but only if there's no
 		 * more data to send.
 		 */
 		if (m == NULL && toep->flags & TPF_SEND_FIN) {
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 			t4_close_conn(sc, toep);
 			return;
 		}
 
 		/*
 		 * If there is no ready data to send, wait until more
 		 * data arrives.
 		 */
 		if (m == NULL || (m->m_flags & M_NOTAVAIL) != 0) {
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 #ifdef VERBOSE_TRACES
 			CTR2(KTR_CXGBE, "%s: tid %d no ready data to send",
 			    __func__, toep->tid);
 #endif
 			return;
 		}
 
 		KASSERT(m->m_flags & M_EXTPG, ("%s: mbuf %p is not NOMAP",
 		    __func__, m));
 		KASSERT(m->m_epg_tls != NULL,
 		    ("%s: mbuf %p doesn't have TLS session", __func__, m));
 
 		/* Calculate WR length. */
 		wr_len = sizeof(struct fw_tlstx_data_wr) +
 		    sizeof(struct cpl_tx_tls_sfo) +
 		    sizeof(struct ulptx_idata) + sizeof(struct ulptx_sc_memrd);
 
 		/* Explicit IVs for AES-CBC and AES-GCM are <= 16. */
 		MPASS(toep->tls.iv_len <= AES_BLOCK_LEN);
 		wr_len += AES_BLOCK_LEN;
 
 		/* Account for SGL in work request length. */
 		nsegs = count_ext_pgs_segs(m);
 		wr_len += sizeof(struct ulptx_sgl) +
 		    ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 
 		/* Not enough credits for this work request. */
 		if (howmany(wr_len, 16) > tx_credits) {
 			if (sowwakeup)
 				sowwakeup_locked(so);
 			else
 				SOCKBUF_UNLOCK(sb);
 			SOCKBUF_UNLOCK_ASSERT(sb);
 #ifdef VERBOSE_TRACES
 			CTR5(KTR_CXGBE,
 	    "%s: tid %d mbuf %p requires %d credits, but only %d available",
 			    __func__, toep->tid, m, howmany(wr_len, 16),
 			    tx_credits);
 #endif
 			toep->flags |= TPF_TX_SUSPENDED;
 			return;
 		}
 	
 		/* Shove if there is no additional data pending. */
 		shove = ((m->m_next == NULL ||
 		    (m->m_next->m_flags & M_NOTAVAIL) != 0)) &&
 		    (tp->t_flags & TF_MORETOCOME) == 0;
 
 		if (sb->sb_flags & SB_AUTOSIZE &&
 		    V_tcp_do_autosndbuf &&
 		    sb->sb_hiwat < V_tcp_autosndbuf_max &&
 		    sbused(sb) >= sb->sb_hiwat * 7 / 8) {
 			int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
 			    V_tcp_autosndbuf_max);
 
 			if (!sbreserve_locked(so, SO_SND, newsize, NULL))
 				sb->sb_flags &= ~SB_AUTOSIZE;
 			else
 				sowwakeup = 1;	/* room available */
 		}
 		if (sowwakeup)
 			sowwakeup_locked(so);
 		else
 			SOCKBUF_UNLOCK(sb);
 		SOCKBUF_UNLOCK_ASSERT(sb);
 
 		if (__predict_false(toep->flags & TPF_FIN_SENT))
 			panic("%s: excess tx.", __func__);
 
 		wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq);
 		if (wr == NULL) {
 			/* XXX: how will we recover from this? */
 			toep->flags |= TPF_TX_SUSPENDED;
 			return;
 		}
 
 		thdr = (struct tls_hdr *)&m->m_epg_hdr;
 #ifdef VERBOSE_TRACES
 		CTR5(KTR_CXGBE, "%s: tid %d TLS record %ju type %d len %#x",
 		    __func__, toep->tid, m->m_epg_seqno, thdr->type,
 		    m->m_len);
 #endif
 		txwr = wrtod(wr);
 		cpl = (struct cpl_tx_tls_sfo *)(txwr + 1);
 		memset(txwr, 0, roundup2(wr_len, 16));
 		credits = howmany(wr_len, 16);
 		expn_size = m->m_epg_hdrlen +
 		    m->m_epg_trllen;
 		tls_size = m->m_len - expn_size;
 		write_tlstx_wr(txwr, toep, tls_size, expn_size, credits, shove);
 		write_tlstx_cpl(cpl, toep, thdr, tls_size, m->m_epg_seqno);
 
 		idata = (struct ulptx_idata *)(cpl + 1);
 		idata->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		idata->len = htobe32(0);
 		memrd = (struct ulptx_sc_memrd *)(idata + 1);
 		memrd->cmd_to_len = htobe32(V_ULPTX_CMD(ULP_TX_SC_MEMRD) |
 		    V_ULP_TX_SC_MORE(1) |
 		    V_ULPTX_LEN16(toep->tls.tx_key_info_size >> 4));
 		memrd->addr = htobe32(toep->tls.tx_key_addr >> 5);
 
 		/* Copy IV. */
 		buf = (char *)(memrd + 1);
 		memcpy(buf, thdr + 1, toep->tls.iv_len);
 		buf += AES_BLOCK_LEN;
 
 		write_ktlstx_sgl(buf, m, nsegs);
 
 		KASSERT(toep->tx_credits >= credits,
 			("%s: not enough credits", __func__));
 
 		toep->tx_credits -= credits;
 
 		tp->snd_nxt += m->m_len;
 		tp->snd_max += m->m_len;
 
 		SOCKBUF_LOCK(sb);
 		sb->sb_sndptr = m;
 		SOCKBUF_UNLOCK(sb);
 
 		toep->flags |= TPF_TX_DATA_SENT;
 		if (toep->tx_credits < MIN_OFLD_TLSTX_CREDITS(toep))
 			toep->flags |= TPF_TX_SUSPENDED;
 
 		KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 		txsd->plen = m->m_len;
 		txsd->tx_credits = credits;
 		txsd++;
 		if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 			toep->txsd_pidx = 0;
 			txsd = &toep->txsd[0];
 		}
 		toep->txsd_avail--;
 
 		counter_u64_add(toep->ofld_txq->tx_toe_tls_records, 1);
 		counter_u64_add(toep->ofld_txq->tx_toe_tls_octets, m->m_len);
 
 		t4_l2t_send(sc, wr, toep->l2te);
 	}
 }
 
 /*
  * For TLS data we place received mbufs received via CPL_TLS_DATA into
  * an mbufq in the TLS offload state.  When CPL_RX_TLS_CMP is
  * received, the completed PDUs are placed into the socket receive
  * buffer.
  *
  * The TLS code reuses the ulp_pdu_reclaimq to hold the pending mbufs.
  */
 static int
 do_tls_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_tls_data *cpl = mtod(m, const void *);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	int len;
 
 	/* XXX: Should this match do_rx_data instead? */
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
 	len = m->m_pkthdr.len;
 
 	toep->ofld_rxq->rx_toe_tls_octets += len;
 
 	KASSERT(len == G_CPL_TLS_DATA_LENGTH(be32toh(cpl->length_pkd)),
 	    ("%s: payload length mismatch", __func__));
 
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	/* Save TCP sequence number. */
 	m->m_pkthdr.tls_tcp_seq = be32toh(cpl->seq);
 
 	if (mbufq_enqueue(&toep->ulp_pdu_reclaimq, m)) {
 #ifdef INVARIANTS
 		panic("Failed to queue TLS data packet");
 #else
 		printf("%s: Failed to queue TLS data packet\n", __func__);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 #endif
 	}
 
 	tp = intotcpcb(inp);
 	tp->t_rcvtime = ticks;
 
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, tid, len,
 	    be32toh(cpl->seq));
 #endif
 
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_tls_cmp *cpl = mtod(m, const void *);
 	struct tlsrx_hdr_pkt *tls_hdr_pkt;
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct mbuf *tls_data;
 	struct tls_get_record *tgr;
 	struct mbuf *control;
 	int pdu_length, rx_credits;
 #if defined(KTR) || defined(INVARIANTS)
 	int len;
 #endif
 
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	/* strip off CPL header */
 	m_adj(m, sizeof(*cpl));
 #if defined(KTR) || defined(INVARIANTS)
 	len = m->m_pkthdr.len;
 #endif
 
 	toep->ofld_rxq->rx_toe_tls_records++;
 
 	KASSERT(len == G_CPL_RX_TLS_CMP_LENGTH(be32toh(cpl->pdulength_length)),
 	    ("%s: payload length mismatch", __func__));
 
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 		    __func__, tid, len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		m_freem(m);
 		return (0);
 	}
 
 	pdu_length = G_CPL_RX_TLS_CMP_PDULENGTH(be32toh(cpl->pdulength_length));
 
 	so = inp_inpcbtosocket(inp);
 	tp = intotcpcb(inp);
 
 #ifdef VERBOSE_TRACES
 	CTR6(KTR_CXGBE, "%s: tid %u PDU len %d len %d seq %u, rcv_nxt %u",
 	    __func__, tid, pdu_length, len, be32toh(cpl->seq), tp->rcv_nxt);
 #endif
 
 	tp->rcv_nxt += pdu_length;
 	KASSERT(tp->rcv_wnd >= pdu_length,
 	    ("%s: negative window size", __func__));
 	tp->rcv_wnd -= pdu_length;
 
 	/* XXX: Not sure what to do about urgent data. */
 
 	/*
 	 * The payload of this CPL is the TLS header followed by
 	 * additional fields.
 	 */
 	KASSERT(m->m_len >= sizeof(*tls_hdr_pkt),
 	    ("%s: payload too small", __func__));
 	tls_hdr_pkt = mtod(m, void *);
 
 	tls_data = mbufq_dequeue(&toep->ulp_pdu_reclaimq);
 	if (tls_data != NULL) {
 		KASSERT(be32toh(cpl->seq) == tls_data->m_pkthdr.tls_tcp_seq,
 		    ("%s: sequence mismatch", __func__));
 	}
 
 	/* Report decryption errors as EBADMSG. */
 	if ((tls_hdr_pkt->res_to_mac_error & M_TLSRX_HDR_PKT_ERROR) != 0) {
 		m_freem(m);
 		m_freem(tls_data);
 
 		CURVNET_SET(toep->vnet);
 		so->so_error = EBADMSG;
 		sorwakeup(so);
 
 		INP_WUNLOCK(inp);
 		CURVNET_RESTORE();
 
 		return (0);
 	}
 
 	/* Allocate the control message mbuf. */
 	control = sbcreatecontrol(NULL, sizeof(*tgr), TLS_GET_RECORD,
 	    IPPROTO_TCP, M_NOWAIT);
 	if (control == NULL) {
 		m_freem(m);
 		m_freem(tls_data);
 
 		CURVNET_SET(toep->vnet);
 		so->so_error = ENOBUFS;
 		sorwakeup(so);
 
 		INP_WUNLOCK(inp);
 		CURVNET_RESTORE();
 
 		return (0);
 	}
 
 	tgr = (struct tls_get_record *)
 	    CMSG_DATA(mtod(control, struct cmsghdr *));
 	memset(tgr, 0, sizeof(*tgr));
 	tgr->tls_type = tls_hdr_pkt->type;
 	tgr->tls_vmajor = be16toh(tls_hdr_pkt->version) >> 8;
 	tgr->tls_vminor = be16toh(tls_hdr_pkt->version) & 0xff;
 
 	m_freem(m);
 
 	if (tls_data != NULL) {
 		m_last(tls_data)->m_flags |= M_EOR;
 		tgr->tls_length = htobe16(tls_data->m_pkthdr.len);
 	} else
 		tgr->tls_length = 0;
 	m = tls_data;
 
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 		struct epoch_tracker et;
 
 		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
 		    __func__, tid, pdu_length);
 		m_freem(m);
 		m_freem(control);
 		SOCKBUF_UNLOCK(sb);
 		INP_WUNLOCK(inp);
 
 		CURVNET_SET(toep->vnet);
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
 		if (tp)
 			INP_WUNLOCK(inp);
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
 
 		return (0);
 	}
 
 	/*
 	 * Not all of the bytes on the wire are included in the socket buffer
 	 * (e.g. the MAC of the TLS record).  However, those bytes are included
 	 * in the TCP sequence space.
 	 */
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
 	CURVNET_SET(toep->vnet);
 	if (sb->sb_flags & SB_AUTOSIZE &&
 	    V_tcp_do_autorcvbuf &&
 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 	    m->m_pkthdr.len > (sbspace(sb) / 8 * 7)) {
 		unsigned int hiwat = sb->sb_hiwat;
 		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
 		    V_tcp_autorcvbuf_max);
 
 		if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
 			sb->sb_flags &= ~SB_AUTOSIZE;
 	}
 
 	sbappendcontrol_locked(sb, m, control, 0);
 	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %u rx_credits %u rcv_wnd %u",
 	    __func__, tid, rx_credits, tp->rcv_wnd);
 #endif
 	if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
 		rx_credits = send_rx_credits(sc, toep, rx_credits);
 		tp->rcv_wnd += rx_credits;
 		tp->rcv_adv += rx_credits;
 	}
 
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(sb);
 
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 void
 do_rx_data_tls(const struct cpl_rx_data *cpl, struct toepcb *toep,
     struct mbuf *m)
 {
 	struct inpcb *inp = toep->inp;
 	struct tls_ofld_info *tls_ofld = &toep->tls;
 	struct tls_hdr *hdr;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	int len, rx_credits;
 
 	len = m->m_pkthdr.len;
 
 	INP_WLOCK_ASSERT(inp);
 
 	so = inp_inpcbtosocket(inp);
 	tp = intotcpcb(inp);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	CURVNET_SET(toep->vnet);
 
 	tp->rcv_nxt += len;
 	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
 	tp->rcv_wnd -= len;
 
 	/* Do we have a full TLS header? */
 	if (len < sizeof(*hdr)) {
 		CTR3(KTR_CXGBE, "%s: tid %u len %d: too short for a TLS header",
 		    __func__, toep->tid, len);
 		so->so_error = EMSGSIZE;
 		goto out;
 	}
 	hdr = mtod(m, struct tls_hdr *);
 
 	/* Is the header valid? */
 	if (be16toh(hdr->version) != tls_ofld->rx_version) {
 		CTR3(KTR_CXGBE, "%s: tid %u invalid version %04x",
 		    __func__, toep->tid, be16toh(hdr->version));
 		so->so_error = EINVAL;
 		goto out;
 	}
 	if (be16toh(hdr->length) < sizeof(*hdr)) {
 		CTR3(KTR_CXGBE, "%s: tid %u invalid length %u",
 		    __func__, toep->tid, be16toh(hdr->length));
 		so->so_error = EBADMSG;
 		goto out;
 	}
 
 	/* Did we get a truncated record? */
 	if (len < be16toh(hdr->length)) {
 		CTR4(KTR_CXGBE, "%s: tid %u truncated TLS record (%d vs %u)",
 		    __func__, toep->tid, len, be16toh(hdr->length));
 
 		so->so_error = EMSGSIZE;
 		goto out;
 	}
 
 	/* Is the header type unknown? */
 	switch (hdr->type) {
 	case CONTENT_TYPE_CCS:
 	case CONTENT_TYPE_ALERT:
 	case CONTENT_TYPE_APP_DATA:
 	case CONTENT_TYPE_HANDSHAKE:
 		break;
 	default:
 		CTR3(KTR_CXGBE, "%s: tid %u invalid TLS record type %u",
 		    __func__, toep->tid, hdr->type);
 		so->so_error = EBADMSG;
 		goto out;
 	}
 
 	/*
 	 * Just punt.  Although this could fall back to software
 	 * decryption, this case should never really happen.
 	 */
 	CTR4(KTR_CXGBE, "%s: tid %u dropping TLS record type %u, length %u",
 	    __func__, toep->tid, hdr->type, be16toh(hdr->length));
 	so->so_error = EBADMSG;
 
 out:
 	/*
 	 * This connection is going to die anyway, so probably don't
 	 * need to bother with returning credits.
 	 */
 	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %u rx_credits %u rcv_wnd %u",
 	    __func__, toep->tid, rx_credits, tp->rcv_wnd);
 #endif
 	if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
 		rx_credits = send_rx_credits(toep->vi->adapter, toep,
 		    rx_credits);
 		tp->rcv_wnd += rx_credits;
 		tp->rcv_adv += rx_credits;
 	}
 
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(sb);
 
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 
 	m_freem(m);
 }
 
 void
 t4_tls_mod_load(void)
 {
 
 	t4_register_cpl_handler(CPL_TLS_DATA, do_tls_data);
 	t4_register_cpl_handler(CPL_RX_TLS_CMP, do_rx_tls_cmp);
 }
 
 void
 t4_tls_mod_unload(void)
 {
 
 	t4_register_cpl_handler(CPL_TLS_DATA, NULL);
 	t4_register_cpl_handler(CPL_RX_TLS_CMP, NULL);
 }
 #endif	/* TCP_OFFLOAD */
 #endif	/* KERN_TLS */
diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
index bf91b774c0dc..83fcc9e70546 100644
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -1,3254 +1,3253 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2014-2019 Netflix Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/domainset.h>
 #include <sys/endian.h>
 #include <sys/ktls.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/rmlock.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/refcount.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/kthread.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
 #include <machine/pcb.h>
 #endif
 #include <machine/vmparam.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #ifdef RSS
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #endif
 #include <net/route.h>
 #include <net/route/nhop.h>
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #endif
 #include <netinet/tcp_var.h>
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 #include <opencrypto/cryptodev.h>
 #include <opencrypto/ktls.h>
 #include <vm/uma_dbg.h>
 #include <vm/vm.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pagequeue.h>
 
 struct ktls_wq {
 	struct mtx	mtx;
 	STAILQ_HEAD(, mbuf) m_head;
 	STAILQ_HEAD(, socket) so_head;
 	bool		running;
 	int		lastallocfail;
 } __aligned(CACHE_LINE_SIZE);
 
 struct ktls_alloc_thread {
 	uint64_t wakeups;
 	uint64_t allocs;
 	struct thread *td;
 	int running;
 };
 
 struct ktls_domain_info {
 	int count;
 	int cpu[MAXCPU];
 	struct ktls_alloc_thread alloc_td;
 };
 
 struct ktls_domain_info ktls_domains[MAXMEMDOM];
 static struct ktls_wq *ktls_wq;
 static struct proc *ktls_proc;
 static uma_zone_t ktls_session_zone;
 static uma_zone_t ktls_buffer_zone;
 static uint16_t ktls_cpuid_lookup[MAXCPU];
 static int ktls_init_state;
 static struct sx ktls_init_lock;
 SX_SYSINIT(ktls_init_lock, &ktls_init_lock, "ktls init");
 
 SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Kernel TLS offload");
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Kernel TLS offload stats");
 
 #ifdef RSS
 static int ktls_bind_threads = 1;
 #else
 static int ktls_bind_threads;
 #endif
 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN,
     &ktls_bind_threads, 0,
     "Bind crypto threads to cores (1) or cores and domains (2) at boot");
 
 static u_int ktls_maxlen = 16384;
 SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RDTUN,
     &ktls_maxlen, 0, "Maximum TLS record size");
 
 static int ktls_number_threads;
 SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD,
     &ktls_number_threads, 0,
     "Number of TLS threads in thread-pool");
 
 unsigned int ktls_ifnet_max_rexmit_pct = 2;
 SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN,
     &ktls_ifnet_max_rexmit_pct, 2,
     "Max percent bytes retransmitted before ifnet TLS is disabled");
 
 static bool ktls_offload_enable;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN,
     &ktls_offload_enable, 0,
     "Enable support for kernel TLS offload");
 
 static bool ktls_cbc_enable = true;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RWTUN,
     &ktls_cbc_enable, 1,
     "Enable Support of AES-CBC crypto for kernel TLS");
 
 static bool ktls_sw_buffer_cache = true;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, sw_buffer_cache, CTLFLAG_RDTUN,
     &ktls_sw_buffer_cache, 1,
     "Enable caching of output buffers for SW encryption");
 
 static int ktls_max_alloc = 128;
 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, max_alloc, CTLFLAG_RWTUN,
     &ktls_max_alloc, 128,
     "Max number of 16k buffers to allocate in thread context");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_tasks_active);
 SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD,
     &ktls_tasks_active, "Number of active tasks");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_pending);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_pending, CTLFLAG_RD,
     &ktls_cnt_tx_pending,
     "Number of TLS 1.0 records waiting for earlier TLS records");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_queued);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD,
     &ktls_cnt_tx_queued,
     "Number of TLS records in queue to tasks for SW encryption");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_rx_queued);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_rx_inqueue, CTLFLAG_RD,
     &ktls_cnt_rx_queued,
     "Number of TLS sockets in queue to tasks for SW decryption");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_total);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total,
     CTLFLAG_RD, &ktls_offload_total,
     "Total successful TLS setups (parameters set)");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_enable_calls);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, enable_calls,
     CTLFLAG_RD, &ktls_offload_enable_calls,
     "Total number of TLS enable calls made");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_active);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD,
     &ktls_offload_active, "Total Active TLS sessions");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_corrupted_records);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, corrupted_records, CTLFLAG_RD,
     &ktls_offload_corrupted_records, "Total corrupted TLS records received");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_failed_crypto);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD,
     &ktls_offload_failed_crypto, "Total TLS crypto failures");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_ifnet);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_ifnet, CTLFLAG_RD,
     &ktls_switch_to_ifnet, "TLS sessions switched from SW to ifnet");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_sw);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_sw, CTLFLAG_RD,
     &ktls_switch_to_sw, "TLS sessions switched from ifnet to SW");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD,
     &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed, CTLFLAG_RD,
     &ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from ifnet");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok, CTLFLAG_RD,
     &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from ifnet");
 
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Software TLS session stats");
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Hardware (ifnet) TLS session stats");
 #ifdef TCP_OFFLOAD
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "TOE TLS session stats");
 #endif
 
 static COUNTER_U64_DEFINE_EARLY(ktls_sw_cbc);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, cbc, CTLFLAG_RD, &ktls_sw_cbc,
     "Active number of software TLS sessions using AES-CBC");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_sw_gcm);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, gcm, CTLFLAG_RD, &ktls_sw_gcm,
     "Active number of software TLS sessions using AES-GCM");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_sw_chacha20);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, chacha20, CTLFLAG_RD,
     &ktls_sw_chacha20,
     "Active number of software TLS sessions using Chacha20-Poly1305");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_cbc);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, cbc, CTLFLAG_RD,
     &ktls_ifnet_cbc,
     "Active number of ifnet TLS sessions using AES-CBC");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_gcm);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, gcm, CTLFLAG_RD,
     &ktls_ifnet_gcm,
     "Active number of ifnet TLS sessions using AES-GCM");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_chacha20);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, chacha20, CTLFLAG_RD,
     &ktls_ifnet_chacha20,
     "Active number of ifnet TLS sessions using Chacha20-Poly1305");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset, CTLFLAG_RD,
     &ktls_ifnet_reset, "TLS sessions updated to a new ifnet send tag");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_dropped);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_dropped, CTLFLAG_RD,
     &ktls_ifnet_reset_dropped,
     "TLS sessions dropped after failing to update ifnet send tag");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_failed);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_failed, CTLFLAG_RD,
     &ktls_ifnet_reset_failed,
     "TLS sessions that failed to allocate a new ifnet send tag");
 
 static int ktls_ifnet_permitted;
 SYSCTL_UINT(_kern_ipc_tls_ifnet, OID_AUTO, permitted, CTLFLAG_RWTUN,
     &ktls_ifnet_permitted, 1,
     "Whether to permit hardware (ifnet) TLS sessions");
 
 #ifdef TCP_OFFLOAD
 static COUNTER_U64_DEFINE_EARLY(ktls_toe_cbc);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, cbc, CTLFLAG_RD,
     &ktls_toe_cbc,
     "Active number of TOE TLS sessions using AES-CBC");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_toe_gcm);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, gcm, CTLFLAG_RD,
     &ktls_toe_gcm,
     "Active number of TOE TLS sessions using AES-GCM");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_toe_chacha20);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, chacha20, CTLFLAG_RD,
     &ktls_toe_chacha20,
     "Active number of TOE TLS sessions using Chacha20-Poly1305");
 #endif
 
 static MALLOC_DEFINE(M_KTLS, "ktls", "Kernel TLS");
 
 static void ktls_cleanup(struct ktls_session *tls);
 #if defined(INET) || defined(INET6)
 static void ktls_reset_receive_tag(void *context, int pending);
 static void ktls_reset_send_tag(void *context, int pending);
 #endif
 static void ktls_work_thread(void *ctx);
 static void ktls_alloc_thread(void *ctx);
 
 #if defined(INET) || defined(INET6)
 static u_int
 ktls_get_cpu(struct socket *so)
 {
 	struct inpcb *inp;
 #ifdef NUMA
 	struct ktls_domain_info *di;
 #endif
 	u_int cpuid;
 
 	inp = sotoinpcb(so);
 #ifdef RSS
 	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 	if (cpuid != NETISR_CPUID_NONE)
 		return (cpuid);
 #endif
 	/*
 	 * Just use the flowid to shard connections in a repeatable
 	 * fashion.  Note that TLS 1.0 sessions rely on the
 	 * serialization provided by having the same connection use
 	 * the same queue.
 	 */
 #ifdef NUMA
 	if (ktls_bind_threads > 1 && inp->inp_numa_domain != M_NODOM) {
 		di = &ktls_domains[inp->inp_numa_domain];
 		cpuid = di->cpu[inp->inp_flowid % di->count];
 	} else
 #endif
 		cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads];
 	return (cpuid);
 }
 #endif
 
 static int
 ktls_buffer_import(void *arg, void **store, int count, int domain, int flags)
 {
 	vm_page_t m;
 	int i, req;
 
 	KASSERT((ktls_maxlen & PAGE_MASK) == 0,
 	    ("%s: ktls max length %d is not page size-aligned",
 	    __func__, ktls_maxlen));
 
 	req = VM_ALLOC_WIRED | VM_ALLOC_NODUMP | malloc2vm_flags(flags);
 	for (i = 0; i < count; i++) {
 		m = vm_page_alloc_noobj_contig_domain(domain, req,
 		    atop(ktls_maxlen), 0, ~0ul, PAGE_SIZE, 0,
 		    VM_MEMATTR_DEFAULT);
 		if (m == NULL)
 			break;
 		store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 	}
 	return (i);
 }
 
 static void
 ktls_buffer_release(void *arg __unused, void **store, int count)
 {
 	vm_page_t m;
 	int i, j;
 
 	for (i = 0; i < count; i++) {
 		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i]));
 		for (j = 0; j < atop(ktls_maxlen); j++) {
 			(void)vm_page_unwire_noq(m + j);
 			vm_page_free(m + j);
 		}
 	}
 }
 
 static void
 ktls_free_mext_contig(struct mbuf *m)
 {
 	M_ASSERTEXTPG(m);
 	uma_zfree(ktls_buffer_zone, (void *)PHYS_TO_DMAP(m->m_epg_pa[0]));
 }
 
 static int
 ktls_init(void)
 {
 	struct thread *td;
 	struct pcpu *pc;
 	int count, domain, error, i;
 
 	ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS,
 	    M_WAITOK | M_ZERO);
 
 	ktls_session_zone = uma_zcreate("ktls_session",
 	    sizeof(struct ktls_session),
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_CACHE, 0);
 
 	if (ktls_sw_buffer_cache) {
 		ktls_buffer_zone = uma_zcache_create("ktls_buffers",
 		    roundup2(ktls_maxlen, PAGE_SIZE), NULL, NULL, NULL, NULL,
 		    ktls_buffer_import, ktls_buffer_release, NULL,
 		    UMA_ZONE_FIRSTTOUCH);
 	}
 
 	/*
 	 * Initialize the workqueues to run the TLS work.  We create a
 	 * work queue for each CPU.
 	 */
 	CPU_FOREACH(i) {
 		STAILQ_INIT(&ktls_wq[i].m_head);
 		STAILQ_INIT(&ktls_wq[i].so_head);
 		mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF);
 		if (ktls_bind_threads > 1) {
 			pc = pcpu_find(i);
 			domain = pc->pc_domain;
 			count = ktls_domains[domain].count;
 			ktls_domains[domain].cpu[count] = i;
 			ktls_domains[domain].count++;
 		}
 		ktls_cpuid_lookup[ktls_number_threads] = i;
 		ktls_number_threads++;
 	}
 
 	/*
 	 * If we somehow have an empty domain, fall back to choosing
 	 * among all KTLS threads.
 	 */
 	if (ktls_bind_threads > 1) {
 		for (i = 0; i < vm_ndomains; i++) {
 			if (ktls_domains[i].count == 0) {
 				ktls_bind_threads = 1;
 				break;
 			}
 		}
 	}
 
 	/* Start kthreads for each workqueue. */
 	CPU_FOREACH(i) {
 		error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i],
 		    &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i);
 		if (error) {
 			printf("Can't add KTLS thread %d error %d\n", i, error);
 			return (error);
 		}
 	}
 
 	/*
 	 * Start an allocation thread per-domain to perform blocking allocations
 	 * of 16k physically contiguous TLS crypto destination buffers.
 	 */
 	if (ktls_sw_buffer_cache) {
 		for (domain = 0; domain < vm_ndomains; domain++) {
 			if (VM_DOMAIN_EMPTY(domain))
 				continue;
 			if (CPU_EMPTY(&cpuset_domain[domain]))
 				continue;
 			error = kproc_kthread_add(ktls_alloc_thread,
 			    &ktls_domains[domain], &ktls_proc,
 			    &ktls_domains[domain].alloc_td.td,
 			    0, 0, "KTLS", "alloc_%d", domain);
 			if (error) {
 				printf("Can't add KTLS alloc thread %d error %d\n",
 				    domain, error);
 				return (error);
 			}
 		}
 	}
 
 	if (bootverbose)
 		printf("KTLS: Initialized %d threads\n", ktls_number_threads);
 	return (0);
 }
 
 static int
 ktls_start_kthreads(void)
 {
 	int error, state;
 
 start:
 	state = atomic_load_acq_int(&ktls_init_state);
 	if (__predict_true(state > 0))
 		return (0);
 	if (state < 0)
 		return (ENXIO);
 
 	sx_xlock(&ktls_init_lock);
 	if (ktls_init_state != 0) {
 		sx_xunlock(&ktls_init_lock);
 		goto start;
 	}
 
 	error = ktls_init();
 	if (error == 0)
 		state = 1;
 	else
 		state = -1;
 	atomic_store_rel_int(&ktls_init_state, state);
 	sx_xunlock(&ktls_init_lock);
 	return (error);
 }
 
 #if defined(INET) || defined(INET6)
 static int
 ktls_create_session(struct socket *so, struct tls_enable *en,
     struct ktls_session **tlsp, int direction)
 {
 	struct ktls_session *tls;
 	int error;
 
 	/* Only TLS 1.0 - 1.3 are supported. */
 	if (en->tls_vmajor != TLS_MAJOR_VER_ONE)
 		return (EINVAL);
 	if (en->tls_vminor < TLS_MINOR_VER_ZERO ||
 	    en->tls_vminor > TLS_MINOR_VER_THREE)
 		return (EINVAL);
 
 	if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE)
 		return (EINVAL);
 	if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE)
 		return (EINVAL);
 	if (en->iv_len < 0 || en->iv_len > sizeof(tls->params.iv))
 		return (EINVAL);
 
 	/* All supported algorithms require a cipher key. */
 	if (en->cipher_key_len == 0)
 		return (EINVAL);
 
 	/* No flags are currently supported. */
 	if (en->flags != 0)
 		return (EINVAL);
 
 	/* Common checks for supported algorithms. */
 	switch (en->cipher_algorithm) {
 	case CRYPTO_AES_NIST_GCM_16:
 		/*
 		 * auth_algorithm isn't used, but permit GMAC values
 		 * for compatibility.
 		 */
 		switch (en->auth_algorithm) {
 		case 0:
 #ifdef COMPAT_FREEBSD12
 		/* XXX: Really 13.0-current COMPAT. */
 		case CRYPTO_AES_128_NIST_GMAC:
 		case CRYPTO_AES_192_NIST_GMAC:
 		case CRYPTO_AES_256_NIST_GMAC:
 #endif
 			break;
 		default:
 			return (EINVAL);
 		}
 		if (en->auth_key_len != 0)
 			return (EINVAL);
 		switch (en->tls_vminor) {
 		case TLS_MINOR_VER_TWO:
 			if (en->iv_len != TLS_AEAD_GCM_LEN)
 				return (EINVAL);
 			break;
 		case TLS_MINOR_VER_THREE:
 			if (en->iv_len != TLS_1_3_GCM_IV_LEN)
 				return (EINVAL);
 			break;
 		default:
 			return (EINVAL);
 		}
 		break;
 	case CRYPTO_AES_CBC:
 		switch (en->auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 			break;
 		case CRYPTO_SHA2_256_HMAC:
 		case CRYPTO_SHA2_384_HMAC:
 			if (en->tls_vminor != TLS_MINOR_VER_TWO)
 				return (EINVAL);
 			break;
 		default:
 			return (EINVAL);
 		}
 		if (en->auth_key_len == 0)
 			return (EINVAL);
 
 		/*
 		 * TLS 1.0 requires an implicit IV.  TLS 1.1 and 1.2
 		 * use explicit IVs.
 		 */
 		switch (en->tls_vminor) {
 		case TLS_MINOR_VER_ZERO:
 			if (en->iv_len != TLS_CBC_IMPLICIT_IV_LEN)
 				return (EINVAL);
 			break;
 		case TLS_MINOR_VER_ONE:
 		case TLS_MINOR_VER_TWO:
 			/* Ignore any supplied IV. */
 			en->iv_len = 0;
 			break;
 		default:
 			return (EINVAL);
 		}
 		break;
 	case CRYPTO_CHACHA20_POLY1305:
 		if (en->auth_algorithm != 0 || en->auth_key_len != 0)
 			return (EINVAL);
 		if (en->tls_vminor != TLS_MINOR_VER_TWO &&
 		    en->tls_vminor != TLS_MINOR_VER_THREE)
 			return (EINVAL);
 		if (en->iv_len != TLS_CHACHA20_IV_LEN)
 			return (EINVAL);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	error = ktls_start_kthreads();
 	if (error != 0)
 		return (error);
 
 	tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
 
 	counter_u64_add(ktls_offload_active, 1);
 
 	refcount_init(&tls->refcount, 1);
 	if (direction == KTLS_RX)
 		TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_receive_tag, tls);
 	else
 		TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls);
 
 	tls->wq_index = ktls_get_cpu(so);
 
 	tls->params.cipher_algorithm = en->cipher_algorithm;
 	tls->params.auth_algorithm = en->auth_algorithm;
 	tls->params.tls_vmajor = en->tls_vmajor;
 	tls->params.tls_vminor = en->tls_vminor;
 	tls->params.flags = en->flags;
 	tls->params.max_frame_len = min(TLS_MAX_MSG_SIZE_V10_2, ktls_maxlen);
 
 	/* Set the header and trailer lengths. */
 	tls->params.tls_hlen = sizeof(struct tls_record_layer);
 	switch (en->cipher_algorithm) {
 	case CRYPTO_AES_NIST_GCM_16:
 		/*
 		 * TLS 1.2 uses a 4 byte implicit IV with an explicit 8 byte
 		 * nonce.  TLS 1.3 uses a 12 byte implicit IV.
 		 */
 		if (en->tls_vminor < TLS_MINOR_VER_THREE)
 			tls->params.tls_hlen += sizeof(uint64_t);
 		tls->params.tls_tlen = AES_GMAC_HASH_LEN;
 		tls->params.tls_bs = 1;
 		break;
 	case CRYPTO_AES_CBC:
 		switch (en->auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 			if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
 				/* Implicit IV, no nonce. */
 				tls->sequential_records = true;
 				tls->next_seqno = be64dec(en->rec_seq);
 				STAILQ_INIT(&tls->pending_records);
 			} else {
 				tls->params.tls_hlen += AES_BLOCK_LEN;
 			}
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA1_HASH_LEN;
 			break;
 		case CRYPTO_SHA2_256_HMAC:
 			tls->params.tls_hlen += AES_BLOCK_LEN;
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA2_256_HASH_LEN;
 			break;
 		case CRYPTO_SHA2_384_HMAC:
 			tls->params.tls_hlen += AES_BLOCK_LEN;
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA2_384_HASH_LEN;
 			break;
 		default:
 			panic("invalid hmac");
 		}
 		tls->params.tls_bs = AES_BLOCK_LEN;
 		break;
 	case CRYPTO_CHACHA20_POLY1305:
 		/*
 		 * Chacha20 uses a 12 byte implicit IV.
 		 */
 		tls->params.tls_tlen = POLY1305_HASH_LEN;
 		tls->params.tls_bs = 1;
 		break;
 	default:
 		panic("invalid cipher");
 	}
 
 	/*
 	 * TLS 1.3 includes optional padding which we do not support,
 	 * and also puts the "real" record type at the end of the
 	 * encrypted data.
 	 */
 	if (en->tls_vminor == TLS_MINOR_VER_THREE)
 		tls->params.tls_tlen += sizeof(uint8_t);
 
 	KASSERT(tls->params.tls_hlen <= MBUF_PEXT_HDR_LEN,
 	    ("TLS header length too long: %d", tls->params.tls_hlen));
 	KASSERT(tls->params.tls_tlen <= MBUF_PEXT_TRAIL_LEN,
 	    ("TLS trailer length too long: %d", tls->params.tls_tlen));
 
 	if (en->auth_key_len != 0) {
 		tls->params.auth_key_len = en->auth_key_len;
 		tls->params.auth_key = malloc(en->auth_key_len, M_KTLS,
 		    M_WAITOK);
 		error = copyin(en->auth_key, tls->params.auth_key,
 		    en->auth_key_len);
 		if (error)
 			goto out;
 	}
 
 	tls->params.cipher_key_len = en->cipher_key_len;
 	tls->params.cipher_key = malloc(en->cipher_key_len, M_KTLS, M_WAITOK);
 	error = copyin(en->cipher_key, tls->params.cipher_key,
 	    en->cipher_key_len);
 	if (error)
 		goto out;
 
 	/*
 	 * This holds the implicit portion of the nonce for AEAD
 	 * ciphers and the initial implicit IV for TLS 1.0.  The
 	 * explicit portions of the IV are generated in ktls_frame().
 	 */
 	if (en->iv_len != 0) {
 		tls->params.iv_len = en->iv_len;
 		error = copyin(en->iv, tls->params.iv, en->iv_len);
 		if (error)
 			goto out;
 
 		/*
 		 * For TLS 1.2 with GCM, generate an 8-byte nonce as a
 		 * counter to generate unique explicit IVs.
 		 *
 		 * Store this counter in the last 8 bytes of the IV
 		 * array so that it is 8-byte aligned.
 		 */
 		if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
 		    en->tls_vminor == TLS_MINOR_VER_TWO)
 			arc4rand(tls->params.iv + 8, sizeof(uint64_t), 0);
 	}
 
 	*tlsp = tls;
 	return (0);
 
 out:
 	ktls_cleanup(tls);
 	return (error);
 }
 
 static struct ktls_session *
 ktls_clone_session(struct ktls_session *tls, int direction)
 {
 	struct ktls_session *tls_new;
 
 	tls_new = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
 
 	counter_u64_add(ktls_offload_active, 1);
 
 	refcount_init(&tls_new->refcount, 1);
 	if (direction == KTLS_RX)
 		TASK_INIT(&tls_new->reset_tag_task, 0, ktls_reset_receive_tag,
 		    tls_new);
 	else
 		TASK_INIT(&tls_new->reset_tag_task, 0, ktls_reset_send_tag,
 		    tls_new);
 
 	/* Copy fields from existing session. */
 	tls_new->params = tls->params;
 	tls_new->wq_index = tls->wq_index;
 
 	/* Deep copy keys. */
 	if (tls_new->params.auth_key != NULL) {
 		tls_new->params.auth_key = malloc(tls->params.auth_key_len,
 		    M_KTLS, M_WAITOK);
 		memcpy(tls_new->params.auth_key, tls->params.auth_key,
 		    tls->params.auth_key_len);
 	}
 
 	tls_new->params.cipher_key = malloc(tls->params.cipher_key_len, M_KTLS,
 	    M_WAITOK);
 	memcpy(tls_new->params.cipher_key, tls->params.cipher_key,
 	    tls->params.cipher_key_len);
 
 	return (tls_new);
 }
 #endif
 
 static void
 ktls_cleanup(struct ktls_session *tls)
 {
 
 	counter_u64_add(ktls_offload_active, -1);
 	switch (tls->mode) {
 	case TCP_TLS_MODE_SW:
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_sw_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_sw_gcm, -1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_sw_chacha20, -1);
 			break;
 		}
 		break;
 	case TCP_TLS_MODE_IFNET:
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_ifnet_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_ifnet_gcm, -1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_ifnet_chacha20, -1);
 			break;
 		}
 		if (tls->snd_tag != NULL)
 			m_snd_tag_rele(tls->snd_tag);
 		if (tls->rx_ifp != NULL)
 			if_rele(tls->rx_ifp);
 		break;
 #ifdef TCP_OFFLOAD
 	case TCP_TLS_MODE_TOE:
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_toe_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_toe_gcm, -1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_toe_chacha20, -1);
 			break;
 		}
 		break;
 #endif
 	}
 	if (tls->ocf_session != NULL)
 		ktls_ocf_free(tls);
 	if (tls->params.auth_key != NULL) {
 		zfree(tls->params.auth_key, M_KTLS);
 		tls->params.auth_key = NULL;
 		tls->params.auth_key_len = 0;
 	}
 	if (tls->params.cipher_key != NULL) {
 		zfree(tls->params.cipher_key, M_KTLS);
 		tls->params.cipher_key = NULL;
 		tls->params.cipher_key_len = 0;
 	}
 	explicit_bzero(tls->params.iv, sizeof(tls->params.iv));
 }
 
 #if defined(INET) || defined(INET6)
 
 #ifdef TCP_OFFLOAD
 static int
 ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	int error;
 
 	inp = so->so_pcb;
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_socket == NULL) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	if (!(tp->t_flags & TF_TOE)) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
 
 	error = tcp_offload_alloc_tls_session(tp, tls, direction);
 	INP_WUNLOCK(inp);
 	if (error == 0) {
 		tls->mode = TCP_TLS_MODE_TOE;
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_toe_cbc, 1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_toe_gcm, 1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_toe_chacha20, 1);
 			break;
 		}
 	}
 	return (error);
 }
 #endif
 
 /*
  * Common code used when first enabling ifnet TLS on a connection or
  * when allocating a new ifnet TLS session due to a routing change.
  * This function allocates a new TLS send tag on whatever interface
  * the connection is currently routed over.
  */
 static int
 ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force,
     struct m_snd_tag **mstp)
 {
 	union if_snd_tag_alloc_params params;
 	struct ifnet *ifp;
 	struct nhop_object *nh;
 	struct tcpcb *tp;
 	int error;
 
 	INP_RLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_socket == NULL) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 
 	/*
 	 * Check administrative controls on ifnet TLS to determine if
 	 * ifnet TLS should be denied.
 	 *
 	 * - Always permit 'force' requests.
 	 * - ktls_ifnet_permitted == 0: always deny.
 	 */
 	if (!force && ktls_ifnet_permitted == 0) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 
 	/*
 	 * XXX: Use the cached route in the inpcb to find the
 	 * interface.  This should perhaps instead use
 	 * rtalloc1_fib(dst, 0, 0, fibnum).  Since KTLS is only
 	 * enabled after a connection has completed key negotiation in
 	 * userland, the cached route will be present in practice.
 	 */
 	nh = inp->inp_route.ro_nh;
 	if (nh == NULL) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 	ifp = nh->nh_ifp;
 	if_ref(ifp);
 
 	/*
 	 * Allocate a TLS + ratelimit tag if the connection has an
 	 * existing pacing rate.
 	 */
 	if (tp->t_pacing_rate != -1 &&
 	    (ifp->if_capenable & IFCAP_TXTLS_RTLMT) != 0) {
 		params.hdr.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT;
 		params.tls_rate_limit.inp = inp;
 		params.tls_rate_limit.tls = tls;
 		params.tls_rate_limit.max_rate = tp->t_pacing_rate;
 	} else {
 		params.hdr.type = IF_SND_TAG_TYPE_TLS;
 		params.tls.inp = inp;
 		params.tls.tls = tls;
 	}
 	params.hdr.flowid = inp->inp_flowid;
 	params.hdr.flowtype = inp->inp_flowtype;
 	params.hdr.numa_domain = inp->inp_numa_domain;
 	INP_RUNLOCK(inp);
 
 	if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) {
 		error = EOPNOTSUPP;
 		goto out;
 	}
 	if (inp->inp_vflag & INP_IPV6) {
 		if ((ifp->if_capenable & IFCAP_TXTLS6) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	} else {
 		if ((ifp->if_capenable & IFCAP_TXTLS4) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	}
 	error = m_snd_tag_alloc(ifp, &params, mstp);
 out:
 	if_rele(ifp);
 	return (error);
 }
 
 /*
  * Allocate an initial TLS receive tag for doing HW decryption of TLS
  * data.
  *
  * This function allocates a new TLS receive tag on whatever interface
  * the connection is currently routed over.  If the connection ends up
  * using a different interface for receive this will get fixed up via
  * ktls_input_ifp_mismatch as future packets arrive.
  */
 static int
 ktls_alloc_rcv_tag(struct inpcb *inp, struct ktls_session *tls,
     struct m_snd_tag **mstp)
 {
 	union if_snd_tag_alloc_params params;
 	struct ifnet *ifp;
 	struct nhop_object *nh;
 	int error;
 
 	if (!ktls_ocf_recrypt_supported(tls))
 		return (ENXIO);
 
 	INP_RLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_socket == NULL) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 
 	/*
 	 * Check administrative controls on ifnet TLS to determine if
 	 * ifnet TLS should be denied.
 	 */
 	if (ktls_ifnet_permitted == 0) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 
 	/*
 	 * XXX: As with ktls_alloc_snd_tag, use the cached route in
 	 * the inpcb to find the interface.
 	 */
 	nh = inp->inp_route.ro_nh;
 	if (nh == NULL) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 	ifp = nh->nh_ifp;
 	if_ref(ifp);
 	tls->rx_ifp = ifp;
 
 	params.hdr.type = IF_SND_TAG_TYPE_TLS_RX;
 	params.hdr.flowid = inp->inp_flowid;
 	params.hdr.flowtype = inp->inp_flowtype;
 	params.hdr.numa_domain = inp->inp_numa_domain;
 	params.tls_rx.inp = inp;
 	params.tls_rx.tls = tls;
 	params.tls_rx.vlan_id = 0;
 
 	INP_RUNLOCK(inp);
 
 	if (inp->inp_vflag & INP_IPV6) {
 		if ((ifp->if_capenable2 & IFCAP2_RXTLS6) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	} else {
 		if ((ifp->if_capenable2 & IFCAP2_RXTLS4) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	}
 	error = m_snd_tag_alloc(ifp, &params, mstp);
 
 	/*
 	 * If this connection is over a vlan, vlan_snd_tag_alloc
 	 * rewrites vlan_id with the saved interface.  Save the VLAN
 	 * ID for use in ktls_reset_receive_tag which allocates new
 	 * receive tags directly from the leaf interface bypassing
 	 * if_vlan.
 	 */
 	if (error == 0)
 		tls->rx_vlan_id = params.tls_rx.vlan_id;
 out:
 	return (error);
 }
 
 static int
 ktls_try_ifnet(struct socket *so, struct ktls_session *tls, int direction,
     bool force)
 {
 	struct m_snd_tag *mst;
 	int error;
 
 	switch (direction) {
 	case KTLS_TX:
 		error = ktls_alloc_snd_tag(so->so_pcb, tls, force, &mst);
 		if (__predict_false(error != 0))
 			goto done;
 		break;
 	case KTLS_RX:
 		KASSERT(!force, ("%s: forced receive tag", __func__));
 		error = ktls_alloc_rcv_tag(so->so_pcb, tls, &mst);
 		if (__predict_false(error != 0))
 			goto done;
 		break;
 	default:
 		__assert_unreachable();
 	}
 
 	tls->mode = TCP_TLS_MODE_IFNET;
 	tls->snd_tag = mst;
 
 	switch (tls->params.cipher_algorithm) {
 	case CRYPTO_AES_CBC:
 		counter_u64_add(ktls_ifnet_cbc, 1);
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
 		counter_u64_add(ktls_ifnet_gcm, 1);
 		break;
 	case CRYPTO_CHACHA20_POLY1305:
 		counter_u64_add(ktls_ifnet_chacha20, 1);
 		break;
 	default:
 		break;
 	}
 done:
 	return (error);
 }
 
 static void
 ktls_use_sw(struct ktls_session *tls)
 {
 	tls->mode = TCP_TLS_MODE_SW;
 	switch (tls->params.cipher_algorithm) {
 	case CRYPTO_AES_CBC:
 		counter_u64_add(ktls_sw_cbc, 1);
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
 		counter_u64_add(ktls_sw_gcm, 1);
 		break;
 	case CRYPTO_CHACHA20_POLY1305:
 		counter_u64_add(ktls_sw_chacha20, 1);
 		break;
 	}
 }
 
 static int
 ktls_try_sw(struct socket *so, struct ktls_session *tls, int direction)
 {
 	int error;
 
 	error = ktls_ocf_try(so, tls, direction);
 	if (error)
 		return (error);
 	ktls_use_sw(tls);
 	return (0);
 }
 
 /*
  * KTLS RX stores data in the socket buffer as a list of TLS records,
  * where each record is stored as a control message containg the TLS
  * header followed by data mbufs containing the decrypted data.  This
  * is different from KTLS TX which always uses an mb_ext_pgs mbuf for
  * both encrypted and decrypted data.  TLS records decrypted by a NIC
  * should be queued to the socket buffer as records, but encrypted
  * data which needs to be decrypted by software arrives as a stream of
  * regular mbufs which need to be converted.  In addition, there may
  * already be pending encrypted data in the socket buffer when KTLS RX
  * is enabled.
  *
  * To manage not-yet-decrypted data for KTLS RX, the following scheme
  * is used:
  *
  * - A single chain of NOTREADY mbufs is hung off of sb_mtls.
  *
  * - ktls_check_rx checks this chain of mbufs reading the TLS header
  *   from the first mbuf.  Once all of the data for that TLS record is
  *   queued, the socket is queued to a worker thread.
  *
  * - The worker thread calls ktls_decrypt to decrypt TLS records in
  *   the TLS chain.  Each TLS record is detached from the TLS chain,
  *   decrypted, and inserted into the regular socket buffer chain as
  *   record starting with a control message holding the TLS header and
  *   a chain of mbufs holding the encrypted data.
  */
 
 static void
 sb_mark_notready(struct sockbuf *sb)
 {
 	struct mbuf *m;
 
 	m = sb->sb_mb;
 	sb->sb_mtls = m;
 	sb->sb_mb = NULL;
 	sb->sb_mbtail = NULL;
 	sb->sb_lastrecord = NULL;
 	for (; m != NULL; m = m->m_next) {
 		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL",
 		    __func__));
 		KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail",
 		    __func__));
 		KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len",
 		    __func__));
 		m->m_flags |= M_NOTREADY;
 		sb->sb_acc -= m->m_len;
 		sb->sb_tlscc += m->m_len;
 		sb->sb_mtlstail = m;
 	}
 	KASSERT(sb->sb_acc == 0 && sb->sb_tlscc == sb->sb_ccc,
 	    ("%s: acc %u tlscc %u ccc %u", __func__, sb->sb_acc, sb->sb_tlscc,
 	    sb->sb_ccc));
 }
 
 /*
  * Return information about the pending TLS data in a socket
  * buffer.  On return, 'seqno' is set to the sequence number
  * of the next TLS record to be received, 'resid' is set to
  * the amount of bytes still needed for the last pending
  * record.  The function returns 'false' if the last pending
  * record contains a partial TLS header.  In that case, 'resid'
  * is the number of bytes needed to complete the TLS header.
  */
 bool
 ktls_pending_rx_info(struct sockbuf *sb, uint64_t *seqnop, size_t *residp)
 {
 	struct tls_record_layer hdr;
 	struct mbuf *m;
 	uint64_t seqno;
 	size_t resid;
 	u_int offset, record_len;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	MPASS(sb->sb_flags & SB_TLS_RX);
 	seqno = sb->sb_tls_seqno;
 	resid = sb->sb_tlscc;
 	m = sb->sb_mtls;
 	offset = 0;
 
 	if (resid == 0) {
 		*seqnop = seqno;
 		*residp = 0;
 		return (true);
 	}
 
 	for (;;) {
 		seqno++;
 
 		if (resid < sizeof(hdr)) {
 			*seqnop = seqno;
 			*residp = sizeof(hdr) - resid;
 			return (false);
 		}
 
 		m_copydata(m, offset, sizeof(hdr), (void *)&hdr);
 
 		record_len = sizeof(hdr) + ntohs(hdr.tls_length);
 		if (resid <= record_len) {
 			*seqnop = seqno;
 			*residp = record_len - resid;
 			return (true);
 		}
 		resid -= record_len;
 
 		while (record_len != 0) {
 			if (m->m_len - offset > record_len) {
 				offset += record_len;
 				break;
 			}
 
 			record_len -= (m->m_len - offset);
 			offset = 0;
 			m = m->m_next;
 		}
 	}
 }
 
 int
 ktls_enable_rx(struct socket *so, struct tls_enable *en)
 {
 	struct ktls_session *tls;
 	int error;
 
 	if (!ktls_offload_enable)
 		return (ENOTSUP);
 	if (SOLISTENING(so))
 		return (EINVAL);
 
 	counter_u64_add(ktls_offload_enable_calls, 1);
 
 	/*
 	 * This should always be true since only the TCP socket option
 	 * invokes this function.
 	 */
 	if (so->so_proto->pr_protocol != IPPROTO_TCP)
 		return (EINVAL);
 
 	/*
 	 * XXX: Don't overwrite existing sessions.  We should permit
 	 * this to support rekeying in the future.
 	 */
 	if (so->so_rcv.sb_tls_info != NULL)
 		return (EALREADY);
 
 	if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
 		return (ENOTSUP);
 
 	error = ktls_create_session(so, en, &tls, KTLS_RX);
 	if (error)
 		return (error);
 
 	error = ktls_ocf_try(so, tls, KTLS_RX);
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	/* Mark the socket as using TLS offload. */
 	SOCKBUF_LOCK(&so->so_rcv);
 	so->so_rcv.sb_tls_seqno = be64dec(en->rec_seq);
 	so->so_rcv.sb_tls_info = tls;
 	so->so_rcv.sb_flags |= SB_TLS_RX;
 
 	/* Mark existing data as not ready until it can be decrypted. */
 	sb_mark_notready(&so->so_rcv);
 	ktls_check_rx(&so->so_rcv);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	/* Prefer TOE -> ifnet TLS -> software TLS. */
 #ifdef TCP_OFFLOAD
 	error = ktls_try_toe(so, tls, KTLS_RX);
 	if (error)
 #endif
 		error = ktls_try_ifnet(so, tls, KTLS_RX, false);
 	if (error)
 		ktls_use_sw(tls);
 
 	counter_u64_add(ktls_offload_total, 1);
 
 	return (0);
 }
 
 int
 ktls_enable_tx(struct socket *so, struct tls_enable *en)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
 	int error;
 
 	if (!ktls_offload_enable)
 		return (ENOTSUP);
 	if (SOLISTENING(so))
 		return (EINVAL);
 
 	counter_u64_add(ktls_offload_enable_calls, 1);
 
 	/*
 	 * This should always be true since only the TCP socket option
 	 * invokes this function.
 	 */
 	if (so->so_proto->pr_protocol != IPPROTO_TCP)
 		return (EINVAL);
 
 	/*
 	 * XXX: Don't overwrite existing sessions.  We should permit
 	 * this to support rekeying in the future.
 	 */
 	if (so->so_snd.sb_tls_info != NULL)
 		return (EALREADY);
 
 	if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
 		return (ENOTSUP);
 
 	/* TLS requires ext pgs */
 	if (mb_use_ext_pgs == 0)
 		return (ENXIO);
 
 	error = ktls_create_session(so, en, &tls, KTLS_TX);
 	if (error)
 		return (error);
 
 	/* Prefer TOE -> ifnet TLS -> software TLS. */
 #ifdef TCP_OFFLOAD
 	error = ktls_try_toe(so, tls, KTLS_TX);
 	if (error)
 #endif
 		error = ktls_try_ifnet(so, tls, KTLS_TX, false);
 	if (error)
 		error = ktls_try_sw(so, tls, KTLS_TX);
 
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	error = SOCK_IO_SEND_LOCK(so, SBL_WAIT);
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	/*
 	 * Write lock the INP when setting sb_tls_info so that
 	 * routines in tcp_ratelimit.c can read sb_tls_info while
 	 * holding the INP lock.
 	 */
 	inp = so->so_pcb;
 	INP_WLOCK(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_tls_seqno = be64dec(en->rec_seq);
 	so->so_snd.sb_tls_info = tls;
 	if (tls->mode != TCP_TLS_MODE_SW)
 		so->so_snd.sb_flags |= SB_TLS_IFNET;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	INP_WUNLOCK(inp);
 	SOCK_IO_SEND_UNLOCK(so);
 
 	counter_u64_add(ktls_offload_total, 1);
 
 	return (0);
 }
 
 int
 ktls_get_rx_mode(struct socket *so, int *modep)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp __diagused;
 
 	if (SOLISTENING(so))
 		return (EINVAL);
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCK_RECVBUF_LOCK(so);
 	tls = so->so_rcv.sb_tls_info;
 	if (tls == NULL)
 		*modep = TCP_TLS_MODE_NONE;
 	else
 		*modep = tls->mode;
 	SOCK_RECVBUF_UNLOCK(so);
 	return (0);
 }
 
 /*
  * ktls_get_rx_sequence - get the next TCP- and TLS- sequence number.
  *
  * This function gets information about the next TCP- and TLS-
  * sequence number to be processed by the TLS receive worker
  * thread. The information is extracted from the given "inpcb"
  * structure. The values are stored in host endian format at the two
  * given output pointer locations. The TCP sequence number points to
  * the beginning of the TLS header.
  *
  * This function returns zero on success, else a non-zero error code
  * is returned.
  */
 int
 ktls_get_rx_sequence(struct inpcb *inp, uint32_t *tcpseq, uint64_t *tlsseq)
 {
 	struct socket *so;
 	struct tcpcb *tp;
 
 	INP_RLOCK(inp);
 	so = inp->inp_socket;
 	if (__predict_false(so == NULL)) {
 		INP_RUNLOCK(inp);
 		return (EINVAL);
 	}
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 
 	tp = intotcpcb(inp);
 	MPASS(tp != NULL);
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	*tcpseq = tp->rcv_nxt - so->so_rcv.sb_tlscc;
 	*tlsseq = so->so_rcv.sb_tls_seqno;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	INP_RUNLOCK(inp);
 
 	return (0);
 }
 
 int
 ktls_get_tx_mode(struct socket *so, int *modep)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp __diagused;
 
 	if (SOLISTENING(so))
 		return (EINVAL);
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCK_SENDBUF_LOCK(so);
 	tls = so->so_snd.sb_tls_info;
 	if (tls == NULL)
 		*modep = TCP_TLS_MODE_NONE;
 	else
 		*modep = tls->mode;
 	SOCK_SENDBUF_UNLOCK(so);
 	return (0);
 }
 
 /*
  * Switch between SW and ifnet TLS sessions as requested.
  */
 int
 ktls_set_tx_mode(struct socket *so, int mode)
 {
 	struct ktls_session *tls, *tls_new;
 	struct inpcb *inp;
 	int error;
 
 	if (SOLISTENING(so))
 		return (EINVAL);
 	switch (mode) {
 	case TCP_TLS_MODE_SW:
 	case TCP_TLS_MODE_IFNET:
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	tls = so->so_snd.sb_tls_info;
 	if (tls == NULL) {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		return (0);
 	}
 
 	if (tls->mode == mode) {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		return (0);
 	}
 
 	tls = ktls_hold(tls);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	INP_WUNLOCK(inp);
 
 	tls_new = ktls_clone_session(tls, KTLS_TX);
 
 	if (mode == TCP_TLS_MODE_IFNET)
 		error = ktls_try_ifnet(so, tls_new, KTLS_TX, true);
 	else
 		error = ktls_try_sw(so, tls_new, KTLS_TX);
 	if (error) {
 		counter_u64_add(ktls_switch_failed, 1);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (error);
 	}
 
 	error = SOCK_IO_SEND_LOCK(so, SBL_WAIT);
 	if (error) {
 		counter_u64_add(ktls_switch_failed, 1);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (error);
 	}
 
 	/*
 	 * If we raced with another session change, keep the existing
 	 * session.
 	 */
 	if (tls != so->so_snd.sb_tls_info) {
 		counter_u64_add(ktls_switch_failed, 1);
 		SOCK_IO_SEND_UNLOCK(so);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (EBUSY);
 	}
 
 	INP_WLOCK(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_tls_info = tls_new;
 	if (tls_new->mode != TCP_TLS_MODE_SW)
 		so->so_snd.sb_flags |= SB_TLS_IFNET;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	SOCK_IO_SEND_UNLOCK(so);
 
 	/*
 	 * Drop two references on 'tls'.  The first is for the
 	 * ktls_hold() above.  The second drops the reference from the
 	 * socket buffer.
 	 */
 	KASSERT(tls->refcount >= 2, ("too few references on old session"));
 	ktls_free(tls);
 	ktls_free(tls);
 
 	if (mode == TCP_TLS_MODE_IFNET)
 		counter_u64_add(ktls_switch_to_ifnet, 1);
 	else
 		counter_u64_add(ktls_switch_to_sw, 1);
 
 	return (0);
 }
 
 /*
  * Try to allocate a new TLS receive tag.  This task is scheduled when
  * sbappend_ktls_rx detects an input path change.  If a new tag is
  * allocated, replace the tag in the TLS session.  If a new tag cannot
  * be allocated, let the session fall back to software decryption.
  */
 static void
 ktls_reset_receive_tag(void *context, int pending)
 {
 	union if_snd_tag_alloc_params params;
 	struct ktls_session *tls;
 	struct m_snd_tag *mst;
 	struct inpcb *inp;
 	struct ifnet *ifp;
 	struct socket *so;
 	int error;
 
 	MPASS(pending == 1);
 
 	tls = context;
 	so = tls->so;
 	inp = so->so_pcb;
 	ifp = NULL;
 
 	INP_RLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_RUNLOCK(inp);
 		goto out;
 	}
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	mst = tls->snd_tag;
 	tls->snd_tag = NULL;
 	if (mst != NULL)
 		m_snd_tag_rele(mst);
 
 	ifp = tls->rx_ifp;
 	if_ref(ifp);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	params.hdr.type = IF_SND_TAG_TYPE_TLS_RX;
 	params.hdr.flowid = inp->inp_flowid;
 	params.hdr.flowtype = inp->inp_flowtype;
 	params.hdr.numa_domain = inp->inp_numa_domain;
 	params.tls_rx.inp = inp;
 	params.tls_rx.tls = tls;
 	params.tls_rx.vlan_id = tls->rx_vlan_id;
 	INP_RUNLOCK(inp);
 
 	if (inp->inp_vflag & INP_IPV6) {
 		if ((ifp->if_capenable2 & IFCAP2_RXTLS6) == 0)
 			goto out;
 	} else {
 		if ((ifp->if_capenable2 & IFCAP2_RXTLS4) == 0)
 			goto out;
 	}
 
 	error = m_snd_tag_alloc(ifp, &params, &mst);
 	if (error == 0) {
 		SOCKBUF_LOCK(&so->so_rcv);
 		tls->snd_tag = mst;
 		SOCKBUF_UNLOCK(&so->so_rcv);
 
 		counter_u64_add(ktls_ifnet_reset, 1);
 	} else {
 		/*
 		 * Just fall back to software decryption if a tag
 		 * cannot be allocated leaving the connection intact.
 		 * If a future input path change switches to another
 		 * interface this connection will resume ifnet TLS.
 		 */
 		counter_u64_add(ktls_ifnet_reset_failed, 1);
 	}
 
 out:
 	mtx_pool_lock(mtxpool_sleep, tls);
 	tls->reset_pending = false;
 	mtx_pool_unlock(mtxpool_sleep, tls);
 
 	if (ifp != NULL)
 		if_rele(ifp);
 	sorele(so);
 	ktls_free(tls);
 }
 
 /*
  * Try to allocate a new TLS send tag.  This task is scheduled when
  * ip_output detects a route change while trying to transmit a packet
  * holding a TLS record.  If a new tag is allocated, replace the tag
  * in the TLS session.  Subsequent packets on the connection will use
  * the new tag.  If a new tag cannot be allocated, drop the
  * connection.
  */
 static void
 ktls_reset_send_tag(void *context, int pending)
 {
 	struct epoch_tracker et;
 	struct ktls_session *tls;
 	struct m_snd_tag *old, *new;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	int error;
 
 	MPASS(pending == 1);
 
 	tls = context;
 	inp = tls->inp;
 
 	/*
 	 * Free the old tag first before allocating a new one.
 	 * ip[6]_output_send() will treat a NULL send tag the same as
 	 * an ifp mismatch and drop packets until a new tag is
 	 * allocated.
 	 *
 	 * Write-lock the INP when changing tls->snd_tag since
 	 * ip[6]_output_send() holds a read-lock when reading the
 	 * pointer.
 	 */
 	INP_WLOCK(inp);
 	old = tls->snd_tag;
 	tls->snd_tag = NULL;
 	INP_WUNLOCK(inp);
 	if (old != NULL)
 		m_snd_tag_rele(old);
 
 	error = ktls_alloc_snd_tag(inp, tls, true, &new);
 
 	if (error == 0) {
 		INP_WLOCK(inp);
 		tls->snd_tag = new;
 		mtx_pool_lock(mtxpool_sleep, tls);
 		tls->reset_pending = false;
 		mtx_pool_unlock(mtxpool_sleep, tls);
 		if (!in_pcbrele_wlocked(inp))
 			INP_WUNLOCK(inp);
 
 		counter_u64_add(ktls_ifnet_reset, 1);
 
 		/*
 		 * XXX: Should we kick tcp_output explicitly now that
 		 * the send tag is fixed or just rely on timers?
 		 */
 	} else {
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
 		if (!in_pcbrele_wlocked(inp)) {
-			if (!(inp->inp_flags & INP_TIMEWAIT) &&
-			    !(inp->inp_flags & INP_DROPPED)) {
+			if (!(inp->inp_flags & INP_DROPPED)) {
 				tp = intotcpcb(inp);
 				CURVNET_SET(tp->t_vnet);
 				tp = tcp_drop(tp, ECONNABORTED);
 				CURVNET_RESTORE();
 				if (tp != NULL)
 					INP_WUNLOCK(inp);
 				counter_u64_add(ktls_ifnet_reset_dropped, 1);
 			} else
 				INP_WUNLOCK(inp);
 		}
 		NET_EPOCH_EXIT(et);
 
 		counter_u64_add(ktls_ifnet_reset_failed, 1);
 
 		/*
 		 * Leave reset_pending true to avoid future tasks while
 		 * the socket goes away.
 		 */
 	}
 
 	ktls_free(tls);
 }
 
 void
 ktls_input_ifp_mismatch(struct sockbuf *sb, struct ifnet *ifp)
 {
 	struct ktls_session *tls;
 	struct socket *so;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX",
 	    __func__, sb));
 	so = __containerof(sb, struct socket, so_rcv);
 
 	tls = sb->sb_tls_info;
 	if_rele(tls->rx_ifp);
 	if_ref(ifp);
 	tls->rx_ifp = ifp;
 
 	/*
 	 * See if we should schedule a task to update the receive tag for
 	 * this session.
 	 */
 	mtx_pool_lock(mtxpool_sleep, tls);
 	if (!tls->reset_pending) {
 		(void) ktls_hold(tls);
 		soref(so);
 		tls->so = so;
 		tls->reset_pending = true;
 		taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task);
 	}
 	mtx_pool_unlock(mtxpool_sleep, tls);
 }
 
 int
 ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls)
 {
 
 	if (inp == NULL)
 		return (ENOBUFS);
 
 	INP_LOCK_ASSERT(inp);
 
 	/*
 	 * See if we should schedule a task to update the send tag for
 	 * this session.
 	 */
 	mtx_pool_lock(mtxpool_sleep, tls);
 	if (!tls->reset_pending) {
 		(void) ktls_hold(tls);
 		in_pcbref(inp);
 		tls->inp = inp;
 		tls->reset_pending = true;
 		taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task);
 	}
 	mtx_pool_unlock(mtxpool_sleep, tls);
 	return (ENOBUFS);
 }
 
 #ifdef RATELIMIT
 int
 ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate)
 {
 	union if_snd_tag_modify_params params = {
 		.rate_limit.max_rate = max_pacing_rate,
 		.rate_limit.flags = M_NOWAIT,
 	};
 	struct m_snd_tag *mst;
 
 	/* Can't get to the inp, but it should be locked. */
 	/* INP_LOCK_ASSERT(inp); */
 
 	MPASS(tls->mode == TCP_TLS_MODE_IFNET);
 
 	if (tls->snd_tag == NULL) {
 		/*
 		 * Resetting send tag, ignore this change.  The
 		 * pending reset may or may not see this updated rate
 		 * in the tcpcb.  If it doesn't, we will just lose
 		 * this rate change.
 		 */
 		return (0);
 	}
 
 	mst = tls->snd_tag;
 
 	MPASS(mst != NULL);
 	MPASS(mst->sw->type == IF_SND_TAG_TYPE_TLS_RATE_LIMIT);
 
 	return (mst->sw->snd_tag_modify(mst, &params));
 }
 #endif
 #endif
 
 void
 ktls_destroy(struct ktls_session *tls)
 {
 
 	if (tls->sequential_records) {
 		struct mbuf *m, *n;
 		int page_count;
 
 		STAILQ_FOREACH_SAFE(m, &tls->pending_records, m_epg_stailq, n) {
 			page_count = m->m_epg_enc_cnt;
 			while (page_count > 0) {
 				KASSERT(page_count >= m->m_epg_nrdy,
 				    ("%s: too few pages", __func__));
 				page_count -= m->m_epg_nrdy;
 				m = m_free(m);
 			}
 		}
 	}
 	ktls_cleanup(tls);
 	uma_zfree(ktls_session_zone, tls);
 }
 
 void
 ktls_seq(struct sockbuf *sb, struct mbuf *m)
 {
 
 	for (; m != NULL; m = m->m_next) {
 		KASSERT((m->m_flags & M_EXTPG) != 0,
 		    ("ktls_seq: mapped mbuf %p", m));
 
 		m->m_epg_seqno = sb->sb_tls_seqno;
 		sb->sb_tls_seqno++;
 	}
 }
 
 /*
  * Add TLS framing (headers and trailers) to a chain of mbufs.  Each
  * mbuf in the chain must be an unmapped mbuf.  The payload of the
  * mbuf must be populated with the payload of each TLS record.
  *
  * The record_type argument specifies the TLS record type used when
  * populating the TLS header.
  *
  * The enq_count argument on return is set to the number of pages of
  * payload data for this entire chain that need to be encrypted via SW
  * encryption.  The returned value should be passed to ktls_enqueue
  * when scheduling encryption of this chain of mbufs.  To handle the
  * special case of empty fragments for TLS 1.0 sessions, an empty
  * fragment counts as one page.
  */
 void
 ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt,
     uint8_t record_type)
 {
 	struct tls_record_layer *tlshdr;
 	struct mbuf *m;
 	uint64_t *noncep;
 	uint16_t tls_len;
 	int maxlen __diagused;
 
 	maxlen = tls->params.max_frame_len;
 	*enq_cnt = 0;
 	for (m = top; m != NULL; m = m->m_next) {
 		/*
 		 * All mbufs in the chain should be TLS records whose
 		 * payload does not exceed the maximum frame length.
 		 *
 		 * Empty TLS 1.0 records are permitted when using CBC.
 		 */
 		KASSERT(m->m_len <= maxlen && m->m_len >= 0 &&
 		    (m->m_len > 0 || ktls_permit_empty_frames(tls)),
 		    ("ktls_frame: m %p len %d", m, m->m_len));
 
 		/*
 		 * TLS frames require unmapped mbufs to store session
 		 * info.
 		 */
 		KASSERT((m->m_flags & M_EXTPG) != 0,
 		    ("ktls_frame: mapped mbuf %p (top = %p)", m, top));
 
 		tls_len = m->m_len;
 
 		/* Save a reference to the session. */
 		m->m_epg_tls = ktls_hold(tls);
 
 		m->m_epg_hdrlen = tls->params.tls_hlen;
 		m->m_epg_trllen = tls->params.tls_tlen;
 		if (tls->params.cipher_algorithm == CRYPTO_AES_CBC) {
 			int bs, delta;
 
 			/*
 			 * AES-CBC pads messages to a multiple of the
 			 * block size.  Note that the padding is
 			 * applied after the digest and the encryption
 			 * is done on the "plaintext || mac || padding".
 			 * At least one byte of padding is always
 			 * present.
 			 *
 			 * Compute the final trailer length assuming
 			 * at most one block of padding.
 			 * tls->params.tls_tlen is the maximum
 			 * possible trailer length (padding + digest).
 			 * delta holds the number of excess padding
 			 * bytes if the maximum were used.  Those
 			 * extra bytes are removed.
 			 */
 			bs = tls->params.tls_bs;
 			delta = (tls_len + tls->params.tls_tlen) & (bs - 1);
 			m->m_epg_trllen -= delta;
 		}
 		m->m_len += m->m_epg_hdrlen + m->m_epg_trllen;
 
 		/* Populate the TLS header. */
 		tlshdr = (void *)m->m_epg_hdr;
 		tlshdr->tls_vmajor = tls->params.tls_vmajor;
 
 		/*
 		 * TLS 1.3 masquarades as TLS 1.2 with a record type
 		 * of TLS_RLTYPE_APP.
 		 */
 		if (tls->params.tls_vminor == TLS_MINOR_VER_THREE &&
 		    tls->params.tls_vmajor == TLS_MAJOR_VER_ONE) {
 			tlshdr->tls_vminor = TLS_MINOR_VER_TWO;
 			tlshdr->tls_type = TLS_RLTYPE_APP;
 			/* save the real record type for later */
 			m->m_epg_record_type = record_type;
 			m->m_epg_trail[0] = record_type;
 		} else {
 			tlshdr->tls_vminor = tls->params.tls_vminor;
 			tlshdr->tls_type = record_type;
 		}
 		tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr));
 
 		/*
 		 * Store nonces / explicit IVs after the end of the
 		 * TLS header.
 		 *
 		 * For GCM with TLS 1.2, an 8 byte nonce is copied
 		 * from the end of the IV.  The nonce is then
 		 * incremented for use by the next record.
 		 *
 		 * For CBC, a random nonce is inserted for TLS 1.1+.
 		 */
 		if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
 		    tls->params.tls_vminor == TLS_MINOR_VER_TWO) {
 			noncep = (uint64_t *)(tls->params.iv + 8);
 			be64enc(tlshdr + 1, *noncep);
 			(*noncep)++;
 		} else if (tls->params.cipher_algorithm == CRYPTO_AES_CBC &&
 		    tls->params.tls_vminor >= TLS_MINOR_VER_ONE)
 			arc4rand(tlshdr + 1, AES_BLOCK_LEN, 0);
 
 		/*
 		 * When using SW encryption, mark the mbuf not ready.
 		 * It will be marked ready via sbready() after the
 		 * record has been encrypted.
 		 *
 		 * When using ifnet TLS, unencrypted TLS records are
 		 * sent down the stack to the NIC.
 		 */
 		if (tls->mode == TCP_TLS_MODE_SW) {
 			m->m_flags |= M_NOTREADY;
 			if (__predict_false(tls_len == 0)) {
 				/* TLS 1.0 empty fragment. */
 				m->m_epg_nrdy = 1;
 			} else
 				m->m_epg_nrdy = m->m_epg_npgs;
 			*enq_cnt += m->m_epg_nrdy;
 		}
 	}
 }
 
 bool
 ktls_permit_empty_frames(struct ktls_session *tls)
 {
 	return (tls->params.cipher_algorithm == CRYPTO_AES_CBC &&
 	    tls->params.tls_vminor == TLS_MINOR_VER_ZERO);
 }
 
 void
 ktls_check_rx(struct sockbuf *sb)
 {
 	struct tls_record_layer hdr;
 	struct ktls_wq *wq;
 	struct socket *so;
 	bool running;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX",
 	    __func__, sb));
 	so = __containerof(sb, struct socket, so_rcv);
 
 	if (sb->sb_flags & SB_TLS_RX_RUNNING)
 		return;
 
 	/* Is there enough queued for a TLS header? */
 	if (sb->sb_tlscc < sizeof(hdr)) {
 		if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc != 0)
 			so->so_error = EMSGSIZE;
 		return;
 	}
 
 	m_copydata(sb->sb_mtls, 0, sizeof(hdr), (void *)&hdr);
 
 	/* Is the entire record queued? */
 	if (sb->sb_tlscc < sizeof(hdr) + ntohs(hdr.tls_length)) {
 		if ((sb->sb_state & SBS_CANTRCVMORE) != 0)
 			so->so_error = EMSGSIZE;
 		return;
 	}
 
 	sb->sb_flags |= SB_TLS_RX_RUNNING;
 
 	soref(so);
 	wq = &ktls_wq[so->so_rcv.sb_tls_info->wq_index];
 	mtx_lock(&wq->mtx);
 	STAILQ_INSERT_TAIL(&wq->so_head, so, so_ktls_rx_list);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 	counter_u64_add(ktls_cnt_rx_queued, 1);
 }
 
 static struct mbuf *
 ktls_detach_record(struct sockbuf *sb, int len)
 {
 	struct mbuf *m, *n, *top;
 	int remain;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	MPASS(len <= sb->sb_tlscc);
 
 	/*
 	 * If TLS chain is the exact size of the record,
 	 * just grab the whole record.
 	 */
 	top = sb->sb_mtls;
 	if (sb->sb_tlscc == len) {
 		sb->sb_mtls = NULL;
 		sb->sb_mtlstail = NULL;
 		goto out;
 	}
 
 	/*
 	 * While it would be nice to use m_split() here, we need
 	 * to know exactly what m_split() allocates to update the
 	 * accounting, so do it inline instead.
 	 */
 	remain = len;
 	for (m = top; remain > m->m_len; m = m->m_next)
 		remain -= m->m_len;
 
 	/* Easy case: don't have to split 'm'. */
 	if (remain == m->m_len) {
 		sb->sb_mtls = m->m_next;
 		if (sb->sb_mtls == NULL)
 			sb->sb_mtlstail = NULL;
 		m->m_next = NULL;
 		goto out;
 	}
 
 	/*
 	 * Need to allocate an mbuf to hold the remainder of 'm'.  Try
 	 * with M_NOWAIT first.
 	 */
 	n = m_get(M_NOWAIT, MT_DATA);
 	if (n == NULL) {
 		/*
 		 * Use M_WAITOK with socket buffer unlocked.  If
 		 * 'sb_mtls' changes while the lock is dropped, return
 		 * NULL to force the caller to retry.
 		 */
 		SOCKBUF_UNLOCK(sb);
 
 		n = m_get(M_WAITOK, MT_DATA);
 
 		SOCKBUF_LOCK(sb);
 		if (sb->sb_mtls != top) {
 			m_free(n);
 			return (NULL);
 		}
 	}
 	n->m_flags |= (m->m_flags & (M_NOTREADY | M_DECRYPTED));
 
 	/* Store remainder in 'n'. */
 	n->m_len = m->m_len - remain;
 	if (m->m_flags & M_EXT) {
 		n->m_data = m->m_data + remain;
 		mb_dupcl(n, m);
 	} else {
 		bcopy(mtod(m, caddr_t) + remain, mtod(n, caddr_t), n->m_len);
 	}
 
 	/* Trim 'm' and update accounting. */
 	m->m_len -= n->m_len;
 	sb->sb_tlscc -= n->m_len;
 	sb->sb_ccc -= n->m_len;
 
 	/* Account for 'n'. */
 	sballoc_ktls_rx(sb, n);
 
 	/* Insert 'n' into the TLS chain. */
 	sb->sb_mtls = n;
 	n->m_next = m->m_next;
 	if (sb->sb_mtlstail == m)
 		sb->sb_mtlstail = n;
 
 	/* Detach the record from the TLS chain. */
 	m->m_next = NULL;
 
 out:
 	MPASS(m_length(top, NULL) == len);
 	for (m = top; m != NULL; m = m->m_next)
 		sbfree_ktls_rx(sb, m);
 	sb->sb_tlsdcc = len;
 	sb->sb_ccc += len;
 	SBCHECK(sb);
 	return (top);
 }
 
 /*
  * Determine the length of the trailing zero padding and find the real
  * record type in the byte before the padding.
  *
  * Walking the mbuf chain backwards is clumsy, so another option would
  * be to scan forwards remembering the last non-zero byte before the
  * trailer.  However, it would be expensive to scan the entire record.
  * Instead, find the last non-zero byte of each mbuf in the chain
  * keeping track of the relative offset of that nonzero byte.
  *
  * trail_len is the size of the MAC/tag on input and is set to the
  * size of the full trailer including padding and the record type on
  * return.
  */
 static int
 tls13_find_record_type(struct ktls_session *tls, struct mbuf *m, int tls_len,
     int *trailer_len, uint8_t *record_typep)
 {
 	char *cp;
 	u_int digest_start, last_offset, m_len, offset;
 	uint8_t record_type;
 
 	digest_start = tls_len - *trailer_len;
 	last_offset = 0;
 	offset = 0;
 	for (; m != NULL && offset < digest_start;
 	     offset += m->m_len, m = m->m_next) {
 		/* Don't look for padding in the tag. */
 		m_len = min(digest_start - offset, m->m_len);
 		cp = mtod(m, char *);
 
 		/* Find last non-zero byte in this mbuf. */
 		while (m_len > 0 && cp[m_len - 1] == 0)
 			m_len--;
 		if (m_len > 0) {
 			record_type = cp[m_len - 1];
 			last_offset = offset + m_len;
 		}
 	}
 	if (last_offset < tls->params.tls_hlen)
 		return (EBADMSG);
 
 	*record_typep = record_type;
 	*trailer_len = tls_len - last_offset + 1;
 	return (0);
 }
 
 /*
  * Check if a mbuf chain is fully decrypted at the given offset and
  * length. Returns KTLS_MBUF_CRYPTO_ST_DECRYPTED if all data is
  * decrypted. KTLS_MBUF_CRYPTO_ST_MIXED if there is a mix of encrypted
  * and decrypted data. Else KTLS_MBUF_CRYPTO_ST_ENCRYPTED if all data
  * is encrypted.
  */
 ktls_mbuf_crypto_st_t
 ktls_mbuf_crypto_state(struct mbuf *mb, int offset, int len)
 {
 	int m_flags_ored = 0;
 	int m_flags_anded = -1;
 
 	for (; mb != NULL; mb = mb->m_next) {
 		if (offset < mb->m_len)
 			break;
 		offset -= mb->m_len;
 	}
 	offset += len;
 
 	for (; mb != NULL; mb = mb->m_next) {
 		m_flags_ored |= mb->m_flags;
 		m_flags_anded &= mb->m_flags;
 
 		if (offset <= mb->m_len)
 			break;
 		offset -= mb->m_len;
 	}
 	MPASS(mb != NULL || offset == 0);
 
 	if ((m_flags_ored ^ m_flags_anded) & M_DECRYPTED)
 		return (KTLS_MBUF_CRYPTO_ST_MIXED);
 	else
 		return ((m_flags_ored & M_DECRYPTED) ?
 		    KTLS_MBUF_CRYPTO_ST_DECRYPTED :
 		    KTLS_MBUF_CRYPTO_ST_ENCRYPTED);
 }
 
 /*
  * ktls_resync_ifnet - get HW TLS RX back on track after packet loss
  */
 static int
 ktls_resync_ifnet(struct socket *so, uint32_t tls_len, uint64_t tls_rcd_num)
 {
 	union if_snd_tag_modify_params params;
 	struct m_snd_tag *mst;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 
 	mst = so->so_rcv.sb_tls_info->snd_tag;
 	if (__predict_false(mst == NULL))
 		return (EINVAL);
 
 	inp = sotoinpcb(so);
 	if (__predict_false(inp == NULL))
 		return (EINVAL);
 
 	INP_RLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 
 	tp = intotcpcb(inp);
 	MPASS(tp != NULL);
 
 	/* Get the TCP sequence number of the next valid TLS header. */
 	SOCKBUF_LOCK(&so->so_rcv);
 	params.tls_rx.tls_hdr_tcp_sn =
 	    tp->rcv_nxt - so->so_rcv.sb_tlscc - tls_len;
 	params.tls_rx.tls_rec_length = tls_len;
 	params.tls_rx.tls_seq_number = tls_rcd_num;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	INP_RUNLOCK(inp);
 
 	MPASS(mst->sw->type == IF_SND_TAG_TYPE_TLS_RX);
 	return (mst->sw->snd_tag_modify(mst, &params));
 }
 
 static void
 ktls_decrypt(struct socket *so)
 {
 	char tls_header[MBUF_PEXT_HDR_LEN];
 	struct ktls_session *tls;
 	struct sockbuf *sb;
 	struct tls_record_layer *hdr;
 	struct tls_get_record tgr;
 	struct mbuf *control, *data, *m;
 	ktls_mbuf_crypto_st_t state;
 	uint64_t seqno;
 	int error, remain, tls_len, trail_len;
 	bool tls13;
 	uint8_t vminor, record_type;
 
 	hdr = (struct tls_record_layer *)tls_header;
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	KASSERT(sb->sb_flags & SB_TLS_RX_RUNNING,
 	    ("%s: socket %p not running", __func__, so));
 
 	tls = sb->sb_tls_info;
 	MPASS(tls != NULL);
 
 	tls13 = (tls->params.tls_vminor == TLS_MINOR_VER_THREE);
 	if (tls13)
 		vminor = TLS_MINOR_VER_TWO;
 	else
 		vminor = tls->params.tls_vminor;
 	for (;;) {
 		/* Is there enough queued for a TLS header? */
 		if (sb->sb_tlscc < tls->params.tls_hlen)
 			break;
 
 		m_copydata(sb->sb_mtls, 0, tls->params.tls_hlen, tls_header);
 		tls_len = sizeof(*hdr) + ntohs(hdr->tls_length);
 
 		if (hdr->tls_vmajor != tls->params.tls_vmajor ||
 		    hdr->tls_vminor != vminor)
 			error = EINVAL;
 		else if (tls13 && hdr->tls_type != TLS_RLTYPE_APP)
 			error = EINVAL;
 		else if (tls_len < tls->params.tls_hlen || tls_len >
 		    tls->params.tls_hlen + TLS_MAX_MSG_SIZE_V10_2 +
 		    tls->params.tls_tlen)
 			error = EMSGSIZE;
 		else
 			error = 0;
 		if (__predict_false(error != 0)) {
 			/*
 			 * We have a corrupted record and are likely
 			 * out of sync.  The connection isn't
 			 * recoverable at this point, so abort it.
 			 */
 			SOCKBUF_UNLOCK(sb);
 			counter_u64_add(ktls_offload_corrupted_records, 1);
 
 			CURVNET_SET(so->so_vnet);
 			so->so_proto->pr_abort(so);
 			so->so_error = error;
 			CURVNET_RESTORE();
 			goto deref;
 		}
 
 		/* Is the entire record queued? */
 		if (sb->sb_tlscc < tls_len)
 			break;
 
 		/*
 		 * Split out the portion of the mbuf chain containing
 		 * this TLS record.
 		 */
 		data = ktls_detach_record(sb, tls_len);
 		if (data == NULL)
 			continue;
 		MPASS(sb->sb_tlsdcc == tls_len);
 
 		seqno = sb->sb_tls_seqno;
 		sb->sb_tls_seqno++;
 		SBCHECK(sb);
 		SOCKBUF_UNLOCK(sb);
 
 		/* get crypto state for this TLS record */
 		state = ktls_mbuf_crypto_state(data, 0, tls_len);
 
 		switch (state) {
 		case KTLS_MBUF_CRYPTO_ST_MIXED:
 			error = ktls_ocf_recrypt(tls, hdr, data, seqno);
 			if (error)
 				break;
 			/* FALLTHROUGH */
 		case KTLS_MBUF_CRYPTO_ST_ENCRYPTED:
 			error = ktls_ocf_decrypt(tls, hdr, data, seqno,
 			    &trail_len);
 			if (__predict_true(error == 0)) {
 				if (tls13) {
 					error = tls13_find_record_type(tls, data,
 					    tls_len, &trail_len, &record_type);
 				} else {
 					record_type = hdr->tls_type;
 				}
 			}
 			break;
 		case KTLS_MBUF_CRYPTO_ST_DECRYPTED:
 			/*
 			 * NIC TLS is only supported for AEAD
 			 * ciphersuites which used a fixed sized
 			 * trailer.
 			 */
 			if (tls13) {
 				trail_len = tls->params.tls_tlen - 1;
 				error = tls13_find_record_type(tls, data,
 				    tls_len, &trail_len, &record_type);
 			} else {
 				trail_len = tls->params.tls_tlen;
 				error = 0;
 				record_type = hdr->tls_type;
 			}
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		if (error) {
 			counter_u64_add(ktls_offload_failed_crypto, 1);
 
 			SOCKBUF_LOCK(sb);
 			if (sb->sb_tlsdcc == 0) {
 				/*
 				 * sbcut/drop/flush discarded these
 				 * mbufs.
 				 */
 				m_freem(data);
 				break;
 			}
 
 			/*
 			 * Drop this TLS record's data, but keep
 			 * decrypting subsequent records.
 			 */
 			sb->sb_ccc -= tls_len;
 			sb->sb_tlsdcc = 0;
 
 			CURVNET_SET(so->so_vnet);
 			so->so_error = EBADMSG;
 			sorwakeup_locked(so);
 			CURVNET_RESTORE();
 
 			m_freem(data);
 
 			SOCKBUF_LOCK(sb);
 			continue;
 		}
 
 		/* Allocate the control mbuf. */
 		memset(&tgr, 0, sizeof(tgr));
 		tgr.tls_type = record_type;
 		tgr.tls_vmajor = hdr->tls_vmajor;
 		tgr.tls_vminor = hdr->tls_vminor;
 		tgr.tls_length = htobe16(tls_len - tls->params.tls_hlen -
 		    trail_len);
 		control = sbcreatecontrol(&tgr, sizeof(tgr),
 		    TLS_GET_RECORD, IPPROTO_TCP, M_WAITOK);
 
 		SOCKBUF_LOCK(sb);
 		if (sb->sb_tlsdcc == 0) {
 			/* sbcut/drop/flush discarded these mbufs. */
 			MPASS(sb->sb_tlscc == 0);
 			m_freem(data);
 			m_freem(control);
 			break;
 		}
 
 		/*
 		 * Clear the 'dcc' accounting in preparation for
 		 * adding the decrypted record.
 		 */
 		sb->sb_ccc -= tls_len;
 		sb->sb_tlsdcc = 0;
 		SBCHECK(sb);
 
 		/* If there is no payload, drop all of the data. */
 		if (tgr.tls_length == htobe16(0)) {
 			m_freem(data);
 			data = NULL;
 		} else {
 			/* Trim header. */
 			remain = tls->params.tls_hlen;
 			while (remain > 0) {
 				if (data->m_len > remain) {
 					data->m_data += remain;
 					data->m_len -= remain;
 					break;
 				}
 				remain -= data->m_len;
 				data = m_free(data);
 			}
 
 			/* Trim trailer and clear M_NOTREADY. */
 			remain = be16toh(tgr.tls_length);
 			m = data;
 			for (m = data; remain > m->m_len; m = m->m_next) {
 				m->m_flags &= ~(M_NOTREADY | M_DECRYPTED);
 				remain -= m->m_len;
 			}
 			m->m_len = remain;
 			m_freem(m->m_next);
 			m->m_next = NULL;
 			m->m_flags &= ~(M_NOTREADY | M_DECRYPTED);
 
 			/* Set EOR on the final mbuf. */
 			m->m_flags |= M_EOR;
 		}
 
 		sbappendcontrol_locked(sb, data, control, 0);
 
 		if (__predict_false(state != KTLS_MBUF_CRYPTO_ST_DECRYPTED)) {
 			sb->sb_flags |= SB_TLS_RX_RESYNC;
 			SOCKBUF_UNLOCK(sb);
 			ktls_resync_ifnet(so, tls_len, seqno);
 			SOCKBUF_LOCK(sb);
 		} else if (__predict_false(sb->sb_flags & SB_TLS_RX_RESYNC)) {
 			sb->sb_flags &= ~SB_TLS_RX_RESYNC;
 			SOCKBUF_UNLOCK(sb);
 			ktls_resync_ifnet(so, 0, seqno);
 			SOCKBUF_LOCK(sb);
 		}
 	}
 
 	sb->sb_flags &= ~SB_TLS_RX_RUNNING;
 
 	if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc > 0)
 		so->so_error = EMSGSIZE;
 
 	sorwakeup_locked(so);
 
 deref:
 	SOCKBUF_UNLOCK_ASSERT(sb);
 
 	CURVNET_SET(so->so_vnet);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 void
 ktls_enqueue_to_free(struct mbuf *m)
 {
 	struct ktls_wq *wq;
 	bool running;
 
 	/* Mark it for freeing. */
 	m->m_epg_flags |= EPG_FLAG_2FREE;
 	wq = &ktls_wq[m->m_epg_tls->wq_index];
 	mtx_lock(&wq->mtx);
 	STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 }
 
 static void *
 ktls_buffer_alloc(struct ktls_wq *wq, struct mbuf *m)
 {
 	void *buf;
 	int domain, running;
 
 	if (m->m_epg_npgs <= 2)
 		return (NULL);
 	if (ktls_buffer_zone == NULL)
 		return (NULL);
 	if ((u_int)(ticks - wq->lastallocfail) < hz) {
 		/*
 		 * Rate-limit allocation attempts after a failure.
 		 * ktls_buffer_import() will acquire a per-domain mutex to check
 		 * the free page queues and may fail consistently if memory is
 		 * fragmented.
 		 */
 		return (NULL);
 	}
 	buf = uma_zalloc(ktls_buffer_zone, M_NOWAIT | M_NORECLAIM);
 	if (buf == NULL) {
 		domain = PCPU_GET(domain);
 		wq->lastallocfail = ticks;
 
 		/*
 		 * Note that this check is "racy", but the races are
 		 * harmless, and are either a spurious wakeup if
 		 * multiple threads fail allocations before the alloc
 		 * thread wakes, or waiting an extra second in case we
 		 * see an old value of running == true.
 		 */
 		if (!VM_DOMAIN_EMPTY(domain)) {
 			running = atomic_load_int(&ktls_domains[domain].alloc_td.running);
 			if (!running)
 				wakeup(&ktls_domains[domain].alloc_td);
 		}
 	}
 	return (buf);
 }
 
 static int
 ktls_encrypt_record(struct ktls_wq *wq, struct mbuf *m,
     struct ktls_session *tls, struct ktls_ocf_encrypt_state *state)
 {
 	vm_page_t pg;
 	int error, i, len, off;
 
 	KASSERT((m->m_flags & (M_EXTPG | M_NOTREADY)) == (M_EXTPG | M_NOTREADY),
 	    ("%p not unready & nomap mbuf\n", m));
 	KASSERT(ptoa(m->m_epg_npgs) <= ktls_maxlen,
 	    ("page count %d larger than maximum frame length %d", m->m_epg_npgs,
 	    ktls_maxlen));
 
 	/* Anonymous mbufs are encrypted in place. */
 	if ((m->m_epg_flags & EPG_FLAG_ANON) != 0)
 		return (ktls_ocf_encrypt(state, tls, m, NULL, 0));
 
 	/*
 	 * For file-backed mbufs (from sendfile), anonymous wired
 	 * pages are allocated and used as the encryption destination.
 	 */
 	if ((state->cbuf = ktls_buffer_alloc(wq, m)) != NULL) {
 		len = ptoa(m->m_epg_npgs - 1) + m->m_epg_last_len -
 		    m->m_epg_1st_off;
 		state->dst_iov[0].iov_base = (char *)state->cbuf +
 		    m->m_epg_1st_off;
 		state->dst_iov[0].iov_len = len;
 		state->parray[0] = DMAP_TO_PHYS((vm_offset_t)state->cbuf);
 		i = 1;
 	} else {
 		off = m->m_epg_1st_off;
 		for (i = 0; i < m->m_epg_npgs; i++, off = 0) {
 			pg = vm_page_alloc_noobj(VM_ALLOC_NODUMP |
 			    VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
 			len = m_epg_pagelen(m, i, off);
 			state->parray[i] = VM_PAGE_TO_PHYS(pg);
 			state->dst_iov[i].iov_base =
 			    (char *)PHYS_TO_DMAP(state->parray[i]) + off;
 			state->dst_iov[i].iov_len = len;
 		}
 	}
 	KASSERT(i + 1 <= nitems(state->dst_iov), ("dst_iov is too small"));
 	state->dst_iov[i].iov_base = m->m_epg_trail;
 	state->dst_iov[i].iov_len = m->m_epg_trllen;
 
 	error = ktls_ocf_encrypt(state, tls, m, state->dst_iov, i + 1);
 
 	if (__predict_false(error != 0)) {
 		/* Free the anonymous pages. */
 		if (state->cbuf != NULL)
 			uma_zfree(ktls_buffer_zone, state->cbuf);
 		else {
 			for (i = 0; i < m->m_epg_npgs; i++) {
 				pg = PHYS_TO_VM_PAGE(state->parray[i]);
 				(void)vm_page_unwire_noq(pg);
 				vm_page_free(pg);
 			}
 		}
 	}
 	return (error);
 }
 
 /* Number of TLS records in a batch passed to ktls_enqueue(). */
 static u_int
 ktls_batched_records(struct mbuf *m)
 {
 	int page_count, records;
 
 	records = 0;
 	page_count = m->m_epg_enc_cnt;
 	while (page_count > 0) {
 		records++;
 		page_count -= m->m_epg_nrdy;
 		m = m->m_next;
 	}
 	KASSERT(page_count == 0, ("%s: mismatched page count", __func__));
 	return (records);
 }
 
 void
 ktls_enqueue(struct mbuf *m, struct socket *so, int page_count)
 {
 	struct ktls_session *tls;
 	struct ktls_wq *wq;
 	int queued;
 	bool running;
 
 	KASSERT(((m->m_flags & (M_EXTPG | M_NOTREADY)) ==
 	    (M_EXTPG | M_NOTREADY)),
 	    ("ktls_enqueue: %p not unready & nomap mbuf\n", m));
 	KASSERT(page_count != 0, ("enqueueing TLS mbuf with zero page count"));
 
 	KASSERT(m->m_epg_tls->mode == TCP_TLS_MODE_SW, ("!SW TLS mbuf"));
 
 	m->m_epg_enc_cnt = page_count;
 
 	/*
 	 * Save a pointer to the socket.  The caller is responsible
 	 * for taking an additional reference via soref().
 	 */
 	m->m_epg_so = so;
 
 	queued = 1;
 	tls = m->m_epg_tls;
 	wq = &ktls_wq[tls->wq_index];
 	mtx_lock(&wq->mtx);
 	if (__predict_false(tls->sequential_records)) {
 		/*
 		 * For TLS 1.0, records must be encrypted
 		 * sequentially.  For a given connection, all records
 		 * queued to the associated work queue are processed
 		 * sequentially.  However, sendfile(2) might complete
 		 * I/O requests spanning multiple TLS records out of
 		 * order.  Here we ensure TLS records are enqueued to
 		 * the work queue in FIFO order.
 		 *
 		 * tls->next_seqno holds the sequence number of the
 		 * next TLS record that should be enqueued to the work
 		 * queue.  If this next record is not tls->next_seqno,
 		 * it must be a future record, so insert it, sorted by
 		 * TLS sequence number, into tls->pending_records and
 		 * return.
 		 *
 		 * If this TLS record matches tls->next_seqno, place
 		 * it in the work queue and then check
 		 * tls->pending_records to see if any
 		 * previously-queued records are now ready for
 		 * encryption.
 		 */
 		if (m->m_epg_seqno != tls->next_seqno) {
 			struct mbuf *n, *p;
 
 			p = NULL;
 			STAILQ_FOREACH(n, &tls->pending_records, m_epg_stailq) {
 				if (n->m_epg_seqno > m->m_epg_seqno)
 					break;
 				p = n;
 			}
 			if (n == NULL)
 				STAILQ_INSERT_TAIL(&tls->pending_records, m,
 				    m_epg_stailq);
 			else if (p == NULL)
 				STAILQ_INSERT_HEAD(&tls->pending_records, m,
 				    m_epg_stailq);
 			else
 				STAILQ_INSERT_AFTER(&tls->pending_records, p, m,
 				    m_epg_stailq);
 			mtx_unlock(&wq->mtx);
 			counter_u64_add(ktls_cnt_tx_pending, 1);
 			return;
 		}
 
 		tls->next_seqno += ktls_batched_records(m);
 		STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
 
 		while (!STAILQ_EMPTY(&tls->pending_records)) {
 			struct mbuf *n;
 
 			n = STAILQ_FIRST(&tls->pending_records);
 			if (n->m_epg_seqno != tls->next_seqno)
 				break;
 
 			queued++;
 			STAILQ_REMOVE_HEAD(&tls->pending_records, m_epg_stailq);
 			tls->next_seqno += ktls_batched_records(n);
 			STAILQ_INSERT_TAIL(&wq->m_head, n, m_epg_stailq);
 		}
 		counter_u64_add(ktls_cnt_tx_pending, -(queued - 1));
 	} else
 		STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
 
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 	counter_u64_add(ktls_cnt_tx_queued, queued);
 }
 
 /*
  * Once a file-backed mbuf (from sendfile) has been encrypted, free
  * the pages from the file and replace them with the anonymous pages
  * allocated in ktls_encrypt_record().
  */
 static void
 ktls_finish_nonanon(struct mbuf *m, struct ktls_ocf_encrypt_state *state)
 {
 	int i;
 
 	MPASS((m->m_epg_flags & EPG_FLAG_ANON) == 0);
 
 	/* Free the old pages. */
 	m->m_ext.ext_free(m);
 
 	/* Replace them with the new pages. */
 	if (state->cbuf != NULL) {
 		for (i = 0; i < m->m_epg_npgs; i++)
 			m->m_epg_pa[i] = state->parray[0] + ptoa(i);
 
 		/* Contig pages should go back to the cache. */
 		m->m_ext.ext_free = ktls_free_mext_contig;
 	} else {
 		for (i = 0; i < m->m_epg_npgs; i++)
 			m->m_epg_pa[i] = state->parray[i];
 
 		/* Use the basic free routine. */
 		m->m_ext.ext_free = mb_free_mext_pgs;
 	}
 
 	/* Pages are now writable. */
 	m->m_epg_flags |= EPG_FLAG_ANON;
 }
 
 static __noinline void
 ktls_encrypt(struct ktls_wq *wq, struct mbuf *top)
 {
 	struct ktls_ocf_encrypt_state state;
 	struct ktls_session *tls;
 	struct socket *so;
 	struct mbuf *m;
 	int error, npages, total_pages;
 
 	so = top->m_epg_so;
 	tls = top->m_epg_tls;
 	KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top));
 	KASSERT(so != NULL, ("so = NULL, top = %p\n", top));
 #ifdef INVARIANTS
 	top->m_epg_so = NULL;
 #endif
 	total_pages = top->m_epg_enc_cnt;
 	npages = 0;
 
 	/*
 	 * Encrypt the TLS records in the chain of mbufs starting with
 	 * 'top'.  'total_pages' gives us a total count of pages and is
 	 * used to know when we have finished encrypting the TLS
 	 * records originally queued with 'top'.
 	 *
 	 * NB: These mbufs are queued in the socket buffer and
 	 * 'm_next' is traversing the mbufs in the socket buffer.  The
 	 * socket buffer lock is not held while traversing this chain.
 	 * Since the mbufs are all marked M_NOTREADY their 'm_next'
 	 * pointers should be stable.  However, the 'm_next' of the
 	 * last mbuf encrypted is not necessarily NULL.  It can point
 	 * to other mbufs appended while 'top' was on the TLS work
 	 * queue.
 	 *
 	 * Each mbuf holds an entire TLS record.
 	 */
 	error = 0;
 	for (m = top; npages != total_pages; m = m->m_next) {
 		KASSERT(m->m_epg_tls == tls,
 		    ("different TLS sessions in a single mbuf chain: %p vs %p",
 		    tls, m->m_epg_tls));
 		KASSERT(npages + m->m_epg_npgs <= total_pages,
 		    ("page count mismatch: top %p, total_pages %d, m %p", top,
 		    total_pages, m));
 
 		error = ktls_encrypt_record(wq, m, tls, &state);
 		if (error) {
 			counter_u64_add(ktls_offload_failed_crypto, 1);
 			break;
 		}
 
 		if ((m->m_epg_flags & EPG_FLAG_ANON) == 0)
 			ktls_finish_nonanon(m, &state);
 
 		npages += m->m_epg_nrdy;
 
 		/*
 		 * Drop a reference to the session now that it is no
 		 * longer needed.  Existing code depends on encrypted
 		 * records having no associated session vs
 		 * yet-to-be-encrypted records having an associated
 		 * session.
 		 */
 		m->m_epg_tls = NULL;
 		ktls_free(tls);
 	}
 
 	CURVNET_SET(so->so_vnet);
 	if (error == 0) {
 		(void)so->so_proto->pr_ready(so, top, npages);
 	} else {
 		so->so_proto->pr_abort(so);
 		so->so_error = EIO;
 		mb_free_notready(top, total_pages);
 	}
 
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 void
 ktls_encrypt_cb(struct ktls_ocf_encrypt_state *state, int error)
 {
 	struct ktls_session *tls;
 	struct socket *so;
 	struct mbuf *m;
 	int npages;
 
 	m = state->m;
 
 	if ((m->m_epg_flags & EPG_FLAG_ANON) == 0)
 		ktls_finish_nonanon(m, state);
 
 	so = state->so;
 	free(state, M_KTLS);
 
 	/*
 	 * Drop a reference to the session now that it is no longer
 	 * needed.  Existing code depends on encrypted records having
 	 * no associated session vs yet-to-be-encrypted records having
 	 * an associated session.
 	 */
 	tls = m->m_epg_tls;
 	m->m_epg_tls = NULL;
 	ktls_free(tls);
 
 	if (error != 0)
 		counter_u64_add(ktls_offload_failed_crypto, 1);
 
 	CURVNET_SET(so->so_vnet);
 	npages = m->m_epg_nrdy;
 
 	if (error == 0) {
 		(void)so->so_proto->pr_ready(so, m, npages);
 	} else {
 		so->so_proto->pr_abort(so);
 		so->so_error = EIO;
 		mb_free_notready(m, npages);
 	}
 
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 /*
  * Similar to ktls_encrypt, but used with asynchronous OCF backends
  * (coprocessors) where encryption does not use host CPU resources and
  * it can be beneficial to queue more requests than CPUs.
  */
 static __noinline void
 ktls_encrypt_async(struct ktls_wq *wq, struct mbuf *top)
 {
 	struct ktls_ocf_encrypt_state *state;
 	struct ktls_session *tls;
 	struct socket *so;
 	struct mbuf *m, *n;
 	int error, mpages, npages, total_pages;
 
 	so = top->m_epg_so;
 	tls = top->m_epg_tls;
 	KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top));
 	KASSERT(so != NULL, ("so = NULL, top = %p\n", top));
 #ifdef INVARIANTS
 	top->m_epg_so = NULL;
 #endif
 	total_pages = top->m_epg_enc_cnt;
 	npages = 0;
 
 	error = 0;
 	for (m = top; npages != total_pages; m = n) {
 		KASSERT(m->m_epg_tls == tls,
 		    ("different TLS sessions in a single mbuf chain: %p vs %p",
 		    tls, m->m_epg_tls));
 		KASSERT(npages + m->m_epg_npgs <= total_pages,
 		    ("page count mismatch: top %p, total_pages %d, m %p", top,
 		    total_pages, m));
 
 		state = malloc(sizeof(*state), M_KTLS, M_WAITOK | M_ZERO);
 		soref(so);
 		state->so = so;
 		state->m = m;
 
 		mpages = m->m_epg_nrdy;
 		n = m->m_next;
 
 		error = ktls_encrypt_record(wq, m, tls, state);
 		if (error) {
 			counter_u64_add(ktls_offload_failed_crypto, 1);
 			free(state, M_KTLS);
 			CURVNET_SET(so->so_vnet);
 			sorele(so);
 			CURVNET_RESTORE();
 			break;
 		}
 
 		npages += mpages;
 	}
 
 	CURVNET_SET(so->so_vnet);
 	if (error != 0) {
 		so->so_proto->pr_abort(so);
 		so->so_error = EIO;
 		mb_free_notready(m, total_pages - npages);
 	}
 
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 static int
 ktls_bind_domain(int domain)
 {
 	int error;
 
 	error = cpuset_setthread(curthread->td_tid, &cpuset_domain[domain]);
 	if (error != 0)
 		return (error);
 	curthread->td_domain.dr_policy = DOMAINSET_PREF(domain);
 	return (0);
 }
 
 static void
 ktls_alloc_thread(void *ctx)
 {
 	struct ktls_domain_info *ktls_domain = ctx;
 	struct ktls_alloc_thread *sc = &ktls_domain->alloc_td;
 	void **buf;
 	struct sysctl_oid *oid;
 	char name[80];
 	int domain, error, i, nbufs;
 
 	domain = ktls_domain - ktls_domains;
 	if (bootverbose)
 		printf("Starting KTLS alloc thread for domain %d\n", domain);
 	error = ktls_bind_domain(domain);
 	if (error)
 		printf("Unable to bind KTLS alloc thread for domain %d: error %d\n",
 		    domain, error);
 	snprintf(name, sizeof(name), "domain%d", domain);
 	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_kern_ipc_tls), OID_AUTO,
 	    name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "allocs",
 	    CTLFLAG_RD,  &sc->allocs, 0, "buffers allocated");
 	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "wakeups",
 	    CTLFLAG_RD,  &sc->wakeups, 0, "thread wakeups");
 	SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "running",
 	    CTLFLAG_RD,  &sc->running, 0, "thread running");
 
 	buf = NULL;
 	nbufs = 0;
 	for (;;) {
 		atomic_store_int(&sc->running, 0);
 		tsleep(sc, PZERO | PNOLOCK, "-",  0);
 		atomic_store_int(&sc->running, 1);
 		sc->wakeups++;
 		if (nbufs != ktls_max_alloc) {
 			free(buf, M_KTLS);
 			nbufs = atomic_load_int(&ktls_max_alloc);
 			buf = malloc(sizeof(void *) * nbufs, M_KTLS,
 			    M_WAITOK | M_ZERO);
 		}
 		/*
 		 * Below we allocate nbufs with different allocation
 		 * flags than we use when allocating normally during
 		 * encryption in the ktls worker thread.  We specify
 		 * M_NORECLAIM in the worker thread. However, we omit
 		 * that flag here and add M_WAITOK so that the VM
 		 * system is permitted to perform expensive work to
 		 * defragment memory.  We do this here, as it does not
 		 * matter if this thread blocks.  If we block a ktls
 		 * worker thread, we risk developing backlogs of
 		 * buffers to be encrypted, leading to surges of
 		 * traffic and potential NIC output drops.
 		 */
 		for (i = 0; i < nbufs; i++) {
 			buf[i] = uma_zalloc(ktls_buffer_zone, M_WAITOK);
 			sc->allocs++;
 		}
 		for (i = 0; i < nbufs; i++) {
 			uma_zfree(ktls_buffer_zone, buf[i]);
 			buf[i] = NULL;
 		}
 	}
 }
 
 static void
 ktls_work_thread(void *ctx)
 {
 	struct ktls_wq *wq = ctx;
 	struct mbuf *m, *n;
 	struct socket *so, *son;
 	STAILQ_HEAD(, mbuf) local_m_head;
 	STAILQ_HEAD(, socket) local_so_head;
 	int cpu;
 
 	cpu = wq - ktls_wq;
 	if (bootverbose)
 		printf("Starting KTLS worker thread for CPU %d\n", cpu);
 
 	/*
 	 * Bind to a core.  If ktls_bind_threads is > 1, then
 	 * we bind to the NUMA domain instead.
 	 */
 	if (ktls_bind_threads) {
 		int error;
 
 		if (ktls_bind_threads > 1) {
 			struct pcpu *pc = pcpu_find(cpu);
 
 			error = ktls_bind_domain(pc->pc_domain);
 		} else {
 			cpuset_t mask;
 
 			CPU_SETOF(cpu, &mask);
 			error = cpuset_setthread(curthread->td_tid, &mask);
 		}
 		if (error)
 			printf("Unable to bind KTLS worker thread for CPU %d: error %d\n",
 				cpu, error);
 	}
 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
 	fpu_kern_thread(0);
 #endif
 	for (;;) {
 		mtx_lock(&wq->mtx);
 		while (STAILQ_EMPTY(&wq->m_head) &&
 		    STAILQ_EMPTY(&wq->so_head)) {
 			wq->running = false;
 			mtx_sleep(wq, &wq->mtx, 0, "-", 0);
 			wq->running = true;
 		}
 
 		STAILQ_INIT(&local_m_head);
 		STAILQ_CONCAT(&local_m_head, &wq->m_head);
 		STAILQ_INIT(&local_so_head);
 		STAILQ_CONCAT(&local_so_head, &wq->so_head);
 		mtx_unlock(&wq->mtx);
 
 		STAILQ_FOREACH_SAFE(m, &local_m_head, m_epg_stailq, n) {
 			if (m->m_epg_flags & EPG_FLAG_2FREE) {
 				ktls_free(m->m_epg_tls);
 				m_free_raw(m);
 			} else {
 				if (m->m_epg_tls->sync_dispatch)
 					ktls_encrypt(wq, m);
 				else
 					ktls_encrypt_async(wq, m);
 				counter_u64_add(ktls_cnt_tx_queued, -1);
 			}
 		}
 
 		STAILQ_FOREACH_SAFE(so, &local_so_head, so_ktls_rx_list, son) {
 			ktls_decrypt(so);
 			counter_u64_add(ktls_cnt_rx_queued, -1);
 		}
 	}
 }
 
 #if defined(INET) || defined(INET6)
 static void
 ktls_disable_ifnet_help(void *context, int pending __unused)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	int err;
 
 	tls = context;
 	inp = tls->inp;
 	if (inp == NULL)
 		return;
 	INP_WLOCK(inp);
 	so = inp->inp_socket;
 	MPASS(so != NULL);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		goto out;
 	}
 
 	if (so->so_snd.sb_tls_info != NULL)
 		err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW);
 	else
 		err = ENXIO;
 	if (err == 0) {
 		counter_u64_add(ktls_ifnet_disable_ok, 1);
 		/* ktls_set_tx_mode() drops inp wlock, so recheck flags */
-		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 &&
+		if ((inp->inp_flags & INP_DROPPED) == 0 &&
 		    (tp = intotcpcb(inp)) != NULL &&
 		    tp->t_fb->tfb_hwtls_change != NULL)
 			(*tp->t_fb->tfb_hwtls_change)(tp, 0);
 	} else {
 		counter_u64_add(ktls_ifnet_disable_fail, 1);
 	}
 
 out:
 	sorele(so);
 	if (!in_pcbrele_wlocked(inp))
 		INP_WUNLOCK(inp);
 	ktls_free(tls);
 }
 
 /*
  * Called when re-transmits are becoming a substantial portion of the
  * sends on this connection.  When this happens, we transition the
  * connection to software TLS.  This is needed because most inline TLS
  * NICs keep crypto state only for in-order transmits.  This means
  * that to handle a TCP rexmit (which is out-of-order), the NIC must
  * re-DMA the entire TLS record up to and including the current
  * segment.  This means that when re-transmitting the last ~1448 byte
  * segment of a 16KB TLS record, we could wind up re-DMA'ing an order
  * of magnitude more data than we are sending.  This can cause the
  * PCIe link to saturate well before the network, which can cause
  * output drops, and a general loss of capacity.
  */
 void
 ktls_disable_ifnet(void *arg)
 {
 	struct tcpcb *tp;
 	struct inpcb *inp;
 	struct socket *so;
 	struct ktls_session *tls;
 
 	tp = arg;
 	inp = tp->t_inpcb;
 	INP_WLOCK_ASSERT(inp);
 	so = inp->inp_socket;
 	SOCK_LOCK(so);
 	tls = so->so_snd.sb_tls_info;
 	if (tls->disable_ifnet_pending) {
 		SOCK_UNLOCK(so);
 		return;
 	}
 
 	/*
 	 * note that disable_ifnet_pending is never cleared; disabling
 	 * ifnet can only be done once per session, so we never want
 	 * to do it again
 	 */
 
 	(void)ktls_hold(tls);
 	in_pcbref(inp);
 	soref(so);
 	tls->disable_ifnet_pending = true;
 	tls->inp = inp;
 	SOCK_UNLOCK(so);
 	TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls);
 	(void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task);
 }
 #endif
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 9dd6d3d019ca..75889b2011ac 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -1,3432 +1,3427 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1991, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2007-2009 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ipsec.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ratelimit.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/hash.h>
 #include <sys/systm.h>
 #include <sys/libkern.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/callout.h>
 #include <sys/eventhandler.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_llatbl.h>
 #include <net/route.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_pcb_var.h>
 #ifdef INET
 #include <netinet/in_var.h>
 #include <netinet/in_fib.h>
 #endif
 #include <netinet/ip_var.h>
 #include <netinet/tcp_var.h>
 #ifdef TCPHPTS
 #include <netinet/tcp_hpts.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #endif /* INET6 */
 #include <net/route/nhop.h>
 #endif
 
 #include <netipsec/ipsec_support.h>
 
 #include <security/mac/mac_framework.h>
 
 #define	INPCBLBGROUP_SIZMIN	8
 #define	INPCBLBGROUP_SIZMAX	256
 #define	INP_FREED	0x00000200	/* See in_pcb.h. */
 
 static struct callout	ipport_tick_callout;
 
 /*
  * These configure the range of local port addresses assigned to
  * "unspecified" outgoing connections/packets/whatever.
  */
 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;	/* 1023 */
 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;	/* 600 */
 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;	/* 10000 */
 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;	/* 65535 */
 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;	/* 49152 */
 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;	/* 65535 */
 
 /*
  * Reserved ports accessible only to root. There are significant
  * security considerations that must be accounted for when changing these,
  * but the security benefits can be great. Please be careful.
  */
 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;	/* 1023 */
 VNET_DEFINE(int, ipport_reservedlow);
 
 /* Variables dealing with random ephemeral port allocation. */
 VNET_DEFINE(int, ipport_randomized) = 1;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_randomcps) = 10;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_randomtime) = 45;	/* user controlled via sysctl */
 VNET_DEFINE(int, ipport_stoprandom);		/* toggled by ipport_tick */
 VNET_DEFINE(int, ipport_tcpallocs);
 VNET_DEFINE_STATIC(int, ipport_tcplastcount);
 
 #define	V_ipport_tcplastcount		VNET(ipport_tcplastcount)
 
 #ifdef INET
 static struct inpcb	*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
 			    struct in_addr faddr, u_int fport_arg,
 			    struct in_addr laddr, u_int lport_arg,
 			    int lookupflags, struct ifnet *ifp,
 			    uint8_t numa_domain);
 
 #define RANGECHK(var, min, max) \
 	if ((var) < (min)) { (var) = (min); } \
 	else if ((var) > (max)) { (var) = (max); }
 
 static int
 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	error = sysctl_handle_int(oidp, arg1, arg2, req);
 	if (error == 0) {
 		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
 	}
 	return (error);
 }
 
 #undef RANGECHK
 
 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IP Ports");
 
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
 	CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
 	&VNET_NAME(ipport_reservedhigh), 0, "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
 	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
 	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
 	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
 	"allocations before switching to a sequential one");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
 	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomtime), 0,
 	"Minimum time to keep sequential port "
 	"allocation before switching to a random one");
 
 #ifdef RATELIMIT
 counter_u64_t rate_limit_new;
 counter_u64_t rate_limit_chg;
 counter_u64_t rate_limit_active;
 counter_u64_t rate_limit_alloc_fail;
 counter_u64_t rate_limit_set_ok;
 
 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "IP Rate Limiting");
 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
     &rate_limit_active, "Active rate limited connections");
 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
    &rate_limit_alloc_fail, "Rate limited connection failures");
 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
    &rate_limit_set_ok, "Rate limited setting succeeded");
 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
    &rate_limit_new, "Total Rate limit new attempts");
 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
    &rate_limit_chg, "Total Rate limited change attempts");
 
 #endif /* RATELIMIT */
 
 #endif /* INET */
 
 VNET_DEFINE(uint32_t, in_pcbhashseed);
 static void
 in_pcbhashseed_init(void)
 {
 
 	V_in_pcbhashseed = arc4random();
 }
 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
     in_pcbhashseed_init, 0);
 
 /*
  * in_pcb.c: manage the Protocol Control Blocks.
  *
  * NOTE: It is assumed that most of these functions will be called with
  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
  * functions often modify hash chains or addresses in pcbs.
  */
 
 static struct inpcblbgroup *
 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
     uint16_t port, const union in_dependaddr *addr, int size,
     uint8_t numa_domain)
 {
 	struct inpcblbgroup *grp;
 	size_t bytes;
 
 	bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
 	grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
 	if (!grp)
 		return (NULL);
 	grp->il_vflag = vflag;
 	grp->il_lport = port;
 	grp->il_numa_domain = numa_domain;
 	grp->il_dependladdr = *addr;
 	grp->il_inpsiz = size;
 	CK_LIST_INSERT_HEAD(hdr, grp, il_list);
 	return (grp);
 }
 
 static void
 in_pcblbgroup_free_deferred(epoch_context_t ctx)
 {
 	struct inpcblbgroup *grp;
 
 	grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
 	free(grp, M_PCB);
 }
 
 static void
 in_pcblbgroup_free(struct inpcblbgroup *grp)
 {
 
 	CK_LIST_REMOVE(grp, il_list);
 	NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
 }
 
 static struct inpcblbgroup *
 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
     struct inpcblbgroup *old_grp, int size)
 {
 	struct inpcblbgroup *grp;
 	int i;
 
 	grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
 	    old_grp->il_lport, &old_grp->il_dependladdr, size,
 	    old_grp->il_numa_domain);
 	if (grp == NULL)
 		return (NULL);
 
 	KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
 	    ("invalid new local group size %d and old local group count %d",
 	     grp->il_inpsiz, old_grp->il_inpcnt));
 
 	for (i = 0; i < old_grp->il_inpcnt; ++i)
 		grp->il_inp[i] = old_grp->il_inp[i];
 	grp->il_inpcnt = old_grp->il_inpcnt;
 	in_pcblbgroup_free(old_grp);
 	return (grp);
 }
 
 /*
  * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
  * and shrink group if possible.
  */
 static void
 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
     int i)
 {
 	struct inpcblbgroup *grp, *new_grp;
 
 	grp = *grpp;
 	for (; i + 1 < grp->il_inpcnt; ++i)
 		grp->il_inp[i] = grp->il_inp[i + 1];
 	grp->il_inpcnt--;
 
 	if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
 	    grp->il_inpcnt <= grp->il_inpsiz / 4) {
 		/* Shrink this group. */
 		new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
 		if (new_grp != NULL)
 			*grpp = new_grp;
 	}
 }
 
 /*
  * Add PCB to load balance group for SO_REUSEPORT_LB option.
  */
 static int
 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
 {
 	const static struct timeval interval = { 60, 0 };
 	static struct timeval lastprint;
 	struct inpcbinfo *pcbinfo;
 	struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	uint32_t idx;
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	/*
 	 * Don't allow jailed socket to join local group.
 	 */
 	if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred))
 		return (0);
 
 #ifdef INET6
 	/*
 	 * Don't allow IPv4 mapped INET6 wild socket.
 	 */
 	if ((inp->inp_vflag & INP_IPV4) &&
 	    inp->inp_laddr.s_addr == INADDR_ANY &&
 	    INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
 		return (0);
 	}
 #endif
 
 	idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
 	hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 		if (grp->il_vflag == inp->inp_vflag &&
 		    grp->il_lport == inp->inp_lport &&
 		    grp->il_numa_domain == numa_domain &&
 		    memcmp(&grp->il_dependladdr,
 		    &inp->inp_inc.inc_ie.ie_dependladdr,
 		    sizeof(grp->il_dependladdr)) == 0)
 			break;
 	}
 	if (grp == NULL) {
 		/* Create new load balance group. */
 		grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
 		    inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
 		    INPCBLBGROUP_SIZMIN, numa_domain);
 		if (grp == NULL)
 			return (ENOBUFS);
 	} else if (grp->il_inpcnt == grp->il_inpsiz) {
 		if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
 			if (ratecheck(&lastprint, &interval))
 				printf("lb group port %d, limit reached\n",
 				    ntohs(grp->il_lport));
 			return (0);
 		}
 
 		/* Expand this local group. */
 		grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
 		if (grp == NULL)
 			return (ENOBUFS);
 	}
 
 	KASSERT(grp->il_inpcnt < grp->il_inpsiz,
 	    ("invalid local group size %d and count %d", grp->il_inpsiz,
 	    grp->il_inpcnt));
 
 	grp->il_inp[grp->il_inpcnt] = inp;
 	grp->il_inpcnt++;
 	return (0);
 }
 
 /*
  * Remove PCB from load balance group.
  */
 static void
 in_pcbremlbgrouphash(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 	struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	int i;
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	hdr = &pcbinfo->ipi_lbgrouphashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 		for (i = 0; i < grp->il_inpcnt; ++i) {
 			if (grp->il_inp[i] != inp)
 				continue;
 
 			if (grp->il_inpcnt == 1) {
 				/* We are the last, free this local group. */
 				in_pcblbgroup_free(grp);
 			} else {
 				/* Pull up inpcbs, shrink group if possible. */
 				in_pcblbgroup_reorder(hdr, &grp, i);
 			}
 			return;
 		}
 	}
 }
 
 int
 in_pcblbgroup_numa(struct inpcb *inp, int arg)
 {
 	struct inpcbinfo *pcbinfo;
 	struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	int err, i;
 	uint8_t numa_domain;
 
 	switch (arg) {
 	case TCP_REUSPORT_LB_NUMA_NODOM:
 		numa_domain = M_NODOM;
 		break;
 	case TCP_REUSPORT_LB_NUMA_CURDOM:
 		numa_domain = PCPU_GET(domain);
 		break;
 	default:
 		if (arg < 0 || arg >= vm_ndomains)
 			return (EINVAL);
 		numa_domain = arg;
 	}
 
 	err = 0;
 	pcbinfo = inp->inp_pcbinfo;
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK(pcbinfo);
 	hdr = &pcbinfo->ipi_lbgrouphashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 		for (i = 0; i < grp->il_inpcnt; ++i) {
 			if (grp->il_inp[i] != inp)
 				continue;
 
 			if (grp->il_numa_domain == numa_domain) {
 				goto abort_with_hash_wlock;
 			}
 
 			/* Remove it from the old group. */
 			in_pcbremlbgrouphash(inp);
 
 			/* Add it to the new group based on numa domain. */
 			in_pcbinslbgrouphash(inp, numa_domain);
 			goto abort_with_hash_wlock;
 		}
 	}
 	err = ENOENT;
 abort_with_hash_wlock:
 	INP_HASH_WUNLOCK(pcbinfo);
 	return (err);
 }
 
 /* Make sure it is safe to use hashinit(9) on CK_LIST. */
 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
 
 /*
  * Initialize an inpcbinfo - a per-VNET instance of connections db.
  */
 void
 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
     u_int hash_nelements, u_int porthash_nelements)
 {
 
 	mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF);
 	mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
 	    NULL, MTX_DEF);
 #ifdef VIMAGE
 	pcbinfo->ipi_vnet = curvnet;
 #endif
 	CK_LIST_INIT(&pcbinfo->ipi_listhead);
 	pcbinfo->ipi_count = 0;
 	pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
 	    &pcbinfo->ipi_hashmask);
 	porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
 	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_porthashmask);
 	pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_lbgrouphashmask);
 	pcbinfo->ipi_zone = pcbstor->ips_zone;
 	pcbinfo->ipi_portzone = pcbstor->ips_portzone;
 	pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
 }
 
 /*
  * Destroy an inpcbinfo.
  */
 void
 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
 {
 
 	KASSERT(pcbinfo->ipi_count == 0,
 	    ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
 
 	hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
 	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
 	    pcbinfo->ipi_porthashmask);
 	hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
 	    pcbinfo->ipi_lbgrouphashmask);
 	mtx_destroy(&pcbinfo->ipi_hash_lock);
 	mtx_destroy(&pcbinfo->ipi_lock);
 }
 
 /*
  * Initialize a pcbstorage - per protocol zones to allocate inpcbs.
  */
 static void inpcb_dtor(void *, int, void *);
 static void inpcb_fini(void *, int);
 void
 in_pcbstorage_init(void *arg)
 {
 	struct inpcbstorage *pcbstor = arg;
 
 	pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
 	    sizeof(struct inpcb), NULL, inpcb_dtor, pcbstor->ips_pcbinit,
 	    inpcb_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR);
 	pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name,
 	    sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_zone_set_smr(pcbstor->ips_portzone,
 	    uma_zone_get_smr(pcbstor->ips_zone));
 }
 
 /*
  * Destroy a pcbstorage - used by unloadable protocols.
  */
 void
 in_pcbstorage_destroy(void *arg)
 {
 	struct inpcbstorage *pcbstor = arg;
 
 	uma_zdestroy(pcbstor->ips_zone);
 	uma_zdestroy(pcbstor->ips_portzone);
 }
 
 /*
  * Allocate a PCB and associate it with the socket.
  * On success return with the PCB locked.
  */
 int
 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
 {
 	struct inpcb *inp;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
 	int error;
 #endif
 
 	inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
 	if (inp == NULL)
 		return (ENOBUFS);
 	bzero(&inp->inp_start_zero, inp_zero_size);
 #ifdef NUMA
 	inp->inp_numa_domain = M_NODOM;
 #endif
 	inp->inp_pcbinfo = pcbinfo;
 	inp->inp_socket = so;
 	inp->inp_cred = crhold(so->so_cred);
 	inp->inp_inc.inc_fibnum = so->so_fibnum;
 #ifdef MAC
 	error = mac_inpcb_init(inp, M_NOWAIT);
 	if (error != 0)
 		goto out;
 	mac_inpcb_create(so, inp);
 #endif
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	error = ipsec_init_pcbpolicy(inp);
 	if (error != 0) {
 #ifdef MAC
 		mac_inpcb_destroy(inp);
 #endif
 		goto out;
 	}
 #endif /*IPSEC*/
 #ifdef INET6
 	if (INP_SOCKAF(so) == AF_INET6) {
 		inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6;
 		if (V_ip6_v6only)
 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
 #ifdef INET
 		else
 			inp->inp_vflag |= INP_IPV4;
 #endif
 		if (V_ip6_auto_flowlabel)
 			inp->inp_flags |= IN6P_AUTOFLOWLABEL;
 		inp->in6p_hops = -1;	/* use kernel default */
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 		inp->inp_vflag |= INP_IPV4;
 #endif
 	/*
 	 * Routes in inpcb's can cache L2 as well; they are guaranteed
 	 * to be cleaned up.
 	 */
 	inp->inp_route.ro_flags = RT_LLE_CACHE;
 #ifdef TCPHPTS
 	/*
 	 * If using hpts lets drop a random number in so
 	 * not all new connections fall on the same CPU.
 	 */
 	inp->inp_hpts_cpu = hpts_random_cpu(inp);
 #endif
 	refcount_init(&inp->inp_refcount, 1);   /* Reference from socket. */
 	INP_WLOCK(inp);
 	INP_INFO_WLOCK(pcbinfo);
 	pcbinfo->ipi_count++;
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
 	INP_INFO_WUNLOCK(pcbinfo);
 	so->so_pcb = inp;
 
 	return (0);
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
 out:
 	uma_zfree_smr(pcbinfo->ipi_zone, inp);
 	return (error);
 #endif
 }
 
 #ifdef INET
 int
 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 	int anonport, error;
 
 	KASSERT(nam == NULL || nam->sa_family == AF_INET,
 	    ("%s: invalid address family for %p", __func__, nam));
 	KASSERT(nam == NULL || nam->sa_len == sizeof(struct sockaddr_in),
 	    ("%s: invalid address length for %p", __func__, nam));
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
 	error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
 	    &inp->inp_lport, cred);
 	if (error)
 		return (error);
 	if (in_pcbinshash(inp) != 0) {
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		inp->inp_lport = 0;
 		return (EAGAIN);
 	}
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 #endif
 
 #if defined(INET) || defined(INET6)
 /*
  * Assign a local port like in_pcb_lport(), but also used with connect()
  * and a foreign address and port.  If fsa is non-NULL, choose a local port
  * that is unused with those, otherwise one that is completely unused.
  * lsa can be NULL for IPv6.
  */
 int
 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp,
     struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags)
 {
 	struct inpcbinfo *pcbinfo;
 	struct inpcb *tmpinp;
 	unsigned short *lastport;
 	int count, dorandom, error;
 	u_short aux, first, last, lport;
 #ifdef INET
 	struct in_addr laddr, faddr;
 #endif
 #ifdef INET6
 	struct in6_addr *laddr6, *faddr6;
 #endif
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	/*
 	 * Because no actual state changes occur here, a global write lock on
 	 * the pcbinfo isn't required.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if (inp->inp_flags & INP_HIGHPORT) {
 		first = V_ipport_hifirstauto;	/* sysctl */
 		last  = V_ipport_hilastauto;
 		lastport = &pcbinfo->ipi_lasthi;
 	} else if (inp->inp_flags & INP_LOWPORT) {
 		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
 		if (error)
 			return (error);
 		first = V_ipport_lowfirstauto;	/* 1023 */
 		last  = V_ipport_lowlastauto;	/* 600 */
 		lastport = &pcbinfo->ipi_lastlow;
 	} else {
 		first = V_ipport_firstauto;	/* sysctl */
 		last  = V_ipport_lastauto;
 		lastport = &pcbinfo->ipi_lastport;
 	}
 	/*
 	 * For UDP(-Lite), use random port allocation as long as the user
 	 * allows it.  For TCP (and as of yet unknown) connections,
 	 * use random port allocation only if the user allows it AND
 	 * ipport_tick() allows it.
 	 */
 	if (V_ipport_randomized &&
 		(!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
 		pcbinfo == &V_ulitecbinfo))
 		dorandom = 1;
 	else
 		dorandom = 0;
 	/*
 	 * It makes no sense to do random port allocation if
 	 * we have the only port available.
 	 */
 	if (first == last)
 		dorandom = 0;
 	/* Make sure to not include UDP(-Lite) packets in the count. */
 	if (pcbinfo != &V_udbinfo && pcbinfo != &V_ulitecbinfo)
 		V_ipport_tcpallocs++;
 	/*
 	 * Instead of having two loops further down counting up or down
 	 * make sure that first is always <= last and go with only one
 	 * code path implementing all logic.
 	 */
 	if (first > last) {
 		aux = first;
 		first = last;
 		last = aux;
 	}
 
 #ifdef INET
 	laddr.s_addr = INADDR_ANY;	/* used by INET6+INET below too */
 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
 		if (lsa != NULL)
 			laddr = ((struct sockaddr_in *)lsa)->sin_addr;
 		if (fsa != NULL)
 			faddr = ((struct sockaddr_in *)fsa)->sin_addr;
 	}
 #endif
 #ifdef INET6
 	laddr6 = NULL;
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		if (lsa != NULL)
 			laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
 		if (fsa != NULL)
 			faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
 	}
 #endif
 
 	tmpinp = NULL;
 	lport = *lportp;
 
 	if (dorandom)
 		*lastport = first + (arc4random() % (last - first));
 
 	count = last - first;
 
 	do {
 		if (count-- < 0)	/* completely used? */
 			return (EADDRNOTAVAIL);
 		++*lastport;
 		if (*lastport < first || *lastport > last)
 			*lastport = first;
 		lport = htons(*lastport);
 
 		if (fsa != NULL) {
 #ifdef INET
 			if (lsa->sa_family == AF_INET) {
 				tmpinp = in_pcblookup_hash_locked(pcbinfo,
 				    faddr, fport, laddr, lport, lookupflags,
 				    NULL, M_NODOM);
 			}
 #endif
 #ifdef INET6
 			if (lsa->sa_family == AF_INET6) {
 				tmpinp = in6_pcblookup_hash_locked(pcbinfo,
 				    faddr6, fport, laddr6, lport, lookupflags,
 				    NULL, M_NODOM);
 			}
 #endif
 		} else {
 #ifdef INET6
 			if ((inp->inp_vflag & INP_IPV6) != 0) {
 				tmpinp = in6_pcblookup_local(pcbinfo,
 				    &inp->in6p_laddr, lport, lookupflags, cred);
 #ifdef INET
 				if (tmpinp == NULL &&
 				    (inp->inp_vflag & INP_IPV4))
 					tmpinp = in_pcblookup_local(pcbinfo,
 					    laddr, lport, lookupflags, cred);
 #endif
 			}
 #endif
 #if defined(INET) && defined(INET6)
 			else
 #endif
 #ifdef INET
 				tmpinp = in_pcblookup_local(pcbinfo, laddr,
 				    lport, lookupflags, cred);
 #endif
 		}
 	} while (tmpinp != NULL);
 
 	*lportp = lport;
 
 	return (0);
 }
 
 /*
  * Select a local port (number) to use.
  */
 int
 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
     struct ucred *cred, int lookupflags)
 {
 	struct sockaddr_in laddr;
 
 	if (laddrp) {
 		bzero(&laddr, sizeof(laddr));
 		laddr.sin_family = AF_INET;
 		laddr.sin_addr = *laddrp;
 	}
 	return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
 	    NULL, lportp, NULL, 0, cred, lookupflags));
 }
 
 /*
  * Return cached socket options.
  */
 int
 inp_so_options(const struct inpcb *inp)
 {
 	int so_options;
 
 	so_options = 0;
 
 	if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
 		so_options |= SO_REUSEPORT_LB;
 	if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
 		so_options |= SO_REUSEPORT;
 	if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
 		so_options |= SO_REUSEADDR;
 	return (so_options);
 }
 #endif /* INET || INET6 */
 
 /*
  * Check if a new BINDMULTI socket is allowed to be created.
  *
  * ni points to the new inp.
  * oi points to the existing inp.
  *
  * This checks whether the existing inp also has BINDMULTI and
  * whether the credentials match.
  */
 int
 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
 {
 	/* Check permissions match */
 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
 	    (ni->inp_cred->cr_uid !=
 	    oi->inp_cred->cr_uid))
 		return (0);
 
 	/* Check the existing inp has BINDMULTI set */
 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
 	    ((oi->inp_flags2 & INP_BINDMULTI) == 0))
 		return (0);
 
 	/*
 	 * We're okay - either INP_BINDMULTI isn't set on ni, or
 	 * it is and it matches the checks.
 	 */
 	return (1);
 }
 
 #ifdef INET
 /*
  * Set up a bind operation on a PCB, performing port allocation
  * as required, but do not actually modify the PCB. Callers can
  * either complete the bind by setting inp_laddr/inp_lport and
  * calling in_pcbinshash(), or they can just use the resulting
  * port and address to authorise the sending of a once-off packet.
  *
  * On error, the values of *laddrp and *lportp are not changed.
  */
 int
 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
     u_short *lportp, struct ucred *cred)
 {
 	struct socket *so = inp->inp_socket;
 	struct sockaddr_in *sin;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct in_addr laddr;
 	u_short lport = 0;
 	int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
 	int error;
 
 	/*
 	 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
 	 * so that we don't have to add to the (already messy) code below.
 	 */
 	int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
 
 	/*
 	 * No state changes, so read locks are sufficient here.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	laddr.s_addr = *laddrp;
 	if (nam != NULL && laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
 		lookupflags = INPLOOKUP_WILDCARD;
 	if (nam == NULL) {
 		if ((error = prison_local_ip4(cred, &laddr)) != 0)
 			return (error);
 	} else {
 		sin = (struct sockaddr_in *)nam;
 		KASSERT(sin->sin_family == AF_INET,
 		    ("%s: invalid family for address %p", __func__, sin));
 		KASSERT(sin->sin_len == sizeof(*sin),
 		    ("%s: invalid length for address %p", __func__, sin));
 
 		error = prison_local_ip4(cred, &sin->sin_addr);
 		if (error)
 			return (error);
 		if (sin->sin_port != *lportp) {
 			/* Don't allow the port to change. */
 			if (*lportp != 0)
 				return (EINVAL);
 			lport = sin->sin_port;
 		}
 		/* NB: lport is left as 0 if the port isn't being changed. */
 		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
 			/*
 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
 			 * allow complete duplication of binding if
 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
 			 * and a multicast address is bound on both
 			 * new and duplicated sockets.
 			 */
 			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
 			/*
 			 * XXX: How to deal with SO_REUSEPORT_LB here?
 			 * Treat same as SO_REUSEPORT for now.
 			 */
 			if ((so->so_options &
 			    (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
 				reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
 			sin->sin_port = 0;		/* yech... */
 			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
 			/*
 			 * Is the address a local IP address?
 			 * If INP_BINDANY is set, then the socket may be bound
 			 * to any endpoint address, local or not.
 			 */
 			if ((inp->inp_flags & INP_BINDANY) == 0 &&
 			    ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
 				return (EADDRNOTAVAIL);
 		}
 		laddr = sin->sin_addr;
 		if (lport) {
 			struct inpcb *t;
 
 			/* GROSS */
 			if (ntohs(lport) <= V_ipport_reservedhigh &&
 			    ntohs(lport) >= V_ipport_reservedlow &&
 			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
 				return (EACCES);
 			if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
 			    priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
 				t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 				    lport, INPLOOKUP_WILDCARD, cred);
 	/*
 	 * XXX
 	 * This entire block sorely needs a rewrite.
 	 */
 				if (t &&
 				    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
-				    ((t->inp_flags & INP_TIMEWAIT) == 0) &&
 				    (so->so_type != SOCK_STREAM ||
 				     ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
 				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
 				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
 				     (t->inp_flags2 & INP_REUSEPORT) ||
 				     (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
 				    (inp->inp_cred->cr_uid !=
 				     t->inp_cred->cr_uid))
 					return (EADDRINUSE);
 
 				/*
 				 * If the socket is a BINDMULTI socket, then
 				 * the credentials need to match and the
 				 * original socket also has to have been bound
 				 * with BINDMULTI.
 				 */
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 			}
 			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 			    lport, lookupflags, cred);
 			if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 			    (reuseport & inp_so_options(t)) == 0 &&
 			    (reuseport_lb & inp_so_options(t)) == 0) {
 #ifdef INET6
 				if (ntohl(sin->sin_addr.s_addr) !=
 				    INADDR_ANY ||
 				    ntohl(t->inp_laddr.s_addr) !=
 				    INADDR_ANY ||
 				    (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
 				    (t->inp_vflag & INP_IPV6PROTO) == 0)
 #endif
 						return (EADDRINUSE);
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 			}
 		}
 	}
 	if (*lportp != 0)
 		lport = *lportp;
 	if (lport == 0) {
 		error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
 		if (error != 0)
 			return (error);
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	return (0);
 }
 
 /*
  * Connect from a socket to a specified address.
  * Both address and port must be specified in argument sin.
  * If don't have a local address for this socket yet,
  * then pick one.
  */
 int
 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred,
     bool rehash)
 {
 	u_short lport, fport;
 	in_addr_t laddr, faddr;
 	int anonport, error;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	lport = inp->inp_lport;
 	laddr = inp->inp_laddr.s_addr;
 	anonport = (lport == 0);
 	error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
 	    NULL, cred);
 	if (error)
 		return (error);
 
 	/* Do the initial binding of the local address if required. */
 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
 		KASSERT(rehash == true,
 		    ("Rehashing required for unbound inps"));
 		inp->inp_lport = lport;
 		inp->inp_laddr.s_addr = laddr;
 		if (in_pcbinshash(inp) != 0) {
 			inp->inp_laddr.s_addr = INADDR_ANY;
 			inp->inp_lport = 0;
 			return (EAGAIN);
 		}
 	}
 
 	/* Commit the remaining changes. */
 	inp->inp_lport = lport;
 	inp->inp_laddr.s_addr = laddr;
 	inp->inp_faddr.s_addr = faddr;
 	inp->inp_fport = fport;
 	if (rehash) {
 		in_pcbrehash(inp);
 	} else {
 		in_pcbinshash(inp);
 	}
 
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 
 /*
  * Do proper source address selection on an unbound socket in case
  * of connect. Take jails into account as well.
  */
 int
 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
     struct ucred *cred)
 {
 	struct ifaddr *ifa;
 	struct sockaddr *sa;
 	struct sockaddr_in *sin, dst;
 	struct nhop_object *nh;
 	int error;
 
 	NET_EPOCH_ASSERT();
 	KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
 	/*
 	 * Bypass source address selection and use the primary jail IP
 	 * if requested.
 	 */
 	if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
 		return (0);
 
 	error = 0;
 
 	nh = NULL;
 	bzero(&dst, sizeof(dst));
 	sin = &dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(struct sockaddr_in);
 	sin->sin_addr.s_addr = faddr->s_addr;
 
 	/*
 	 * If route is known our src addr is taken from the i/f,
 	 * else punt.
 	 *
 	 * Find out route to destination.
 	 */
 	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
 		nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
 		    0, NHR_NONE, 0);
 
 	/*
 	 * If we found a route, use the address corresponding to
 	 * the outgoing interface.
 	 *
 	 * Otherwise assume faddr is reachable on a directly connected
 	 * network and try to find a corresponding interface to take
 	 * the source address from.
 	 */
 	if (nh == NULL || nh->nh_ifp == NULL) {
 		struct in_ifaddr *ia;
 		struct ifnet *ifp;
 
 		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
 					inp->inp_socket->so_fibnum));
 		if (ia == NULL) {
 			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
 						inp->inp_socket->so_fibnum));
 		}
 		if (ia == NULL) {
 			error = ENETUNREACH;
 			goto done;
 		}
 
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		ifp = ia->ia_ifp;
 		ia = NULL;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 	/*
 	 * If the outgoing interface on the route found is not
 	 * a loopback interface, use the address from that interface.
 	 * In case of jails do those three steps:
 	 * 1. check if the interface address belongs to the jail. If so use it.
 	 * 2. check if we have any address on the outgoing interface
 	 *    belonging to this jail. If so use it.
 	 * 3. as a last resort return the 'default' jail address.
 	 */
 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
 		struct in_ifaddr *ia;
 		struct ifnet *ifp;
 
 		/* If not jailed, use the default returned. */
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			ia = (struct in_ifaddr *)nh->nh_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* Jailed. */
 		/* 1. Check if the iface address belongs to the jail. */
 		sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
 		if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 			ia = (struct in_ifaddr *)nh->nh_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/*
 		 * 2. Check if we have any address on the outgoing interface
 		 *    belonging to this jail.
 		 */
 		ia = NULL;
 		ifp = nh->nh_ifp;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 	/*
 	 * The outgoing interface is marked with 'loopback net', so a route
 	 * to ourselves is here.
 	 * Try to find the interface of the destination address and then
 	 * take the address from there. That interface is not necessarily
 	 * a loopback interface.
 	 * In case of jails, check that it is an address of the jail
 	 * and if we cannot find, fall back to the 'default' jail address.
 	 */
 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
 		struct in_ifaddr *ia;
 
 		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
 					inp->inp_socket->so_fibnum));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
 						inp->inp_socket->so_fibnum));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
 
 		if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 			if (ia == NULL) {
 				error = ENETUNREACH;
 				goto done;
 			}
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* Jailed. */
 		if (ia != NULL) {
 			struct ifnet *ifp;
 
 			ifp = ia->ia_ifp;
 			ia = NULL;
 			CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 				sa = ifa->ifa_addr;
 				if (sa->sa_family != AF_INET)
 					continue;
 				sin = (struct sockaddr_in *)sa;
 				if (prison_check_ip4(cred,
 				    &sin->sin_addr) == 0) {
 					ia = (struct in_ifaddr *)ifa;
 					break;
 				}
 			}
 			if (ia != NULL) {
 				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 				goto done;
 			}
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 done:
 	return (error);
 }
 
 /*
  * Set up for a connect from a socket to the specified address.
  * On entry, *laddrp and *lportp should contain the current local
  * address and port for the PCB; these are updated to the values
  * that should be placed in inp_laddr and inp_lport to complete
  * the connect.
  *
  * On success, *faddrp and *fportp will be set to the remote address
  * and port. These are not updated in the error case.
  *
  * If the operation fails because the connection already exists,
  * *oinpp will be set to the PCB of that connection so that the
  * caller can decide to override it. In all other cases, *oinpp
  * is set to NULL.
  */
 int
 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
     struct inpcb **oinpp, struct ucred *cred)
 {
 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
 	struct in_ifaddr *ia;
 	struct inpcb *oinp;
 	struct in_addr laddr, faddr;
 	u_short lport, fport;
 	int error;
 
 	KASSERT(sin->sin_family == AF_INET,
 	    ("%s: invalid address family for %p", __func__, sin));
 	KASSERT(sin->sin_len == sizeof(*sin),
 	    ("%s: invalid address length for %p", __func__, sin));
 
 	/*
 	 * Because a global state change doesn't actually occur here, a read
 	 * lock is sufficient.
 	 */
 	NET_EPOCH_ASSERT();
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (oinpp != NULL)
 		*oinpp = NULL;
 	if (sin->sin_port == 0)
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	lport = *lportp;
 	faddr = sin->sin_addr;
 	fport = sin->sin_port;
 #ifdef ROUTE_MPATH
 	if (CALC_FLOWID_OUTBOUND) {
 		uint32_t hash_val, hash_type;
 
 		hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport,
 		    inp->inp_socket->so_proto->pr_protocol, &hash_type);
 
 		inp->inp_flowid = hash_val;
 		inp->inp_flowtype = hash_type;
 	}
 #endif
 	if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
 		/*
 		 * If the destination address is INADDR_ANY,
 		 * use the primary local address.
 		 * If the supplied address is INADDR_BROADCAST,
 		 * and the primary interface supports broadcast,
 		 * choose the broadcast address for that interface.
 		 */
 		if (faddr.s_addr == INADDR_ANY) {
 			faddr =
 			    IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
 			if (cred != NULL &&
 			    (error = prison_get_ip4(cred, &faddr)) != 0)
 				return (error);
 		} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
 			if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
 			    IFF_BROADCAST)
 				faddr = satosin(&CK_STAILQ_FIRST(
 				    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
 		}
 	}
 	if (laddr.s_addr == INADDR_ANY) {
 		error = in_pcbladdr(inp, &faddr, &laddr, cred);
 		/*
 		 * If the destination address is multicast and an outgoing
 		 * interface has been set as a multicast option, prefer the
 		 * address of that interface as our source address.
 		 */
 		if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
 		    inp->inp_moptions != NULL) {
 			struct ip_moptions *imo;
 			struct ifnet *ifp;
 
 			imo = inp->inp_moptions;
 			if (imo->imo_multicast_ifp != NULL) {
 				ifp = imo->imo_multicast_ifp;
 				CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 					if ((ia->ia_ifp == ifp) &&
 					    (cred == NULL ||
 					    prison_check_ip4(cred,
 					    &ia->ia_addr.sin_addr) == 0))
 						break;
 				}
 				if (ia == NULL)
 					error = EADDRNOTAVAIL;
 				else {
 					laddr = ia->ia_addr.sin_addr;
 					error = 0;
 				}
 			}
 		}
 		if (error)
 			return (error);
 	}
 
 	if (lport != 0) {
 		oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
 		    fport, laddr, lport, 0, NULL, M_NODOM);
 		if (oinp != NULL) {
 			if (oinpp != NULL)
 				*oinpp = oinp;
 			return (EADDRINUSE);
 		}
 	} else {
 		struct sockaddr_in lsin, fsin;
 
 		bzero(&lsin, sizeof(lsin));
 		bzero(&fsin, sizeof(fsin));
 		lsin.sin_family = AF_INET;
 		lsin.sin_addr = laddr;
 		fsin.sin_family = AF_INET;
 		fsin.sin_addr = faddr;
 		error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin,
 		    &lport, (struct sockaddr *)& fsin, fport, cred,
 		    INPLOOKUP_WILDCARD);
 		if (error)
 			return (error);
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	*faddrp = faddr.s_addr;
 	*fportp = fport;
 	return (0);
 }
 
 void
 in_pcbdisconnect(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	inp->inp_faddr.s_addr = INADDR_ANY;
 	inp->inp_fport = 0;
 	in_pcbrehash(inp);
 }
 #endif /* INET */
 
 /*
  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
  * For most protocols, this will be invoked immediately prior to calling
  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
  * socket, in which case in_pcbfree() is deferred.
  */
 void
 in_pcbdetach(struct inpcb *inp)
 {
 
 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
 
 #ifdef RATELIMIT
 	if (inp->inp_snd_tag != NULL)
 		in_pcbdetach_txrtlmt(inp);
 #endif
 	inp->inp_socket->so_pcb = NULL;
 	inp->inp_socket = NULL;
 }
 
 /*
  * inpcb hash lookups are protected by SMR section.
  *
  * Once desired pcb has been found, switching from SMR section to a pcb
  * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
  * here because SMR is a critical section.
  * In 99%+ cases inp_smr_lock() would obtain the lock immediately.
  */
 static inline void
 inp_lock(struct inpcb *inp, const inp_lookup_t lock)
 {
 
 	lock == INPLOOKUP_RLOCKPCB ?
 	    rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock);
 }
 
 static inline void
 inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
 {
 
 	lock == INPLOOKUP_RLOCKPCB ?
 	    rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock);
 }
 
 static inline int
 inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
 {
 
 	return (lock == INPLOOKUP_RLOCKPCB ?
 	    rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
 }
 
 static inline bool
 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
 {
 
 	return (lock == INPLOOKUP_RLOCKPCB ?
 	    in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
 }
 
 bool
 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
 {
 
 	MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
 	SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
 
 	if (__predict_true(inp_trylock(inp, lock))) {
 		if (__predict_false(inp->inp_flags & INP_FREED)) {
 			smr_exit(inp->inp_pcbinfo->ipi_smr);
 			inp_unlock(inp, lock);
 			return (false);
 		}
 		smr_exit(inp->inp_pcbinfo->ipi_smr);
 		return (true);
 	}
 
 	if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
 		smr_exit(inp->inp_pcbinfo->ipi_smr);
 		inp_lock(inp, lock);
 		if (__predict_false(in_pcbrele(inp, lock)))
 			return (false);
 		/*
 		 * inp acquired through refcount & lock for sure didn't went
 		 * through uma_zfree().  However, it may have already went
 		 * through in_pcbfree() and has another reference, that
 		 * prevented its release by our in_pcbrele().
 		 */
 		if (__predict_false(inp->inp_flags & INP_FREED)) {
 			inp_unlock(inp, lock);
 			return (false);
 		}
 		return (true);
 	} else {
 		smr_exit(inp->inp_pcbinfo->ipi_smr);
 		return (false);
 	}
 }
 
 /*
  * inp_next() - inpcb hash/list traversal iterator
  *
  * Requires initialized struct inpcb_iterator for context.
  * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
  *
  * - Iterator can have either write-lock or read-lock semantics, that can not
  *   be changed later.
  * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
  *   a single hash slot.  Note: only rip_input() does the latter.
  * - Iterator may have optional bool matching function.  The matching function
  *   will be executed for each inpcb in the SMR context, so it can not acquire
  *   locks and can safely access only immutable fields of inpcb.
  *
  * A fresh initialized iterator has NULL inpcb in its context and that
  * means that inp_next() call would return the very first inpcb on the list
  * locked with desired semantic.  In all following calls the context pointer
  * shall hold the current inpcb pointer.  The KPI user is not supposed to
  * unlock the current inpcb!  Upon end of traversal inp_next() will return NULL
  * and write NULL to its context.  After end of traversal an iterator can be
  * reused.
  *
  * List traversals have the following features/constraints:
  * - New entries won't be seen, as they are always added to the head of a list.
  * - Removed entries won't stop traversal as long as they are not added to
  *   a different list. This is violated by in_pcbrehash().
  */
 #define	II_LIST_FIRST(ipi, hash)					\
 		(((hash) == INP_ALL_LIST) ?				\
 		    CK_LIST_FIRST(&(ipi)->ipi_listhead) :		\
 		    CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)]))
 #define	II_LIST_NEXT(inp, hash)						\
 		(((hash) == INP_ALL_LIST) ?				\
 		    CK_LIST_NEXT((inp), inp_list) :			\
 		    CK_LIST_NEXT((inp), inp_hash))
 #define	II_LOCK_ASSERT(inp, lock)					\
 		rw_assert(&(inp)->inp_lock,				\
 		    (lock) == INPLOOKUP_RLOCKPCB ?  RA_RLOCKED : RA_WLOCKED )
 struct inpcb *
 inp_next(struct inpcb_iterator *ii)
 {
 	const struct inpcbinfo *ipi = ii->ipi;
 	inp_match_t *match = ii->match;
 	void *ctx = ii->ctx;
 	inp_lookup_t lock = ii->lock;
 	int hash = ii->hash;
 	struct inpcb *inp;
 
 	if (ii->inp == NULL) {		/* First call. */
 		smr_enter(ipi->ipi_smr);
 		/* This is unrolled CK_LIST_FOREACH(). */
 		for (inp = II_LIST_FIRST(ipi, hash);
 		    inp != NULL;
 		    inp = II_LIST_NEXT(inp, hash)) {
 			if (match != NULL && (match)(inp, ctx) == false)
 				continue;
 			if (__predict_true(inp_smr_lock(inp, lock)))
 				break;
 			else {
 				smr_enter(ipi->ipi_smr);
 				MPASS(inp != II_LIST_FIRST(ipi, hash));
 				inp = II_LIST_FIRST(ipi, hash);
 				if (inp == NULL)
 					break;
 			}
 		}
 
 		if (inp == NULL)
 			smr_exit(ipi->ipi_smr);
 		else
 			ii->inp = inp;
 
 		return (inp);
 	}
 
 	/* Not a first call. */
 	smr_enter(ipi->ipi_smr);
 restart:
 	inp = ii->inp;
 	II_LOCK_ASSERT(inp, lock);
 next:
 	inp = II_LIST_NEXT(inp, hash);
 	if (inp == NULL) {
 		smr_exit(ipi->ipi_smr);
 		goto found;
 	}
 
 	if (match != NULL && (match)(inp, ctx) == false)
 		goto next;
 
 	if (__predict_true(inp_trylock(inp, lock))) {
 		if (__predict_false(inp->inp_flags & INP_FREED)) {
 			/*
 			 * Entries are never inserted in middle of a list, thus
 			 * as long as we are in SMR, we can continue traversal.
 			 * Jump to 'restart' should yield in the same result,
 			 * but could produce unnecessary looping.  Could this
 			 * looping be unbound?
 			 */
 			inp_unlock(inp, lock);
 			goto next;
 		} else {
 			smr_exit(ipi->ipi_smr);
 			goto found;
 		}
 	}
 
 	/*
 	 * Can't obtain lock immediately, thus going hard.  Once we exit the
 	 * SMR section we can no longer jump to 'next', and our only stable
 	 * anchoring point is ii->inp, which we keep locked for this case, so
 	 * we jump to 'restart'.
 	 */
 	if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
 		smr_exit(ipi->ipi_smr);
 		inp_lock(inp, lock);
 		if (__predict_false(in_pcbrele(inp, lock))) {
 			smr_enter(ipi->ipi_smr);
 			goto restart;
 		}
 		/*
 		 * See comment in inp_smr_lock().
 		 */
 		if (__predict_false(inp->inp_flags & INP_FREED)) {
 			inp_unlock(inp, lock);
 			smr_enter(ipi->ipi_smr);
 			goto restart;
 		}
 	} else
 		goto next;
 
 found:
 	inp_unlock(ii->inp, lock);
 	ii->inp = inp;
 
 	return (ii->inp);
 }
 
 /*
  * in_pcbref() bumps the reference count on an inpcb in order to maintain
  * stability of an inpcb pointer despite the inpcb lock being released or
  * SMR section exited.
  *
  * To free a reference later in_pcbrele_(r|w)locked() must be performed.
  */
 void
 in_pcbref(struct inpcb *inp)
 {
 	u_int old __diagused;
 
 	old = refcount_acquire(&inp->inp_refcount);
 	KASSERT(old > 0, ("%s: refcount 0", __func__));
 }
 
 /*
  * Drop a refcount on an inpcb elevated using in_pcbref(), potentially
  * freeing the pcb, if the reference was very last.
  */
 bool
 in_pcbrele_rlocked(struct inpcb *inp)
 {
 
 	INP_RLOCK_ASSERT(inp);
 
 	if (refcount_release(&inp->inp_refcount) == 0)
 		return (false);
 
 	MPASS(inp->inp_flags & INP_FREED);
 	MPASS(inp->inp_socket == NULL);
 	MPASS(inp->inp_in_hpts == 0);
 	INP_RUNLOCK(inp);
 	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
 	return (true);
 }
 
 bool
 in_pcbrele_wlocked(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 
 	if (refcount_release(&inp->inp_refcount) == 0)
 		return (false);
 
 	MPASS(inp->inp_flags & INP_FREED);
 	MPASS(inp->inp_socket == NULL);
 	MPASS(inp->inp_in_hpts == 0);
 	INP_WUNLOCK(inp);
 	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
 	return (true);
 }
 
 /*
  * Unconditionally schedule an inpcb to be freed by decrementing its
  * reference count, which should occur only after the inpcb has been detached
  * from its socket.  If another thread holds a temporary reference (acquired
  * using in_pcbref()) then the free is deferred until that reference is
  * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
  *  Almost all work, including removal from global lists, is done in this
  * context, where the pcbinfo lock is held.
  */
 void
 in_pcbfree(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 #ifdef INET
 	struct ip_moptions *imo;
 #endif
 #ifdef INET6
 	struct ip6_moptions *im6o;
 #endif
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 	KASSERT((inp->inp_flags & INP_FREED) == 0,
 	    ("%s: called twice for pcb %p", __func__, inp));
 
 	inp->inp_flags |= INP_FREED;
 	INP_INFO_WLOCK(pcbinfo);
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	pcbinfo->ipi_count--;
 	CK_LIST_REMOVE(inp, inp_list);
 	INP_INFO_WUNLOCK(pcbinfo);
 
 	if (inp->inp_flags & INP_INHASHLIST) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		INP_HASH_WLOCK(pcbinfo);
 		/* XXX: Only do if SO_REUSEPORT_LB set? */
 		in_pcbremlbgrouphash(inp);
 
 		CK_LIST_REMOVE(inp, inp_hash);
 		CK_LIST_REMOVE(inp, inp_portlist);
 		if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			CK_LIST_REMOVE(phd, phd_hash);
 			uma_zfree_smr(pcbinfo->ipi_portzone, phd);
 		}
 		INP_HASH_WUNLOCK(pcbinfo);
 		inp->inp_flags &= ~INP_INHASHLIST;
 	}
 
 	RO_INVALIDATE_CACHE(&inp->inp_route);
 #ifdef MAC
 	mac_inpcb_destroy(inp);
 #endif
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	if (inp->inp_sp != NULL)
 		ipsec_delete_pcbpolicy(inp);
 #endif
 #ifdef INET
 	if (inp->inp_options)
 		(void)m_free(inp->inp_options);
 	imo = inp->inp_moptions;
 #endif
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6PROTO) {
 		ip6_freepcbopts(inp->in6p_outputopts);
 		im6o = inp->in6p_moptions;
 	} else
 		im6o = NULL;
 #endif
 
 	if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
 		INP_WUNLOCK(inp);
 	}
 #ifdef INET6
 	ip6_freemoptions(im6o);
 #endif
 #ifdef INET
 	inp_freemoptions(imo);
 #endif
 	/* Destruction is finalized in inpcb_dtor(). */
 }
 
 static void
 inpcb_dtor(void *mem, int size, void *arg)
 {
 	struct inpcb *inp = mem;
 
 	crfree(inp->inp_cred);
 #ifdef INVARIANTS
 	inp->inp_cred = NULL;
 #endif
 }
 
 /*
  * Different protocols initialize their inpcbs differently - giving
  * different name to the lock.  But they all are disposed the same.
  */
 static void
 inpcb_fini(void *mem, int size)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_DESTROY(inp);
 }
 
 /*
  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
  * port reservation, and preventing it from being returned by inpcb lookups.
  *
  * It is used by TCP to mark an inpcb as unused and avoid future packet
  * delivery or event notification when a socket remains open but TCP has
  * closed.  This might occur as a result of a shutdown()-initiated TCP close
  * or a RST on the wire, and allows the port binding to be reused while still
  * maintaining the invariant that so_pcb always points to a valid inpcb until
  * in_pcbdetach().
  *
  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
  * in_pcbnotifyall() and in_pcbpurgeif0()?
  */
 void
 in_pcbdrop(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 #ifdef INVARIANTS
 	if (inp->inp_socket != NULL && inp->inp_ppcb != NULL)
 		MPASS(inp->inp_refcount > 1);
 #endif
 
 	/*
 	 * XXXRW: Possibly we should protect the setting of INP_DROPPED with
 	 * the hash lock...?
 	 */
 	inp->inp_flags |= INP_DROPPED;
 	if (inp->inp_flags & INP_INHASHLIST) {
 		struct inpcbport *phd = inp->inp_phd;
 
 		INP_HASH_WLOCK(inp->inp_pcbinfo);
 		in_pcbremlbgrouphash(inp);
 		CK_LIST_REMOVE(inp, inp_hash);
 		CK_LIST_REMOVE(inp, inp_portlist);
 		if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
 			CK_LIST_REMOVE(phd, phd_hash);
 			uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd);
 		}
 		INP_HASH_WUNLOCK(inp->inp_pcbinfo);
 		inp->inp_flags &= ~INP_INHASHLIST;
 	}
 }
 
 #ifdef INET
 /*
  * Common routines to return the socket addresses associated with inpcbs.
  */
 struct sockaddr *
 in_sockaddr(in_port_t port, struct in_addr *addr_p)
 {
 	struct sockaddr_in *sin;
 
 	sin = malloc(sizeof *sin, M_SONAME,
 		M_WAITOK | M_ZERO);
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = *addr_p;
 	sin->sin_port = port;
 
 	return (struct sockaddr *)sin;
 }
 
 int
 in_getsockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_lport;
 	addr = inp->inp_laddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 int
 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_fport;
 	addr = inp->inp_faddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 void
 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
     struct inpcb *(*notify)(struct inpcb *, int))
 {
 	struct inpcb *inp, *inp_temp;
 
 	INP_INFO_WLOCK(pcbinfo);
 	CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) {
 		INP_WLOCK(inp);
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV4) == 0) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 #endif
 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
 		    inp->inp_socket == NULL) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 		if ((*notify)(inp, errno))
 			INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 static bool
 inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
 {
 
 	if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
 		return (true);
 	else
 		return (false);
 }
 
 void
 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
 {
 	struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
 	    inp_v4_multi_match, NULL);
 	struct inpcb *inp;
 	struct in_multi *inm;
 	struct in_mfilter *imf;
 	struct ip_moptions *imo;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	while ((inp = inp_next(&inpi)) != NULL) {
 		INP_WLOCK_ASSERT(inp);
 
 		imo = inp->inp_moptions;
 		/*
 		 * Unselect the outgoing interface if it is being
 		 * detached.
 		 */
 		if (imo->imo_multicast_ifp == ifp)
 			imo->imo_multicast_ifp = NULL;
 
 		/*
 		 * Drop multicast group membership if we joined
 		 * through the interface being detached.
 		 *
 		 * XXX This can all be deferred to an epoch_call
 		 */
 restart:
 		IP_MFILTER_FOREACH(imf, &imo->imo_head) {
 			if ((inm = imf->imf_inm) == NULL)
 				continue;
 			if (inm->inm_ifp != ifp)
 				continue;
 			ip_mfilter_remove(&imo->imo_head, imf);
 			in_leavegroup_locked(inm, NULL);
 			ip_mfilter_free(imf);
 			goto restart;
 		}
 	}
 }
 
 /*
  * Lookup a PCB based on the local address and port.  Caller must hold the
  * hash lock.  No inpcb locks or references are acquired.
  */
 #define INP_LOOKUP_MAPPED_PCB_COST	3
 struct inpcb *
 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
     u_short lport, int lookupflags, struct ucred *cred)
 {
 	struct inpcb *inp;
 #ifdef INET6
 	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
 #else
 	int matchwild = 3;
 #endif
 	int wildcard;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
 		struct inpcbhead *head;
 		/*
 		 * Look for an unconnected (wildcard foreign addr) PCB that
 		 * matches the local address and port we're looking for.
 		 */
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
 		    pcbinfo->ipi_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
 			    inp->inp_laddr.s_addr == laddr.s_addr &&
 			    inp->inp_lport == lport) {
 				/*
 				 * Found?
 				 */
 				if (cred == NULL ||
 				    prison_equal_ip4(cred->cr_prison,
 					inp->inp_cred->cr_prison))
 					return (inp);
 			}
 		}
 		/*
 		 * Not found.
 		 */
 		return (NULL);
 	} else {
 		struct inpcbporthead *porthash;
 		struct inpcbport *phd;
 		struct inpcb *match = NULL;
 		/*
 		 * Best fit PCB lookup.
 		 *
 		 * First see if this local port is in use by looking on the
 		 * port hash list.
 		 */
 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
 		    pcbinfo->ipi_porthashmask)];
 		CK_LIST_FOREACH(phd, porthash, phd_hash) {
 			if (phd->phd_port == lport)
 				break;
 		}
 		if (phd != NULL) {
 			/*
 			 * Port is in use by one or more PCBs. Look for best
 			 * fit.
 			 */
 			CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
 				wildcard = 0;
 				if (cred != NULL &&
 				    !prison_equal_ip4(inp->inp_cred->cr_prison,
 					cred->cr_prison))
 					continue;
 #ifdef INET6
 				/* XXX inp locking */
 				if ((inp->inp_vflag & INP_IPV4) == 0)
 					continue;
 				/*
 				 * We never select the PCB that has
 				 * INP_IPV6 flag and is bound to :: if
 				 * we have another PCB which is bound
 				 * to 0.0.0.0.  If a PCB has the
 				 * INP_IPV6 flag, then we set its cost
 				 * higher than IPv4 only PCBs.
 				 *
 				 * Note that the case only happens
 				 * when a socket is bound to ::, under
 				 * the condition that the use of the
 				 * mapped address is allowed.
 				 */
 				if ((inp->inp_vflag & INP_IPV6) != 0)
 					wildcard += INP_LOOKUP_MAPPED_PCB_COST;
 #endif
 				if (inp->inp_faddr.s_addr != INADDR_ANY)
 					wildcard++;
 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
 					if (laddr.s_addr == INADDR_ANY)
 						wildcard++;
 					else if (inp->inp_laddr.s_addr != laddr.s_addr)
 						continue;
 				} else {
 					if (laddr.s_addr != INADDR_ANY)
 						wildcard++;
 				}
 				if (wildcard < matchwild) {
 					match = inp;
 					matchwild = wildcard;
 					if (matchwild == 0)
 						break;
 				}
 			}
 		}
 		return (match);
 	}
 }
 #undef INP_LOOKUP_MAPPED_PCB_COST
 
 static struct inpcb *
 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
     const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
     uint16_t fport, int lookupflags, int numa_domain)
 {
 	struct inpcb *local_wild, *numa_wild;
 	const struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	uint32_t idx;
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	hdr = &pcbinfo->ipi_lbgrouphashbase[
 	    INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
 
 	/*
 	 * Order of socket selection:
 	 * 1. non-wild.
 	 * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
 	 *
 	 * NOTE:
 	 * - Load balanced group does not contain jailed sockets
 	 * - Load balanced group does not contain IPv4 mapped INET6 wild sockets
 	 */
 	local_wild = NULL;
 	numa_wild = NULL;
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 #ifdef INET6
 		if (!(grp->il_vflag & INP_IPV4))
 			continue;
 #endif
 		if (grp->il_lport != lport)
 			continue;
 
 		idx = INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) %
 		    grp->il_inpcnt;
 		if (grp->il_laddr.s_addr == laddr->s_addr) {
 			if (numa_domain == M_NODOM ||
 			    grp->il_numa_domain == numa_domain) {
 				return (grp->il_inp[idx]);
 			} else {
 				numa_wild = grp->il_inp[idx];
 			}
 		}
 		if (grp->il_laddr.s_addr == INADDR_ANY &&
 		    (lookupflags & INPLOOKUP_WILDCARD) != 0 &&
 		    (local_wild == NULL || numa_domain == M_NODOM ||
 			grp->il_numa_domain == numa_domain)) {
 			local_wild = grp->il_inp[idx];
 		}
 	}
 	if (numa_wild != NULL)
 		return (numa_wild);
 
 	return (local_wild);
 }
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
  * that the caller has either locked the hash list, which usually happens
  * for bind(2) operations, or is in SMR section, which happens when sorting
  * out incoming packets.
  */
 static struct inpcb *
 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
     struct ifnet *ifp, uint8_t numa_domain)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&faddr, lport, fport,
 	    pcbinfo->ipi_hashmask)];
 	CK_LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
 		    inp->inp_laddr.s_addr == laddr.s_addr &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP4))
 				return (inp);
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL)
 		return (tmpinp);
 
 	/*
 	 * Then look in lb group (for wildcard match).
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
 		    fport, lookupflags, numa_domain);
 		if (inp != NULL)
 			return (inp);
 	}
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
 		    pcbinfo->ipi_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4_locked(
 				    inp->inp_cred->cr_prison, &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					return (inp);
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 		if (jail_wild != NULL)
 			return (jail_wild);
 		if (local_exact != NULL)
 			return (local_exact);
 		if (local_wild != NULL)
 			return (local_wild);
 #ifdef INET6
 		if (local_wild_mapped != NULL)
 			return (local_wild_mapped);
 #endif
 	} /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
 
 	return (NULL);
 }
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
  * hash list lock, and will return the inpcb locked (i.e., requires
  * INPLOOKUP_LOCKPCB).
  */
 static struct inpcb *
 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
     struct ifnet *ifp, uint8_t numa_domain)
 {
 	struct inpcb *inp;
 
 	smr_enter(pcbinfo->ipi_smr);
 	inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain);
 	if (inp != NULL) {
 		if (__predict_false(inp_smr_lock(inp,
 		    (lookupflags & INPLOOKUP_LOCKMASK)) == false))
 			inp = NULL;
 	} else
 		smr_exit(pcbinfo->ipi_smr);
 
 	return (inp);
 }
 
 /*
  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
  * from which a pre-calculated hash value may be extracted.
  */
 struct inpcb *
 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
     struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
 {
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp, M_NODOM));
 }
 
 struct inpcb *
 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
     struct ifnet *ifp, struct mbuf *m)
 {
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp, m->m_pkthdr.numa_domain));
 }
 #endif /* INET */
 
 /*
  * Insert PCB onto various hash lists.
  */
 int
 in_pcbinshash(struct inpcb *inp)
 {
 	struct inpcbhead *pcbhash;
 	struct inpcbporthead *pcbporthash;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbport *phd;
 	int so_options;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
 	    ("in_pcbinshash: INP_INHASHLIST"));
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		pcbhash = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr,
 		    inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 	else
 #endif
 		pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr,
 		    inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	pcbporthash = &pcbinfo->ipi_porthashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
 
 	/*
 	 * Add entry to load balance group.
 	 * Only do this if SO_REUSEPORT_LB is set.
 	 */
 	so_options = inp_so_options(inp);
 	if (so_options & SO_REUSEPORT_LB) {
 		int ret = in_pcbinslbgrouphash(inp, M_NODOM);
 		if (ret) {
 			/* pcb lb group malloc fail (ret=ENOBUFS). */
 			return (ret);
 		}
 	}
 
 	/*
 	 * Go through port list and look for a head for this lport.
 	 */
 	CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
 		if (phd->phd_port == inp->inp_lport)
 			break;
 	}
 	/*
 	 * If none exists, malloc one and tack it on.
 	 */
 	if (phd == NULL) {
 		phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT);
 		if (phd == NULL) {
 			return (ENOBUFS); /* XXX */
 		}
 		phd->phd_port = inp->inp_lport;
 		CK_LIST_INIT(&phd->phd_pcblist);
 		CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
 	}
 	inp->inp_phd = phd;
 	CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
 	CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
 	inp->inp_flags |= INP_INHASHLIST;
 
 	return (0);
 }
 
 /*
  * Move PCB to the proper hash bucket when { faddr, fport } have  been
  * changed. NOTE: This does not handle the case of the lport changing (the
  * hashed port list would have to be updated as well), so the lport must
  * not change after in_pcbinshash() has been called.
  *
  * XXXGL: a race between this function and SMR-protected hash iterator
  * will lead to iterator traversing a possibly wrong hash list. However,
  * this race should have been here since change from rwlock to epoch.
  */
 void
 in_pcbrehash(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbhead *head;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	KASSERT(inp->inp_flags & INP_INHASHLIST,
 	    ("in_pcbrehash: !INP_INHASHLIST"));
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr,
 		    inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 	else
 #endif
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr,
 		    inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	CK_LIST_REMOVE(inp, inp_hash);
 	CK_LIST_INSERT_HEAD(head, inp, inp_hash);
 }
 
 /*
  * Check for alternatives when higher level complains
  * about service problems.  For now, invalidate cached
  * routing information.  If the route was created dynamically
  * (by a redirect), time to try a default gateway again.
  */
 void
 in_losing(struct inpcb *inp)
 {
 
 	RO_INVALIDATE_CACHE(&inp->inp_route);
 	return;
 }
 
 /*
  * A set label operation has occurred at the socket layer, propagate the
  * label change into the in_pcb for the socket.
  */
 void
 in_pcbsosetlabel(struct socket *so)
 {
 #ifdef MAC
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
 
 	INP_WLOCK(inp);
 	SOCK_LOCK(so);
 	mac_inpcb_sosetlabel(so, inp);
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 #endif
 }
 
 /*
  * ipport_tick runs once per second, determining if random port allocation
  * should be continued.  If more than ipport_randomcps ports have been
  * allocated in the last second, then we return to sequential port
  * allocation. We return to random allocation only once we drop below
  * ipport_randomcps for at least ipport_randomtime seconds.
  */
 static void
 ipport_tick(void *xtp)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);	/* XXX appease INVARIANTS here */
 		if (V_ipport_tcpallocs - V_ipport_tcplastcount <=
 		    V_ipport_randomcps) {
 			if (V_ipport_stoprandom > 0)
 				V_ipport_stoprandom--;
 		} else
 			V_ipport_stoprandom = V_ipport_randomtime;
 		V_ipport_tcplastcount = V_ipport_tcpallocs;
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 	callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
 }
 
 static void
 ip_fini(void *xtp)
 {
 
 	callout_stop(&ipport_tick_callout);
 }
 
 /*
  * The ipport_callout should start running at about the time we attach the
  * inet or inet6 domains.
  */
 static void
 ipport_tick_init(const void *unused __unused)
 {
 
 	/* Start ipport_tick. */
 	callout_init(&ipport_tick_callout, 1);
 	callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
 }
 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
     ipport_tick_init, NULL);
 
 void
 inp_wlock(struct inpcb *inp)
 {
 
 	INP_WLOCK(inp);
 }
 
 void
 inp_wunlock(struct inpcb *inp)
 {
 
 	INP_WUNLOCK(inp);
 }
 
 void
 inp_rlock(struct inpcb *inp)
 {
 
 	INP_RLOCK(inp);
 }
 
 void
 inp_runlock(struct inpcb *inp)
 {
 
 	INP_RUNLOCK(inp);
 }
 
 #ifdef INVARIANT_SUPPORT
 void
 inp_lock_assert(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 }
 
 void
 inp_unlock_assert(struct inpcb *inp)
 {
 
 	INP_UNLOCK_ASSERT(inp);
 }
 #endif
 
 void
 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
 {
 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
 	    INPLOOKUP_WLOCKPCB);
 	struct inpcb *inp;
 
 	while ((inp = inp_next(&inpi)) != NULL)
 		func(inp, arg);
 }
 
 struct socket *
 inp_inpcbtosocket(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return (inp->inp_socket);
 }
 
 struct tcpcb *
 inp_inpcbtotcpcb(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return ((struct tcpcb *)inp->inp_ppcb);
 }
 
 int
 inp_ip_tos_get(const struct inpcb *inp)
 {
 
 	return (inp->inp_ip_tos);
 }
 
 void
 inp_ip_tos_set(struct inpcb *inp, int val)
 {
 
 	inp->inp_ip_tos = val;
 }
 
 void
 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
     uint32_t *faddr, uint16_t *fp)
 {
 
 	INP_LOCK_ASSERT(inp);
 	*laddr = inp->inp_laddr.s_addr;
 	*faddr = inp->inp_faddr.s_addr;
 	*lp = inp->inp_lport;
 	*fp = inp->inp_fport;
 }
 
 struct inpcb *
 so_sotoinpcb(struct socket *so)
 {
 
 	return (sotoinpcb(so));
 }
 
 struct tcpcb *
 so_sototcpcb(struct socket *so)
 {
 
 	return (sototcpcb(so));
 }
 
 /*
  * Create an external-format (``xinpcb'') structure using the information in
  * the kernel-format in_pcb structure pointed to by inp.  This is done to
  * reduce the spew of irrelevant information over this interface, to isolate
  * user code from changes in the kernel structure, and potentially to provide
  * information-hiding if we decide that some of this information should be
  * hidden from users.
  */
 void
 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
 {
 
 	bzero(xi, sizeof(*xi));
 	xi->xi_len = sizeof(struct xinpcb);
 	if (inp->inp_socket)
 		sotoxsocket(inp->inp_socket, &xi->xi_socket);
 	bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
 	xi->inp_gencnt = inp->inp_gencnt;
 	xi->inp_ppcb = (uintptr_t)inp->inp_ppcb;
 	xi->inp_flow = inp->inp_flow;
 	xi->inp_flowid = inp->inp_flowid;
 	xi->inp_flowtype = inp->inp_flowtype;
 	xi->inp_flags = inp->inp_flags;
 	xi->inp_flags2 = inp->inp_flags2;
 	xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket;
 	xi->in6p_cksum = inp->in6p_cksum;
 	xi->in6p_hops = inp->in6p_hops;
 	xi->inp_ip_tos = inp->inp_ip_tos;
 	xi->inp_vflag = inp->inp_vflag;
 	xi->inp_ip_ttl = inp->inp_ip_ttl;
 	xi->inp_ip_p = inp->inp_ip_p;
 	xi->inp_ip_minttl = inp->inp_ip_minttl;
 }
 
 int
 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
     int (*ctloutput_set)(struct inpcb *, struct sockopt *))
 {
 	struct sockopt sopt;
 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
 	    INPLOOKUP_WLOCKPCB);
 	struct inpcb *inp;
 	struct sockopt_parameters *params;
 	struct socket *so;
 	int error;
 	char buf[1024];
 
 	if (req->oldptr != NULL || req->oldlen != 0)
 		return (EINVAL);
 	if (req->newptr == NULL)
 		return (EPERM);
 	if (req->newlen > sizeof(buf))
 		return (ENOMEM);
 	error = SYSCTL_IN(req, buf, req->newlen);
 	if (error != 0)
 		return (error);
 	if (req->newlen < sizeof(struct sockopt_parameters))
 		return (EINVAL);
 	params = (struct sockopt_parameters *)buf;
 	sopt.sopt_level = params->sop_level;
 	sopt.sopt_name = params->sop_optname;
 	sopt.sopt_dir = SOPT_SET;
 	sopt.sopt_val = params->sop_optval;
 	sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters);
 	sopt.sopt_td = NULL;
 #ifdef INET6
 	if (params->sop_inc.inc_flags & INC_ISIPV6) {
 		if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_laddr))
 			params->sop_inc.inc6_laddr.s6_addr16[1] =
 			    htons(params->sop_inc.inc6_zoneid & 0xffff);
 		if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_faddr))
 			params->sop_inc.inc6_faddr.s6_addr16[1] =
 			    htons(params->sop_inc.inc6_zoneid & 0xffff);
 	}
 #endif
 	if (params->sop_inc.inc_lport != htons(0)) {
 		if (params->sop_inc.inc_fport == htons(0))
 			inpi.hash = INP_PCBHASH_WILD(params->sop_inc.inc_lport,
 			    pcbinfo->ipi_hashmask);
 		else
 #ifdef INET6
 			if (params->sop_inc.inc_flags & INC_ISIPV6)
 				inpi.hash = INP6_PCBHASH(
 				    &params->sop_inc.inc6_faddr,
 				    params->sop_inc.inc_lport,
 				    params->sop_inc.inc_fport,
 				    pcbinfo->ipi_hashmask);
 			else
 #endif
 				inpi.hash = INP_PCBHASH(
 				    &params->sop_inc.inc_faddr,
 				    params->sop_inc.inc_lport,
 				    params->sop_inc.inc_fport,
 				    pcbinfo->ipi_hashmask);
 	}
 	while ((inp = inp_next(&inpi)) != NULL)
 		if (inp->inp_gencnt == params->sop_id) {
-			if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+			if (inp->inp_flags & INP_DROPPED) {
 				INP_WUNLOCK(inp);
 				return (ECONNRESET);
 			}
 			so = inp->inp_socket;
 			KASSERT(so != NULL, ("inp_socket == NULL"));
 			soref(so);
 			error = (*ctloutput_set)(inp, &sopt);
 			sorele(so);
 			break;
 		}
 	if (inp == NULL)
 		error = ESRCH;
 	return (error);
 }
 
 #ifdef DDB
 static void
 db_print_indent(int indent)
 {
 	int i;
 
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 }
 
 static void
 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
 {
 	char faddr_str[48], laddr_str[48];
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inc);
 
 	indent += 2;
 
 #ifdef INET6
 	if (inc->inc_flags & INC_ISIPV6) {
 		/* IPv6. */
 		ip6_sprintf(laddr_str, &inc->inc6_laddr);
 		ip6_sprintf(faddr_str, &inc->inc6_faddr);
 	} else
 #endif
 	{
 		/* IPv4. */
 		inet_ntoa_r(inc->inc_laddr, laddr_str);
 		inet_ntoa_r(inc->inc_faddr, faddr_str);
 	}
 	db_print_indent(indent);
 	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
 	    ntohs(inc->inc_lport));
 	db_print_indent(indent);
 	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
 	    ntohs(inc->inc_fport));
 }
 
 static void
 db_print_inpflags(int inp_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_flags & INP_RECVOPTS) {
 		db_printf("%sINP_RECVOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVRETOPTS) {
 		db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVDSTADDR) {
 		db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_ORIGDSTADDR) {
 		db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HDRINCL) {
 		db_printf("%sINP_HDRINCL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HIGHPORT) {
 		db_printf("%sINP_HIGHPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_LOWPORT) {
 		db_printf("%sINP_LOWPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_ANONPORT) {
 		db_printf("%sINP_ANONPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVIF) {
 		db_printf("%sINP_RECVIF", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_MTUDISC) {
 		db_printf("%sINP_MTUDISC", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTTL) {
 		db_printf("%sINP_RECVTTL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_DONTFRAG) {
 		db_printf("%sINP_DONTFRAG", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTOS) {
 		db_printf("%sINP_RECVTOS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_IPV6_V6ONLY) {
 		db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_PKTINFO) {
 		db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPLIMIT) {
 		db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPOPTS) {
 		db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_DSTOPTS) {
 		db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDR) {
 		db_printf("%sIN6P_RTHDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDRDSTOPTS) {
 		db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_TCLASS) {
 		db_printf("%sIN6P_TCLASS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_AUTOFLOWLABEL) {
 		db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
 		comma = 1;
 	}
-	if (inp_flags & INP_TIMEWAIT) {
-		db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
-		comma  = 1;
-	}
 	if (inp_flags & INP_ONESBCAST) {
 		db_printf("%sINP_ONESBCAST", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_DROPPED) {
 		db_printf("%sINP_DROPPED", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_SOCKREF) {
 		db_printf("%sINP_SOCKREF", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & IN6P_RFC2292) {
 		db_printf("%sIN6P_RFC2292", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_MTU) {
 		db_printf("IN6P_MTU%s", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_inpvflag(u_char inp_vflag)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_vflag & INP_IPV4) {
 		db_printf("%sINP_IPV4", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6) {
 		db_printf("%sINP_IPV6", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6PROTO) {
 		db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
 		comma  = 1;
 	}
 }
 
 static void
 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
 {
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inp);
 
 	indent += 2;
 
 	db_print_indent(indent);
 	db_printf("inp_flow: 0x%x\n", inp->inp_flow);
 
 	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
 
 	db_print_indent(indent);
 	db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
 	    inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
 
 	db_print_indent(indent);
 	db_printf("inp_label: %p   inp_flags: 0x%x (",
 	   inp->inp_label, inp->inp_flags);
 	db_print_inpflags(inp->inp_flags);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
 	    inp->inp_vflag);
 	db_print_inpvflag(inp->inp_vflag);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
 	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
 
 	db_print_indent(indent);
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		db_printf("in6p_options: %p   in6p_outputopts: %p   "
 		    "in6p_moptions: %p\n", inp->in6p_options,
 		    inp->in6p_outputopts, inp->in6p_moptions);
 		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
 		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
 		    inp->in6p_hops);
 	} else
 #endif
 	{
 		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
 		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
 		    inp->inp_options, inp->inp_moptions);
 	}
 
 	db_print_indent(indent);
 	db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
 	    (uintmax_t)inp->inp_gencnt);
 }
 
 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
 {
 	struct inpcb *inp;
 
 	if (!have_addr) {
 		db_printf("usage: show inpcb <addr>\n");
 		return;
 	}
 	inp = (struct inpcb *)addr;
 
 	db_print_inpcb(inp, "inpcb", 0);
 }
 #endif /* DDB */
 
 #ifdef RATELIMIT
 /*
  * Modify TX rate limit based on the existing "inp->inp_snd_tag",
  * if any.
  */
 int
 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
 {
 	union if_snd_tag_modify_params params = {
 		.rate_limit.max_rate = max_pacing_rate,
 		.rate_limit.flags = M_NOWAIT,
 	};
 	struct m_snd_tag *mst;
 	int error;
 
 	mst = inp->inp_snd_tag;
 	if (mst == NULL)
 		return (EINVAL);
 
 	if (mst->sw->snd_tag_modify == NULL) {
 		error = EOPNOTSUPP;
 	} else {
 		error = mst->sw->snd_tag_modify(mst, &params);
 	}
 	return (error);
 }
 
 /*
  * Query existing TX rate limit based on the existing
  * "inp->inp_snd_tag", if any.
  */
 int
 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
 {
 	union if_snd_tag_query_params params = { };
 	struct m_snd_tag *mst;
 	int error;
 
 	mst = inp->inp_snd_tag;
 	if (mst == NULL)
 		return (EINVAL);
 
 	if (mst->sw->snd_tag_query == NULL) {
 		error = EOPNOTSUPP;
 	} else {
 		error = mst->sw->snd_tag_query(mst, &params);
 		if (error == 0 && p_max_pacing_rate != NULL)
 			*p_max_pacing_rate = params.rate_limit.max_rate;
 	}
 	return (error);
 }
 
 /*
  * Query existing TX queue level based on the existing
  * "inp->inp_snd_tag", if any.
  */
 int
 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
 {
 	union if_snd_tag_query_params params = { };
 	struct m_snd_tag *mst;
 	int error;
 
 	mst = inp->inp_snd_tag;
 	if (mst == NULL)
 		return (EINVAL);
 
 	if (mst->sw->snd_tag_query == NULL)
 		return (EOPNOTSUPP);
 
 	error = mst->sw->snd_tag_query(mst, &params);
 	if (error == 0 && p_txqueue_level != NULL)
 		*p_txqueue_level = params.rate_limit.queue_level;
 	return (error);
 }
 
 /*
  * Allocate a new TX rate limit send tag from the network interface
  * given by the "ifp" argument and save it in "inp->inp_snd_tag":
  */
 int
 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
     uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
 
 {
 	union if_snd_tag_alloc_params params = {
 		.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
 		    IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
 		.rate_limit.hdr.flowid = flowid,
 		.rate_limit.hdr.flowtype = flowtype,
 		.rate_limit.hdr.numa_domain = inp->inp_numa_domain,
 		.rate_limit.max_rate = max_pacing_rate,
 		.rate_limit.flags = M_NOWAIT,
 	};
 	int error;
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * If there is already a send tag, or the INP is being torn
 	 * down, allocating a new send tag is not allowed. Else send
 	 * tags may leak.
 	 */
-	if (*st != NULL || (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0)
+	if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0)
 		return (EINVAL);
 
 	error = m_snd_tag_alloc(ifp, &params, st);
 #ifdef INET
 	if (error == 0) {
 		counter_u64_add(rate_limit_set_ok, 1);
 		counter_u64_add(rate_limit_active, 1);
 	} else if (error != EOPNOTSUPP)
 		  counter_u64_add(rate_limit_alloc_fail, 1);
 #endif
 	return (error);
 }
 
 void
 in_pcbdetach_tag(struct m_snd_tag *mst)
 {
 
 	m_snd_tag_rele(mst);
 #ifdef INET
 	counter_u64_add(rate_limit_active, -1);
 #endif
 }
 
 /*
  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
  * if any:
  */
 void
 in_pcbdetach_txrtlmt(struct inpcb *inp)
 {
 	struct m_snd_tag *mst;
 
 	INP_WLOCK_ASSERT(inp);
 
 	mst = inp->inp_snd_tag;
 	inp->inp_snd_tag = NULL;
 
 	if (mst == NULL)
 		return;
 
 	m_snd_tag_rele(mst);
 #ifdef INET
 	counter_u64_add(rate_limit_active, -1);
 #endif
 }
 
 int
 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
 {
 	int error;
 
 	/*
 	 * If the existing send tag is for the wrong interface due to
 	 * a route change, first drop the existing tag.  Set the
 	 * CHANGED flag so that we will keep trying to allocate a new
 	 * tag if we fail to allocate one this time.
 	 */
 	if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
 		in_pcbdetach_txrtlmt(inp);
 		inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 	}
 
 	/*
 	 * NOTE: When attaching to a network interface a reference is
 	 * made to ensure the network interface doesn't go away until
 	 * all ratelimit connections are gone. The network interface
 	 * pointers compared below represent valid network interfaces,
 	 * except when comparing towards NULL.
 	 */
 	if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
 		error = 0;
 	} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
 		if (inp->inp_snd_tag != NULL)
 			in_pcbdetach_txrtlmt(inp);
 		error = 0;
 	} else if (inp->inp_snd_tag == NULL) {
 		/*
 		 * In order to utilize packet pacing with RSS, we need
 		 * to wait until there is a valid RSS hash before we
 		 * can proceed:
 		 */
 		if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
 			error = EAGAIN;
 		} else {
 			error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
 			    mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
 		}
 	} else {
 		error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
 	}
 	if (error == 0 || error == EOPNOTSUPP)
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 
 	return (error);
 }
 
 /*
  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
  * is set in the fast path and will attach/detach/modify the TX rate
  * limit send tag based on the socket's so_max_pacing_rate value.
  */
 void
 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
 {
 	struct socket *socket;
 	uint32_t max_pacing_rate;
 	bool did_upgrade;
 
 	if (inp == NULL)
 		return;
 
 	socket = inp->inp_socket;
 	if (socket == NULL)
 		return;
 
 	if (!INP_WLOCKED(inp)) {
 		/*
 		 * NOTE: If the write locking fails, we need to bail
 		 * out and use the non-ratelimited ring for the
 		 * transmit until there is a new chance to get the
 		 * write lock.
 		 */
 		if (!INP_TRY_UPGRADE(inp))
 			return;
 		did_upgrade = 1;
 	} else {
 		did_upgrade = 0;
 	}
 
 	/*
 	 * NOTE: The so_max_pacing_rate value is read unlocked,
 	 * because atomic updates are not required since the variable
 	 * is checked at every mbuf we send. It is assumed that the
 	 * variable read itself will be atomic.
 	 */
 	max_pacing_rate = socket->so_max_pacing_rate;
 
 	in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
 
 	if (did_upgrade)
 		INP_DOWNGRADE(inp);
 }
 
 /*
  * Track route changes for TX rate limiting.
  */
 void
 in_pcboutput_eagain(struct inpcb *inp)
 {
 	bool did_upgrade;
 
 	if (inp == NULL)
 		return;
 
 	if (inp->inp_snd_tag == NULL)
 		return;
 
 	if (!INP_WLOCKED(inp)) {
 		/*
 		 * NOTE: If the write locking fails, we need to bail
 		 * out and use the non-ratelimited ring for the
 		 * transmit until there is a new chance to get the
 		 * write lock.
 		 */
 		if (!INP_TRY_UPGRADE(inp))
 			return;
 		did_upgrade = 1;
 	} else {
 		did_upgrade = 0;
 	}
 
 	/* detach rate limiting */
 	in_pcbdetach_txrtlmt(inp);
 
 	/* make sure new mbuf send tag allocation is made */
 	inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 
 	if (did_upgrade)
 		INP_DOWNGRADE(inp);
 }
 
 #ifdef INET
 static void
 rl_init(void *st)
 {
 	rate_limit_new = counter_u64_alloc(M_WAITOK);
 	rate_limit_chg = counter_u64_alloc(M_WAITOK);
 	rate_limit_active = counter_u64_alloc(M_WAITOK);
 	rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
 	rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
 }
 
 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
 #endif
 #endif /* RATELIMIT */
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 331474999163..a72ae5742d4e 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -1,826 +1,825 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1990, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef _NETINET_IN_PCB_H_
 #define _NETINET_IN_PCB_H_
 
 #include <sys/queue.h>
 #include <sys/epoch.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <sys/_rwlock.h>
 #include <net/route.h>
 
 #ifdef _KERNEL
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/smr.h>
 #include <sys/sysctl.h>
 #include <net/vnet.h>
 #include <vm/uma.h>
 #endif
 #include <sys/ck.h>
 
 /*
  * struct inpcb is the common protocol control block structure used in most
  * IP transport protocols.
  *
  * Pointers to local and foreign host table entries, local and foreign socket
  * numbers, and pointers up (to a socket structure) and down (to a
  * protocol-specific control block) are stored here.
  */
 CK_LIST_HEAD(inpcbhead, inpcb);
 CK_LIST_HEAD(inpcbporthead, inpcbport);
 CK_LIST_HEAD(inpcblbgrouphead, inpcblbgroup);
 typedef	uint64_t	inp_gen_t;
 
 /*
  * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet.
  * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing
  * the following structure.  This requires padding always be zeroed out,
  * which is done right after inpcb allocation and stays through its lifetime.
  */
 struct in_addr_4in6 {
 	u_int32_t	ia46_pad32[3];
 	struct	in_addr	ia46_addr4;
 };
 
 union in_dependaddr {
 	struct in_addr_4in6 id46_addr;
 	struct in6_addr	id6_addr;
 };
 
 /*
  * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553.  in_conninfo has
  * some extra padding to accomplish this.
  * NOTE 2: tcp_syncache.c uses first 5 32-bit words, which identify fport,
  * lport, faddr to generate hash, so these fields shouldn't be moved.
  */
 struct in_endpoints {
 	u_int16_t	ie_fport;		/* foreign port */
 	u_int16_t	ie_lport;		/* local port */
 	/* protocol dependent part, local and foreign addr */
 	union in_dependaddr ie_dependfaddr;	/* foreign host table entry */
 	union in_dependaddr ie_dependladdr;	/* local host table entry */
 #define	ie_faddr	ie_dependfaddr.id46_addr.ia46_addr4
 #define	ie_laddr	ie_dependladdr.id46_addr.ia46_addr4
 #define	ie6_faddr	ie_dependfaddr.id6_addr
 #define	ie6_laddr	ie_dependladdr.id6_addr
 	u_int32_t	ie6_zoneid;		/* scope zone id */
 };
 
 /*
  * XXX The defines for inc_* are hacks and should be changed to direct
  * references.
  */
 struct in_conninfo {
 	u_int8_t	inc_flags;
 	u_int8_t	inc_len;
 	u_int16_t	inc_fibnum;	/* XXX was pad, 16 bits is plenty */
 	/* protocol dependent part */
 	struct	in_endpoints inc_ie;
 };
 
 /*
  * Flags for inc_flags.
  */
 #define	INC_ISIPV6	0x01
 #define	INC_IPV6MINMTU	0x02
 
 #define	inc_fport	inc_ie.ie_fport
 #define	inc_lport	inc_ie.ie_lport
 #define	inc_faddr	inc_ie.ie_faddr
 #define	inc_laddr	inc_ie.ie_laddr
 #define	inc6_faddr	inc_ie.ie6_faddr
 #define	inc6_laddr	inc_ie.ie6_laddr
 #define	inc6_zoneid	inc_ie.ie6_zoneid
 
 #if defined(_KERNEL) || defined(_WANT_INPCB)
 /*
  * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and
  * IPv6 sockets.  In the case of TCP and UDP, further per-connection state is
  * hung off of inp_ppcb most of the time.  Almost all fields of struct inpcb
  * are static after creation or protected by a per-inpcb rwlock, inp_lock.
  *
  * A inpcb database is indexed by addresses/ports hash as well as list of
  * all pcbs that belong to a certain proto. Database lookups or list traversals
  * are be performed inside SMR section. Once desired PCB is found its own
  * lock is to be obtained and SMR section exited.
  *
  * Key:
  * (b) - Protected by the hpts lock.
  * (c) - Constant after initialization
  * (e) - Protected by the SMR section
  * (i) - Protected by the inpcb lock
  * (p) - Protected by the pcbinfo lock for the inpcb
  * (h) - Protected by the pcbhash lock for the inpcb
  * (s) - Protected by another subsystem's locks
  * (x) - Undefined locking
  *
  * Notes on the tcp_hpts:
  *
  * First Hpts lock order is
  * 1) INP_WLOCK()
  * 2) HPTS_LOCK() i.e. hpts->pmtx
  *
  * To insert a TCB on the hpts you *must* be holding the INP_WLOCK().
  * You may check the inp->inp_in_hpts flag without the hpts lock.
  * The hpts is the only one that will clear this flag holding
  * only the hpts lock. This means that in your tcp_output()
  * routine when you test for the inp_in_hpts flag to be 1
  * it may be transitioning to 0 (by the hpts).
  * That's ok since that will just mean an extra call to tcp_output
  * that most likely will find the call you executed
  * (when the mis-match occurred) will have put the TCB back
  * on the hpts and it will return. If your
  * call did not add the inp back to the hpts then you will either
  * over-send or the cwnd will block you from sending more.
  *
  * Note you should also be holding the INP_WLOCK() when you
  * call the remove from the hpts as well. Though usually
  * you are either doing this from a timer, where you need and have
  * the INP_WLOCK() or from destroying your TCB where again
  * you should already have the INP_WLOCK().
  *
  * The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and
  * inp_input_cpu_set fields are controlled completely by
  * the hpts. Do not ever set these. The inp_hpts_cpu_set
  * and inp_input_cpu_set fields indicate if the hpts has
  * setup the respective cpu field. It is advised if this
  * field is 0, to enqueue the packet with the appropriate
  * hpts_immediate() call. If the _set field is 1, then
  * you may compare the inp_*_cpu field to the curcpu and
  * may want to again insert onto the hpts if these fields
  * are not equal (i.e. you are not on the expected CPU).
  *
  * A note on inp_hpts_calls and inp_input_calls, these
  * flags are set when the hpts calls either the output
  * or do_segment routines respectively. If the routine
  * being called wants to use this, then it needs to
  * clear the flag before returning. The hpts will not
  * clear the flag. The flags can be used to tell if
  * the hpts is the function calling the respective
  * routine.
  *
  * A few other notes:
  *
  * When a read lock is held, stability of the field is guaranteed; to write
  * to a field, a write lock must generally be held.
  *
  * netinet/netinet6-layer code should not assume that the inp_socket pointer
- * is safe to dereference without inp_lock being held, even for protocols
- * other than TCP (where the inpcb persists during TIMEWAIT even after the
- * socket has been freed), or there may be close(2)-related races.
+ * is safe to dereference without inp_lock being held, there may be
+ * close(2)-related races.
  *
  * The inp_vflag field is overloaded, and would otherwise ideally be (c).
  */
 struct icmp6_filter;
 struct inpcbpolicy;
 struct m_snd_tag;
 struct inpcb {
 	/* Cache line #1 (amd64) */
 	CK_LIST_ENTRY(inpcb) inp_hash;	/* (w:h/r:e)  hash list */
 	struct rwlock	inp_lock;
 	/* Cache line #2 (amd64) */
 #define	inp_start_zero	inp_hpts
 #define	inp_zero_size	(sizeof(struct inpcb) - \
 			    offsetof(struct inpcb, inp_start_zero))
 	TAILQ_ENTRY(inpcb) inp_hpts;	/* pacing out queue next lock(b) */
 	uint32_t inp_hpts_gencnt;	/* XXXGL */
 	uint32_t inp_hpts_request;	/* Current hpts request, zero if
 					 * fits in the pacing window (i&b). */
 	/*
 	 * Note the next fields are protected by a
 	 * different lock (hpts-lock). This means that
 	 * they must correspond in size to the smallest
 	 * protectable bit field (uint8_t on x86, and
 	 * other platfomrs potentially uint32_t?). Also
 	 * since CPU switches can occur at different times the two
 	 * fields can *not* be collapsed into a signal bit field.
 	 */
 #if defined(__amd64__) || defined(__i386__)
 	uint8_t inp_in_hpts; /* on output hpts (lock b) */
 #else
 	uint32_t inp_in_hpts; /* on output hpts (lock b) */
 #endif
 	volatile uint16_t  inp_hpts_cpu; /* Lock (i) */
 	volatile uint16_t  inp_irq_cpu;	/* Set by LRO in behalf of or the driver */
 	u_int	inp_refcount;		/* (i) refcount */
 	int	inp_flags;		/* (i) generic IP/datagram flags */
 	int	inp_flags2;		/* (i) generic IP/datagram flags #2*/
 	uint8_t inp_hpts_cpu_set :1,  /* on output hpts (i) */
 			 inp_hpts_calls :1,	/* (i) from output hpts */
 			 inp_irq_cpu_set :1,	/* (i) from LRO/Driver */
 			 inp_spare_bits2 : 3;
 	uint8_t inp_numa_domain;	/* numa domain */
 	void	*inp_ppcb;		/* (i) pointer to per-protocol pcb */
 	struct	socket *inp_socket;	/* (i) back pointer to socket */
 	int32_t 	 inp_hptsslot;	/* Hpts wheel slot this tcb is Lock(i&b) */
 	uint32_t         inp_hpts_drop_reas;	/* reason we are dropping the PCB (lock i&b) */
 	struct	inpcbinfo *inp_pcbinfo;	/* (c) PCB list info */
 	struct	ucred	*inp_cred;	/* (c) cache of socket cred */
 	u_int32_t inp_flow;		/* (i) IPv6 flow information */
 	u_char	inp_vflag;		/* (i) IP version flag (v4/v6) */
 	u_char	inp_ip_ttl;		/* (i) time to live proto */
 	u_char	inp_ip_p;		/* (c) protocol proto */
 	u_char	inp_ip_minttl;		/* (i) minimum TTL or drop */
 	uint32_t inp_flowid;		/* (x) flow id / queue id */
 	struct m_snd_tag *inp_snd_tag;	/* (i) send tag for outgoing mbufs */
 	uint32_t inp_flowtype;		/* (x) M_HASHTYPE value */
 	uint32_t inp_rss_listen_bucket;	/* (x) overridden RSS listen bucket */
 
 	/* Local and foreign ports, local and foreign addr. */
 	struct	in_conninfo inp_inc;	/* (i) list for PCB's local port */
 
 	/* MAC and IPSEC policy information. */
 	struct	label *inp_label;	/* (i) MAC label */
 	struct	inpcbpolicy *inp_sp;    /* (s) for IPSEC */
 
 	/* Protocol-dependent part; options. */
 	struct {
 		u_char	inp_ip_tos;		/* (i) type of service proto */
 		struct mbuf		*inp_options;	/* (i) IP options */
 		struct ip_moptions	*inp_moptions;	/* (i) mcast options */
 	};
 	struct {
 		/* (i) IP options */
 		struct mbuf		*in6p_options;
 		/* (i) IP6 options for outgoing packets */
 		struct ip6_pktopts	*in6p_outputopts;
 		/* (i) IP multicast options */
 		struct ip6_moptions	*in6p_moptions;
 		/* (i) ICMPv6 code type filter */
 		struct icmp6_filter	*in6p_icmp6filt;
 		/* (i) IPV6_CHECKSUM setsockopt */
 		int	in6p_cksum;
 		short	in6p_hops;
 	};
 	CK_LIST_ENTRY(inpcb) inp_portlist;	/* (r:e/w:h) port list */
 	struct	inpcbport *inp_phd;	/* (r:e/w:h) head of this list */
 	inp_gen_t	inp_gencnt;	/* (c) generation count */
 	void		*spare_ptr;	/* Spare pointer. */
 	rt_gen_t	inp_rt_cookie;	/* generation for route entry */
 	union {				/* cached L3 information */
 		struct route inp_route;
 		struct route_in6 inp_route6;
 	};
 	CK_LIST_ENTRY(inpcb) inp_list;	/* (r:e/w:p) all PCBs for proto */
 };
 #endif	/* _KERNEL */
 
 #define	inp_fport	inp_inc.inc_fport
 #define	inp_lport	inp_inc.inc_lport
 #define	inp_faddr	inp_inc.inc_faddr
 #define	inp_laddr	inp_inc.inc_laddr
 
 #define	in6p_faddr	inp_inc.inc6_faddr
 #define	in6p_laddr	inp_inc.inc6_laddr
 #define	in6p_zoneid	inp_inc.inc6_zoneid
 
 #define	inp_vnet	inp_pcbinfo->ipi_vnet
 
 /*
  * The range of the generation count, as used in this implementation, is 9e19.
  * We would have to create 300 billion connections per second for this number
  * to roll over in a year.  This seems sufficiently unlikely that we simply
  * don't concern ourselves with that possibility.
  */
 
 /*
  * Interface exported to userland by various protocols which use inpcbs.  Hack
  * alert -- only define if struct xsocket is in scope.
  * Fields prefixed with "xi_" are unique to this structure, and the rest
  * match fields in the struct inpcb, to ease coding and porting.
  *
  * Legend:
  * (s) - used by userland utilities in src
  * (p) - used by utilities in ports
  * (3) - is known to be used by third party software not in ports
  * (n) - no known usage
  */
 #ifdef _SYS_SOCKETVAR_H_
 struct xinpcb {
 	ksize_t		xi_len;			/* length of this structure */
 	struct xsocket	xi_socket;		/* (s,p) */
 	struct in_conninfo inp_inc;		/* (s,p) */
 	uint64_t	inp_gencnt;		/* (s,p) */
 	kvaddr_t	inp_ppcb;		/* (s) netstat(1) */
 	int64_t		inp_spare64[4];
 	uint32_t	inp_flow;		/* (s) */
 	uint32_t	inp_flowid;		/* (s) */
 	uint32_t	inp_flowtype;		/* (s) */
 	int32_t		inp_flags;		/* (s,p) */
 	int32_t		inp_flags2;		/* (s) */
 	int32_t		inp_rss_listen_bucket;	/* (n) */
 	int32_t		in6p_cksum;		/* (n) */
 	int32_t		inp_spare32[4];
 	uint16_t	in6p_hops;		/* (n) */
 	uint8_t		inp_ip_tos;		/* (n) */
 	int8_t		pad8;
 	uint8_t		inp_vflag;		/* (s,p) */
 	uint8_t		inp_ip_ttl;		/* (n) */
 	uint8_t		inp_ip_p;		/* (n) */
 	uint8_t		inp_ip_minttl;		/* (n) */
 	int8_t		inp_spare8[4];
 } __aligned(8);
 
 struct xinpgen {
 	ksize_t	xig_len;	/* length of this structure */
 	u_int		xig_count;	/* number of PCBs at this time */
 	uint32_t	_xig_spare32;
 	inp_gen_t	xig_gen;	/* generation count at this time */
 	so_gen_t	xig_sogen;	/* socket generation count this time */
 	uint64_t	_xig_spare64[4];
 } __aligned(8);
 
 struct sockopt_parameters {
 	struct in_conninfo sop_inc;
 	uint64_t sop_id;
 	int sop_level;
 	int sop_optname;
 	char sop_optval[];
 };
 
 #ifdef	_KERNEL
 int	sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
 	    int (*ctloutput_set)(struct inpcb *, struct sockopt *));
 void	in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *);
 #endif
 #endif /* _SYS_SOCKETVAR_H_ */
 
 #ifdef _KERNEL
 /*
  * Per-VNET pcb database for each high-level protocol (UDP, TCP, ...) in both
  * IPv4 and IPv6.
  *
  * The pcbs are protected with SMR section and thus all lists in inpcbinfo
  * are CK-lists.  Locking is required to insert a pcb into database. Two
  * locks are provided: one for the hash and one for the global list of pcbs,
  * as well as overall count and generation count.
  *
  * Locking key:
  *
  * (c) Constant or nearly constant after initialisation
  * (e) Protected by SMR section
  * (g) Locked by ipi_lock
  * (h) Locked by ipi_hash_lock
  */
 struct inpcbinfo {
 	/*
 	 * Global lock protecting inpcb list modification
 	 */
 	struct mtx		 ipi_lock;
 	struct inpcbhead	 ipi_listhead;		/* (r:e/w:g) */
 	u_int			 ipi_count;		/* (g) */
 
 	/*
 	 * Generation count -- incremented each time a connection is allocated
 	 * or freed.
 	 */
 	u_quad_t		 ipi_gencnt;		/* (g) */
 
 	/*
 	 * Fields associated with port lookup and allocation.
 	 */
 	u_short			 ipi_lastport;		/* (h) */
 	u_short			 ipi_lastlow;		/* (h) */
 	u_short			 ipi_lasthi;		/* (h) */
 
 	/*
 	 * UMA zone from which inpcbs are allocated for this protocol.
 	 */
 	uma_zone_t		 ipi_zone;		/* (c) */
 	uma_zone_t		 ipi_portzone;		/* (c) */
 	smr_t			 ipi_smr;		/* (c) */
 
 	/*
 	 * Global hash of inpcbs, hashed by local and foreign addresses and
 	 * port numbers.
 	 */
 	struct mtx		 ipi_hash_lock;
 	struct inpcbhead 	*ipi_hashbase;		/* (r:e/w:h) */
 	u_long			 ipi_hashmask;		/* (c) */
 
 	/*
 	 * Global hash of inpcbs, hashed by only local port number.
 	 */
 	struct inpcbporthead	*ipi_porthashbase;	/* (h) */
 	u_long			 ipi_porthashmask;	/* (h) */
 
 	/*
 	 * Load balance groups used for the SO_REUSEPORT_LB option,
 	 * hashed by local port.
 	 */
 	struct	inpcblbgrouphead *ipi_lbgrouphashbase;	/* (r:e/w:h) */
 	u_long			 ipi_lbgrouphashmask;	/* (h) */
 
 	/*
 	 * Pointer to network stack instance
 	 */
 	struct vnet		*ipi_vnet;		/* (c) */
 };
 
 /*
  * Global allocation storage for each high-level protocol (UDP, TCP, ...).
  * Each corresponding per-VNET inpcbinfo points into this one.
  */
 struct inpcbstorage {
 	uma_zone_t	ips_zone;
 	uma_zone_t	ips_portzone;
 	uma_init	ips_pcbinit;
 	const char *	ips_zone_name;
 	const char *	ips_portzone_name;
 	const char *	ips_infolock_name;
 	const char *	ips_hashlock_name;
 };
 
 #define INPCBSTORAGE_DEFINE(prot, lname, zname, iname, hname)		\
 static int								\
 prot##_inpcb_init(void *mem, int size __unused, int flags __unused)	\
 {									\
 	struct inpcb *inp = mem;					\
 									\
 	rw_init_flags(&inp->inp_lock, lname, RW_RECURSE | RW_DUPOK);	\
 	return (0);							\
 }									\
 static struct inpcbstorage prot = {					\
 	.ips_pcbinit = prot##_inpcb_init,				\
 	.ips_zone_name = zname,						\
 	.ips_portzone_name = zname " ports",				\
 	.ips_infolock_name = iname,					\
 	.ips_hashlock_name = hname,					\
 };									\
 SYSINIT(prot##_inpcbstorage_init, SI_SUB_PROTO_DOMAIN,			\
     SI_ORDER_SECOND, in_pcbstorage_init, &prot);			\
 SYSUNINIT(prot##_inpcbstorage_uninit, SI_SUB_PROTO_DOMAIN,		\
     SI_ORDER_SECOND, in_pcbstorage_destroy, &prot)
 
 /*
  * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
  * (or unique address:port combination) can be re-used at most
  * INPCBLBGROUP_SIZMAX (256) times. The inpcbs are stored in il_inp which
  * is dynamically resized as processes bind/unbind to that specific group.
  */
 struct inpcblbgroup {
 	CK_LIST_ENTRY(inpcblbgroup) il_list;
 	struct epoch_context il_epoch_ctx;
 	uint16_t	il_lport;			/* (c) */
 	u_char		il_vflag;			/* (c) */
 	u_int8_t		il_numa_domain;
 	uint32_t	il_pad2;
 	union in_dependaddr il_dependladdr;		/* (c) */
 #define	il_laddr	il_dependladdr.id46_addr.ia46_addr4
 #define	il6_laddr	il_dependladdr.id6_addr
 	uint32_t	il_inpsiz; /* max count in il_inp[] (h) */
 	uint32_t	il_inpcnt; /* cur count in il_inp[] (h) */
 	struct inpcb	*il_inp[];			/* (h) */
 };
 
 #define INP_LOCK_DESTROY(inp)	rw_destroy(&(inp)->inp_lock)
 #define INP_RLOCK(inp)		rw_rlock(&(inp)->inp_lock)
 #define INP_WLOCK(inp)		rw_wlock(&(inp)->inp_lock)
 #define INP_TRY_RLOCK(inp)	rw_try_rlock(&(inp)->inp_lock)
 #define INP_TRY_WLOCK(inp)	rw_try_wlock(&(inp)->inp_lock)
 #define INP_RUNLOCK(inp)	rw_runlock(&(inp)->inp_lock)
 #define INP_WUNLOCK(inp)	rw_wunlock(&(inp)->inp_lock)
 #define INP_UNLOCK(inp)		rw_unlock(&(inp)->inp_lock)
 #define	INP_TRY_UPGRADE(inp)	rw_try_upgrade(&(inp)->inp_lock)
 #define	INP_DOWNGRADE(inp)	rw_downgrade(&(inp)->inp_lock)
 #define	INP_WLOCKED(inp)	rw_wowned(&(inp)->inp_lock)
 #define	INP_LOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_LOCKED)
 #define	INP_RLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_RLOCKED)
 #define	INP_WLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_WLOCKED)
 #define	INP_UNLOCK_ASSERT(inp)	rw_assert(&(inp)->inp_lock, RA_UNLOCKED)
 
 /*
  * These locking functions are for inpcb consumers outside of sys/netinet,
  * more specifically, they were added for the benefit of TOE drivers. The
  * macros are reserved for use by the stack.
  */
 void inp_wlock(struct inpcb *);
 void inp_wunlock(struct inpcb *);
 void inp_rlock(struct inpcb *);
 void inp_runlock(struct inpcb *);
 
 #ifdef INVARIANT_SUPPORT
 void inp_lock_assert(struct inpcb *);
 void inp_unlock_assert(struct inpcb *);
 #else
 #define	inp_lock_assert(inp)	do {} while (0)
 #define	inp_unlock_assert(inp)	do {} while (0)
 #endif
 
 void	inp_apply_all(void (*func)(struct inpcb *, void *), void *arg);
 int 	inp_ip_tos_get(const struct inpcb *inp);
 void 	inp_ip_tos_set(struct inpcb *inp, int val);
 struct socket *
 	inp_inpcbtosocket(struct inpcb *inp);
 struct tcpcb *
 	inp_inpcbtotcpcb(struct inpcb *inp);
 void 	inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
 		uint32_t *faddr, uint16_t *fp);
 int	inp_so_options(const struct inpcb *inp);
 
 #endif /* _KERNEL */
 
 #define INP_INFO_WLOCK(ipi)	mtx_lock(&(ipi)->ipi_lock)
 #define INP_INFO_WLOCKED(ipi)	mtx_owned(&(ipi)->ipi_lock)
 #define INP_INFO_WUNLOCK(ipi)	mtx_unlock(&(ipi)->ipi_lock)
 #define	INP_INFO_LOCK_ASSERT(ipi)	MPASS(SMR_ENTERED((ipi)->ipi_smr) || \
 					mtx_owned(&(ipi)->ipi_lock))
 #define INP_INFO_WLOCK_ASSERT(ipi)	mtx_assert(&(ipi)->ipi_lock, MA_OWNED)
 #define INP_INFO_WUNLOCK_ASSERT(ipi)	\
 				mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED)
 
 #define	INP_HASH_WLOCK(ipi)		mtx_lock(&(ipi)->ipi_hash_lock)
 #define	INP_HASH_WUNLOCK(ipi)		mtx_unlock(&(ipi)->ipi_hash_lock)
 #define	INP_HASH_LOCK_ASSERT(ipi)	MPASS(SMR_ENTERED((ipi)->ipi_smr) || \
 					mtx_owned(&(ipi)->ipi_hash_lock))
 #define	INP_HASH_WLOCK_ASSERT(ipi)	mtx_assert(&(ipi)->ipi_hash_lock, \
 					MA_OWNED)
 
 /*
  * Wildcard matching hash is not just a microoptimisation!  The hash for
  * wildcard IPv4 and wildcard IPv6 must be the same, otherwise AF_INET6
  * wildcard bound pcb won't be able to receive AF_INET connections, while:
  * jenkins_hash(&zeroes, 1, s) != jenkins_hash(&zeroes, 4, s)
  * See also comment above struct in_addr_4in6.
  */
 #define	IN_ADDR_JHASH32(addr)						\
 	((addr)->s_addr == INADDR_ANY ? V_in_pcbhashseed :		\
 	    jenkins_hash32((&(addr)->s_addr), 1, V_in_pcbhashseed))
 #define	IN6_ADDR_JHASH32(addr)						\
 	(memcmp((addr), &in6addr_any, sizeof(in6addr_any)) == 0 ?	\
 	    V_in_pcbhashseed :						\
 	    jenkins_hash32((addr)->__u6_addr.__u6_addr32,		\
 	    nitems((addr)->__u6_addr.__u6_addr32), V_in_pcbhashseed))
 
 #define INP_PCBHASH(faddr, lport, fport, mask)				\
 	((IN_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport))) & (mask))
 #define	INP6_PCBHASH(faddr, lport, fport, mask)				\
 	((IN6_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport))) & (mask))
 
 #define	INP_PCBHASH_WILD(lport, mask)					\
 	((V_in_pcbhashseed ^ ntohs(lport)) & (mask))
 
 #define	INP_PCBLBGROUP_PKTHASH(faddr, lport, fport)			\
 	(IN_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport)))
 #define	INP6_PCBLBGROUP_PKTHASH(faddr, lport, fport)			\
 	(IN6_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport)))
 
 #define INP_PCBPORTHASH(lport, mask)	(ntohs((lport)) & (mask))
 
 /*
  * Flags for inp_vflags -- historically version flags only
  */
 #define	INP_IPV4	0x1
 #define	INP_IPV6	0x2
 #define	INP_IPV6PROTO	0x4		/* opened under IPv6 protocol */
 
 /*
  * Flags for inp_flags.
  */
 #define	INP_RECVOPTS		0x00000001 /* receive incoming IP options */
 #define	INP_RECVRETOPTS		0x00000002 /* receive IP options for reply */
 #define	INP_RECVDSTADDR		0x00000004 /* receive IP dst address */
 #define	INP_HDRINCL		0x00000008 /* user supplies entire IP header */
 #define	INP_HIGHPORT		0x00000010 /* user wants "high" port binding */
 #define	INP_LOWPORT		0x00000020 /* user wants "low" port binding */
 #define	INP_ANONPORT		0x00000040 /* port chosen for user */
 #define	INP_RECVIF		0x00000080 /* receive incoming interface */
 #define	INP_MTUDISC		0x00000100 /* user can do MTU discovery */
 /*	INP_FREED		0x00000200 private to in_pcb.c */
 #define	INP_RECVTTL		0x00000400 /* receive incoming IP TTL */
 #define	INP_DONTFRAG		0x00000800 /* don't fragment packet */
 #define	INP_BINDANY		0x00001000 /* allow bind to any address */
 #define	INP_INHASHLIST		0x00002000 /* in_pcbinshash() has been called */
 #define	INP_RECVTOS		0x00004000 /* receive incoming IP TOS */
 #define	IN6P_IPV6_V6ONLY	0x00008000 /* restrict AF_INET6 socket for v6 */
 #define	IN6P_PKTINFO		0x00010000 /* receive IP6 dst and I/F */
 #define	IN6P_HOPLIMIT		0x00020000 /* receive hoplimit */
 #define	IN6P_HOPOPTS		0x00040000 /* receive hop-by-hop options */
 #define	IN6P_DSTOPTS		0x00080000 /* receive dst options after rthdr */
 #define	IN6P_RTHDR		0x00100000 /* receive routing header */
 #define	IN6P_RTHDRDSTOPTS	0x00200000 /* receive dstoptions before rthdr */
 #define	IN6P_TCLASS		0x00400000 /* receive traffic class value */
 #define	IN6P_AUTOFLOWLABEL	0x00800000 /* attach flowlabel automatically */
-#define	INP_TIMEWAIT		0x01000000 /* in TIMEWAIT, ppcb is tcptw */
+/* was	INP_TIMEWAIT		0x01000000 */
 #define	INP_ONESBCAST		0x02000000 /* send all-ones broadcast */
 #define	INP_DROPPED		0x04000000 /* protocol drop flag */
 #define	INP_SOCKREF		0x08000000 /* strong socket reference */
 #define	INP_RESERVED_0          0x10000000 /* reserved field */
 #define	INP_RESERVED_1          0x20000000 /* reserved field */
 #define	IN6P_RFC2292		0x40000000 /* used RFC2292 API on the socket */
 #define	IN6P_MTU		0x80000000 /* receive path MTU */
 
 #define	INP_CONTROLOPTS		(INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\
 				 INP_RECVIF|INP_RECVTTL|INP_RECVTOS|\
 				 IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\
 				 IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\
 				 IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\
 				 IN6P_MTU)
 
 /*
  * Flags for inp_flags2.
  */
 #define	INP_MBUF_L_ACKS		0x00000001 /* We need large mbufs for ack compression */
 #define	INP_MBUF_ACKCMP		0x00000002 /* TCP mbuf ack compression ok */
 /*				0x00000004 */
 #define	INP_REUSEPORT		0x00000008 /* SO_REUSEPORT option is set */
 /*				0x00000010 */
 #define	INP_REUSEADDR		0x00000020 /* SO_REUSEADDR option is set */
 #define	INP_BINDMULTI		0x00000040 /* IP_BINDMULTI option is set */
 #define	INP_RSS_BUCKET_SET	0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
 #define	INP_RECVFLOWID		0x00000100 /* populate recv datagram with flow info */
 #define	INP_RECVRSSBUCKETID	0x00000200 /* populate recv datagram with bucket id */
 #define	INP_RATE_LIMIT_CHANGED	0x00000400 /* rate limit needs attention */
 #define	INP_ORIGDSTADDR		0x00000800 /* receive IP dst address/port */
 #define INP_CANNOT_DO_ECN	0x00001000 /* The stack does not do ECN */
 #define	INP_REUSEPORT_LB	0x00002000 /* SO_REUSEPORT_LB option is set */
 #define INP_SUPPORTS_MBUFQ	0x00004000 /* Supports the mbuf queue method of LRO */
 #define INP_MBUF_QUEUE_READY	0x00008000 /* The transport is pacing, inputs can be queued */
 #define INP_DONT_SACK_QUEUE	0x00010000 /* If a sack arrives do not wake me */
 #define INP_2PCP_SET		0x00020000 /* If the Eth PCP should be set explicitly */
 #define INP_2PCP_BIT0		0x00040000 /* Eth PCP Bit 0 */
 #define INP_2PCP_BIT1		0x00080000 /* Eth PCP Bit 1 */
 #define INP_2PCP_BIT2		0x00100000 /* Eth PCP Bit 2 */
 #define INP_2PCP_BASE	INP_2PCP_BIT0
 #define INP_2PCP_MASK	(INP_2PCP_BIT0 | INP_2PCP_BIT1 | INP_2PCP_BIT2)
 #define INP_2PCP_SHIFT		18         /* shift PCP field in/out of inp_flags2 */
 
 /*
  * Flags passed to in_pcblookup*(), inp_smr_lock() and inp_next().
  */
 typedef	enum {
 	INPLOOKUP_WILDCARD = 0x00000001,	/* Allow wildcard sockets. */
 	INPLOOKUP_RLOCKPCB = 0x00000002,	/* Return inpcb read-locked. */
 	INPLOOKUP_WLOCKPCB = 0x00000004,	/* Return inpcb write-locked. */
 } inp_lookup_t;
 
 #define	INPLOOKUP_MASK	(INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \
 	    INPLOOKUP_WLOCKPCB)
 #define	INPLOOKUP_LOCKMASK	(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)
 
 #define	sotoinpcb(so)	((struct inpcb *)(so)->so_pcb)
 
 #define	INP_SOCKAF(so) so->so_proto->pr_domain->dom_family
 
 #define	INP_CHECK_SOCKAF(so, af)	(INP_SOCKAF(so) == af)
 
 #ifdef _KERNEL
 VNET_DECLARE(int, ipport_reservedhigh);
 VNET_DECLARE(int, ipport_reservedlow);
 VNET_DECLARE(int, ipport_lowfirstauto);
 VNET_DECLARE(int, ipport_lowlastauto);
 VNET_DECLARE(int, ipport_firstauto);
 VNET_DECLARE(int, ipport_lastauto);
 VNET_DECLARE(int, ipport_hifirstauto);
 VNET_DECLARE(int, ipport_hilastauto);
 VNET_DECLARE(int, ipport_randomized);
 VNET_DECLARE(int, ipport_randomcps);
 VNET_DECLARE(int, ipport_randomtime);
 VNET_DECLARE(int, ipport_stoprandom);
 VNET_DECLARE(int, ipport_tcpallocs);
 
 #define	V_ipport_reservedhigh	VNET(ipport_reservedhigh)
 #define	V_ipport_reservedlow	VNET(ipport_reservedlow)
 #define	V_ipport_lowfirstauto	VNET(ipport_lowfirstauto)
 #define	V_ipport_lowlastauto	VNET(ipport_lowlastauto)
 #define	V_ipport_firstauto	VNET(ipport_firstauto)
 #define	V_ipport_lastauto	VNET(ipport_lastauto)
 #define	V_ipport_hifirstauto	VNET(ipport_hifirstauto)
 #define	V_ipport_hilastauto	VNET(ipport_hilastauto)
 #define	V_ipport_randomized	VNET(ipport_randomized)
 #define	V_ipport_randomcps	VNET(ipport_randomcps)
 #define	V_ipport_randomtime	VNET(ipport_randomtime)
 #define	V_ipport_stoprandom	VNET(ipport_stoprandom)
 #define	V_ipport_tcpallocs	VNET(ipport_tcpallocs)
 
 void	in_pcbinfo_init(struct inpcbinfo *, struct inpcbstorage *,
 	    u_int, u_int);
 void	in_pcbinfo_destroy(struct inpcbinfo *);
 void	in_pcbstorage_init(void *);
 void	in_pcbstorage_destroy(void *);
 
 int	in_pcbbind_check_bindmulti(const struct inpcb *ni,
 	    const struct inpcb *oi);
 
 void	in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *);
 int	in_pcballoc(struct socket *, struct inpcbinfo *);
 int	in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *);
 int	in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
 	    u_short *, struct ucred *);
 int	in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *, bool);
 int	in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
 	    u_short *, in_addr_t *, u_short *, struct inpcb **,
 	    struct ucred *);
 void	in_pcbdetach(struct inpcb *);
 void	in_pcbdisconnect(struct inpcb *);
 void	in_pcbdrop(struct inpcb *);
 void	in_pcbfree(struct inpcb *);
 int	in_pcbinshash(struct inpcb *);
 int	in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *,
 	    struct ucred *);
 int	in_pcblbgroup_numa(struct inpcb *, int arg);
 struct inpcb *
 	in_pcblookup(struct inpcbinfo *, struct in_addr, u_int,
 	    struct in_addr, u_int, int, struct ifnet *);
 struct inpcb *
 	in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int,
 	    struct in_addr, u_int, int, struct ifnet *, struct mbuf *);
 void	in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
 	    int, struct inpcb *(*)(struct inpcb *, int));
 void	in_pcbref(struct inpcb *);
 void	in_pcbrehash(struct inpcb *);
 bool	in_pcbrele_rlocked(struct inpcb *);
 bool	in_pcbrele_wlocked(struct inpcb *);
 
 typedef bool inp_match_t(const struct inpcb *, void *);
 struct inpcb_iterator {
 	const struct inpcbinfo	*ipi;
 	struct inpcb		*inp;
 	inp_match_t		*match;
 	void			*ctx;
 	int			hash;
 #define	INP_ALL_LIST		-1
 	const inp_lookup_t	lock;
 };
 
 /* Note: sparse initializers guarantee .inp = NULL. */
 #define	INP_ITERATOR(_ipi, _lock, _match, _ctx)		\
 	{						\
 		.ipi = (_ipi),				\
 		.lock = (_lock),			\
 		.hash = INP_ALL_LIST,			\
 		.match = (_match),			\
 		.ctx = (_ctx),				\
 	}
 #define	INP_ALL_ITERATOR(_ipi, _lock)			\
 	{						\
 		.ipi = (_ipi),				\
 		.lock = (_lock),			\
 		.hash = INP_ALL_LIST,			\
 	}
 
 struct inpcb *inp_next(struct inpcb_iterator *);
 void	in_losing(struct inpcb *);
 void	in_pcbsetsolabel(struct socket *so);
 int	in_getpeeraddr(struct socket *so, struct sockaddr **nam);
 int	in_getsockaddr(struct socket *so, struct sockaddr **nam);
 struct sockaddr *
 	in_sockaddr(in_port_t port, struct in_addr *addr);
 void	in_pcbsosetlabel(struct socket *so);
 #ifdef RATELIMIT
 int
 in_pcboutput_txrtlmt_locked(struct inpcb *, struct ifnet *,
 	    struct mbuf *, uint32_t);
 int	in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t,
 	    uint32_t, struct m_snd_tag **);
 void	in_pcbdetach_txrtlmt(struct inpcb *);
 void    in_pcbdetach_tag(struct m_snd_tag *);
 int	in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
 int	in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
 int	in_pcbquery_txrlevel(struct inpcb *, uint32_t *);
 void	in_pcboutput_txrtlmt(struct inpcb *, struct ifnet *, struct mbuf *);
 void	in_pcboutput_eagain(struct inpcb *);
 #endif
 #endif /* _KERNEL */
 
 #endif /* !_NETINET_IN_PCB_H_ */
diff --git a/sys/netinet/siftr.c b/sys/netinet/siftr.c
index b326218a8729..c025c06d7f32 100644
--- a/sys/netinet/siftr.c
+++ b/sys/netinet/siftr.c
@@ -1,1599 +1,1597 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2007-2009
  * 	Swinburne University of Technology, Melbourne, Australia.
  * Copyright (c) 2009-2010, The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed at the Centre for Advanced
  * Internet Architectures, Swinburne University of Technology, Melbourne,
  * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /******************************************************
  * Statistical Information For TCP Research (SIFTR)
  *
  * A FreeBSD kernel module that adds very basic intrumentation to the
  * TCP stack, allowing internal stats to be recorded to a log file
  * for experimental, debugging and performance analysis purposes.
  *
  * SIFTR was first released in 2007 by James Healy and Lawrence Stewart whilst
  * working on the NewTCP research project at Swinburne University of
  * Technology's Centre for Advanced Internet Architectures, Melbourne,
  * Australia, which was made possible in part by a grant from the Cisco
  * University Research Program Fund at Community Foundation Silicon Valley.
  * More details are available at:
  *   http://caia.swin.edu.au/urp/newtcp/
  *
  * Work on SIFTR v1.2.x was sponsored by the FreeBSD Foundation as part of
  * the "Enhancing the FreeBSD TCP Implementation" project 2008-2009.
  * More details are available at:
  *   http://www.freebsdfoundation.org/
  *   http://caia.swin.edu.au/freebsd/etcp09/
  *
  * Lawrence Stewart is the current maintainer, and all contact regarding
  * SIFTR should be directed to him via email: lastewart@swin.edu.au
  *
  * Initial release date: June 2007
  * Most recent update: September 2010
  ******************************************************/
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/alq.h>
 #include <sys/errno.h>
 #include <sys/eventhandler.h>
 #include <sys/hash.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/unistd.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/pfil.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp_var.h>
 
 #ifdef SIFTR_IPV6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_pcb.h>
 #endif /* SIFTR_IPV6 */
 
 #include <machine/in_cksum.h>
 
 /*
  * Three digit version number refers to X.Y.Z where:
  * X is the major version number
  * Y is bumped to mark backwards incompatible changes
  * Z is bumped to mark backwards compatible changes
  */
 #define V_MAJOR		1
 #define V_BACKBREAK	2
 #define V_BACKCOMPAT	4
 #define MODVERSION	__CONCAT(V_MAJOR, __CONCAT(V_BACKBREAK, V_BACKCOMPAT))
 #define MODVERSION_STR	__XSTRING(V_MAJOR) "." __XSTRING(V_BACKBREAK) "." \
     __XSTRING(V_BACKCOMPAT)
 
 #define HOOK 0
 #define UNHOOK 1
 #define SIFTR_EXPECTED_MAX_TCP_FLOWS 65536
 #define SYS_NAME "FreeBSD"
 #define PACKET_TAG_SIFTR 100
 #define PACKET_COOKIE_SIFTR 21749576
 #define SIFTR_LOG_FILE_MODE 0644
 #define SIFTR_DISABLE 0
 #define SIFTR_ENABLE 1
 
 /*
  * Hard upper limit on the length of log messages. Bump this up if you add new
  * data fields such that the line length could exceed the below value.
  */
 #define MAX_LOG_MSG_LEN 300
 /* XXX: Make this a sysctl tunable. */
 #define SIFTR_ALQ_BUFLEN (1000*MAX_LOG_MSG_LEN)
 
 /*
  * 1 byte for IP version
  * IPv4: src/dst IP (4+4) + src/dst port (2+2) = 12 bytes
  * IPv6: src/dst IP (16+16) + src/dst port (2+2) = 36 bytes
  */
 #ifdef SIFTR_IPV6
 #define FLOW_KEY_LEN 37
 #else
 #define FLOW_KEY_LEN 13
 #endif
 
 #ifdef SIFTR_IPV6
 #define SIFTR_IPMODE 6
 #else
 #define SIFTR_IPMODE 4
 #endif
 
 /* useful macros */
 #define UPPER_SHORT(X)	(((X) & 0xFFFF0000) >> 16)
 #define LOWER_SHORT(X)	((X) & 0x0000FFFF)
 
 #define FIRST_OCTET(X)	(((X) & 0xFF000000) >> 24)
 #define SECOND_OCTET(X)	(((X) & 0x00FF0000) >> 16)
 #define THIRD_OCTET(X)	(((X) & 0x0000FF00) >> 8)
 #define FOURTH_OCTET(X)	((X) & 0x000000FF)
 
 static MALLOC_DEFINE(M_SIFTR, "siftr", "dynamic memory used by SIFTR");
 static MALLOC_DEFINE(M_SIFTR_PKTNODE, "siftr_pktnode",
     "SIFTR pkt_node struct");
 static MALLOC_DEFINE(M_SIFTR_HASHNODE, "siftr_hashnode",
     "SIFTR flow_hash_node struct");
 
 /* Used as links in the pkt manager queue. */
 struct pkt_node {
 	/* Timestamp of pkt as noted in the pfil hook. */
 	struct timeval		tval;
 	/* Direction pkt is travelling. */
 	enum {
 		DIR_IN = 0,
 		DIR_OUT = 1,
 	}			direction;
 	/* IP version pkt_node relates to; either INP_IPV4 or INP_IPV6. */
 	uint8_t			ipver;
 	/* Hash of the pkt which triggered the log message. */
 	uint32_t		hash;
 	/* Local/foreign IP address. */
 #ifdef SIFTR_IPV6
 	uint32_t		ip_laddr[4];
 	uint32_t		ip_faddr[4];
 #else
 	uint8_t			ip_laddr[4];
 	uint8_t			ip_faddr[4];
 #endif
 	/* Local TCP port. */
 	uint16_t		tcp_localport;
 	/* Foreign TCP port. */
 	uint16_t		tcp_foreignport;
 	/* Congestion Window (bytes). */
 	uint32_t		snd_cwnd;
 	/* Sending Window (bytes). */
 	uint32_t		snd_wnd;
 	/* Receive Window (bytes). */
 	uint32_t		rcv_wnd;
 	/* More tcpcb flags storage */
 	uint32_t		t_flags2;
 	/* Slow Start Threshold (bytes). */
 	uint32_t		snd_ssthresh;
 	/* Current state of the TCP FSM. */
 	int			conn_state;
 	/* Max Segment Size (bytes). */
 	u_int			max_seg_size;
 	/*
 	 * Smoothed RTT stored as found in the TCP control block
 	 * in units of (TCP_RTT_SCALE*hz).
 	 */
 	int			smoothed_rtt;
 	/* Is SACK enabled? */
 	u_char			sack_enabled;
 	/* Window scaling for snd window. */
 	u_char			snd_scale;
 	/* Window scaling for recv window. */
 	u_char			rcv_scale;
 	/* TCP control block flags. */
 	u_int			flags;
 	/* Retransmit timeout length. */
 	int			rxt_length;
 	/* Size of the TCP send buffer in bytes. */
 	u_int			snd_buf_hiwater;
 	/* Current num bytes in the send socket buffer. */
 	u_int			snd_buf_cc;
 	/* Size of the TCP receive buffer in bytes. */
 	u_int			rcv_buf_hiwater;
 	/* Current num bytes in the receive socket buffer. */
 	u_int			rcv_buf_cc;
 	/* Number of bytes inflight that we are waiting on ACKs for. */
 	u_int			sent_inflight_bytes;
 	/* Number of segments currently in the reassembly queue. */
 	int			t_segqlen;
 	/* Flowid for the connection. */
 	u_int			flowid;
 	/* Flow type for the connection. */
 	u_int			flowtype;
 	/* Link to next pkt_node in the list. */
 	STAILQ_ENTRY(pkt_node)	nodes;
 };
 
 struct flow_hash_node
 {
 	uint16_t counter;
 	uint8_t key[FLOW_KEY_LEN];
 	LIST_ENTRY(flow_hash_node) nodes;
 };
 
 struct siftr_stats
 {
 	/* # TCP pkts seen by the SIFTR PFIL hooks, including any skipped. */
 	uint64_t n_in;
 	uint64_t n_out;
 	/* # pkts skipped due to failed malloc calls. */
 	uint32_t nskip_in_malloc;
 	uint32_t nskip_out_malloc;
 	/* # pkts skipped due to failed mtx acquisition. */
 	uint32_t nskip_in_mtx;
 	uint32_t nskip_out_mtx;
 	/* # pkts skipped due to failed inpcb lookups. */
 	uint32_t nskip_in_inpcb;
 	uint32_t nskip_out_inpcb;
 	/* # pkts skipped due to failed tcpcb lookups. */
 	uint32_t nskip_in_tcpcb;
 	uint32_t nskip_out_tcpcb;
 	/* # pkts skipped due to stack reinjection. */
 	uint32_t nskip_in_dejavu;
 	uint32_t nskip_out_dejavu;
 };
 
 DPCPU_DEFINE_STATIC(struct siftr_stats, ss);
 
 static volatile unsigned int siftr_exit_pkt_manager_thread = 0;
 static unsigned int siftr_enabled = 0;
 static unsigned int siftr_pkts_per_log = 1;
 static unsigned int siftr_generate_hashes = 0;
 static uint16_t     siftr_port_filter = 0;
 /* static unsigned int siftr_binary_log = 0; */
 static char siftr_logfile[PATH_MAX] = "/var/log/siftr.log";
 static char siftr_logfile_shadow[PATH_MAX] = "/var/log/siftr.log";
 static u_long siftr_hashmask;
 STAILQ_HEAD(pkthead, pkt_node) pkt_queue = STAILQ_HEAD_INITIALIZER(pkt_queue);
 LIST_HEAD(listhead, flow_hash_node) *counter_hash;
 static int wait_for_pkt;
 static struct alq *siftr_alq = NULL;
 static struct mtx siftr_pkt_queue_mtx;
 static struct mtx siftr_pkt_mgr_mtx;
 static struct thread *siftr_pkt_manager_thr = NULL;
 static char direction[2] = {'i','o'};
 
 /* Required function prototypes. */
 static int siftr_sysctl_enabled_handler(SYSCTL_HANDLER_ARGS);
 static int siftr_sysctl_logfile_name_handler(SYSCTL_HANDLER_ARGS);
 
 /* Declare the net.inet.siftr sysctl tree and populate it. */
 
 SYSCTL_DECL(_net_inet_siftr);
 
 SYSCTL_NODE(_net_inet, OID_AUTO, siftr, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
     "siftr related settings");
 
 SYSCTL_PROC(_net_inet_siftr, OID_AUTO, enabled,
     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &siftr_enabled, 0, &siftr_sysctl_enabled_handler, "IU",
     "switch siftr module operations on/off");
 
 SYSCTL_PROC(_net_inet_siftr, OID_AUTO, logfile,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &siftr_logfile_shadow,
     sizeof(siftr_logfile_shadow), &siftr_sysctl_logfile_name_handler, "A",
     "file to save siftr log messages to");
 
 SYSCTL_UINT(_net_inet_siftr, OID_AUTO, ppl, CTLFLAG_RW,
     &siftr_pkts_per_log, 1,
     "number of packets between generating a log message");
 
 SYSCTL_UINT(_net_inet_siftr, OID_AUTO, genhashes, CTLFLAG_RW,
     &siftr_generate_hashes, 0,
     "enable packet hash generation");
 
 SYSCTL_U16(_net_inet_siftr, OID_AUTO, port_filter, CTLFLAG_RW,
     &siftr_port_filter, 0,
     "enable packet filter on a TCP port");
 
 /* XXX: TODO
 SYSCTL_UINT(_net_inet_siftr, OID_AUTO, binary, CTLFLAG_RW,
     &siftr_binary_log, 0,
     "write log files in binary instead of ascii");
 */
 
 /* Begin functions. */
 
 static void
 siftr_process_pkt(struct pkt_node * pkt_node)
 {
 	struct flow_hash_node *hash_node;
 	struct listhead *counter_list;
 	struct siftr_stats *ss;
 	struct ale *log_buf;
 	uint8_t key[FLOW_KEY_LEN];
 	uint8_t found_match, key_offset;
 
 	hash_node = NULL;
 	ss = DPCPU_PTR(ss);
 	found_match = 0;
 	key_offset = 1;
 
 	/*
 	 * Create the key that will be used to create a hash index
 	 * into our hash table. Our key consists of:
 	 * ipversion, localip, localport, foreignip, foreignport
 	 */
 	key[0] = pkt_node->ipver;
 	memcpy(key + key_offset, &pkt_node->ip_laddr,
 	    sizeof(pkt_node->ip_laddr));
 	key_offset += sizeof(pkt_node->ip_laddr);
 	memcpy(key + key_offset, &pkt_node->tcp_localport,
 	    sizeof(pkt_node->tcp_localport));
 	key_offset += sizeof(pkt_node->tcp_localport);
 	memcpy(key + key_offset, &pkt_node->ip_faddr,
 	    sizeof(pkt_node->ip_faddr));
 	key_offset += sizeof(pkt_node->ip_faddr);
 	memcpy(key + key_offset, &pkt_node->tcp_foreignport,
 	    sizeof(pkt_node->tcp_foreignport));
 
 	counter_list = counter_hash +
 	    (hash32_buf(key, sizeof(key), 0) & siftr_hashmask);
 
 	/*
 	 * If the list is not empty i.e. the hash index has
 	 * been used by another flow previously.
 	 */
 	if (LIST_FIRST(counter_list) != NULL) {
 		/*
 		 * Loop through the hash nodes in the list.
 		 * There should normally only be 1 hash node in the list,
 		 * except if there have been collisions at the hash index
 		 * computed by hash32_buf().
 		 */
 		LIST_FOREACH(hash_node, counter_list, nodes) {
 			/*
 			 * Check if the key for the pkt we are currently
 			 * processing is the same as the key stored in the
 			 * hash node we are currently processing.
 			 * If they are the same, then we've found the
 			 * hash node that stores the counter for the flow
 			 * the pkt belongs to.
 			 */
 			if (memcmp(hash_node->key, key, sizeof(key)) == 0) {
 				found_match = 1;
 				break;
 			}
 		}
 	}
 
 	/* If this flow hash hasn't been seen before or we have a collision. */
 	if (hash_node == NULL || !found_match) {
 		/* Create a new hash node to store the flow's counter. */
 		hash_node = malloc(sizeof(struct flow_hash_node),
 		    M_SIFTR_HASHNODE, M_WAITOK);
 
 		if (hash_node != NULL) {
 			/* Initialise our new hash node list entry. */
 			hash_node->counter = 0;
 			memcpy(hash_node->key, key, sizeof(key));
 			LIST_INSERT_HEAD(counter_list, hash_node, nodes);
 		} else {
 			/* Malloc failed. */
 			if (pkt_node->direction == DIR_IN)
 				ss->nskip_in_malloc++;
 			else
 				ss->nskip_out_malloc++;
 
 			return;
 		}
 	} else if (siftr_pkts_per_log > 1) {
 		/*
 		 * Taking the remainder of the counter divided
 		 * by the current value of siftr_pkts_per_log
 		 * and storing that in counter provides a neat
 		 * way to modulate the frequency of log
 		 * messages being written to the log file.
 		 */
 		hash_node->counter = (hash_node->counter + 1) %
 		    siftr_pkts_per_log;
 
 		/*
 		 * If we have not seen enough packets since the last time
 		 * we wrote a log message for this connection, return.
 		 */
 		if (hash_node->counter > 0)
 			return;
 	}
 
 	log_buf = alq_getn(siftr_alq, MAX_LOG_MSG_LEN, ALQ_WAITOK);
 
 	if (log_buf == NULL)
 		return; /* Should only happen if the ALQ is shutting down. */
 
 #ifdef SIFTR_IPV6
 	pkt_node->ip_laddr[3] = ntohl(pkt_node->ip_laddr[3]);
 	pkt_node->ip_faddr[3] = ntohl(pkt_node->ip_faddr[3]);
 
 	if (pkt_node->ipver == INP_IPV6) { /* IPv6 packet */
 		pkt_node->ip_laddr[0] = ntohl(pkt_node->ip_laddr[0]);
 		pkt_node->ip_laddr[1] = ntohl(pkt_node->ip_laddr[1]);
 		pkt_node->ip_laddr[2] = ntohl(pkt_node->ip_laddr[2]);
 		pkt_node->ip_faddr[0] = ntohl(pkt_node->ip_faddr[0]);
 		pkt_node->ip_faddr[1] = ntohl(pkt_node->ip_faddr[1]);
 		pkt_node->ip_faddr[2] = ntohl(pkt_node->ip_faddr[2]);
 
 		/* Construct an IPv6 log message. */
 		log_buf->ae_bytesused = snprintf(log_buf->ae_data,
 		    MAX_LOG_MSG_LEN,
 		    "%c,0x%08x,%zd.%06ld,%x:%x:%x:%x:%x:%x:%x:%x,%u,%x:%x:%x:"
 		    "%x:%x:%x:%x:%x,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,"
 		    "%u,%d,%u,%u,%u,%u,%u,%u,%u,%u\n",
 		    direction[pkt_node->direction],
 		    pkt_node->hash,
 		    pkt_node->tval.tv_sec,
 		    pkt_node->tval.tv_usec,
 		    UPPER_SHORT(pkt_node->ip_laddr[0]),
 		    LOWER_SHORT(pkt_node->ip_laddr[0]),
 		    UPPER_SHORT(pkt_node->ip_laddr[1]),
 		    LOWER_SHORT(pkt_node->ip_laddr[1]),
 		    UPPER_SHORT(pkt_node->ip_laddr[2]),
 		    LOWER_SHORT(pkt_node->ip_laddr[2]),
 		    UPPER_SHORT(pkt_node->ip_laddr[3]),
 		    LOWER_SHORT(pkt_node->ip_laddr[3]),
 		    ntohs(pkt_node->tcp_localport),
 		    UPPER_SHORT(pkt_node->ip_faddr[0]),
 		    LOWER_SHORT(pkt_node->ip_faddr[0]),
 		    UPPER_SHORT(pkt_node->ip_faddr[1]),
 		    LOWER_SHORT(pkt_node->ip_faddr[1]),
 		    UPPER_SHORT(pkt_node->ip_faddr[2]),
 		    LOWER_SHORT(pkt_node->ip_faddr[2]),
 		    UPPER_SHORT(pkt_node->ip_faddr[3]),
 		    LOWER_SHORT(pkt_node->ip_faddr[3]),
 		    ntohs(pkt_node->tcp_foreignport),
 		    pkt_node->snd_ssthresh,
 		    pkt_node->snd_cwnd,
 		    pkt_node->t_flags2,
 		    pkt_node->snd_wnd,
 		    pkt_node->rcv_wnd,
 		    pkt_node->snd_scale,
 		    pkt_node->rcv_scale,
 		    pkt_node->conn_state,
 		    pkt_node->max_seg_size,
 		    pkt_node->smoothed_rtt,
 		    pkt_node->sack_enabled,
 		    pkt_node->flags,
 		    pkt_node->rxt_length,
 		    pkt_node->snd_buf_hiwater,
 		    pkt_node->snd_buf_cc,
 		    pkt_node->rcv_buf_hiwater,
 		    pkt_node->rcv_buf_cc,
 		    pkt_node->sent_inflight_bytes,
 		    pkt_node->t_segqlen,
 		    pkt_node->flowid,
 		    pkt_node->flowtype);
 	} else { /* IPv4 packet */
 		pkt_node->ip_laddr[0] = FIRST_OCTET(pkt_node->ip_laddr[3]);
 		pkt_node->ip_laddr[1] = SECOND_OCTET(pkt_node->ip_laddr[3]);
 		pkt_node->ip_laddr[2] = THIRD_OCTET(pkt_node->ip_laddr[3]);
 		pkt_node->ip_laddr[3] = FOURTH_OCTET(pkt_node->ip_laddr[3]);
 		pkt_node->ip_faddr[0] = FIRST_OCTET(pkt_node->ip_faddr[3]);
 		pkt_node->ip_faddr[1] = SECOND_OCTET(pkt_node->ip_faddr[3]);
 		pkt_node->ip_faddr[2] = THIRD_OCTET(pkt_node->ip_faddr[3]);
 		pkt_node->ip_faddr[3] = FOURTH_OCTET(pkt_node->ip_faddr[3]);
 #endif /* SIFTR_IPV6 */
 
 		/* Construct an IPv4 log message. */
 		log_buf->ae_bytesused = snprintf(log_buf->ae_data,
 		    MAX_LOG_MSG_LEN,
 		    "%c,0x%08x,%jd.%06ld,%u.%u.%u.%u,%u,%u.%u.%u.%u,%u,%u,%u,"
 		    "%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%d,%u,%u,%u,%u,%u,%u,%u,%u\n",
 		    direction[pkt_node->direction],
 		    pkt_node->hash,
 		    (intmax_t)pkt_node->tval.tv_sec,
 		    pkt_node->tval.tv_usec,
 		    pkt_node->ip_laddr[0],
 		    pkt_node->ip_laddr[1],
 		    pkt_node->ip_laddr[2],
 		    pkt_node->ip_laddr[3],
 		    ntohs(pkt_node->tcp_localport),
 		    pkt_node->ip_faddr[0],
 		    pkt_node->ip_faddr[1],
 		    pkt_node->ip_faddr[2],
 		    pkt_node->ip_faddr[3],
 		    ntohs(pkt_node->tcp_foreignport),
 		    pkt_node->snd_ssthresh,
 		    pkt_node->snd_cwnd,
 		    pkt_node->t_flags2,
 		    pkt_node->snd_wnd,
 		    pkt_node->rcv_wnd,
 		    pkt_node->snd_scale,
 		    pkt_node->rcv_scale,
 		    pkt_node->conn_state,
 		    pkt_node->max_seg_size,
 		    pkt_node->smoothed_rtt,
 		    pkt_node->sack_enabled,
 		    pkt_node->flags,
 		    pkt_node->rxt_length,
 		    pkt_node->snd_buf_hiwater,
 		    pkt_node->snd_buf_cc,
 		    pkt_node->rcv_buf_hiwater,
 		    pkt_node->rcv_buf_cc,
 		    pkt_node->sent_inflight_bytes,
 		    pkt_node->t_segqlen,
 		    pkt_node->flowid,
 		    pkt_node->flowtype);
 #ifdef SIFTR_IPV6
 	}
 #endif
 
 	alq_post_flags(siftr_alq, log_buf, 0);
 }
 
 static void
 siftr_pkt_manager_thread(void *arg)
 {
 	STAILQ_HEAD(pkthead, pkt_node) tmp_pkt_queue =
 	    STAILQ_HEAD_INITIALIZER(tmp_pkt_queue);
 	struct pkt_node *pkt_node, *pkt_node_temp;
 	uint8_t draining;
 
 	draining = 2;
 
 	mtx_lock(&siftr_pkt_mgr_mtx);
 
 	/* draining == 0 when queue has been flushed and it's safe to exit. */
 	while (draining) {
 		/*
 		 * Sleep until we are signalled to wake because thread has
 		 * been told to exit or until 1 tick has passed.
 		 */
 		mtx_sleep(&wait_for_pkt, &siftr_pkt_mgr_mtx, PWAIT, "pktwait",
 		    1);
 
 		/* Gain exclusive access to the pkt_node queue. */
 		mtx_lock(&siftr_pkt_queue_mtx);
 
 		/*
 		 * Move pkt_queue to tmp_pkt_queue, which leaves
 		 * pkt_queue empty and ready to receive more pkt_nodes.
 		 */
 		STAILQ_CONCAT(&tmp_pkt_queue, &pkt_queue);
 
 		/*
 		 * We've finished making changes to the list. Unlock it
 		 * so the pfil hooks can continue queuing pkt_nodes.
 		 */
 		mtx_unlock(&siftr_pkt_queue_mtx);
 
 		/*
 		 * We can't hold a mutex whilst calling siftr_process_pkt
 		 * because ALQ might sleep waiting for buffer space.
 		 */
 		mtx_unlock(&siftr_pkt_mgr_mtx);
 
 		/* Flush all pkt_nodes to the log file. */
 		STAILQ_FOREACH_SAFE(pkt_node, &tmp_pkt_queue, nodes,
 		    pkt_node_temp) {
 			siftr_process_pkt(pkt_node);
 			STAILQ_REMOVE_HEAD(&tmp_pkt_queue, nodes);
 			free(pkt_node, M_SIFTR_PKTNODE);
 		}
 
 		KASSERT(STAILQ_EMPTY(&tmp_pkt_queue),
 		    ("SIFTR tmp_pkt_queue not empty after flush"));
 
 		mtx_lock(&siftr_pkt_mgr_mtx);
 
 		/*
 		 * If siftr_exit_pkt_manager_thread gets set during the window
 		 * where we are draining the tmp_pkt_queue above, there might
 		 * still be pkts in pkt_queue that need to be drained.
 		 * Allow one further iteration to occur after
 		 * siftr_exit_pkt_manager_thread has been set to ensure
 		 * pkt_queue is completely empty before we kill the thread.
 		 *
 		 * siftr_exit_pkt_manager_thread is set only after the pfil
 		 * hooks have been removed, so only 1 extra iteration
 		 * is needed to drain the queue.
 		 */
 		if (siftr_exit_pkt_manager_thread)
 			draining--;
 	}
 
 	mtx_unlock(&siftr_pkt_mgr_mtx);
 
 	/* Calls wakeup on this thread's struct thread ptr. */
 	kthread_exit();
 }
 
 static uint32_t
 hash_pkt(struct mbuf *m, uint32_t offset)
 {
 	uint32_t hash;
 
 	hash = 0;
 
 	while (m != NULL && offset > m->m_len) {
 		/*
 		 * The IP packet payload does not start in this mbuf, so
 		 * need to figure out which mbuf it starts in and what offset
 		 * into the mbuf's data region the payload starts at.
 		 */
 		offset -= m->m_len;
 		m = m->m_next;
 	}
 
 	while (m != NULL) {
 		/* Ensure there is data in the mbuf */
 		if ((m->m_len - offset) > 0)
 			hash = hash32_buf(m->m_data + offset,
 			    m->m_len - offset, hash);
 
 		m = m->m_next;
 		offset = 0;
         }
 
 	return (hash);
 }
 
 /*
  * Check if a given mbuf has the SIFTR mbuf tag. If it does, log the fact that
  * it's a reinjected packet and return. If it doesn't, tag the mbuf and return.
  * Return value >0 means the caller should skip processing this mbuf.
  */
 static inline int
 siftr_chkreinject(struct mbuf *m, int dir, struct siftr_stats *ss)
 {
 	if (m_tag_locate(m, PACKET_COOKIE_SIFTR, PACKET_TAG_SIFTR, NULL)
 	    != NULL) {
 		if (dir == PFIL_IN)
 			ss->nskip_in_dejavu++;
 		else
 			ss->nskip_out_dejavu++;
 
 		return (1);
 	} else {
 		struct m_tag *tag = m_tag_alloc(PACKET_COOKIE_SIFTR,
 		    PACKET_TAG_SIFTR, 0, M_NOWAIT);
 		if (tag == NULL) {
 			if (dir == PFIL_IN)
 				ss->nskip_in_malloc++;
 			else
 				ss->nskip_out_malloc++;
 
 			return (1);
 		}
 
 		m_tag_prepend(m, tag);
 	}
 
 	return (0);
 }
 
 /*
  * Look up an inpcb for a packet. Return the inpcb pointer if found, or NULL
  * otherwise.
  */
 static inline struct inpcb *
 siftr_findinpcb(int ipver, struct ip *ip, struct mbuf *m, uint16_t sport,
     uint16_t dport, int dir, struct siftr_stats *ss)
 {
 	struct inpcb *inp;
 
 	/* We need the tcbinfo lock. */
 	INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo);
 
 	if (dir == PFIL_IN)
 		inp = (ipver == INP_IPV4 ?
 		    in_pcblookup(&V_tcbinfo, ip->ip_src, sport, ip->ip_dst,
 		    dport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif)
 		    :
 #ifdef SIFTR_IPV6
 		    in6_pcblookup(&V_tcbinfo,
 		    &((struct ip6_hdr *)ip)->ip6_src, sport,
 		    &((struct ip6_hdr *)ip)->ip6_dst, dport, INPLOOKUP_RLOCKPCB,
 		    m->m_pkthdr.rcvif)
 #else
 		    NULL
 #endif
 		    );
 
 	else
 		inp = (ipver == INP_IPV4 ?
 		    in_pcblookup(&V_tcbinfo, ip->ip_dst, dport, ip->ip_src,
 		    sport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif)
 		    :
 #ifdef SIFTR_IPV6
 		    in6_pcblookup(&V_tcbinfo,
 		    &((struct ip6_hdr *)ip)->ip6_dst, dport,
 		    &((struct ip6_hdr *)ip)->ip6_src, sport, INPLOOKUP_RLOCKPCB,
 		    m->m_pkthdr.rcvif)
 #else
 		    NULL
 #endif
 		    );
 
 	/* If we can't find the inpcb, bail. */
 	if (inp == NULL) {
 		if (dir == PFIL_IN)
 			ss->nskip_in_inpcb++;
 		else
 			ss->nskip_out_inpcb++;
 	}
 
 	return (inp);
 }
 
 static inline void
 siftr_siftdata(struct pkt_node *pn, struct inpcb *inp, struct tcpcb *tp,
     int ipver, int dir, int inp_locally_locked)
 {
 #ifdef SIFTR_IPV6
 	if (ipver == INP_IPV4) {
 		pn->ip_laddr[3] = inp->inp_laddr.s_addr;
 		pn->ip_faddr[3] = inp->inp_faddr.s_addr;
 #else
 		*((uint32_t *)pn->ip_laddr) = inp->inp_laddr.s_addr;
 		*((uint32_t *)pn->ip_faddr) = inp->inp_faddr.s_addr;
 #endif
 #ifdef SIFTR_IPV6
 	} else {
 		pn->ip_laddr[0] = inp->in6p_laddr.s6_addr32[0];
 		pn->ip_laddr[1] = inp->in6p_laddr.s6_addr32[1];
 		pn->ip_laddr[2] = inp->in6p_laddr.s6_addr32[2];
 		pn->ip_laddr[3] = inp->in6p_laddr.s6_addr32[3];
 		pn->ip_faddr[0] = inp->in6p_faddr.s6_addr32[0];
 		pn->ip_faddr[1] = inp->in6p_faddr.s6_addr32[1];
 		pn->ip_faddr[2] = inp->in6p_faddr.s6_addr32[2];
 		pn->ip_faddr[3] = inp->in6p_faddr.s6_addr32[3];
 	}
 #endif
 	pn->tcp_localport = inp->inp_lport;
 	pn->tcp_foreignport = inp->inp_fport;
 	pn->snd_cwnd = tp->snd_cwnd;
 	pn->snd_wnd = tp->snd_wnd;
 	pn->rcv_wnd = tp->rcv_wnd;
 	pn->t_flags2 = tp->t_flags2;
 	pn->snd_ssthresh = tp->snd_ssthresh;
 	pn->snd_scale = tp->snd_scale;
 	pn->rcv_scale = tp->rcv_scale;
 	pn->conn_state = tp->t_state;
 	pn->max_seg_size = tp->t_maxseg;
 	pn->smoothed_rtt = tp->t_srtt;
 	pn->sack_enabled = (tp->t_flags & TF_SACK_PERMIT) != 0;
 	pn->flags = tp->t_flags;
 	pn->rxt_length = tp->t_rxtcur;
 	pn->snd_buf_hiwater = inp->inp_socket->so_snd.sb_hiwat;
 	pn->snd_buf_cc = sbused(&inp->inp_socket->so_snd);
 	pn->rcv_buf_hiwater = inp->inp_socket->so_rcv.sb_hiwat;
 	pn->rcv_buf_cc = sbused(&inp->inp_socket->so_rcv);
 	pn->sent_inflight_bytes = tp->snd_max - tp->snd_una;
 	pn->t_segqlen = tp->t_segqlen;
 	pn->flowid = inp->inp_flowid;
 	pn->flowtype = inp->inp_flowtype;
 
 	/* We've finished accessing the tcb so release the lock. */
 	if (inp_locally_locked)
 		INP_RUNLOCK(inp);
 
 	pn->ipver = ipver;
 	pn->direction = (dir == PFIL_IN ? DIR_IN : DIR_OUT);
 
 	/*
 	 * Significantly more accurate than using getmicrotime(), but slower!
 	 * Gives true microsecond resolution at the expense of a hit to
 	 * maximum pps throughput processing when SIFTR is loaded and enabled.
 	 */
 	microtime(&pn->tval);
 	TCP_PROBE1(siftr, &pn);
 
 }
 
 /*
  * pfil hook that is called for each IPv4 packet making its way through the
  * stack in either direction.
  * The pfil subsystem holds a non-sleepable mutex somewhere when
  * calling our hook function, so we can't sleep at all.
  * It's very important to use the M_NOWAIT flag with all function calls
  * that support it so that they won't sleep, otherwise you get a panic.
  */
 static pfil_return_t
 siftr_chkpkt(struct mbuf **m, struct ifnet *ifp, int flags,
     void *ruleset __unused, struct inpcb *inp)
 {
 	struct pkt_node *pn;
 	struct ip *ip;
 	struct tcphdr *th;
 	struct tcpcb *tp;
 	struct siftr_stats *ss;
 	unsigned int ip_hl;
 	int inp_locally_locked, dir;
 
 	inp_locally_locked = 0;
 	dir = PFIL_DIR(flags);
 	ss = DPCPU_PTR(ss);
 
 	/*
 	 * m_pullup is not required here because ip_{input|output}
 	 * already do the heavy lifting for us.
 	 */
 
 	ip = mtod(*m, struct ip *);
 
 	/* Only continue processing if the packet is TCP. */
 	if (ip->ip_p != IPPROTO_TCP)
 		goto ret;
 
 	/*
 	 * If a kernel subsystem reinjects packets into the stack, our pfil
 	 * hook will be called multiple times for the same packet.
 	 * Make sure we only process unique packets.
 	 */
 	if (siftr_chkreinject(*m, dir, ss))
 		goto ret;
 
 	if (dir == PFIL_IN)
 		ss->n_in++;
 	else
 		ss->n_out++;
 
 	/*
 	 * Create a tcphdr struct starting at the correct offset
 	 * in the IP packet. ip->ip_hl gives the ip header length
 	 * in 4-byte words, so multiply it to get the size in bytes.
 	 */
 	ip_hl = (ip->ip_hl << 2);
 	th = (struct tcphdr *)((caddr_t)ip + ip_hl);
 
 	/*
 	 * If the pfil hooks don't provide a pointer to the
 	 * inpcb, we need to find it ourselves and lock it.
 	 */
 	if (!inp) {
 		/* Find the corresponding inpcb for this pkt. */
 		inp = siftr_findinpcb(INP_IPV4, ip, *m, th->th_sport,
 		    th->th_dport, dir, ss);
 
 		if (inp == NULL)
 			goto ret;
 		else
 			inp_locally_locked = 1;
 	}
 
 	INP_LOCK_ASSERT(inp);
 
 	/* Find the TCP control block that corresponds with this packet */
 	tp = intotcpcb(inp);
 
 	/*
 	 * If we can't find the TCP control block (happens occasionaly for a
-	 * packet sent during the shutdown phase of a TCP connection),
-	 * or we're in the timewait state, bail
+	 * packet sent during the shutdown phase of a TCP connection), bail
 	 */
-	if (tp == NULL || inp->inp_flags & INP_TIMEWAIT) {
+	if (tp == NULL) {
 		if (dir == PFIL_IN)
 			ss->nskip_in_tcpcb++;
 		else
 			ss->nskip_out_tcpcb++;
 
 		goto inp_unlock;
 	}
 
 	/*
 	 * Only pkts selected by the tcp port filter
 	 * can be inserted into the pkt_queue
 	 */
 	if ((siftr_port_filter != 0) &&
 	    (siftr_port_filter != ntohs(inp->inp_lport)) &&
 	    (siftr_port_filter != ntohs(inp->inp_fport))) {
 		goto inp_unlock;
 	}
 
 	pn = malloc(sizeof(struct pkt_node), M_SIFTR_PKTNODE, M_NOWAIT|M_ZERO);
 
 	if (pn == NULL) {
 		if (dir == PFIL_IN)
 			ss->nskip_in_malloc++;
 		else
 			ss->nskip_out_malloc++;
 
 		goto inp_unlock;
 	}
 
 	siftr_siftdata(pn, inp, tp, INP_IPV4, dir, inp_locally_locked);
 
 	if (siftr_generate_hashes) {
 		if ((*m)->m_pkthdr.csum_flags & CSUM_TCP) {
 			/*
 			 * For outbound packets, the TCP checksum isn't
 			 * calculated yet. This is a problem for our packet
 			 * hashing as the receiver will calc a different hash
 			 * to ours if we don't include the correct TCP checksum
 			 * in the bytes being hashed. To work around this
 			 * problem, we manually calc the TCP checksum here in
 			 * software. We unset the CSUM_TCP flag so the lower
 			 * layers don't recalc it.
 			 */
 			(*m)->m_pkthdr.csum_flags &= ~CSUM_TCP;
 
 			/*
 			 * Calculate the TCP checksum in software and assign
 			 * to correct TCP header field, which will follow the
 			 * packet mbuf down the stack. The trick here is that
 			 * tcp_output() sets th->th_sum to the checksum of the
 			 * pseudo header for us already. Because of the nature
 			 * of the checksumming algorithm, we can sum over the
 			 * entire IP payload (i.e. TCP header and data), which
 			 * will include the already calculated pseduo header
 			 * checksum, thus giving us the complete TCP checksum.
 			 *
 			 * To put it in simple terms, if checksum(1,2,3,4)=10,
 			 * then checksum(1,2,3,4,5) == checksum(10,5).
 			 * This property is what allows us to "cheat" and
 			 * checksum only the IP payload which has the TCP
 			 * th_sum field populated with the pseudo header's
 			 * checksum, and not need to futz around checksumming
 			 * pseudo header bytes and TCP header/data in one hit.
 			 * Refer to RFC 1071 for more info.
 			 *
 			 * NB: in_cksum_skip(struct mbuf *m, int len, int skip)
 			 * in_cksum_skip 2nd argument is NOT the number of
 			 * bytes to read from the mbuf at "skip" bytes offset
 			 * from the start of the mbuf (very counter intuitive!).
 			 * The number of bytes to read is calculated internally
 			 * by the function as len-skip i.e. to sum over the IP
 			 * payload (TCP header + data) bytes, it is INCORRECT
 			 * to call the function like this:
 			 * in_cksum_skip(at, ip->ip_len - offset, offset)
 			 * Rather, it should be called like this:
 			 * in_cksum_skip(at, ip->ip_len, offset)
 			 * which means read "ip->ip_len - offset" bytes from
 			 * the mbuf cluster "at" at offset "offset" bytes from
 			 * the beginning of the "at" mbuf's data pointer.
 			 */
 			th->th_sum = in_cksum_skip(*m, ntohs(ip->ip_len),
 			    ip_hl);
 		}
 
 		/*
 		 * XXX: Having to calculate the checksum in software and then
 		 * hash over all bytes is really inefficient. Would be nice to
 		 * find a way to create the hash and checksum in the same pass
 		 * over the bytes.
 		 */
 		pn->hash = hash_pkt(*m, ip_hl);
 	}
 
 	mtx_lock(&siftr_pkt_queue_mtx);
 	STAILQ_INSERT_TAIL(&pkt_queue, pn, nodes);
 	mtx_unlock(&siftr_pkt_queue_mtx);
 	goto ret;
 
 inp_unlock:
 	if (inp_locally_locked)
 		INP_RUNLOCK(inp);
 
 ret:
 	return (PFIL_PASS);
 }
 
 #ifdef SIFTR_IPV6
 static pfil_return_t
 siftr_chkpkt6(struct mbuf **m, struct ifnet *ifp, int flags,
     void *ruleset __unused, struct inpcb *inp)
 {
 	struct pkt_node *pn;
 	struct ip6_hdr *ip6;
 	struct tcphdr *th;
 	struct tcpcb *tp;
 	struct siftr_stats *ss;
 	unsigned int ip6_hl;
 	int inp_locally_locked, dir;
 
 	inp_locally_locked = 0;
 	dir = PFIL_DIR(flags);
 	ss = DPCPU_PTR(ss);
 
 	/*
 	 * m_pullup is not required here because ip6_{input|output}
 	 * already do the heavy lifting for us.
 	 */
 
 	ip6 = mtod(*m, struct ip6_hdr *);
 
 	/*
 	 * Only continue processing if the packet is TCP
 	 * XXX: We should follow the next header fields
 	 * as shown on Pg 6 RFC 2460, but right now we'll
 	 * only check pkts that have no extension headers.
 	 */
 	if (ip6->ip6_nxt != IPPROTO_TCP)
 		goto ret6;
 
 	/*
 	 * If a kernel subsystem reinjects packets into the stack, our pfil
 	 * hook will be called multiple times for the same packet.
 	 * Make sure we only process unique packets.
 	 */
 	if (siftr_chkreinject(*m, dir, ss))
 		goto ret6;
 
 	if (dir == PFIL_IN)
 		ss->n_in++;
 	else
 		ss->n_out++;
 
 	ip6_hl = sizeof(struct ip6_hdr);
 
 	/*
 	 * Create a tcphdr struct starting at the correct offset
 	 * in the ipv6 packet. ip->ip_hl gives the ip header length
 	 * in 4-byte words, so multiply it to get the size in bytes.
 	 */
 	th = (struct tcphdr *)((caddr_t)ip6 + ip6_hl);
 
 	/*
 	 * For inbound packets, the pfil hooks don't provide a pointer to the
 	 * inpcb, so we need to find it ourselves and lock it.
 	 */
 	if (!inp) {
 		/* Find the corresponding inpcb for this pkt. */
 		inp = siftr_findinpcb(INP_IPV6, (struct ip *)ip6, *m,
 		    th->th_sport, th->th_dport, dir, ss);
 
 		if (inp == NULL)
 			goto ret6;
 		else
 			inp_locally_locked = 1;
 	}
 
 	/* Find the TCP control block that corresponds with this packet. */
 	tp = intotcpcb(inp);
 
 	/*
 	 * If we can't find the TCP control block (happens occasionaly for a
-	 * packet sent during the shutdown phase of a TCP connection),
-	 * or we're in the timewait state, bail.
+	 * packet sent during the shutdown phase of a TCP connection), bail
 	 */
-	if (tp == NULL || inp->inp_flags & INP_TIMEWAIT) {
+	if (tp == NULL) {
 		if (dir == PFIL_IN)
 			ss->nskip_in_tcpcb++;
 		else
 			ss->nskip_out_tcpcb++;
 
 		goto inp_unlock6;
 	}
 
 	/*
 	 * Only pkts selected by the tcp port filter
 	 * can be inserted into the pkt_queue
 	 */
 	if ((siftr_port_filter != 0) &&
 	    (siftr_port_filter != ntohs(inp->inp_lport)) &&
 	    (siftr_port_filter != ntohs(inp->inp_fport))) {
 		goto inp_unlock6;
 	}
 
 	pn = malloc(sizeof(struct pkt_node), M_SIFTR_PKTNODE, M_NOWAIT|M_ZERO);
 
 	if (pn == NULL) {
 		if (dir == PFIL_IN)
 			ss->nskip_in_malloc++;
 		else
 			ss->nskip_out_malloc++;
 
 		goto inp_unlock6;
 	}
 
 	siftr_siftdata(pn, inp, tp, INP_IPV6, dir, inp_locally_locked);
 
 	/* XXX: Figure out how to generate hashes for IPv6 packets. */
 
 	mtx_lock(&siftr_pkt_queue_mtx);
 	STAILQ_INSERT_TAIL(&pkt_queue, pn, nodes);
 	mtx_unlock(&siftr_pkt_queue_mtx);
 	goto ret6;
 
 inp_unlock6:
 	if (inp_locally_locked)
 		INP_RUNLOCK(inp);
 
 ret6:
 	return (PFIL_PASS);
 }
 #endif /* #ifdef SIFTR_IPV6 */
 
 VNET_DEFINE_STATIC(pfil_hook_t, siftr_inet_hook);
 #define	V_siftr_inet_hook	VNET(siftr_inet_hook)
 #ifdef SIFTR_IPV6
 VNET_DEFINE_STATIC(pfil_hook_t, siftr_inet6_hook);
 #define	V_siftr_inet6_hook	VNET(siftr_inet6_hook)
 #endif
 static int
 siftr_pfil(int action)
 {
 	struct pfil_hook_args pha;
 	struct pfil_link_args pla;
 
 	pha.pa_version = PFIL_VERSION;
 	pha.pa_flags = PFIL_IN | PFIL_OUT;
 	pha.pa_modname = "siftr";
 	pha.pa_ruleset = NULL;
 	pha.pa_rulname = "default";
 
 	pla.pa_version = PFIL_VERSION;
 	pla.pa_flags = PFIL_IN | PFIL_OUT |
 	    PFIL_HEADPTR | PFIL_HOOKPTR;
 
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 
 		if (action == HOOK) {
 			pha.pa_func = siftr_chkpkt;
 			pha.pa_type = PFIL_TYPE_IP4;
 			V_siftr_inet_hook = pfil_add_hook(&pha);
 			pla.pa_hook = V_siftr_inet_hook;
 			pla.pa_head = V_inet_pfil_head;
 			(void)pfil_link(&pla);
 #ifdef SIFTR_IPV6
 			pha.pa_func = siftr_chkpkt6;
 			pha.pa_type = PFIL_TYPE_IP6;
 			V_siftr_inet6_hook = pfil_add_hook(&pha);
 			pla.pa_hook = V_siftr_inet6_hook;
 			pla.pa_head = V_inet6_pfil_head;
 			(void)pfil_link(&pla);
 #endif
 		} else if (action == UNHOOK) {
 			pfil_remove_hook(V_siftr_inet_hook);
 #ifdef SIFTR_IPV6
 			pfil_remove_hook(V_siftr_inet6_hook);
 #endif
 		}
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 
 	return (0);
 }
 
 static int
 siftr_sysctl_logfile_name_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct alq *new_alq;
 	int error;
 
 	error = sysctl_handle_string(oidp, arg1, arg2, req);
 
 	/* Check for error or same filename */
 	if (error != 0 || req->newptr == NULL ||
 	    strncmp(siftr_logfile, arg1, arg2) == 0)
 		goto done;
 
 	/* Filname changed */
 	error = alq_open(&new_alq, arg1, curthread->td_ucred,
 	    SIFTR_LOG_FILE_MODE, SIFTR_ALQ_BUFLEN, 0);
 	if (error != 0)
 		goto done;
 
 	/*
 	 * If disabled, siftr_alq == NULL so we simply close
 	 * the alq as we've proved it can be opened.
 	 * If enabled, close the existing alq and switch the old
 	 * for the new.
 	 */
 	if (siftr_alq == NULL) {
 		alq_close(new_alq);
 	} else {
 		alq_close(siftr_alq);
 		siftr_alq = new_alq;
 	}
 
 	/* Update filename upon success */
 	strlcpy(siftr_logfile, arg1, arg2);
 done:
 	return (error);
 }
 
 static int
 siftr_manage_ops(uint8_t action)
 {
 	struct siftr_stats totalss;
 	struct timeval tval;
 	struct flow_hash_node *counter, *tmp_counter;
 	struct sbuf *s;
 	int i, key_index, error;
 	uint32_t bytes_to_write, total_skipped_pkts;
 	uint16_t lport, fport;
 	uint8_t *key, ipver __unused;
 
 #ifdef SIFTR_IPV6
 	uint32_t laddr[4];
 	uint32_t faddr[4];
 #else
 	uint8_t laddr[4];
 	uint8_t faddr[4];
 #endif
 
 	error = 0;
 	total_skipped_pkts = 0;
 
 	/* Init an autosizing sbuf that initially holds 200 chars. */
 	if ((s = sbuf_new(NULL, NULL, 200, SBUF_AUTOEXTEND)) == NULL)
 		return (-1);
 
 	if (action == SIFTR_ENABLE && siftr_pkt_manager_thr == NULL) {
 		/*
 		 * Create our alq
 		 * XXX: We should abort if alq_open fails!
 		 */
 		alq_open(&siftr_alq, siftr_logfile, curthread->td_ucred,
 		    SIFTR_LOG_FILE_MODE, SIFTR_ALQ_BUFLEN, 0);
 
 		STAILQ_INIT(&pkt_queue);
 
 		DPCPU_ZERO(ss);
 
 		siftr_exit_pkt_manager_thread = 0;
 
 		kthread_add(&siftr_pkt_manager_thread, NULL, NULL,
 		    &siftr_pkt_manager_thr, RFNOWAIT, 0,
 		    "siftr_pkt_manager_thr");
 
 		siftr_pfil(HOOK);
 
 		microtime(&tval);
 
 		sbuf_printf(s,
 		    "enable_time_secs=%jd\tenable_time_usecs=%06ld\t"
 		    "siftrver=%s\thz=%u\ttcp_rtt_scale=%u\tsysname=%s\t"
 		    "sysver=%u\tipmode=%u\n",
 		    (intmax_t)tval.tv_sec, tval.tv_usec, MODVERSION_STR, hz,
 		    TCP_RTT_SCALE, SYS_NAME, __FreeBSD_version, SIFTR_IPMODE);
 
 		sbuf_finish(s);
 		alq_writen(siftr_alq, sbuf_data(s), sbuf_len(s), ALQ_WAITOK);
 
 	} else if (action == SIFTR_DISABLE && siftr_pkt_manager_thr != NULL) {
 		/*
 		 * Remove the pfil hook functions. All threads currently in
 		 * the hook functions are allowed to exit before siftr_pfil()
 		 * returns.
 		 */
 		siftr_pfil(UNHOOK);
 
 		/* This will block until the pkt manager thread unlocks it. */
 		mtx_lock(&siftr_pkt_mgr_mtx);
 
 		/* Tell the pkt manager thread that it should exit now. */
 		siftr_exit_pkt_manager_thread = 1;
 
 		/*
 		 * Wake the pkt_manager thread so it realises that
 		 * siftr_exit_pkt_manager_thread == 1 and exits gracefully.
 		 * The wakeup won't be delivered until we unlock
 		 * siftr_pkt_mgr_mtx so this isn't racy.
 		 */
 		wakeup(&wait_for_pkt);
 
 		/* Wait for the pkt_manager thread to exit. */
 		mtx_sleep(siftr_pkt_manager_thr, &siftr_pkt_mgr_mtx, PWAIT,
 		    "thrwait", 0);
 
 		siftr_pkt_manager_thr = NULL;
 		mtx_unlock(&siftr_pkt_mgr_mtx);
 
 		totalss.n_in = DPCPU_VARSUM(ss, n_in);
 		totalss.n_out = DPCPU_VARSUM(ss, n_out);
 		totalss.nskip_in_malloc = DPCPU_VARSUM(ss, nskip_in_malloc);
 		totalss.nskip_out_malloc = DPCPU_VARSUM(ss, nskip_out_malloc);
 		totalss.nskip_in_mtx = DPCPU_VARSUM(ss, nskip_in_mtx);
 		totalss.nskip_out_mtx = DPCPU_VARSUM(ss, nskip_out_mtx);
 		totalss.nskip_in_tcpcb = DPCPU_VARSUM(ss, nskip_in_tcpcb);
 		totalss.nskip_out_tcpcb = DPCPU_VARSUM(ss, nskip_out_tcpcb);
 		totalss.nskip_in_inpcb = DPCPU_VARSUM(ss, nskip_in_inpcb);
 		totalss.nskip_out_inpcb = DPCPU_VARSUM(ss, nskip_out_inpcb);
 
 		total_skipped_pkts = totalss.nskip_in_malloc +
 		    totalss.nskip_out_malloc + totalss.nskip_in_mtx +
 		    totalss.nskip_out_mtx + totalss.nskip_in_tcpcb +
 		    totalss.nskip_out_tcpcb + totalss.nskip_in_inpcb +
 		    totalss.nskip_out_inpcb;
 
 		microtime(&tval);
 
 		sbuf_printf(s,
 		    "disable_time_secs=%jd\tdisable_time_usecs=%06ld\t"
 		    "num_inbound_tcp_pkts=%ju\tnum_outbound_tcp_pkts=%ju\t"
 		    "total_tcp_pkts=%ju\tnum_inbound_skipped_pkts_malloc=%u\t"
 		    "num_outbound_skipped_pkts_malloc=%u\t"
 		    "num_inbound_skipped_pkts_mtx=%u\t"
 		    "num_outbound_skipped_pkts_mtx=%u\t"
 		    "num_inbound_skipped_pkts_tcpcb=%u\t"
 		    "num_outbound_skipped_pkts_tcpcb=%u\t"
 		    "num_inbound_skipped_pkts_inpcb=%u\t"
 		    "num_outbound_skipped_pkts_inpcb=%u\t"
 		    "total_skipped_tcp_pkts=%u\tflow_list=",
 		    (intmax_t)tval.tv_sec,
 		    tval.tv_usec,
 		    (uintmax_t)totalss.n_in,
 		    (uintmax_t)totalss.n_out,
 		    (uintmax_t)(totalss.n_in + totalss.n_out),
 		    totalss.nskip_in_malloc,
 		    totalss.nskip_out_malloc,
 		    totalss.nskip_in_mtx,
 		    totalss.nskip_out_mtx,
 		    totalss.nskip_in_tcpcb,
 		    totalss.nskip_out_tcpcb,
 		    totalss.nskip_in_inpcb,
 		    totalss.nskip_out_inpcb,
 		    total_skipped_pkts);
 
 		/*
 		 * Iterate over the flow hash, printing a summary of each
 		 * flow seen and freeing any malloc'd memory.
 		 * The hash consists of an array of LISTs (man 3 queue).
 		 */
 		for (i = 0; i <= siftr_hashmask; i++) {
 			LIST_FOREACH_SAFE(counter, counter_hash + i, nodes,
 			    tmp_counter) {
 				key = counter->key;
 				key_index = 1;
 
 				ipver = key[0];
 
 				memcpy(laddr, key + key_index, sizeof(laddr));
 				key_index += sizeof(laddr);
 				memcpy(&lport, key + key_index, sizeof(lport));
 				key_index += sizeof(lport);
 				memcpy(faddr, key + key_index, sizeof(faddr));
 				key_index += sizeof(faddr);
 				memcpy(&fport, key + key_index, sizeof(fport));
 
 #ifdef SIFTR_IPV6
 				laddr[3] = ntohl(laddr[3]);
 				faddr[3] = ntohl(faddr[3]);
 
 				if (ipver == INP_IPV6) {
 					laddr[0] = ntohl(laddr[0]);
 					laddr[1] = ntohl(laddr[1]);
 					laddr[2] = ntohl(laddr[2]);
 					faddr[0] = ntohl(faddr[0]);
 					faddr[1] = ntohl(faddr[1]);
 					faddr[2] = ntohl(faddr[2]);
 
 					sbuf_printf(s,
 					    "%x:%x:%x:%x:%x:%x:%x:%x;%u-"
 					    "%x:%x:%x:%x:%x:%x:%x:%x;%u,",
 					    UPPER_SHORT(laddr[0]),
 					    LOWER_SHORT(laddr[0]),
 					    UPPER_SHORT(laddr[1]),
 					    LOWER_SHORT(laddr[1]),
 					    UPPER_SHORT(laddr[2]),
 					    LOWER_SHORT(laddr[2]),
 					    UPPER_SHORT(laddr[3]),
 					    LOWER_SHORT(laddr[3]),
 					    ntohs(lport),
 					    UPPER_SHORT(faddr[0]),
 					    LOWER_SHORT(faddr[0]),
 					    UPPER_SHORT(faddr[1]),
 					    LOWER_SHORT(faddr[1]),
 					    UPPER_SHORT(faddr[2]),
 					    LOWER_SHORT(faddr[2]),
 					    UPPER_SHORT(faddr[3]),
 					    LOWER_SHORT(faddr[3]),
 					    ntohs(fport));
 				} else {
 					laddr[0] = FIRST_OCTET(laddr[3]);
 					laddr[1] = SECOND_OCTET(laddr[3]);
 					laddr[2] = THIRD_OCTET(laddr[3]);
 					laddr[3] = FOURTH_OCTET(laddr[3]);
 					faddr[0] = FIRST_OCTET(faddr[3]);
 					faddr[1] = SECOND_OCTET(faddr[3]);
 					faddr[2] = THIRD_OCTET(faddr[3]);
 					faddr[3] = FOURTH_OCTET(faddr[3]);
 #endif
 					sbuf_printf(s,
 					    "%u.%u.%u.%u;%u-%u.%u.%u.%u;%u,",
 					    laddr[0],
 					    laddr[1],
 					    laddr[2],
 					    laddr[3],
 					    ntohs(lport),
 					    faddr[0],
 					    faddr[1],
 					    faddr[2],
 					    faddr[3],
 					    ntohs(fport));
 #ifdef SIFTR_IPV6
 				}
 #endif
 
 				free(counter, M_SIFTR_HASHNODE);
 			}
 
 			LIST_INIT(counter_hash + i);
 		}
 
 		sbuf_printf(s, "\n");
 		sbuf_finish(s);
 
 		i = 0;
 		do {
 			bytes_to_write = min(SIFTR_ALQ_BUFLEN, sbuf_len(s)-i);
 			alq_writen(siftr_alq, sbuf_data(s)+i, bytes_to_write, ALQ_WAITOK);
 			i += bytes_to_write;
 		} while (i < sbuf_len(s));
 
 		alq_close(siftr_alq);
 		siftr_alq = NULL;
 	} else
 		error = EINVAL;
 
 	sbuf_delete(s);
 
 	/*
 	 * XXX: Should be using ret to check if any functions fail
 	 * and set error appropriately
 	 */
 
 	return (error);
 }
 
 static int
 siftr_sysctl_enabled_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = siftr_enabled;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (new > 1)
 			return (EINVAL);
 		else if (new != siftr_enabled) {
 			if ((error = siftr_manage_ops(new)) == 0) {
 				siftr_enabled = new;
 			} else {
 				siftr_manage_ops(SIFTR_DISABLE);
 			}
 		}
 	}
 
 	return (error);
 }
 
 static void
 siftr_shutdown_handler(void *arg)
 {
 	if (siftr_enabled == 1) {
 		siftr_manage_ops(SIFTR_DISABLE);
 	}
 }
 
 /*
  * Module is being unloaded or machine is shutting down. Take care of cleanup.
  */
 static int
 deinit_siftr(void)
 {
 	/* Cleanup. */
 	siftr_manage_ops(SIFTR_DISABLE);
 	hashdestroy(counter_hash, M_SIFTR, siftr_hashmask);
 	mtx_destroy(&siftr_pkt_queue_mtx);
 	mtx_destroy(&siftr_pkt_mgr_mtx);
 
 	return (0);
 }
 
 /*
  * Module has just been loaded into the kernel.
  */
 static int
 init_siftr(void)
 {
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, siftr_shutdown_handler, NULL,
 	    SHUTDOWN_PRI_FIRST);
 
 	/* Initialise our flow counter hash table. */
 	counter_hash = hashinit(SIFTR_EXPECTED_MAX_TCP_FLOWS, M_SIFTR,
 	    &siftr_hashmask);
 
 	mtx_init(&siftr_pkt_queue_mtx, "siftr_pkt_queue_mtx", NULL, MTX_DEF);
 	mtx_init(&siftr_pkt_mgr_mtx, "siftr_pkt_mgr_mtx", NULL, MTX_DEF);
 
 	/* Print message to the user's current terminal. */
 	uprintf("\nStatistical Information For TCP Research (SIFTR) %s\n"
 	    "          http://caia.swin.edu.au/urp/newtcp\n\n",
 	    MODVERSION_STR);
 
 	return (0);
 }
 
 /*
  * This is the function that is called to load and unload the module.
  * When the module is loaded, this function is called once with
  * "what" == MOD_LOAD
  * When the module is unloaded, this function is called twice with
  * "what" = MOD_QUIESCE first, followed by "what" = MOD_UNLOAD second
  * When the system is shut down e.g. CTRL-ALT-DEL or using the shutdown command,
  * this function is called once with "what" = MOD_SHUTDOWN
  * When the system is shut down, the handler isn't called until the very end
  * of the shutdown sequence i.e. after the disks have been synced.
  */
 static int
 siftr_load_handler(module_t mod, int what, void *arg)
 {
 	int ret;
 
 	switch (what) {
 	case MOD_LOAD:
 		ret = init_siftr();
 		break;
 
 	case MOD_QUIESCE:
 	case MOD_SHUTDOWN:
 		ret = deinit_siftr();
 		break;
 
 	case MOD_UNLOAD:
 		ret = 0;
 		break;
 
 	default:
 		ret = EINVAL;
 		break;
 	}
 
 	return (ret);
 }
 
 static moduledata_t siftr_mod = {
 	.name = "siftr",
 	.evhand = siftr_load_handler,
 };
 
 /*
  * Param 1: name of the kernel module
  * Param 2: moduledata_t struct containing info about the kernel module
  *          and the execution entry point for the module
  * Param 3: From sysinit_sub_id enumeration in /usr/include/sys/kernel.h
  *          Defines the module initialisation order
  * Param 4: From sysinit_elem_order enumeration in /usr/include/sys/kernel.h
  *          Defines the initialisation order of this kld relative to others
  *          within the same subsystem as defined by param 3
  */
 DECLARE_MODULE(siftr, siftr_mod, SI_SUB_LAST, SI_ORDER_ANY);
 MODULE_DEPEND(siftr, alq, 1, 1, 1);
 MODULE_VERSION(siftr, MODVERSION);
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
index 828ce16e8dad..3ed0c1e27c58 100644
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
@@ -1,2043 +1,2043 @@
 /*-
  * Copyright (c) 2016-2018 Netflix, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_rss.h"
 #include "opt_tcpdebug.h"
 
 /**
  * Some notes about usage.
  *
  * The tcp_hpts system is designed to provide a high precision timer
  * system for tcp. Its main purpose is to provide a mechanism for
  * pacing packets out onto the wire. It can be used in two ways
  * by a given TCP stack (and those two methods can be used simultaneously).
  *
  * First, and probably the main thing its used by Rack and BBR, it can
  * be used to call tcp_output() of a transport stack at some time in the future.
  * The normal way this is done is that tcp_output() of the stack schedules
  * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
  * slot is the time from now that the stack wants to be called but it
  * must be converted to tcp_hpts's notion of slot. This is done with
  * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical
  * call from the tcp_output() routine might look like:
  *
  * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550));
  *
  * The above would schedule tcp_ouput() to be called in 550 useconds.
  * Note that if using this mechanism the stack will want to add near
  * its top a check to prevent unwanted calls (from user land or the
  * arrival of incoming ack's). So it would add something like:
  *
  * if (tcp_in_hpts(inp))
  *    return;
  *
  * to prevent output processing until the time alotted has gone by.
  * Of course this is a bare bones example and the stack will probably
  * have more consideration then just the above.
  *
  * In order to run input queued segments from the HPTS context the
  * tcp stack must define an input function for
  * tfb_do_queued_segments(). This function understands
  * how to dequeue a array of packets that were input and
  * knows how to call the correct processing routine.
  *
  * Locking in this is important as well so most likely the
  * stack will need to define the tfb_do_segment_nounlock()
  * splitting tfb_do_segment() into two parts. The main processing
  * part that does not unlock the INP and returns a value of 1 or 0.
  * It returns 0 if all is well and the lock was not released. It
  * returns 1 if we had to destroy the TCB (a reset received etc).
  * The remains of tfb_do_segment() then become just a simple call
  * to the tfb_do_segment_nounlock() function and check the return
  * code and possibly unlock.
  *
  * The stack must also set the flag on the INP that it supports this
  * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes
  * this flag as well and will queue packets when it is set.
  * There are other flags as well INP_MBUF_QUEUE_READY and
  * INP_DONT_SACK_QUEUE. The first flag tells the LRO code
  * that we are in the pacer for output so there is no
  * need to wake up the hpts system to get immediate
  * input. The second tells the LRO code that its okay
  * if a SACK arrives you can still defer input and let
  * the current hpts timer run (this is usually set when
  * a rack timer is up so we know SACK's are happening
  * on the connection already and don't want to wakeup yet).
  *
  * There is a common functions within the rack_bbr_common code
  * version i.e. ctf_do_queued_segments(). This function
  * knows how to take the input queue of packets from
  * tp->t_in_pkts and process them digging out
  * all the arguments, calling any bpf tap and
  * calling into tfb_do_segment_nounlock(). The common
  * function (ctf_do_queued_segments())  requires that
  * you have defined the tfb_do_segment_nounlock() as
  * described above.
  */
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/hhook.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/refcount.h>
 #include <sys/sched.h>
 #include <sys/queue.h>
 #include <sys/smp.h>
 #include <sys/counter.h>
 #include <sys/time.h>
 #include <sys/kthread.h>
 #include <sys/kern_prefetch.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 
 #include <net/route.h>
 #include <net/vnet.h>
 
 #ifdef RSS
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #endif
 
 #define TCPSTATES		/* for logging */
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* required for icmp_var.h */
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcp_hpts.h>
 #include <netinet/tcp_log_buf.h>
 
 #ifdef tcpdebug
 #include <netinet/tcp_debug.h>
 #endif				/* tcpdebug */
 #ifdef tcp_offload
 #include <netinet/tcp_offload.h>
 #endif
 
 /*
  * The hpts uses a 102400 wheel. The wheel
  * defines the time in 10 usec increments (102400 x 10).
  * This gives a range of 10usec - 1024ms to place
  * an entry within. If the user requests more than
  * 1.024 second, a remaineder is attached and the hpts
  * when seeing the remainder will re-insert the
  * inpcb forward in time from where it is until
  * the remainder is zero.
  */
 
 #define NUM_OF_HPTSI_SLOTS 102400
 
 /* Each hpts has its own p_mtx which is used for locking */
 #define	HPTS_MTX_ASSERT(hpts)	mtx_assert(&(hpts)->p_mtx, MA_OWNED)
 #define	HPTS_LOCK(hpts)		mtx_lock(&(hpts)->p_mtx)
 #define	HPTS_UNLOCK(hpts)	mtx_unlock(&(hpts)->p_mtx)
 struct tcp_hpts_entry {
 	/* Cache line 0x00 */
 	struct mtx p_mtx;	/* Mutex for hpts */
 	struct timeval p_mysleep;	/* Our min sleep time */
 	uint64_t syscall_cnt;
 	uint64_t sleeping;	/* What the actual sleep was (if sleeping) */
 	uint16_t p_hpts_active; /* Flag that says hpts is awake  */
 	uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
 	uint32_t p_curtick;	/* Tick in 10 us the hpts is going to */
 	uint32_t p_runningslot; /* Current tick we are at if we are running */
 	uint32_t p_prev_slot;	/* Previous slot we were on */
 	uint32_t p_cur_slot;	/* Current slot in wheel hpts is draining */
 	uint32_t p_nxt_slot;	/* The next slot outside the current range of
 				 * slots that the hpts is running on. */
 	int32_t p_on_queue_cnt;	/* Count on queue in this hpts */
 	uint32_t p_lasttick;	/* Last tick before the current one */
 	uint8_t p_direct_wake :1, /* boolean */
 		p_on_min_sleep:1, /* boolean */
 		p_hpts_wake_scheduled:1, /* boolean */
 		p_avail:5;
 	uint8_t p_fill[3];	  /* Fill to 32 bits */
 	/* Cache line 0x40 */
 	struct hptsh {
 		TAILQ_HEAD(, inpcb)	head;
 		uint32_t		count;
 		uint32_t		gencnt;
 	} *p_hptss;			/* Hptsi wheel */
 	uint32_t p_hpts_sleep_time;	/* Current sleep interval having a max
 					 * of 255ms */
 	uint32_t overidden_sleep;	/* what was overrided by min-sleep for logging */
 	uint32_t saved_lasttick;	/* for logging */
 	uint32_t saved_curtick;		/* for logging */
 	uint32_t saved_curslot;		/* for logging */
 	uint32_t saved_prev_slot;       /* for logging */
 	uint32_t p_delayed_by;	/* How much were we delayed by */
 	/* Cache line 0x80 */
 	struct sysctl_ctx_list hpts_ctx;
 	struct sysctl_oid *hpts_root;
 	struct intr_event *ie;
 	void *ie_cookie;
 	uint16_t p_num;		/* The hpts number one per cpu */
 	uint16_t p_cpu;		/* The hpts CPU */
 	/* There is extra space in here */
 	/* Cache line 0x100 */
 	struct callout co __aligned(CACHE_LINE_SIZE);
 }               __aligned(CACHE_LINE_SIZE);
 
 static struct tcp_hptsi {
 	struct cpu_group **grps;
 	struct tcp_hpts_entry **rp_ent;	/* Array of hptss */
 	uint32_t *cts_last_ran;
 	uint32_t grp_cnt;
 	uint32_t rp_num_hptss;	/* Number of hpts threads */
 } tcp_pace;
 
 MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
 #ifdef RSS
 static int tcp_bind_threads = 1;
 #else
 static int tcp_bind_threads = 2;
 #endif
 static int tcp_use_irq_cpu = 0;
 static uint32_t *cts_last_ran;
 static int hpts_does_tp_logging = 0;
 
 static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout);
 static void tcp_hpts_thread(void *ctx);
 static void tcp_init_hptsi(void *st);
 
 int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
 static int conn_cnt_thresh = DEFAULT_CONNECTION_THESHOLD;
 static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP;
 static int32_t dynamic_max_sleep = DYNAMIC_MAX_SLEEP;
 
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "TCP Hpts controls");
 SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "TCP Hpts statistics");
 
 #define	timersub(tvp, uvp, vvp)						\
 	do {								\
 		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
 		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\
 		if ((vvp)->tv_usec < 0) {				\
 			(vvp)->tv_sec--;				\
 			(vvp)->tv_usec += 1000000;			\
 		}							\
 	} while (0)
 
 static int32_t tcp_hpts_precision = 120;
 
 static struct hpts_domain_info {
 	int count;
 	int cpu[MAXCPU];
 } hpts_domains[MAXMEMDOM];
 
 enum {
 	IHPTS_NONE = 0,
 	IHPTS_ONQUEUE,
 	IHPTS_MOVING,
 };
 
 counter_u64_t hpts_hopelessly_behind;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD,
     &hpts_hopelessly_behind,
     "Number of times hpts could not catch up and was behind hopelessly");
 
 counter_u64_t hpts_loops;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, loops, CTLFLAG_RD,
     &hpts_loops, "Number of times hpts had to loop to catch up");
 
 counter_u64_t back_tosleep;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
     &back_tosleep, "Number of times hpts found no tcbs");
 
 counter_u64_t combined_wheel_wrap;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
     &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
 
 counter_u64_t wheel_wrap;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, wheel_wrap, CTLFLAG_RD,
     &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
 
 counter_u64_t hpts_direct_call;
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_call, CTLFLAG_RD,
     &hpts_direct_call, "Number of times hpts was called by syscall/trap or other entry");
 
 counter_u64_t hpts_wake_timeout;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, timeout_wakeup, CTLFLAG_RD,
     &hpts_wake_timeout, "Number of times hpts threads woke up via the callout expiring");
 
 counter_u64_t hpts_direct_awakening;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_awakening, CTLFLAG_RD,
     &hpts_direct_awakening, "Number of times hpts threads woke up via the callout expiring");
 
 counter_u64_t hpts_back_tosleep;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, back_tosleep, CTLFLAG_RD,
     &hpts_back_tosleep, "Number of times hpts threads woke up via the callout expiring and went back to sleep no work");
 
 counter_u64_t cpu_uses_flowid;
 counter_u64_t cpu_uses_random;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_flowid, CTLFLAG_RD,
     &cpu_uses_flowid, "Number of times when setting cpuid we used the flowid field");
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_random, CTLFLAG_RD,
     &cpu_uses_random, "Number of times when setting cpuid we used the a random value");
 
 TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
 TUNABLE_INT("net.inet.tcp.use_irq", &tcp_use_irq_cpu);
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, bind_hptss, CTLFLAG_RD,
     &tcp_bind_threads, 2,
     "Thread Binding tunable");
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_irq, CTLFLAG_RD,
     &tcp_use_irq_cpu, 0,
     "Use of irq CPU  tunable");
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
     &tcp_hpts_precision, 120,
     "Value for PRE() precision of callout");
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, cnt_thresh, CTLFLAG_RW,
     &conn_cnt_thresh, 0,
     "How many connections (below) make us use the callout based mechanism");
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
     &hpts_does_tp_logging, 0,
     "Do we add to any tp that has logging on pacer logs");
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_minsleep, CTLFLAG_RW,
     &dynamic_min_sleep, 250,
     "What is the dynamic minsleep value?");
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_maxsleep, CTLFLAG_RW,
     &dynamic_max_sleep, 5000,
     "What is the dynamic maxsleep value?");
 
 static int32_t max_pacer_loops = 10;
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW,
     &max_pacer_loops, 10,
     "What is the maximum number of times the pacer will loop trying to catch up");
 
 #define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2)
 
 static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED;
 
 static int
 sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = hpts_sleep_max;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if ((new < (dynamic_min_sleep/HPTS_TICKS_PER_SLOT)) ||
 		     (new > HPTS_MAX_SLEEP_ALLOWED))
 			error = EINVAL;
 		else
 			hpts_sleep_max = new;
 	}
 	return (error);
 }
 
 static int
 sysctl_net_inet_tcp_hpts_min_sleep(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = tcp_min_hptsi_time;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < LOWEST_SLEEP_ALLOWED)
 			error = EINVAL;
 		else
 			tcp_min_hptsi_time = new;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep,
     CTLTYPE_UINT | CTLFLAG_RW,
     &hpts_sleep_max, 0,
     &sysctl_net_inet_tcp_hpts_max_sleep, "IU",
     "Maximum time hpts will sleep in slots");
 
 SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, minsleep,
     CTLTYPE_UINT | CTLFLAG_RW,
     &tcp_min_hptsi_time, 0,
     &sysctl_net_inet_tcp_hpts_min_sleep, "IU",
     "The minimum time the hpts must sleep before processing more slots");
 
 static int ticks_indicate_more_sleep = TICKS_INDICATE_MORE_SLEEP;
 static int ticks_indicate_less_sleep = TICKS_INDICATE_LESS_SLEEP;
 static int tcp_hpts_no_wake_over_thresh = 1;
 
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, more_sleep, CTLFLAG_RW,
     &ticks_indicate_more_sleep, 0,
     "If we only process this many or less on a timeout, we need longer sleep on the next callout");
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, less_sleep, CTLFLAG_RW,
     &ticks_indicate_less_sleep, 0,
     "If we process this many or more on a timeout, we need less sleep on the next callout");
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
     &tcp_hpts_no_wake_over_thresh, 0,
     "When we are over the threshold on the pacer do we prohibit wakeups?");
 
 static void
 tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
 	     int slots_to_run, int idx, int from_callout)
 {
 	union tcp_log_stackspecific log;
 	/*
 	 * Unused logs are
 	 * 64 bit - delRate, rttProp, bw_inuse
 	 * 16 bit - cwnd_gain
 	 *  8 bit - bbr_state, bbr_substate, inhpts;
 	 */
 	memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 	log.u_bbr.flex1 = hpts->p_nxt_slot;
 	log.u_bbr.flex2 = hpts->p_cur_slot;
 	log.u_bbr.flex3 = hpts->p_prev_slot;
 	log.u_bbr.flex4 = idx;
 	log.u_bbr.flex5 = hpts->p_curtick;
 	log.u_bbr.flex6 = hpts->p_on_queue_cnt;
 	log.u_bbr.flex7 = hpts->p_cpu;
 	log.u_bbr.flex8 = (uint8_t)from_callout;
 	log.u_bbr.inflight = slots_to_run;
 	log.u_bbr.applimited = hpts->overidden_sleep;
 	log.u_bbr.delivered = hpts->saved_curtick;
 	log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
 	log.u_bbr.epoch = hpts->saved_curslot;
 	log.u_bbr.lt_epoch = hpts->saved_prev_slot;
 	log.u_bbr.pkts_out = hpts->p_delayed_by;
 	log.u_bbr.lost = hpts->p_hpts_sleep_time;
 	log.u_bbr.pacing_gain = hpts->p_cpu;
 	log.u_bbr.pkt_epoch = hpts->p_runningslot;
 	log.u_bbr.use_lt_bw = 1;
 	TCP_LOG_EVENTP(tp, NULL,
 		       &tp->t_inpcb->inp_socket->so_rcv,
 		       &tp->t_inpcb->inp_socket->so_snd,
 		       BBR_LOG_HPTSDIAG, 0,
 		       0, &log, false, tv);
 }
 
 static void
 tcp_wakehpts(struct tcp_hpts_entry *hpts)
 {
 	HPTS_MTX_ASSERT(hpts);
 
 	if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) {
 		hpts->p_direct_wake = 0;
 		return;
 	}
 	if (hpts->p_hpts_wake_scheduled == 0) {
 		hpts->p_hpts_wake_scheduled = 1;
 		swi_sched(hpts->ie_cookie, 0);
 	}
 }
 
 static void
 hpts_timeout_swi(void *arg)
 {
 	struct tcp_hpts_entry *hpts;
 
 	hpts = (struct tcp_hpts_entry *)arg;
 	swi_sched(hpts->ie_cookie, 0);
 }
 
 static void
 inp_hpts_insert(struct inpcb *inp, struct tcp_hpts_entry *hpts)
 {
 	struct hptsh *hptsh;
 
 	INP_WLOCK_ASSERT(inp);
 	HPTS_MTX_ASSERT(hpts);
 	MPASS(hpts->p_cpu == inp->inp_hpts_cpu);
-	MPASS(!(inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)));
+	MPASS(!(inp->inp_flags & INP_DROPPED));
 
 	hptsh = &hpts->p_hptss[inp->inp_hptsslot];
 
 	if (inp->inp_in_hpts == IHPTS_NONE) {
 		inp->inp_in_hpts = IHPTS_ONQUEUE;
 		in_pcbref(inp);
 	} else if (inp->inp_in_hpts == IHPTS_MOVING) {
 		inp->inp_in_hpts = IHPTS_ONQUEUE;
 	} else
 		MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE);
 	inp->inp_hpts_gencnt = hptsh->gencnt;
 
 	TAILQ_INSERT_TAIL(&hptsh->head, inp, inp_hpts);
 	hptsh->count++;
 	hpts->p_on_queue_cnt++;
 }
 
 static struct tcp_hpts_entry *
 tcp_hpts_lock(struct inpcb *inp)
 {
 	struct tcp_hpts_entry *hpts;
 
 	INP_LOCK_ASSERT(inp);
 
 	hpts = tcp_pace.rp_ent[inp->inp_hpts_cpu];
 	HPTS_LOCK(hpts);
 
 	return (hpts);
 }
 
 static void
 inp_hpts_release(struct inpcb *inp)
 {
 	bool released __diagused;
 
 	inp->inp_in_hpts = IHPTS_NONE;
 	released = in_pcbrele_wlocked(inp);
 	MPASS(released == false);
 }
 
 /*
  * Called normally with the INP_LOCKED but it
  * does not matter, the hpts lock is the key
  * but the lock order allows us to hold the
  * INP lock and then get the hpts lock.
  */
 void
 tcp_hpts_remove(struct inpcb *inp)
 {
 	struct tcp_hpts_entry *hpts;
 	struct hptsh *hptsh;
 
 	INP_WLOCK_ASSERT(inp);
 
 	hpts = tcp_hpts_lock(inp);
 	if (inp->inp_in_hpts == IHPTS_ONQUEUE) {
 		hptsh = &hpts->p_hptss[inp->inp_hptsslot];
 		inp->inp_hpts_request = 0;
 		if (__predict_true(inp->inp_hpts_gencnt == hptsh->gencnt)) {
 			TAILQ_REMOVE(&hptsh->head, inp, inp_hpts);
 			MPASS(hptsh->count > 0);
 			hptsh->count--;
 			MPASS(hpts->p_on_queue_cnt > 0);
 			hpts->p_on_queue_cnt--;
 			inp_hpts_release(inp);
 		} else {
 			/*
 			 * tcp_hptsi() now owns the TAILQ head of this inp.
 			 * Can't TAILQ_REMOVE, just mark it.
 			 */
 #ifdef INVARIANTS
 			struct inpcb *tmp;
 
 			TAILQ_FOREACH(tmp, &hptsh->head, inp_hpts)
 				MPASS(tmp != inp);
 #endif
 			inp->inp_in_hpts = IHPTS_MOVING;
 			inp->inp_hptsslot = -1;
 		}
 	} else if (inp->inp_in_hpts == IHPTS_MOVING) {
 		/*
 		 * Handle a special race condition:
 		 * tcp_hptsi() moves inpcb to detached tailq
 		 * tcp_hpts_remove() marks as IHPTS_MOVING, slot = -1
 		 * tcp_hpts_insert() sets slot to a meaningful value
 		 * tcp_hpts_remove() again (we are here!), then in_pcbdrop()
 		 * tcp_hptsi() finds pcb with meaningful slot and INP_DROPPED
 		 */
 		inp->inp_hptsslot = -1;
 	}
 	HPTS_UNLOCK(hpts);
 }
 
 bool
 tcp_in_hpts(struct inpcb *inp)
 {
 
 	return (inp->inp_in_hpts == IHPTS_ONQUEUE);
 }
 
 static inline int
 hpts_slot(uint32_t wheel_slot, uint32_t plus)
 {
 	/*
 	 * Given a slot on the wheel, what slot
 	 * is that plus ticks out?
 	 */
 	KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot));
 	return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS);
 }
 
 static inline int
 tick_to_wheel(uint32_t cts_in_wticks)
 {
 	/*
 	 * Given a timestamp in ticks (so by
 	 * default to get it to a real time one
 	 * would multiply by 10.. i.e the number
 	 * of ticks in a slot) map it to our limited
 	 * space wheel.
 	 */
 	return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
 }
 
 static inline int
 hpts_slots_diff(int prev_slot, int slot_now)
 {
 	/*
 	 * Given two slots that are someplace
 	 * on our wheel. How far are they apart?
 	 */
 	if (slot_now > prev_slot)
 		return (slot_now - prev_slot);
 	else if (slot_now == prev_slot)
 		/*
 		 * Special case, same means we can go all of our
 		 * wheel less one slot.
 		 */
 		return (NUM_OF_HPTSI_SLOTS - 1);
 	else
 		return ((NUM_OF_HPTSI_SLOTS - prev_slot) + slot_now);
 }
 
 /*
  * Given a slot on the wheel that is the current time
  * mapped to the wheel (wheel_slot), what is the maximum
  * distance forward that can be obtained without
  * wrapping past either prev_slot or running_slot
  * depending on the htps state? Also if passed
  * a uint32_t *, fill it with the slot location.
  *
  * Note if you do not give this function the current
  * time (that you think it is) mapped to the wheel slot
  * then the results will not be what you expect and
  * could lead to invalid inserts.
  */
 static inline int32_t
 max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *target_slot)
 {
 	uint32_t dis_to_travel, end_slot, pacer_to_now, avail_on_wheel;
 
 	if ((hpts->p_hpts_active == 1) &&
 	    (hpts->p_wheel_complete == 0)) {
 		end_slot = hpts->p_runningslot;
 		/* Back up one tick */
 		if (end_slot == 0)
 			end_slot = NUM_OF_HPTSI_SLOTS - 1;
 		else
 			end_slot--;
 		if (target_slot)
 			*target_slot = end_slot;
 	} else {
 		/*
 		 * For the case where we are
 		 * not active, or we have
 		 * completed the pass over
 		 * the wheel, we can use the
 		 * prev tick and subtract one from it. This puts us
 		 * as far out as possible on the wheel.
 		 */
 		end_slot = hpts->p_prev_slot;
 		if (end_slot == 0)
 			end_slot = NUM_OF_HPTSI_SLOTS - 1;
 		else
 			end_slot--;
 		if (target_slot)
 			*target_slot = end_slot;
 		/*
 		 * Now we have close to the full wheel left minus the
 		 * time it has been since the pacer went to sleep. Note
 		 * that wheel_tick, passed in, should be the current time
 		 * from the perspective of the caller, mapped to the wheel.
 		 */
 		if (hpts->p_prev_slot != wheel_slot)
 			dis_to_travel = hpts_slots_diff(hpts->p_prev_slot, wheel_slot);
 		else
 			dis_to_travel = 1;
 		/*
 		 * dis_to_travel in this case is the space from when the
 		 * pacer stopped (p_prev_slot) and where our wheel_slot
 		 * is now. To know how many slots we can put it in we
 		 * subtract from the wheel size. We would not want
 		 * to place something after p_prev_slot or it will
 		 * get ran too soon.
 		 */
 		return (NUM_OF_HPTSI_SLOTS - dis_to_travel);
 	}
 	/*
 	 * So how many slots are open between p_runningslot -> p_cur_slot
 	 * that is what is currently un-available for insertion. Special
 	 * case when we are at the last slot, this gets 1, so that
 	 * the answer to how many slots are available is all but 1.
 	 */
 	if (hpts->p_runningslot == hpts->p_cur_slot)
 		dis_to_travel = 1;
 	else
 		dis_to_travel = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
 	/*
 	 * How long has the pacer been running?
 	 */
 	if (hpts->p_cur_slot != wheel_slot) {
 		/* The pacer is a bit late */
 		pacer_to_now = hpts_slots_diff(hpts->p_cur_slot, wheel_slot);
 	} else {
 		/* The pacer is right on time, now == pacers start time */
 		pacer_to_now = 0;
 	}
 	/*
 	 * To get the number left we can insert into we simply
 	 * subract the distance the pacer has to run from how
 	 * many slots there are.
 	 */
 	avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel;
 	/*
 	 * Now how many of those we will eat due to the pacer's
 	 * time (p_cur_slot) of start being behind the
 	 * real time (wheel_slot)?
 	 */
 	if (avail_on_wheel <= pacer_to_now) {
 		/*
 		 * Wheel wrap, we can't fit on the wheel, that
 		 * is unusual the system must be way overloaded!
 		 * Insert into the assured slot, and return special
 		 * "0".
 		 */
 		counter_u64_add(combined_wheel_wrap, 1);
 		*target_slot = hpts->p_nxt_slot;
 		return (0);
 	} else {
 		/*
 		 * We know how many slots are open
 		 * on the wheel (the reverse of what
 		 * is left to run. Take away the time
 		 * the pacer started to now (wheel_slot)
 		 * and that tells you how many slots are
 		 * open that can be inserted into that won't
 		 * be touched by the pacer until later.
 		 */
 		return (avail_on_wheel - pacer_to_now);
 	}
 }
 
 
 #ifdef INVARIANTS
 static void
 check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line)
 {
 	/*
 	 * Sanity checks for the pacer with invariants
 	 * on insert.
 	 */
 	KASSERT(inp_hptsslot < NUM_OF_HPTSI_SLOTS,
 		("hpts:%p inp:%p slot:%d > max",
 		 hpts, inp, inp_hptsslot));
 	if ((hpts->p_hpts_active) &&
 	    (hpts->p_wheel_complete == 0)) {
 		/*
 		 * If the pacer is processing a arc
 		 * of the wheel, we need to make
 		 * sure we are not inserting within
 		 * that arc.
 		 */
 		int distance, yet_to_run;
 
 		distance = hpts_slots_diff(hpts->p_runningslot, inp_hptsslot);
 		if (hpts->p_runningslot != hpts->p_cur_slot)
 			yet_to_run = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
 		else
 			yet_to_run = 0;	/* processing last slot */
 		KASSERT(yet_to_run <= distance,
 			("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
 			 hpts, inp, inp_hptsslot,
 			 distance, yet_to_run,
 			 hpts->p_runningslot, hpts->p_cur_slot));
 	}
 }
 #endif
 
 uint32_t
 tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag)
 {
 	struct tcp_hpts_entry *hpts;
 	struct timeval tv;
 	uint32_t slot_on, wheel_cts, last_slot, need_new_to = 0;
 	int32_t wheel_slot, maxslots;
 	bool need_wakeup = false;
 
 	INP_WLOCK_ASSERT(inp);
 	MPASS(!tcp_in_hpts(inp));
-	MPASS(!(inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)));
+	MPASS(!(inp->inp_flags & INP_DROPPED));
 
 	/*
 	 * We now return the next-slot the hpts will be on, beyond its
 	 * current run (if up) or where it was when it stopped if it is
 	 * sleeping.
 	 */
 	hpts = tcp_hpts_lock(inp);
 	microuptime(&tv);
 	if (diag) {
 		memset(diag, 0, sizeof(struct hpts_diag));
 		diag->p_hpts_active = hpts->p_hpts_active;
 		diag->p_prev_slot = hpts->p_prev_slot;
 		diag->p_runningslot = hpts->p_runningslot;
 		diag->p_nxt_slot = hpts->p_nxt_slot;
 		diag->p_cur_slot = hpts->p_cur_slot;
 		diag->p_curtick = hpts->p_curtick;
 		diag->p_lasttick = hpts->p_lasttick;
 		diag->slot_req = slot;
 		diag->p_on_min_sleep = hpts->p_on_min_sleep;
 		diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
 	}
 	if (slot == 0) {
 		/* Ok we need to set it on the hpts in the current slot */
 		inp->inp_hpts_request = 0;
 		if ((hpts->p_hpts_active == 0) || (hpts->p_wheel_complete)) {
 			/*
 			 * A sleeping hpts we want in next slot to run
 			 * note that in this state p_prev_slot == p_cur_slot
 			 */
 			inp->inp_hptsslot = hpts_slot(hpts->p_prev_slot, 1);
 			if ((hpts->p_on_min_sleep == 0) &&
 			    (hpts->p_hpts_active == 0))
 				need_wakeup = true;
 		} else
 			inp->inp_hptsslot = hpts->p_runningslot;
 		if (__predict_true(inp->inp_in_hpts != IHPTS_MOVING))
 			inp_hpts_insert(inp, hpts);
 		if (need_wakeup) {
 			/*
 			 * Activate the hpts if it is sleeping and its
 			 * timeout is not 1.
 			 */
 			hpts->p_direct_wake = 1;
 			tcp_wakehpts(hpts);
 		}
 		slot_on = hpts->p_nxt_slot;
 		HPTS_UNLOCK(hpts);
 
 		return (slot_on);
 	}
 	/* Get the current time relative to the wheel */
 	wheel_cts = tcp_tv_to_hptstick(&tv);
 	/* Map it onto the wheel */
 	wheel_slot = tick_to_wheel(wheel_cts);
 	/* Now what's the max we can place it at? */
 	maxslots = max_slots_available(hpts, wheel_slot, &last_slot);
 	if (diag) {
 		diag->wheel_slot = wheel_slot;
 		diag->maxslots = maxslots;
 		diag->wheel_cts = wheel_cts;
 	}
 	if (maxslots == 0) {
 		/* The pacer is in a wheel wrap behind, yikes! */
 		if (slot > 1) {
 			/*
 			 * Reduce by 1 to prevent a forever loop in
 			 * case something else is wrong. Note this
 			 * probably does not hurt because the pacer
 			 * if its true is so far behind we will be
 			 * > 1second late calling anyway.
 			 */
 			slot--;
 		}
 		inp->inp_hptsslot = last_slot;
 		inp->inp_hpts_request = slot;
 	} else 	if (maxslots >= slot) {
 		/* It all fits on the wheel */
 		inp->inp_hpts_request = 0;
 		inp->inp_hptsslot = hpts_slot(wheel_slot, slot);
 	} else {
 		/* It does not fit */
 		inp->inp_hpts_request = slot - maxslots;
 		inp->inp_hptsslot = last_slot;
 	}
 	if (diag) {
 		diag->slot_remaining = inp->inp_hpts_request;
 		diag->inp_hptsslot = inp->inp_hptsslot;
 	}
 #ifdef INVARIANTS
 	check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
 #endif
 	if (__predict_true(inp->inp_in_hpts != IHPTS_MOVING))
 		inp_hpts_insert(inp, hpts);
 	if ((hpts->p_hpts_active == 0) &&
 	    (inp->inp_hpts_request == 0) &&
 	    (hpts->p_on_min_sleep == 0)) {
 		/*
 		 * The hpts is sleeping and NOT on a minimum
 		 * sleep time, we need to figure out where
 		 * it will wake up at and if we need to reschedule
 		 * its time-out.
 		 */
 		uint32_t have_slept, yet_to_sleep;
 
 		/* Now do we need to restart the hpts's timer? */
 		have_slept = hpts_slots_diff(hpts->p_prev_slot, wheel_slot);
 		if (have_slept < hpts->p_hpts_sleep_time)
 			yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
 		else {
 			/* We are over-due */
 			yet_to_sleep = 0;
 			need_wakeup = 1;
 		}
 		if (diag) {
 			diag->have_slept = have_slept;
 			diag->yet_to_sleep = yet_to_sleep;
 		}
 		if (yet_to_sleep &&
 		    (yet_to_sleep > slot)) {
 			/*
 			 * We need to reschedule the hpts's time-out.
 			 */
 			hpts->p_hpts_sleep_time = slot;
 			need_new_to = slot * HPTS_TICKS_PER_SLOT;
 		}
 	}
 	/*
 	 * Now how far is the hpts sleeping to? if active is 1, its
 	 * up and ticking we do nothing, otherwise we may need to
 	 * reschedule its callout if need_new_to is set from above.
 	 */
 	if (need_wakeup) {
 		hpts->p_direct_wake = 1;
 		tcp_wakehpts(hpts);
 		if (diag) {
 			diag->need_new_to = 0;
 			diag->co_ret = 0xffff0000;
 		}
 	} else if (need_new_to) {
 		int32_t co_ret;
 		struct timeval tv;
 		sbintime_t sb;
 
 		tv.tv_sec = 0;
 		tv.tv_usec = 0;
 		while (need_new_to > HPTS_USEC_IN_SEC) {
 			tv.tv_sec++;
 			need_new_to -= HPTS_USEC_IN_SEC;
 		}
 		tv.tv_usec = need_new_to;
 		sb = tvtosbt(tv);
 		co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
 					      hpts_timeout_swi, hpts, hpts->p_cpu,
 					      (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 		if (diag) {
 			diag->need_new_to = need_new_to;
 			diag->co_ret = co_ret;
 		}
 	}
 	slot_on = hpts->p_nxt_slot;
 	HPTS_UNLOCK(hpts);
 
 	return (slot_on);
 }
 
 uint16_t
 hpts_random_cpu(struct inpcb *inp){
 	/*
 	 * No flow type set distribute the load randomly.
 	 */
 	uint16_t cpuid;
 	uint32_t ran;
 
 	/*
 	 * Shortcut if it is already set. XXXGL: does it happen?
 	 */
 	if (inp->inp_hpts_cpu_set) {
 		return (inp->inp_hpts_cpu);
 	}
 	/* Nothing set use a random number */
 	ran = arc4random();
 	cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss);
 	return (cpuid);
 }
 
 static uint16_t
 hpts_cpuid(struct inpcb *inp, int *failed)
 {
 	u_int cpuid;
 #ifdef NUMA
 	struct hpts_domain_info *di;
 #endif
 
 	*failed = 0;
 	if (inp->inp_hpts_cpu_set) {
 		return (inp->inp_hpts_cpu);
 	}
 	/*
 	 * If we are using the irq cpu set by LRO or
 	 * the driver then it overrides all other domains.
 	 */
 	if (tcp_use_irq_cpu) {
 		if (inp->inp_irq_cpu_set == 0) {
 			*failed = 1;
 			return(0);
 		}
 		return(inp->inp_irq_cpu);
 	}
 	/* If one is set the other must be the same */
 #ifdef RSS
 	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 	if (cpuid == NETISR_CPUID_NONE)
 		return (hpts_random_cpu(inp));
 	else
 		return (cpuid);
 #endif
 	/*
 	 * We don't have a flowid -> cpuid mapping, so cheat and just map
 	 * unknown cpuids to curcpu.  Not the best, but apparently better
 	 * than defaulting to swi 0.
 	 */
 	if (inp->inp_flowtype == M_HASHTYPE_NONE) {
 		counter_u64_add(cpu_uses_random, 1);
 		return (hpts_random_cpu(inp));
 	}
 	/*
 	 * Hash to a thread based on the flowid.  If we are using numa,
 	 * then restrict the hash to the numa domain where the inp lives.
 	 */
 
 #ifdef NUMA
 	if ((vm_ndomains == 1) ||
 	    (inp->inp_numa_domain == M_NODOM)) {
 #endif
 		cpuid = inp->inp_flowid % mp_ncpus;
 #ifdef NUMA
 	} else {
 		/* Hash into the cpu's that use that domain */
 		di = &hpts_domains[inp->inp_numa_domain];
 		cpuid = di->cpu[inp->inp_flowid % di->count];
 	}
 #endif
 	counter_u64_add(cpu_uses_flowid, 1);
 	return (cpuid);
 }
 
 #ifdef not_longer_used_gleb
 static void
 tcp_drop_in_pkts(struct tcpcb *tp)
 {
 	struct mbuf *m, *n;
 
 	m = tp->t_in_pkt;
 	if (m)
 		n = m->m_nextpkt;
 	else
 		n = NULL;
 	tp->t_in_pkt = NULL;
 	while (m) {
 		m_freem(m);
 		m = n;
 		if (m)
 			n = m->m_nextpkt;
 	}
 }
 #endif
 
 static void
 tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt)
 {
 	uint32_t t = 0, i;
 
 	if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) {
 		/*
 		 * Find next slot that is occupied and use that to
 		 * be the sleep time.
 		 */
 		for (i = 0, t = hpts_slot(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
 			if (TAILQ_EMPTY(&hpts->p_hptss[t].head) == 0) {
 				break;
 			}
 			t = (t + 1) % NUM_OF_HPTSI_SLOTS;
 		}
 		KASSERT((i != NUM_OF_HPTSI_SLOTS), ("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt));
 		hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
 	} else {
 		/* No one on the wheel sleep for all but 400 slots or sleep max  */
 		hpts->p_hpts_sleep_time = hpts_sleep_max;
 	}
 }
 
 static int32_t
 tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout)
 {
 	struct tcpcb *tp;
 	struct inpcb *inp;
 	struct timeval tv;
 	int32_t slots_to_run, i, error;
 	int32_t loop_cnt = 0;
 	int32_t did_prefetch = 0;
 	int32_t prefetch_ninp = 0;
 	int32_t prefetch_tp = 0;
 	int32_t wrap_loop_cnt = 0;
 	int32_t slot_pos_of_endpoint = 0;
 	int32_t orig_exit_slot;
 	int8_t completed_measure = 0, seen_endpoint = 0;
 
 	HPTS_MTX_ASSERT(hpts);
 	NET_EPOCH_ASSERT();
 	/* record previous info for any logging */
 	hpts->saved_lasttick = hpts->p_lasttick;
 	hpts->saved_curtick = hpts->p_curtick;
 	hpts->saved_curslot = hpts->p_cur_slot;
 	hpts->saved_prev_slot = hpts->p_prev_slot;
 
 	hpts->p_lasttick = hpts->p_curtick;
 	hpts->p_curtick = tcp_gethptstick(&tv);
 	cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv);
 	orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
 	if ((hpts->p_on_queue_cnt == 0) ||
 	    (hpts->p_lasttick == hpts->p_curtick)) {
 		/*
 		 * No time has yet passed,
 		 * or nothing to do.
 		 */
 		hpts->p_prev_slot = hpts->p_cur_slot;
 		hpts->p_lasttick = hpts->p_curtick;
 		goto no_run;
 	}
 again:
 	hpts->p_wheel_complete = 0;
 	HPTS_MTX_ASSERT(hpts);
 	slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot);
 	if (((hpts->p_curtick - hpts->p_lasttick) >
 	     ((NUM_OF_HPTSI_SLOTS-1) * HPTS_TICKS_PER_SLOT)) &&
 	    (hpts->p_on_queue_cnt != 0)) {
 		/*
 		 * Wheel wrap is occuring, basically we
 		 * are behind and the distance between
 		 * run's has spread so much it has exceeded
 		 * the time on the wheel (1.024 seconds). This
 		 * is ugly and should NOT be happening. We
 		 * need to run the entire wheel. We last processed
 		 * p_prev_slot, so that needs to be the last slot
 		 * we run. The next slot after that should be our
 		 * reserved first slot for new, and then starts
 		 * the running position. Now the problem is the
 		 * reserved "not to yet" place does not exist
 		 * and there may be inp's in there that need
 		 * running. We can merge those into the
 		 * first slot at the head.
 		 */
 		wrap_loop_cnt++;
 		hpts->p_nxt_slot = hpts_slot(hpts->p_prev_slot, 1);
 		hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 2);
 		/*
 		 * Adjust p_cur_slot to be where we are starting from
 		 * hopefully we will catch up (fat chance if something
 		 * is broken this bad :( )
 		 */
 		hpts->p_cur_slot = hpts->p_prev_slot;
 		/*
 		 * The next slot has guys to run too, and that would
 		 * be where we would normally start, lets move them into
 		 * the next slot (p_prev_slot + 2) so that we will
 		 * run them, the extra 10usecs of late (by being
 		 * put behind) does not really matter in this situation.
 		 */
 		TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot].head,
 		    inp_hpts) {
 			MPASS(inp->inp_hptsslot == hpts->p_nxt_slot);
 			MPASS(inp->inp_hpts_gencnt ==
 			    hpts->p_hptss[hpts->p_nxt_slot].gencnt);
 			MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE);
 
 			/*
 			 * Update gencnt and nextslot accordingly to match
 			 * the new location. This is safe since it takes both
 			 * the INP lock and the pacer mutex to change the
 			 * inp_hptsslot and inp_hpts_gencnt.
 			 */
 			inp->inp_hpts_gencnt =
 			    hpts->p_hptss[hpts->p_runningslot].gencnt;
 			inp->inp_hptsslot = hpts->p_runningslot;
 		}
 		TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningslot].head,
 		    &hpts->p_hptss[hpts->p_nxt_slot].head, inp_hpts);
 		hpts->p_hptss[hpts->p_runningslot].count +=
 		    hpts->p_hptss[hpts->p_nxt_slot].count;
 		hpts->p_hptss[hpts->p_nxt_slot].count = 0;
 		hpts->p_hptss[hpts->p_nxt_slot].gencnt++;
 		slots_to_run = NUM_OF_HPTSI_SLOTS - 1;
 		counter_u64_add(wheel_wrap, 1);
 	} else {
 		/*
 		 * Nxt slot is always one after p_runningslot though
 		 * its not used usually unless we are doing wheel wrap.
 		 */
 		hpts->p_nxt_slot = hpts->p_prev_slot;
 		hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 1);
 	}
 	if (hpts->p_on_queue_cnt == 0) {
 		goto no_one;
 	}
 	for (i = 0; i < slots_to_run; i++) {
 		struct inpcb *inp, *ninp;
 		TAILQ_HEAD(, inpcb) head = TAILQ_HEAD_INITIALIZER(head);
 		struct hptsh *hptsh;
 		uint32_t runningslot;
 
 		/*
 		 * Calculate our delay, if there are no extra ticks there
 		 * was not any (i.e. if slots_to_run == 1, no delay).
 		 */
 		hpts->p_delayed_by = (slots_to_run - (i + 1)) *
 		    HPTS_TICKS_PER_SLOT;
 
 		runningslot = hpts->p_runningslot;
 		hptsh = &hpts->p_hptss[runningslot];
 		TAILQ_SWAP(&head, &hptsh->head, inpcb, inp_hpts);
 		hpts->p_on_queue_cnt -= hptsh->count;
 		hptsh->count = 0;
 		hptsh->gencnt++;
 
 		HPTS_UNLOCK(hpts);
 
 		TAILQ_FOREACH_SAFE(inp, &head, inp_hpts, ninp) {
 			bool set_cpu;
 
 			if (ninp != NULL) {
 				/* We prefetch the next inp if possible */
 				kern_prefetch(ninp, &prefetch_ninp);
 				prefetch_ninp = 1;
 			}
 
 			/* For debugging */
 			if (seen_endpoint == 0) {
 				seen_endpoint = 1;
 				orig_exit_slot = slot_pos_of_endpoint =
 				    runningslot;
 			} else if (completed_measure == 0) {
 				/* Record the new position */
 				orig_exit_slot = runningslot;
 			}
 
 			INP_WLOCK(inp);
 			if (inp->inp_hpts_cpu_set == 0) {
 				set_cpu = true;
 			} else {
 				set_cpu = false;
 			}
 
 			if (__predict_false(inp->inp_in_hpts == IHPTS_MOVING)) {
 				if (inp->inp_hptsslot == -1) {
 					inp->inp_in_hpts = IHPTS_NONE;
 					if (in_pcbrele_wlocked(inp) == false)
 						INP_WUNLOCK(inp);
 				} else {
 					HPTS_LOCK(hpts);
 					inp_hpts_insert(inp, hpts);
 					HPTS_UNLOCK(hpts);
 					INP_WUNLOCK(inp);
 				}
 				continue;
 			}
 
 			MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE);
-			MPASS(!(inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)));
+			MPASS(!(inp->inp_flags & INP_DROPPED));
 			KASSERT(runningslot == inp->inp_hptsslot,
 				("Hpts:%p inp:%p slot mis-aligned %u vs %u",
 				 hpts, inp, runningslot, inp->inp_hptsslot));
 
 			if (inp->inp_hpts_request) {
 				/*
 				 * This guy is deferred out further in time
 				 * then our wheel had available on it.
 				 * Push him back on the wheel or run it
 				 * depending.
 				 */
 				uint32_t maxslots, last_slot, remaining_slots;
 
 				remaining_slots = slots_to_run - (i + 1);
 				if (inp->inp_hpts_request > remaining_slots) {
 					HPTS_LOCK(hpts);
 					/*
 					 * How far out can we go?
 					 */
 					maxslots = max_slots_available(hpts,
 					    hpts->p_cur_slot, &last_slot);
 					if (maxslots >= inp->inp_hpts_request) {
 						/* We can place it finally to
 						 * be processed.  */
 						inp->inp_hptsslot = hpts_slot(
 						    hpts->p_runningslot,
 						    inp->inp_hpts_request);
 						inp->inp_hpts_request = 0;
 					} else {
 						/* Work off some more time */
 						inp->inp_hptsslot = last_slot;
 						inp->inp_hpts_request -=
 						    maxslots;
 					}
 					inp_hpts_insert(inp, hpts);
 					HPTS_UNLOCK(hpts);
 					INP_WUNLOCK(inp);
 					continue;
 				}
 				inp->inp_hpts_request = 0;
 				/* Fall through we will so do it now */
 			}
 
 			inp_hpts_release(inp);
 			tp = intotcpcb(inp);
 			MPASS(tp);
 			if (set_cpu) {
 				/*
 				 * Setup so the next time we will move to
 				 * the right CPU. This should be a rare
 				 * event. It will sometimes happens when we
 				 * are the client side (usually not the
 				 * server). Somehow tcp_output() gets called
 				 * before the tcp_do_segment() sets the
 				 * intial state. This means the r_cpu and
 				 * r_hpts_cpu is 0. We get on the hpts, and
 				 * then tcp_input() gets called setting up
 				 * the r_cpu to the correct value. The hpts
 				 * goes off and sees the mis-match. We
 				 * simply correct it here and the CPU will
 				 * switch to the new hpts nextime the tcb
 				 * gets added to the hpts (not this one)
 				 * :-)
 				 */
 				tcp_set_hpts(inp);
 			}
 			CURVNET_SET(inp->inp_vnet);
 			/* Lets do any logging that we might want to */
 			if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
 				tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout);
 			}
 
 			if (tp->t_fb_ptr != NULL) {
 				kern_prefetch(tp->t_fb_ptr, &did_prefetch);
 				did_prefetch = 1;
 			}
 			if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
 				error = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
 				if (error) {
 					/* The input killed the connection */
 					goto skip_pacing;
 				}
 			}
 			inp->inp_hpts_calls = 1;
 			error = tcp_output(tp);
 			if (error < 0)
 				goto skip_pacing;
 			inp->inp_hpts_calls = 0;
 			if (ninp && ninp->inp_ppcb) {
 				/*
 				 * If we have a nxt inp, see if we can
 				 * prefetch its ppcb. Note this may seem
 				 * "risky" since we have no locks (other
 				 * than the previous inp) and there no
 				 * assurance that ninp was not pulled while
 				 * we were processing inp and freed. If this
 				 * occurred it could mean that either:
 				 *
 				 * a) Its NULL (which is fine we won't go
 				 * here) <or> b) Its valid (which is cool we
 				 * will prefetch it) <or> c) The inp got
 				 * freed back to the slab which was
 				 * reallocated. Then the piece of memory was
 				 * re-used and something else (not an
 				 * address) is in inp_ppcb. If that occurs
 				 * we don't crash, but take a TLB shootdown
 				 * performance hit (same as if it was NULL
 				 * and we tried to pre-fetch it).
 				 *
 				 * Considering that the likelyhood of <c> is
 				 * quite rare we will take a risk on doing
 				 * this. If performance drops after testing
 				 * we can always take this out. NB: the
 				 * kern_prefetch on amd64 actually has
 				 * protection against a bad address now via
 				 * the DMAP_() tests. This will prevent the
 				 * TLB hit, and instead if <c> occurs just
 				 * cause us to load cache with a useless
 				 * address (to us).
 				 */
 				kern_prefetch(ninp->inp_ppcb, &prefetch_tp);
 				prefetch_tp = 1;
 			}
 			INP_WUNLOCK(inp);
 		skip_pacing:
 			CURVNET_RESTORE();
 		}
 		if (seen_endpoint) {
 			/*
 			 * We now have a accurate distance between
 			 * slot_pos_of_endpoint <-> orig_exit_slot
 			 * to tell us how late we were, orig_exit_slot
 			 * is where we calculated the end of our cycle to
 			 * be when we first entered.
 			 */
 			completed_measure = 1;
 		}
 		HPTS_LOCK(hpts);
 		hpts->p_runningslot++;
 		if (hpts->p_runningslot >= NUM_OF_HPTSI_SLOTS) {
 			hpts->p_runningslot = 0;
 		}
 	}
 no_one:
 	HPTS_MTX_ASSERT(hpts);
 	hpts->p_delayed_by = 0;
 	/*
 	 * Check to see if we took an excess amount of time and need to run
 	 * more ticks (if we did not hit eno-bufs).
 	 */
 	hpts->p_prev_slot = hpts->p_cur_slot;
 	hpts->p_lasttick = hpts->p_curtick;
 	if ((from_callout == 0) || (loop_cnt > max_pacer_loops)) {
 		/*
 		 * Something is serious slow we have
 		 * looped through processing the wheel
 		 * and by the time we cleared the
 		 * needs to run max_pacer_loops time
 		 * we still needed to run. That means
 		 * the system is hopelessly behind and
 		 * can never catch up :(
 		 *
 		 * We will just lie to this thread
 		 * and let it thing p_curtick is
 		 * correct. When it next awakens
 		 * it will find itself further behind.
 		 */
 		if (from_callout)
 			counter_u64_add(hpts_hopelessly_behind, 1);
 		goto no_run;
 	}
 	hpts->p_curtick = tcp_gethptstick(&tv);
 	hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
 	if (seen_endpoint == 0) {
 		/* We saw no endpoint but we may be looping */
 		orig_exit_slot = hpts->p_cur_slot;
 	}
 	if ((wrap_loop_cnt < 2) &&
 	    (hpts->p_lasttick != hpts->p_curtick)) {
 		counter_u64_add(hpts_loops, 1);
 		loop_cnt++;
 		goto again;
 	}
 no_run:
 	cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv);
 	/*
 	 * Set flag to tell that we are done for
 	 * any slot input that happens during
 	 * input.
 	 */
 	hpts->p_wheel_complete = 1;
 	/*
 	 * Now did we spend too long running input and need to run more ticks?
 	 * Note that if wrap_loop_cnt < 2 then we should have the conditions
 	 * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt
 	 * is greater than 2, then the condtion most likely are *not* true.
 	 * Also if we are called not from the callout, we don't run the wheel
 	 * multiple times so the slots may not align either.
 	 */
 	KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) ||
 		 (wrap_loop_cnt >= 2) || (from_callout == 0)),
 		("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
 		 hpts->p_prev_slot, hpts->p_cur_slot));
 	KASSERT(((hpts->p_lasttick == hpts->p_curtick)
 		 || (wrap_loop_cnt >= 2) || (from_callout == 0)),
 		("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
 		 hpts->p_lasttick, hpts->p_curtick));
 	if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) {
 		hpts->p_curtick = tcp_gethptstick(&tv);
 		counter_u64_add(hpts_loops, 1);
 		hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
 		goto again;
 	}
 
 	if (from_callout){
 		tcp_hpts_set_max_sleep(hpts, wrap_loop_cnt);
 	}
 	if (seen_endpoint)
 		return(hpts_slots_diff(slot_pos_of_endpoint, orig_exit_slot));
 	else
 		return (0);
 }
 
 void
 __tcp_set_hpts(struct inpcb *inp, int32_t line)
 {
 	struct tcp_hpts_entry *hpts;
 	int failed;
 
 	INP_WLOCK_ASSERT(inp);
 	hpts = tcp_hpts_lock(inp);
 	if ((inp->inp_in_hpts == 0) &&
 	    (inp->inp_hpts_cpu_set == 0)) {
 		inp->inp_hpts_cpu = hpts_cpuid(inp, &failed);
 		if (failed == 0)
 			inp->inp_hpts_cpu_set = 1;
 	}
 	mtx_unlock(&hpts->p_mtx);
 }
 
 static void
 __tcp_run_hpts(struct tcp_hpts_entry *hpts)
 {
 	int ticks_ran;
 
 	if (hpts->p_hpts_active) {
 		/* Already active */
 		return;
 	}
 	if (mtx_trylock(&hpts->p_mtx) == 0) {
 		/* Someone else got the lock */
 		return;
 	}
 	if (hpts->p_hpts_active)
 		goto out_with_mtx;
 	hpts->syscall_cnt++;
 	counter_u64_add(hpts_direct_call, 1);
 	hpts->p_hpts_active = 1;
 	ticks_ran = tcp_hptsi(hpts, 0);
 	/* We may want to adjust the sleep values here */
 	if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
 		if (ticks_ran > ticks_indicate_less_sleep) {
 			struct timeval tv;
 			sbintime_t sb;
 
 			hpts->p_mysleep.tv_usec /= 2;
 			if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
 				hpts->p_mysleep.tv_usec = dynamic_min_sleep;
 			/* Reschedule with new to value */
 			tcp_hpts_set_max_sleep(hpts, 0);
 			tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
 			/* Validate its in the right ranges */
 			if (tv.tv_usec < hpts->p_mysleep.tv_usec) {
 				hpts->overidden_sleep = tv.tv_usec;
 				tv.tv_usec = hpts->p_mysleep.tv_usec;
 			} else if (tv.tv_usec > dynamic_max_sleep) {
 				/* Lets not let sleep get above this value */
 				hpts->overidden_sleep = tv.tv_usec;
 				tv.tv_usec = dynamic_max_sleep;
 			}
 			/*
 			 * In this mode the timer is a backstop to
 			 * all the userret/lro_flushes so we use
 			 * the dynamic value and set the on_min_sleep
 			 * flag so we will not be awoken.
 			 */
 			sb = tvtosbt(tv);
 			/* Store off to make visible the actual sleep time */
 			hpts->sleeping = tv.tv_usec;
 			callout_reset_sbt_on(&hpts->co, sb, 0,
 					     hpts_timeout_swi, hpts, hpts->p_cpu,
 					     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 		} else if (ticks_ran < ticks_indicate_more_sleep) {
 			/* For the further sleep, don't reschedule  hpts */
 			hpts->p_mysleep.tv_usec *= 2;
 			if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
 				hpts->p_mysleep.tv_usec = dynamic_max_sleep;
 		}
 		hpts->p_on_min_sleep = 1;
 	}
 	hpts->p_hpts_active = 0;
 out_with_mtx:
 	HPTS_MTX_ASSERT(hpts);
 	mtx_unlock(&hpts->p_mtx);
 }
 
 static struct tcp_hpts_entry *
 tcp_choose_hpts_to_run(void)
 {
 	int i, oldest_idx, start, end;
 	uint32_t cts, time_since_ran, calc;
 
 	cts = tcp_get_usecs(NULL);
 	time_since_ran = 0;
 	/* Default is all one group */
 	start = 0;
 	end = tcp_pace.rp_num_hptss;
 	/*
 	 * If we have more than one L3 group figure out which one
 	 * this CPU is in.
 	 */
 	if (tcp_pace.grp_cnt > 1) {
 		for (i = 0; i < tcp_pace.grp_cnt; i++) {
 			if (CPU_ISSET(curcpu, &tcp_pace.grps[i]->cg_mask)) {
 				start = tcp_pace.grps[i]->cg_first;
 				end = (tcp_pace.grps[i]->cg_last + 1);
 				break;
 			}
 		}
 	}
 	oldest_idx = -1;
 	for (i = start; i < end; i++) {
 		if (TSTMP_GT(cts, cts_last_ran[i]))
 			calc = cts - cts_last_ran[i];
 		else
 			calc = 0;
 		if (calc > time_since_ran) {
 			oldest_idx = i;
 			time_since_ran = calc;
 		}
 	}
 	if (oldest_idx >= 0)
 		return(tcp_pace.rp_ent[oldest_idx]);
 	else
 		return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
 }
 
 
 void
 tcp_run_hpts(void)
 {
 	static struct tcp_hpts_entry *hpts;
 	struct epoch_tracker et;
 
 	NET_EPOCH_ENTER(et);
 	hpts = tcp_choose_hpts_to_run();
 	__tcp_run_hpts(hpts);
 	NET_EPOCH_EXIT(et);
 }
 
 
 static void
 tcp_hpts_thread(void *ctx)
 {
 	struct tcp_hpts_entry *hpts;
 	struct epoch_tracker et;
 	struct timeval tv;
 	sbintime_t sb;
 	int ticks_ran;
 
 	hpts = (struct tcp_hpts_entry *)ctx;
 	mtx_lock(&hpts->p_mtx);
 	if (hpts->p_direct_wake) {
 		/* Signaled by input or output with low occupancy count. */
 		callout_stop(&hpts->co);
 		counter_u64_add(hpts_direct_awakening, 1);
 	} else {
 		/* Timed out, the normal case. */
 		counter_u64_add(hpts_wake_timeout, 1);
 		if (callout_pending(&hpts->co) ||
 		    !callout_active(&hpts->co)) {
 			mtx_unlock(&hpts->p_mtx);
 			return;
 		}
 	}
 	callout_deactivate(&hpts->co);
 	hpts->p_hpts_wake_scheduled = 0;
 	NET_EPOCH_ENTER(et);
 	if (hpts->p_hpts_active) {
 		/*
 		 * We are active already. This means that a syscall
 		 * trap or LRO is running in behalf of hpts. In that case
 		 * we need to double our timeout since there seems to be
 		 * enough activity in the system that we don't need to
 		 * run as often (if we were not directly woken).
 		 */
 		if (hpts->p_direct_wake == 0) {
 			counter_u64_add(hpts_back_tosleep, 1);
 			if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
 				hpts->p_mysleep.tv_usec *= 2;
 				if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
 					hpts->p_mysleep.tv_usec = dynamic_max_sleep;
 				tv.tv_usec = hpts->p_mysleep.tv_usec;
 				hpts->p_on_min_sleep = 1;
 			} else {
 				/*
 				 * Here we have low count on the wheel, but
 				 * somehow we still collided with one of the
 				 * connections. Lets go back to sleep for a
 				 * min sleep time, but clear the flag so we
 				 * can be awoken by insert.
 				 */
 				hpts->p_on_min_sleep = 0;
 				tv.tv_usec = tcp_min_hptsi_time;
 			}
 		} else {
 			/*
 			 * Directly woken most likely to reset the
 			 * callout time.
 			 */
 			tv.tv_sec = 0;
 			tv.tv_usec = hpts->p_mysleep.tv_usec;
 		}
 		goto back_to_sleep;
 	}
 	hpts->sleeping = 0;
 	hpts->p_hpts_active = 1;
 	ticks_ran = tcp_hptsi(hpts, 1);
 	tv.tv_sec = 0;
 	tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
 	if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
 		if(hpts->p_direct_wake == 0) {
 			/*
 			 * Only adjust sleep time if we were
 			 * called from the callout i.e. direct_wake == 0.
 			 */
 			if (ticks_ran < ticks_indicate_more_sleep) {
 				hpts->p_mysleep.tv_usec *= 2;
 				if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
 					hpts->p_mysleep.tv_usec = dynamic_max_sleep;
 			} else if (ticks_ran > ticks_indicate_less_sleep) {
 				hpts->p_mysleep.tv_usec /= 2;
 				if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
 					hpts->p_mysleep.tv_usec = dynamic_min_sleep;
 			}
 		}
 		if (tv.tv_usec < hpts->p_mysleep.tv_usec) {
 			hpts->overidden_sleep = tv.tv_usec;
 			tv.tv_usec = hpts->p_mysleep.tv_usec;
 		} else if (tv.tv_usec > dynamic_max_sleep) {
 			/* Lets not let sleep get above this value */
 			hpts->overidden_sleep = tv.tv_usec;
 			tv.tv_usec = dynamic_max_sleep;
 		}
 		/*
 		 * In this mode the timer is a backstop to
 		 * all the userret/lro_flushes so we use
 		 * the dynamic value and set the on_min_sleep
 		 * flag so we will not be awoken.
 		 */
 		hpts->p_on_min_sleep = 1;
 	} else if (hpts->p_on_queue_cnt == 0)  {
 		/*
 		 * No one on the wheel, please wake us up
 		 * if you insert on the wheel.
 		 */
 		hpts->p_on_min_sleep = 0;
 		hpts->overidden_sleep = 0;
 	} else {
 		/*
 		 * We hit here when we have a low number of
 		 * clients on the wheel (our else clause).
 		 * We may need to go on min sleep, if we set
 		 * the flag we will not be awoken if someone
 		 * is inserted ahead of us. Clearing the flag
 		 * means we can be awoken. This is "old mode"
 		 * where the timer is what runs hpts mainly.
 		 */
 		if (tv.tv_usec < tcp_min_hptsi_time) {
 			/*
 			 * Yes on min sleep, which means
 			 * we cannot be awoken.
 			 */
 			hpts->overidden_sleep = tv.tv_usec;
 			tv.tv_usec = tcp_min_hptsi_time;
 			hpts->p_on_min_sleep = 1;
 		} else {
 			/* Clear the min sleep flag */
 			hpts->overidden_sleep = 0;
 			hpts->p_on_min_sleep = 0;
 		}
 	}
 	HPTS_MTX_ASSERT(hpts);
 	hpts->p_hpts_active = 0;
 back_to_sleep:
 	hpts->p_direct_wake = 0;
 	sb = tvtosbt(tv);
 	/* Store off to make visible the actual sleep time */
 	hpts->sleeping = tv.tv_usec;
 	callout_reset_sbt_on(&hpts->co, sb, 0,
 			     hpts_timeout_swi, hpts, hpts->p_cpu,
 			     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 	NET_EPOCH_EXIT(et);
 	mtx_unlock(&hpts->p_mtx);
 }
 
 #undef	timersub
 
 static int32_t
 hpts_count_level(struct cpu_group *cg)
 {
 	int32_t count_l3, i;
 
 	count_l3 = 0;
 	if (cg->cg_level == CG_SHARE_L3)
 		count_l3++;
 	/* Walk all the children looking for L3 */
 	for (i = 0; i < cg->cg_children; i++) {
 		count_l3 += hpts_count_level(&cg->cg_child[i]);
 	}
 	return (count_l3);
 }
 
 static void
 hpts_gather_grps(struct cpu_group **grps, int32_t *at, int32_t max, struct cpu_group *cg)
 {
 	int32_t idx, i;
 
 	idx = *at;
 	if (cg->cg_level == CG_SHARE_L3) {
 		grps[idx] = cg;
 		idx++;
 		if (idx == max) {
 			*at = idx;
 			return;
 		}
 	}
 	*at = idx;
 	/* Walk all the children looking for L3 */
 	for (i = 0; i < cg->cg_children; i++) {
 		hpts_gather_grps(grps, at, max, &cg->cg_child[i]);
 	}
 }
 
 static void
 tcp_init_hptsi(void *st)
 {
 	struct cpu_group *cpu_top;
 	int32_t error __diagused;
 	int32_t i, j, bound = 0, created = 0;
 	size_t sz, asz;
 	struct timeval tv;
 	sbintime_t sb;
 	struct tcp_hpts_entry *hpts;
 	struct pcpu *pc;
 	char unit[16];
 	uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
 	int count, domain;
 
 #ifdef SMP
 	cpu_top = smp_topo();
 #else
 	cpu_top = NULL;
 #endif
 	tcp_pace.rp_num_hptss = ncpus;
 	hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
 	hpts_loops = counter_u64_alloc(M_WAITOK);
 	back_tosleep = counter_u64_alloc(M_WAITOK);
 	combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
 	wheel_wrap = counter_u64_alloc(M_WAITOK);
 	hpts_wake_timeout = counter_u64_alloc(M_WAITOK);
 	hpts_direct_awakening = counter_u64_alloc(M_WAITOK);
 	hpts_back_tosleep = counter_u64_alloc(M_WAITOK);
 	hpts_direct_call = counter_u64_alloc(M_WAITOK);
 	cpu_uses_flowid = counter_u64_alloc(M_WAITOK);
 	cpu_uses_random = counter_u64_alloc(M_WAITOK);
 
 	sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
 	tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
 	sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss);
 	cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
 	tcp_pace.grp_cnt = 0;
 	if (cpu_top == NULL) {
 		tcp_pace.grp_cnt = 1;
 	} else {
 		/* Find out how many cache level 3 domains we have */
 		count = 0;
 		tcp_pace.grp_cnt = hpts_count_level(cpu_top);
 		if (tcp_pace.grp_cnt == 0) {
 			tcp_pace.grp_cnt = 1;
 		}
 		sz = (tcp_pace.grp_cnt * sizeof(struct cpu_group *));
 		tcp_pace.grps = malloc(sz, M_TCPHPTS, M_WAITOK);
 		/* Now populate the groups */
 		if (tcp_pace.grp_cnt == 1) {
 			/*
 			 * All we need is the top level all cpu's are in
 			 * the same cache so when we use grp[0]->cg_mask
 			 * with the cg_first <-> cg_last it will include
 			 * all cpu's in it. The level here is probably
 			 * zero which is ok.
 			 */
 			tcp_pace.grps[0] = cpu_top;
 		} else {
 			/*
 			 * Here we must find all the level three cache domains
 			 * and setup our pointers to them.
 			 */
 			count = 0;
 			hpts_gather_grps(tcp_pace.grps, &count, tcp_pace.grp_cnt, cpu_top);
 		}
 	}
 	asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
 	for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
 		tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
 		    M_TCPHPTS, M_WAITOK | M_ZERO);
 		tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK);
 		hpts = tcp_pace.rp_ent[i];
 		/*
 		 * Init all the hpts structures that are not specifically
 		 * zero'd by the allocations. Also lets attach them to the
 		 * appropriate sysctl block as well.
 		 */
 		mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
 		    "hpts", MTX_DEF | MTX_DUPOK);
 		for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
 			TAILQ_INIT(&hpts->p_hptss[j].head);
 			hpts->p_hptss[j].count = 0;
 			hpts->p_hptss[j].gencnt = 0;
 		}
 		sysctl_ctx_init(&hpts->hpts_ctx);
 		sprintf(unit, "%d", i);
 		hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
 		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
 		    OID_AUTO,
 		    unit,
 		    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 		    "");
 		SYSCTL_ADD_INT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "out_qcnt", CTLFLAG_RD,
 		    &hpts->p_on_queue_cnt, 0,
 		    "Count TCB's awaiting output processing");
 		SYSCTL_ADD_U16(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "active", CTLFLAG_RD,
 		    &hpts->p_hpts_active, 0,
 		    "Is the hpts active");
 		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "curslot", CTLFLAG_RD,
 		    &hpts->p_cur_slot, 0,
 		    "What the current running pacers goal");
 		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "runtick", CTLFLAG_RD,
 		    &hpts->p_runningslot, 0,
 		    "What the running pacers current slot is");
 		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "curtick", CTLFLAG_RD,
 		    &hpts->p_curtick, 0,
 		    "What the running pacers last tick mapped to the wheel was");
 		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "lastran", CTLFLAG_RD,
 		    &cts_last_ran[i], 0,
 		    "The last usec tick that this hpts ran");
 		SYSCTL_ADD_LONG(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
 		    &hpts->p_mysleep.tv_usec,
 		    "What the running pacers is using for p_mysleep.tv_usec");
 		SYSCTL_ADD_U64(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "now_sleeping", CTLFLAG_RD,
 		    &hpts->sleeping, 0,
 		    "What the running pacers is actually sleeping for");
 		SYSCTL_ADD_U64(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "syscall_cnt", CTLFLAG_RD,
 		    &hpts->syscall_cnt, 0,
 		    "How many times we had syscalls on this hpts");
 
 		hpts->p_hpts_sleep_time = hpts_sleep_max;
 		hpts->p_num = i;
 		hpts->p_curtick = tcp_gethptstick(&tv);
 		cts_last_ran[i] = tcp_tv_to_usectick(&tv);
 		hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
 		hpts->p_cpu = 0xffff;
 		hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1);
 		callout_init(&hpts->co, 1);
 	}
 	/* Don't try to bind to NUMA domains if we don't have any */
 	if (vm_ndomains == 1 && tcp_bind_threads == 2)
 		tcp_bind_threads = 0;
 
 	/*
 	 * Now lets start ithreads to handle the hptss.
 	 */
 	for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
 		hpts = tcp_pace.rp_ent[i];
 		hpts->p_cpu = i;
 
 		error = swi_add(&hpts->ie, "hpts",
 		    tcp_hpts_thread, (void *)hpts,
 		    SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
 		KASSERT(error == 0,
 			("Can't add hpts:%p i:%d err:%d",
 			 hpts, i, error));
 		created++;
 		hpts->p_mysleep.tv_sec = 0;
 		hpts->p_mysleep.tv_usec = tcp_min_hptsi_time;
 		if (tcp_bind_threads == 1) {
 			if (intr_event_bind(hpts->ie, i) == 0)
 				bound++;
 		} else if (tcp_bind_threads == 2) {
 			/* Find the group for this CPU (i) and bind into it */
 			for (j = 0; j < tcp_pace.grp_cnt; j++) {
 				if (CPU_ISSET(i, &tcp_pace.grps[j]->cg_mask)) {
 					if (intr_event_bind_ithread_cpuset(hpts->ie,
 						&tcp_pace.grps[j]->cg_mask) == 0) {
 						bound++;
 						pc = pcpu_find(i);
 						domain = pc->pc_domain;
 						count = hpts_domains[domain].count;
 						hpts_domains[domain].cpu[count] = i;
 						hpts_domains[domain].count++;
 						break;
 					}
 				}
 			}
 		}
 		tv.tv_sec = 0;
 		tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
 		hpts->sleeping = tv.tv_usec;
 		sb = tvtosbt(tv);
 		callout_reset_sbt_on(&hpts->co, sb, 0,
 				     hpts_timeout_swi, hpts, hpts->p_cpu,
 				     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 	}
 	/*
 	 * If we somehow have an empty domain, fall back to choosing
 	 * among all htps threads.
 	 */
 	for (i = 0; i < vm_ndomains; i++) {
 		if (hpts_domains[i].count == 0) {
 			tcp_bind_threads = 0;
 			break;
 		}
 	}
 	printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n",
 	    created, bound,
 	    tcp_bind_threads == 2 ? "NUMA domains" : "cpus");
 #ifdef INVARIANTS
 	printf("HPTS is in INVARIANT mode!!\n");
 #endif
 }
 
 SYSINIT(tcphptsi, SI_SUB_SOFTINTR, SI_ORDER_ANY, tcp_init_hptsi, NULL);
 MODULE_VERSION(tcphpts, 1);
diff --git a/sys/netinet/tcp_log_buf.c b/sys/netinet/tcp_log_buf.c
index 5ec4acf367d5..6baf1bce4623 100644
--- a/sys/netinet/tcp_log_buf.c
+++ b/sys/netinet/tcp_log_buf.c
@@ -1,2639 +1,2639 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2016-2018 Netflix, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/arb.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/qmath.h>
 #include <sys/queue.h>
 #include <sys/refcount.h>
 #include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/tree.h>
 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
 #include <sys/counter.h>
 
 #include <dev/tcp_log/tcp_log_dev.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_log_buf.h>
 
 /* Default expiry time */
 #define	TCP_LOG_EXPIRE_TIME	((sbintime_t)60 * SBT_1S)
 
 /* Max interval at which to run the expiry timer */
 #define	TCP_LOG_EXPIRE_INTVL	((sbintime_t)5 * SBT_1S)
 
 bool	tcp_log_verbose;
 static uma_zone_t tcp_log_id_bucket_zone, tcp_log_id_node_zone, tcp_log_zone;
 static int	tcp_log_session_limit = TCP_LOG_BUF_DEFAULT_SESSION_LIMIT;
 static uint32_t	tcp_log_version = TCP_LOG_BUF_VER;
 RB_HEAD(tcp_log_id_tree, tcp_log_id_bucket);
 static struct tcp_log_id_tree tcp_log_id_head;
 static STAILQ_HEAD(, tcp_log_id_node) tcp_log_expireq_head =
     STAILQ_HEAD_INITIALIZER(tcp_log_expireq_head);
 static struct mtx tcp_log_expireq_mtx;
 static struct callout tcp_log_expireq_callout;
 static u_long tcp_log_auto_ratio = 0;
 static volatile u_long tcp_log_auto_ratio_cur = 0;
 static uint32_t tcp_log_auto_mode = TCP_LOG_STATE_TAIL;
 static bool tcp_log_auto_all = false;
 static uint32_t tcp_disable_all_bb_logs = 0;
 
 RB_PROTOTYPE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp)
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, bb, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "TCP Black Box controls");
 
 SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_verbose, CTLFLAG_RW, &tcp_log_verbose,
     0, "Force verbose logging for TCP traces");
 
 SYSCTL_INT(_net_inet_tcp_bb, OID_AUTO, log_session_limit,
     CTLFLAG_RW, &tcp_log_session_limit, 0,
     "Maximum number of events maintained for each TCP session");
 
 SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_global_limit, CTLFLAG_RW,
     &tcp_log_zone, "Maximum number of events maintained for all TCP sessions");
 
 SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_global_entries, CTLFLAG_RD,
     &tcp_log_zone, "Current number of events maintained for all TCP sessions");
 
 SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_limit, CTLFLAG_RW,
     &tcp_log_id_bucket_zone, "Maximum number of log IDs");
 
 SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_entries, CTLFLAG_RD,
     &tcp_log_id_bucket_zone, "Current number of log IDs");
 
 SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_limit, CTLFLAG_RW,
     &tcp_log_id_node_zone, "Maximum number of tcpcbs with log IDs");
 
 SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_entries, CTLFLAG_RD,
     &tcp_log_id_node_zone, "Current number of tcpcbs with log IDs");
 
 SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_version, CTLFLAG_RD, &tcp_log_version,
     0, "Version of log formats exported");
 
 SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, disable_all, CTLFLAG_RW,
     &tcp_disable_all_bb_logs, TCP_LOG_STATE_HEAD_AUTO,
     "Disable all BB logging for all connections");
 
 SYSCTL_ULONG(_net_inet_tcp_bb, OID_AUTO, log_auto_ratio, CTLFLAG_RW,
     &tcp_log_auto_ratio, 0, "Do auto capturing for 1 out of N sessions");
 
 SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_auto_mode, CTLFLAG_RW,
     &tcp_log_auto_mode, TCP_LOG_STATE_HEAD_AUTO,
     "Logging mode for auto-selected sessions (default is TCP_LOG_STATE_HEAD_AUTO)");
 
 SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_auto_all, CTLFLAG_RW,
     &tcp_log_auto_all, false,
     "Auto-select from all sessions (rather than just those with IDs)");
 
 #ifdef TCPLOG_DEBUG_COUNTERS
 counter_u64_t tcp_log_queued;
 counter_u64_t tcp_log_que_fail1;
 counter_u64_t tcp_log_que_fail2;
 counter_u64_t tcp_log_que_fail3;
 counter_u64_t tcp_log_que_fail4;
 counter_u64_t tcp_log_que_fail5;
 counter_u64_t tcp_log_que_copyout;
 counter_u64_t tcp_log_que_read;
 counter_u64_t tcp_log_que_freed;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, queued, CTLFLAG_RD,
     &tcp_log_queued, "Number of entries queued");
 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail1, CTLFLAG_RD,
     &tcp_log_que_fail1, "Number of entries queued but fail 1");
 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail2, CTLFLAG_RD,
     &tcp_log_que_fail2, "Number of entries queued but fail 2");
 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail3, CTLFLAG_RD,
     &tcp_log_que_fail3, "Number of entries queued but fail 3");
 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail4, CTLFLAG_RD,
     &tcp_log_que_fail4, "Number of entries queued but fail 4");
 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail5, CTLFLAG_RD,
     &tcp_log_que_fail5, "Number of entries queued but fail 4");
 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, copyout, CTLFLAG_RD,
     &tcp_log_que_copyout, "Number of entries copied out");
 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, read, CTLFLAG_RD,
     &tcp_log_que_read, "Number of entries read from the queue");
 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, freed, CTLFLAG_RD,
     &tcp_log_que_freed, "Number of entries freed after reading");
 #endif
 
 #ifdef INVARIANTS
 #define	TCPLOG_DEBUG_RINGBUF
 #endif
 /* Number of requests to consider a PBCID "active". */
 #define	ACTIVE_REQUEST_COUNT	10
 
 /* Statistic tracking for "active" PBCIDs. */
 static counter_u64_t tcp_log_pcb_ids_cur;
 static counter_u64_t tcp_log_pcb_ids_tot;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, pcb_ids_cur, CTLFLAG_RD,
     &tcp_log_pcb_ids_cur, "Number of pcb IDs allocated in the system");
 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, pcb_ids_tot, CTLFLAG_RD,
     &tcp_log_pcb_ids_tot, "Total number of pcb IDs that have been allocated");
 
 struct tcp_log_mem
 {
 	STAILQ_ENTRY(tcp_log_mem) tlm_queue;
 	struct tcp_log_buffer	tlm_buf;
 	struct tcp_log_verbose	tlm_v;
 #ifdef TCPLOG_DEBUG_RINGBUF
 	volatile int		tlm_refcnt;
 #endif
 };
 
 /* 60 bytes for the header, + 16 bytes for padding */
 static uint8_t	zerobuf[76];
 
 /*
  * Lock order:
  * 1. TCPID_TREE
  * 2. TCPID_BUCKET
  * 3. INP
  *
  * Rules:
  * A. You need a lock on the Tree to add/remove buckets.
  * B. You need a lock on the bucket to add/remove nodes from the bucket.
  * C. To change information in a node, you need the INP lock if the tln_closed
  *    field is false. Otherwise, you need the bucket lock. (Note that the
  *    tln_closed field can change at any point, so you need to recheck the
  *    entry after acquiring the INP lock.)
  * D. To remove a node from the bucket, you must have that entry locked,
  *    according to the criteria of Rule C. Also, the node must not be on
  *    the expiry queue.
  * E. The exception to C is the expiry queue fields, which are locked by
  *    the TCPLOG_EXPIREQ lock.
  *
  * Buckets have a reference count. Each node is a reference. Further,
  * other callers may add reference counts to keep a bucket from disappearing.
  * You can add a reference as long as you own a lock sufficient to keep the
  * bucket from disappearing. For example, a common use is:
  *   a. Have a locked INP, but need to lock the TCPID_BUCKET.
  *   b. Add a refcount on the bucket. (Safe because the INP lock prevents
  *      the TCPID_BUCKET from going away.)
  *   c. Drop the INP lock.
  *   d. Acquire a lock on the TCPID_BUCKET.
  *   e. Acquire a lock on the INP.
  *   f. Drop the refcount on the bucket.
  *      (At this point, the bucket may disappear.)
  *
  * Expire queue lock:
  * You can acquire this with either the bucket or INP lock. Don't reverse it.
  * When the expire code has committed to freeing a node, it resets the expiry
  * time to SBT_MAX. That is the signal to everyone else that they should
  * leave that node alone.
  */
 static struct rwlock tcp_id_tree_lock;
 #define	TCPID_TREE_WLOCK()		rw_wlock(&tcp_id_tree_lock)
 #define	TCPID_TREE_RLOCK()		rw_rlock(&tcp_id_tree_lock)
 #define	TCPID_TREE_UPGRADE()		rw_try_upgrade(&tcp_id_tree_lock)
 #define	TCPID_TREE_WUNLOCK()		rw_wunlock(&tcp_id_tree_lock)
 #define	TCPID_TREE_RUNLOCK()		rw_runlock(&tcp_id_tree_lock)
 #define	TCPID_TREE_WLOCK_ASSERT()	rw_assert(&tcp_id_tree_lock, RA_WLOCKED)
 #define	TCPID_TREE_RLOCK_ASSERT()	rw_assert(&tcp_id_tree_lock, RA_RLOCKED)
 #define	TCPID_TREE_UNLOCK_ASSERT()	rw_assert(&tcp_id_tree_lock, RA_UNLOCKED)
 
 #define	TCPID_BUCKET_LOCK_INIT(tlb)	mtx_init(&((tlb)->tlb_mtx), "tcp log id bucket", NULL, MTX_DEF)
 #define	TCPID_BUCKET_LOCK_DESTROY(tlb)	mtx_destroy(&((tlb)->tlb_mtx))
 #define	TCPID_BUCKET_LOCK(tlb)		mtx_lock(&((tlb)->tlb_mtx))
 #define	TCPID_BUCKET_UNLOCK(tlb)	mtx_unlock(&((tlb)->tlb_mtx))
 #define	TCPID_BUCKET_LOCK_ASSERT(tlb)	mtx_assert(&((tlb)->tlb_mtx), MA_OWNED)
 #define	TCPID_BUCKET_UNLOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_NOTOWNED)
 
 #define	TCPID_BUCKET_REF(tlb)		refcount_acquire(&((tlb)->tlb_refcnt))
 #define	TCPID_BUCKET_UNREF(tlb)		refcount_release(&((tlb)->tlb_refcnt))
 
 #define	TCPLOG_EXPIREQ_LOCK()		mtx_lock(&tcp_log_expireq_mtx)
 #define	TCPLOG_EXPIREQ_UNLOCK()		mtx_unlock(&tcp_log_expireq_mtx)
 
 SLIST_HEAD(tcp_log_id_head, tcp_log_id_node);
 
 struct tcp_log_id_bucket
 {
 	/*
 	 * tlb_id must be first. This lets us use strcmp on
 	 * (struct tcp_log_id_bucket *) and (char *) interchangeably.
 	 */
 	char				tlb_id[TCP_LOG_ID_LEN];
 	char				tlb_tag[TCP_LOG_TAG_LEN];
 	RB_ENTRY(tcp_log_id_bucket)	tlb_rb;
 	struct tcp_log_id_head		tlb_head;
 	struct mtx			tlb_mtx;
 	volatile u_int			tlb_refcnt;
 	volatile u_int			tlb_reqcnt;
 	uint32_t			tlb_loglimit;
 	uint8_t				tlb_logstate;
 };
 
 struct tcp_log_id_node
 {
 	SLIST_ENTRY(tcp_log_id_node) tln_list;
 	STAILQ_ENTRY(tcp_log_id_node) tln_expireq; /* Locked by the expireq lock */
 	sbintime_t		tln_expiretime;	/* Locked by the expireq lock */
 
 	/*
 	 * If INP is NULL, that means the connection has closed. We've
 	 * saved the connection endpoint information and the log entries
 	 * in the tln_ie and tln_entries members. We've also saved a pointer
 	 * to the enclosing bucket here. If INP is not NULL, the information is
 	 * in the PCB and not here.
 	 */
 	struct inpcb		*tln_inp;
 	struct tcpcb		*tln_tp;
 	struct tcp_log_id_bucket *tln_bucket;
 	struct in_endpoints	tln_ie;
 	struct tcp_log_stailq	tln_entries;
 	int			tln_count;
 	volatile int		tln_closed;
 	uint8_t			tln_af;
 };
 
 enum tree_lock_state {
 	TREE_UNLOCKED = 0,
 	TREE_RLOCKED,
 	TREE_WLOCKED,
 };
 
 /* Do we want to select this session for auto-logging? */
 static __inline bool
 tcp_log_selectauto(void)
 {
 
 	/*
 	 * If we are doing auto-capturing, figure out whether we will capture
 	 * this session.
 	 */
 	if (tcp_log_auto_ratio &&
 	    (tcp_disable_all_bb_logs == 0) &&
 	    (atomic_fetchadd_long(&tcp_log_auto_ratio_cur, 1) %
 	    tcp_log_auto_ratio) == 0)
 		return (true);
 	return (false);
 }
 
 static __inline int
 tcp_log_id_cmp(struct tcp_log_id_bucket *a, struct tcp_log_id_bucket *b)
 {
 	KASSERT(a != NULL, ("tcp_log_id_cmp: argument a is unexpectedly NULL"));
 	KASSERT(b != NULL, ("tcp_log_id_cmp: argument b is unexpectedly NULL"));
 	return strncmp(a->tlb_id, b->tlb_id, TCP_LOG_ID_LEN);
 }
 
 RB_GENERATE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp)
 
 static __inline void
 tcp_log_id_validate_tree_lock(int tree_locked)
 {
 
 #ifdef INVARIANTS
 	switch (tree_locked) {
 	case TREE_WLOCKED:
 		TCPID_TREE_WLOCK_ASSERT();
 		break;
 	case TREE_RLOCKED:
 		TCPID_TREE_RLOCK_ASSERT();
 		break;
 	case TREE_UNLOCKED:
 		TCPID_TREE_UNLOCK_ASSERT();
 		break;
 	default:
 		kassert_panic("%s:%d: unknown tree lock state", __func__,
 		    __LINE__);
 	}
 #endif
 }
 
 static __inline void
 tcp_log_remove_bucket(struct tcp_log_id_bucket *tlb)
 {
 
 	TCPID_TREE_WLOCK_ASSERT();
 	KASSERT(SLIST_EMPTY(&tlb->tlb_head),
 	    ("%s: Attempt to remove non-empty bucket", __func__));
 	if (RB_REMOVE(tcp_log_id_tree, &tcp_log_id_head, tlb) == NULL) {
 #ifdef INVARIANTS
 		kassert_panic("%s:%d: error removing element from tree",
 			    __func__, __LINE__);
 #endif
 	}
 	TCPID_BUCKET_LOCK_DESTROY(tlb);
 	counter_u64_add(tcp_log_pcb_ids_cur, (int64_t)-1);
 	uma_zfree(tcp_log_id_bucket_zone, tlb);
 }
 
 /*
  * Call with a referenced and locked bucket.
  * Will return true if the bucket was freed; otherwise, false.
  * tlb: The bucket to unreference.
  * tree_locked: A pointer to the state of the tree lock. If the tree lock
  *    state changes, the function will update it.
  * inp: If not NULL and the function needs to drop the inp lock to relock the
  *    tree, it will do so. (The caller must ensure inp will not become invalid,
  *    probably by holding a reference to it.)
  */
 static bool
 tcp_log_unref_bucket(struct tcp_log_id_bucket *tlb, int *tree_locked,
     struct inpcb *inp)
 {
 
 	KASSERT(tlb != NULL, ("%s: called with NULL tlb", __func__));
 	KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked",
 	    __func__));
 
 	tcp_log_id_validate_tree_lock(*tree_locked);
 
 	/*
 	 * Did we hold the last reference on the tlb? If so, we may need
 	 * to free it. (Note that we can realistically only execute the
 	 * loop twice: once without a write lock and once with a write
 	 * lock.)
 	 */
 	while (TCPID_BUCKET_UNREF(tlb)) {
 		/*
 		 * We need a write lock on the tree to free this.
 		 * If we can upgrade the tree lock, this is "easy". If we
 		 * can't upgrade the tree lock, we need to do this the
 		 * "hard" way: unwind all our locks and relock everything.
 		 * In the meantime, anything could have changed. We even
 		 * need to validate that we still need to free the bucket.
 		 */
 		if (*tree_locked == TREE_RLOCKED && TCPID_TREE_UPGRADE())
 			*tree_locked = TREE_WLOCKED;
 		else if (*tree_locked != TREE_WLOCKED) {
 			TCPID_BUCKET_REF(tlb);
 			if (inp != NULL)
 				INP_WUNLOCK(inp);
 			TCPID_BUCKET_UNLOCK(tlb);
 			if (*tree_locked == TREE_RLOCKED)
 				TCPID_TREE_RUNLOCK();
 			TCPID_TREE_WLOCK();
 			*tree_locked = TREE_WLOCKED;
 			TCPID_BUCKET_LOCK(tlb);
 			if (inp != NULL)
 				INP_WLOCK(inp);
 			continue;
 		}
 
 		/*
 		 * We have an empty bucket and a write lock on the tree.
 		 * Remove the empty bucket.
 		 */
 		tcp_log_remove_bucket(tlb);
 		return (true);
 	}
 	return (false);
 }
 
 /*
  * Call with a locked bucket. This function will release the lock on the
  * bucket before returning.
  *
  * The caller is responsible for freeing the tp->t_lin/tln node!
  *
  * Note: one of tp or both tlb and tln must be supplied.
  *
  * inp: A pointer to the inp. If the function needs to drop the inp lock to
  *    acquire the tree write lock, it will do so. (The caller must ensure inp
  *    will not become invalid, probably by holding a reference to it.)
  * tp: A pointer to the tcpcb. (optional; if specified, tlb and tln are ignored)
  * tlb: A pointer to the bucket. (optional; ignored if tp is specified)
  * tln: A pointer to the node. (optional; ignored if tp is specified)
  * tree_locked: A pointer to the state of the tree lock. If the tree lock
  *    state changes, the function will update it.
  *
  * Will return true if the INP lock was reacquired; otherwise, false.
  */
 static bool
 tcp_log_remove_id_node(struct inpcb *inp, struct tcpcb *tp,
     struct tcp_log_id_bucket *tlb, struct tcp_log_id_node *tln,
     int *tree_locked)
 {
 	int orig_tree_locked;
 
 	KASSERT(tp != NULL || (tlb != NULL && tln != NULL),
 	    ("%s: called with tp=%p, tlb=%p, tln=%p", __func__,
 	    tp, tlb, tln));
 	KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked",
 	    __func__));
 
 	if (tp != NULL) {
 		tlb = tp->t_lib;
 		tln = tp->t_lin;
 		KASSERT(tlb != NULL, ("%s: unexpectedly NULL tlb", __func__));
 		KASSERT(tln != NULL, ("%s: unexpectedly NULL tln", __func__));
 	}
 
 	tcp_log_id_validate_tree_lock(*tree_locked);
 	TCPID_BUCKET_LOCK_ASSERT(tlb);
 
 	/*
 	 * Remove the node, clear the log bucket and node from the TCPCB, and
 	 * decrement the bucket refcount. In the process, if this is the
 	 * last reference, the bucket will be freed.
 	 */
 	SLIST_REMOVE(&tlb->tlb_head, tln, tcp_log_id_node, tln_list);
 	if (tp != NULL) {
 		tp->t_lib = NULL;
 		tp->t_lin = NULL;
 	}
 	orig_tree_locked = *tree_locked;
 	if (!tcp_log_unref_bucket(tlb, tree_locked, inp))
 		TCPID_BUCKET_UNLOCK(tlb);
 	return (*tree_locked != orig_tree_locked);
 }
 
 #define	RECHECK_INP_CLEAN(cleanup)	do {			\
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {	\
+	if (inp->inp_flags & INP_DROPPED) {			\
 		rv = ECONNRESET;				\
 		cleanup;					\
 		goto done;					\
 	}							\
 	tp = intotcpcb(inp);					\
 } while (0)
 
 #define	RECHECK_INP()	RECHECK_INP_CLEAN(/* noop */)
 
 static void
 tcp_log_grow_tlb(char *tlb_id, struct tcpcb *tp)
 {
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 #ifdef STATS
 	if (V_tcp_perconn_stats_enable == 2 && tp->t_stats == NULL)
 		(void)tcp_stats_sample_rollthedice(tp, tlb_id, strlen(tlb_id));
 #endif
 }
 
 static void
 tcp_log_increment_reqcnt(struct tcp_log_id_bucket *tlb)
 {
 
 	atomic_fetchadd_int(&tlb->tlb_reqcnt, 1);
 }
 
 /*
  * Associate the specified tag with a particular TCP log ID.
  * Called with INPCB locked. Returns with it unlocked.
  * Returns 0 on success or EOPNOTSUPP if the connection has no TCP log ID.
  */
 int
 tcp_log_set_tag(struct tcpcb *tp, char *tag)
 {
 	struct tcp_log_id_bucket *tlb;
 	int tree_locked;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tree_locked = TREE_UNLOCKED;
 	tlb = tp->t_lib;
 	if (tlb == NULL) {
 		INP_WUNLOCK(tp->t_inpcb);
 		return (EOPNOTSUPP);
 	}
 
 	TCPID_BUCKET_REF(tlb);
 	INP_WUNLOCK(tp->t_inpcb);
 	TCPID_BUCKET_LOCK(tlb);
 	strlcpy(tlb->tlb_tag, tag, TCP_LOG_TAG_LEN);
 	if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL))
 		TCPID_BUCKET_UNLOCK(tlb);
 
 	if (tree_locked == TREE_WLOCKED) {
 		TCPID_TREE_WLOCK_ASSERT();
 		TCPID_TREE_WUNLOCK();
 	} else if (tree_locked == TREE_RLOCKED) {
 		TCPID_TREE_RLOCK_ASSERT();
 		TCPID_TREE_RUNLOCK();
 	} else
 		TCPID_TREE_UNLOCK_ASSERT();
 
 	return (0);
 }
 
 /*
  * Set the TCP log ID for a TCPCB.
  * Called with INPCB locked. Returns with it unlocked.
  */
 int
 tcp_log_set_id(struct tcpcb *tp, char *id)
 {
 	struct tcp_log_id_bucket *tlb, *tmp_tlb;
 	struct tcp_log_id_node *tln;
 	struct inpcb *inp;
 	int tree_locked, rv;
 	bool bucket_locked;
 
 	tlb = NULL;
 	tln = NULL;
 	inp = tp->t_inpcb;
 	tree_locked = TREE_UNLOCKED;
 	bucket_locked = false;
 
 restart:
 	INP_WLOCK_ASSERT(inp);
 
 	/* See if the ID is unchanged. */
 	if ((tp->t_lib != NULL && !strcmp(tp->t_lib->tlb_id, id)) ||
 	    (tp->t_lib == NULL && *id == 0)) {
 		if (tp->t_lib != NULL) {
 			tcp_log_increment_reqcnt(tp->t_lib);
 			if ((tp->t_lib->tlb_logstate) &&
 			    (tp->t_log_state_set == 0)) {
 				/* Clone in any logging */
 
 				tp->t_logstate = tp->t_lib->tlb_logstate;
 			}
 			if ((tp->t_lib->tlb_loglimit) &&
 			    (tp->t_log_state_set == 0)) {
 				/* We also have a limit set */
 
 				tp->t_loglimit = tp->t_lib->tlb_loglimit;
 			}
 		}
 		rv = 0;
 		goto done;
 	}
 
 	/*
 	 * If the TCPCB had a previous ID, we need to extricate it from
 	 * the previous list.
 	 *
 	 * Drop the TCPCB lock and lock the tree and the bucket.
 	 * Because this is called in the socket context, we (theoretically)
 	 * don't need to worry about the INPCB completely going away
 	 * while we are gone.
 	 */
 	if (tp->t_lib != NULL) {
 		tlb = tp->t_lib;
 		TCPID_BUCKET_REF(tlb);
 		INP_WUNLOCK(inp);
 
 		if (tree_locked == TREE_UNLOCKED) {
 			TCPID_TREE_RLOCK();
 			tree_locked = TREE_RLOCKED;
 		}
 		TCPID_BUCKET_LOCK(tlb);
 		bucket_locked = true;
 		INP_WLOCK(inp);
 
 		/*
 		 * Unreference the bucket. If our bucket went away, it is no
 		 * longer locked or valid.
 		 */
 		if (tcp_log_unref_bucket(tlb, &tree_locked, inp)) {
 			bucket_locked = false;
 			tlb = NULL;
 		}
 
 		/* Validate the INP. */
 		RECHECK_INP();
 
 		/*
 		 * Evaluate whether the bucket changed while we were unlocked.
 		 *
 		 * Possible scenarios here:
 		 * 1. Bucket is unchanged and the same one we started with.
 		 * 2. The TCPCB no longer has a bucket and our bucket was
 		 *    freed.
 		 * 3. The TCPCB has a new bucket, whether ours was freed.
 		 * 4. The TCPCB no longer has a bucket and our bucket was
 		 *    not freed.
 		 *
 		 * In cases 2-4, we will start over. In case 1, we will
 		 * proceed here to remove the bucket.
 		 */
 		if (tlb == NULL || tp->t_lib != tlb) {
 			KASSERT(bucket_locked || tlb == NULL,
 			    ("%s: bucket_locked (%d) and tlb (%p) are "
 			    "inconsistent", __func__, bucket_locked, tlb));
 
 			if (bucket_locked) {
 				TCPID_BUCKET_UNLOCK(tlb);
 				bucket_locked = false;
 				tlb = NULL;
 			}
 			goto restart;
 		}
 
 		/*
 		 * Store the (struct tcp_log_id_node) for reuse. Then, remove
 		 * it from the bucket. In the process, we may end up relocking.
 		 * If so, we need to validate that the INP is still valid, and
 		 * the TCPCB entries match we expect.
 		 *
 		 * We will clear tlb and change the bucket_locked state just
 		 * before calling tcp_log_remove_id_node(), since that function
 		 * will unlock the bucket.
 		 */
 		if (tln != NULL)
 			uma_zfree(tcp_log_id_node_zone, tln);
 		tln = tp->t_lin;
 		tlb = NULL;
 		bucket_locked = false;
 		if (tcp_log_remove_id_node(inp, tp, NULL, NULL, &tree_locked)) {
 			RECHECK_INP();
 
 			/*
 			 * If the TCPCB moved to a new bucket while we had
 			 * dropped the lock, restart.
 			 */
 			if (tp->t_lib != NULL || tp->t_lin != NULL)
 				goto restart;
 		}
 
 		/*
 		 * Yay! We successfully removed the TCPCB from its old
 		 * bucket. Phew!
 		 *
 		 * On to bigger and better things...
 		 */
 	}
 
 	/* At this point, the TCPCB should not be in any bucket. */
 	KASSERT(tp->t_lib == NULL, ("%s: tp->t_lib is not NULL", __func__));
 
 	/*
 	 * If the new ID is not empty, we need to now assign this TCPCB to a
 	 * new bucket.
 	 */
 	if (*id) {
 		/* Get a new tln, if we don't already have one to reuse. */
 		if (tln == NULL) {
 			tln = uma_zalloc(tcp_log_id_node_zone,
 				M_NOWAIT | M_ZERO);
 			if (tln == NULL) {
 				rv = ENOBUFS;
 				goto done;
 			}
 			tln->tln_inp = inp;
 			tln->tln_tp = tp;
 		}
 
 		/*
 		 * Drop the INP lock for a bit. We don't need it, and dropping
 		 * it prevents lock order reversals.
 		 */
 		INP_WUNLOCK(inp);
 
 		/* Make sure we have at least a read lock on the tree. */
 		tcp_log_id_validate_tree_lock(tree_locked);
 		if (tree_locked == TREE_UNLOCKED) {
 			TCPID_TREE_RLOCK();
 			tree_locked = TREE_RLOCKED;
 		}
 
 refind:
 		/*
 		 * Remember that we constructed (struct tcp_log_id_node) so
 		 * we can safely cast the id to it for the purposes of finding.
 		 */
 		KASSERT(tlb == NULL, ("%s:%d tlb unexpectedly non-NULL",
 		    __func__, __LINE__));
 		tmp_tlb = RB_FIND(tcp_log_id_tree, &tcp_log_id_head,
 		    (struct tcp_log_id_bucket *) id);
 
 		/*
 		 * If we didn't find a matching bucket, we need to add a new
 		 * one. This requires a write lock. But, of course, we will
 		 * need to recheck some things when we re-acquire the lock.
 		 */
 		if (tmp_tlb == NULL && tree_locked != TREE_WLOCKED) {
 			tree_locked = TREE_WLOCKED;
 			if (!TCPID_TREE_UPGRADE()) {
 				TCPID_TREE_RUNLOCK();
 				TCPID_TREE_WLOCK();
 
 				/*
 				 * The tree may have changed while we were
 				 * unlocked.
 				 */
 				goto refind;
 			}
 		}
 
 		/* If we need to add a new bucket, do it now. */
 		if (tmp_tlb == NULL) {
 			/* Allocate new bucket. */
 			tlb = uma_zalloc(tcp_log_id_bucket_zone, M_NOWAIT);
 			if (tlb == NULL) {
 				rv = ENOBUFS;
 				goto done_noinp;
 			}
 			counter_u64_add(tcp_log_pcb_ids_cur, 1);
 			counter_u64_add(tcp_log_pcb_ids_tot, 1);
 
 			if ((tcp_log_auto_all == false) &&
 			    tcp_log_auto_mode &&
 			    tcp_log_selectauto()) {
 				/* Save off the log state */
 				tlb->tlb_logstate = tcp_log_auto_mode;
 			} else
 				tlb->tlb_logstate = TCP_LOG_STATE_OFF;
 			tlb->tlb_loglimit = 0;
 			tlb->tlb_tag[0] = '\0'; /* Default to an empty tag. */
 
 			/*
 			 * Copy the ID to the bucket.
 			 * NB: Don't use strlcpy() unless you are sure
 			 * we've always validated NULL termination.
 			 *
 			 * TODO: When I'm done writing this, see if we
 			 * we have correctly validated NULL termination and
 			 * can use strlcpy(). :-)
 			 */
 			strncpy(tlb->tlb_id, id, TCP_LOG_ID_LEN - 1);
 			tlb->tlb_id[TCP_LOG_ID_LEN - 1] = '\0';
 
 			/*
 			 * Take the refcount for the first node and go ahead
 			 * and lock this. Note that we zero the tlb_mtx
 			 * structure, since 0xdeadc0de flips the right bits
 			 * for the code to think that this mutex has already
 			 * been initialized. :-(
 			 */
 			SLIST_INIT(&tlb->tlb_head);
 			refcount_init(&tlb->tlb_refcnt, 1);
 			tlb->tlb_reqcnt = 1;
 			memset(&tlb->tlb_mtx, 0, sizeof(struct mtx));
 			TCPID_BUCKET_LOCK_INIT(tlb);
 			TCPID_BUCKET_LOCK(tlb);
 			bucket_locked = true;
 
 #define	FREE_NEW_TLB()	do {				\
 	TCPID_BUCKET_LOCK_DESTROY(tlb);			\
 	uma_zfree(tcp_log_id_bucket_zone, tlb);		\
 	counter_u64_add(tcp_log_pcb_ids_cur, (int64_t)-1);	\
 	counter_u64_add(tcp_log_pcb_ids_tot, (int64_t)-1);	\
 	bucket_locked = false;				\
 	tlb = NULL;					\
 } while (0)
 			/*
 			 * Relock the INP and make sure we are still
 			 * unassigned.
 			 */
 			INP_WLOCK(inp);
 			RECHECK_INP_CLEAN(FREE_NEW_TLB());
 			if (tp->t_lib != NULL) {
 				FREE_NEW_TLB();
 				goto restart;
 			}
 
 			/* Add the new bucket to the tree. */
 			tmp_tlb = RB_INSERT(tcp_log_id_tree, &tcp_log_id_head,
 			    tlb);
 			KASSERT(tmp_tlb == NULL,
 			    ("%s: Unexpected conflicting bucket (%p) while "
 			    "adding new bucket (%p)", __func__, tmp_tlb, tlb));
 
 			/*
 			 * If we found a conflicting bucket, free the new
 			 * one we made and fall through to use the existing
 			 * bucket.
 			 */
 			if (tmp_tlb != NULL) {
 				FREE_NEW_TLB();
 				INP_WUNLOCK(inp);
 			}
 #undef	FREE_NEW_TLB
 		}
 
 		/* If we found an existing bucket, use it. */
 		if (tmp_tlb != NULL) {
 			tlb = tmp_tlb;
 			TCPID_BUCKET_LOCK(tlb);
 			bucket_locked = true;
 
 			/*
 			 * Relock the INP and make sure we are still
 			 * unassigned.
 			 */
 			INP_UNLOCK_ASSERT(inp);
 			INP_WLOCK(inp);
 			RECHECK_INP();
 			if (tp->t_lib != NULL) {
 				TCPID_BUCKET_UNLOCK(tlb);
 				bucket_locked = false;
 				tlb = NULL;
 				goto restart;
 			}
 
 			/* Take a reference on the bucket. */
 			TCPID_BUCKET_REF(tlb);
 
 			/* Record the request. */
 			tcp_log_increment_reqcnt(tlb);
 		}
 
 		tcp_log_grow_tlb(tlb->tlb_id, tp);
 
 		/* Add the new node to the list. */
 		SLIST_INSERT_HEAD(&tlb->tlb_head, tln, tln_list);
 		tp->t_lib = tlb;
 		tp->t_lin = tln;
 		if (tp->t_lib->tlb_logstate) {
 			/* Clone in any logging */
 
 			tp->t_logstate = tp->t_lib->tlb_logstate;
 		}
 		if (tp->t_lib->tlb_loglimit) {
 			/* The loglimit too */
 
 			tp->t_loglimit = tp->t_lib->tlb_loglimit;
 		}
 		tln = NULL;
 	}
 
 	rv = 0;
 
 done:
 	/* Unlock things, as needed, and return. */
 	INP_WUNLOCK(inp);
 done_noinp:
 	INP_UNLOCK_ASSERT(inp);
 	if (bucket_locked) {
 		TCPID_BUCKET_LOCK_ASSERT(tlb);
 		TCPID_BUCKET_UNLOCK(tlb);
 	} else if (tlb != NULL)
 		TCPID_BUCKET_UNLOCK_ASSERT(tlb);
 	if (tree_locked == TREE_WLOCKED) {
 		TCPID_TREE_WLOCK_ASSERT();
 		TCPID_TREE_WUNLOCK();
 	} else if (tree_locked == TREE_RLOCKED) {
 		TCPID_TREE_RLOCK_ASSERT();
 		TCPID_TREE_RUNLOCK();
 	} else
 		TCPID_TREE_UNLOCK_ASSERT();
 	if (tln != NULL)
 		uma_zfree(tcp_log_id_node_zone, tln);
 	return (rv);
 }
 
 /*
  * Get the TCP log ID for a TCPCB.
  * Called with INPCB locked.
  * 'buf' must point to a buffer that is at least TCP_LOG_ID_LEN bytes long.
  * Returns number of bytes copied.
  */
 size_t
 tcp_log_get_id(struct tcpcb *tp, char *buf)
 {
 	size_t len;
 
 	INP_LOCK_ASSERT(tp->t_inpcb);
 	if (tp->t_lib != NULL) {
 		len = strlcpy(buf, tp->t_lib->tlb_id, TCP_LOG_ID_LEN);
 		KASSERT(len < TCP_LOG_ID_LEN,
 		    ("%s:%d: tp->t_lib->tlb_id too long (%zu)",
 		    __func__, __LINE__, len));
 	} else {
 		*buf = '\0';
 		len = 0;
 	}
 	return (len);
 }
 
 /*
  * Get the tag associated with the TCPCB's log ID.
  * Called with INPCB locked. Returns with it unlocked.
  * 'buf' must point to a buffer that is at least TCP_LOG_TAG_LEN bytes long.
  * Returns number of bytes copied.
  */
 size_t
 tcp_log_get_tag(struct tcpcb *tp, char *buf)
 {
 	struct tcp_log_id_bucket *tlb;
 	size_t len;
 	int tree_locked;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tree_locked = TREE_UNLOCKED;
 	tlb = tp->t_lib;
 
 	if (tlb != NULL) {
 		TCPID_BUCKET_REF(tlb);
 		INP_WUNLOCK(tp->t_inpcb);
 		TCPID_BUCKET_LOCK(tlb);
 		len = strlcpy(buf, tlb->tlb_tag, TCP_LOG_TAG_LEN);
 		KASSERT(len < TCP_LOG_TAG_LEN,
 		    ("%s:%d: tp->t_lib->tlb_tag too long (%zu)",
 		    __func__, __LINE__, len));
 		if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL))
 			TCPID_BUCKET_UNLOCK(tlb);
 
 		if (tree_locked == TREE_WLOCKED) {
 			TCPID_TREE_WLOCK_ASSERT();
 			TCPID_TREE_WUNLOCK();
 		} else if (tree_locked == TREE_RLOCKED) {
 			TCPID_TREE_RLOCK_ASSERT();
 			TCPID_TREE_RUNLOCK();
 		} else
 			TCPID_TREE_UNLOCK_ASSERT();
 	} else {
 		INP_WUNLOCK(tp->t_inpcb);
 		*buf = '\0';
 		len = 0;
 	}
 
 	return (len);
 }
 
 /*
  * Get number of connections with the same log ID.
  * Log ID is taken from given TCPCB.
  * Called with INPCB locked.
  */
 u_int
 tcp_log_get_id_cnt(struct tcpcb *tp)
 {
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	return ((tp->t_lib == NULL) ? 0 : tp->t_lib->tlb_refcnt);
 }
 
 #ifdef TCPLOG_DEBUG_RINGBUF
 /*
  * Functions/macros to increment/decrement reference count for a log
  * entry. This should catch when we do a double-free/double-remove or
  * a double-add.
  */
 static inline void
 _tcp_log_entry_refcnt_add(struct tcp_log_mem *log_entry, const char *func,
     int line)
 {
 	int refcnt;
 
 	refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, 1);
 	if (refcnt != 0)
 		panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 0)",
 		    func, line, log_entry, refcnt);
 }
 #define	tcp_log_entry_refcnt_add(l)	\
     _tcp_log_entry_refcnt_add((l), __func__, __LINE__)
 
 static inline void
 _tcp_log_entry_refcnt_rem(struct tcp_log_mem *log_entry, const char *func,
     int line)
 {
 	int refcnt;
 
 	refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, -1);
 	if (refcnt != 1)
 		panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 1)",
 		    func, line, log_entry, refcnt);
 }
 #define	tcp_log_entry_refcnt_rem(l)	\
     _tcp_log_entry_refcnt_rem((l), __func__, __LINE__)
 
 #else /* !TCPLOG_DEBUG_RINGBUF */
 
 #define	tcp_log_entry_refcnt_add(l)
 #define	tcp_log_entry_refcnt_rem(l)
 
 #endif
 
 /*
  * Cleanup after removing a log entry, but only decrement the count if we
  * are running INVARIANTS.
  */
 static inline void
 tcp_log_free_log_common(struct tcp_log_mem *log_entry, int *count __unused)
 {
 
 	uma_zfree(tcp_log_zone, log_entry);
 #ifdef INVARIANTS
 	(*count)--;
 	KASSERT(*count >= 0,
 	    ("%s: count unexpectedly negative", __func__));
 #endif
 }
 
 static void
 tcp_log_free_entries(struct tcp_log_stailq *head, int *count)
 {
 	struct tcp_log_mem *log_entry;
 
 	/* Free the entries. */
 	while ((log_entry = STAILQ_FIRST(head)) != NULL) {
 		STAILQ_REMOVE_HEAD(head, tlm_queue);
 		tcp_log_entry_refcnt_rem(log_entry);
 		tcp_log_free_log_common(log_entry, count);
 	}
 }
 
 /* Cleanup after removing a log entry. */
 static inline void
 tcp_log_remove_log_cleanup(struct tcpcb *tp, struct tcp_log_mem *log_entry)
 {
 	uma_zfree(tcp_log_zone, log_entry);
 	tp->t_lognum--;
 	KASSERT(tp->t_lognum >= 0,
 	    ("%s: tp->t_lognum unexpectedly negative", __func__));
 }
 
 /* Remove a log entry from the head of a list. */
 static inline void
 tcp_log_remove_log_head(struct tcpcb *tp, struct tcp_log_mem *log_entry)
 {
 
 	KASSERT(log_entry == STAILQ_FIRST(&tp->t_logs),
 	    ("%s: attempt to remove non-HEAD log entry", __func__));
 	STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue);
 	tcp_log_entry_refcnt_rem(log_entry);
 	tcp_log_remove_log_cleanup(tp, log_entry);
 }
 
 #ifdef TCPLOG_DEBUG_RINGBUF
 /*
  * Initialize the log entry's reference count, which we want to
  * survive allocations.
  */
 static int
 tcp_log_zone_init(void *mem, int size, int flags __unused)
 {
 	struct tcp_log_mem *tlm;
 
 	KASSERT(size >= sizeof(struct tcp_log_mem),
 	    ("%s: unexpectedly short (%d) allocation", __func__, size));
 	tlm = (struct tcp_log_mem *)mem;
 	tlm->tlm_refcnt = 0;
 	return (0);
 }
 
 /*
  * Double check that the refcnt is zero on allocation and return.
  */
 static int
 tcp_log_zone_ctor(void *mem, int size, void *args __unused, int flags __unused)
 {
 	struct tcp_log_mem *tlm;
 
 	KASSERT(size >= sizeof(struct tcp_log_mem),
 	    ("%s: unexpectedly short (%d) allocation", __func__, size));
 	tlm = (struct tcp_log_mem *)mem;
 	if (tlm->tlm_refcnt != 0)
 		panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)",
 		    __func__, __LINE__, tlm, tlm->tlm_refcnt);
 	return (0);
 }
 
 static void
 tcp_log_zone_dtor(void *mem, int size, void *args __unused)
 {
 	struct tcp_log_mem *tlm;
 
 	KASSERT(size >= sizeof(struct tcp_log_mem),
 	    ("%s: unexpectedly short (%d) allocation", __func__, size));
 	tlm = (struct tcp_log_mem *)mem;
 	if (tlm->tlm_refcnt != 0)
 		panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)",
 		    __func__, __LINE__, tlm, tlm->tlm_refcnt);
 }
 #endif /* TCPLOG_DEBUG_RINGBUF */
 
 /* Do global initialization. */
 void
 tcp_log_init(void)
 {
 
 	tcp_log_zone = uma_zcreate("tcp_log", sizeof(struct tcp_log_mem),
 #ifdef TCPLOG_DEBUG_RINGBUF
 	    tcp_log_zone_ctor, tcp_log_zone_dtor, tcp_log_zone_init,
 #else
 	    NULL, NULL, NULL,
 #endif
 	    NULL, UMA_ALIGN_PTR, 0);
 	(void)uma_zone_set_max(tcp_log_zone, TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT);
 	tcp_log_id_bucket_zone = uma_zcreate("tcp_log_id_bucket",
 	    sizeof(struct tcp_log_id_bucket), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 	tcp_log_id_node_zone = uma_zcreate("tcp_log_id_node",
 	    sizeof(struct tcp_log_id_node), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 #ifdef TCPLOG_DEBUG_COUNTERS
 	tcp_log_queued = counter_u64_alloc(M_WAITOK);
 	tcp_log_que_fail1 = counter_u64_alloc(M_WAITOK);
 	tcp_log_que_fail2 = counter_u64_alloc(M_WAITOK);
 	tcp_log_que_fail3 = counter_u64_alloc(M_WAITOK);
 	tcp_log_que_fail4 = counter_u64_alloc(M_WAITOK);
 	tcp_log_que_fail5 = counter_u64_alloc(M_WAITOK);
 	tcp_log_que_copyout = counter_u64_alloc(M_WAITOK);
 	tcp_log_que_read = counter_u64_alloc(M_WAITOK);
 	tcp_log_que_freed = counter_u64_alloc(M_WAITOK);
 #endif
 	tcp_log_pcb_ids_cur = counter_u64_alloc(M_WAITOK);
 	tcp_log_pcb_ids_tot = counter_u64_alloc(M_WAITOK);
 
 	rw_init_flags(&tcp_id_tree_lock, "TCP ID tree", RW_NEW);
 	mtx_init(&tcp_log_expireq_mtx, "TCP log expireq", NULL, MTX_DEF);
 	callout_init(&tcp_log_expireq_callout, 1);
 }
 
 /* Do per-TCPCB initialization. */
 void
 tcp_log_tcpcbinit(struct tcpcb *tp)
 {
 
 	/* A new TCPCB should start out zero-initialized. */
 	STAILQ_INIT(&tp->t_logs);
 
 	/*
 	 * If we are doing auto-capturing, figure out whether we will capture
 	 * this session.
 	 */
 	tp->t_loglimit = tcp_log_session_limit;
 	if ((tcp_log_auto_all == true) &&
 	    tcp_log_auto_mode &&
 	    tcp_log_selectauto()) {
 		tp->t_logstate = tcp_log_auto_mode;
 		tp->t_flags2 |= TF2_LOG_AUTO;
 	}
 }
 
 /* Remove entries */
 static void
 tcp_log_expire(void *unused __unused)
 {
 	struct tcp_log_id_bucket *tlb;
 	struct tcp_log_id_node *tln;
 	sbintime_t expiry_limit;
 	int tree_locked;
 
 	TCPLOG_EXPIREQ_LOCK();
 	if (callout_pending(&tcp_log_expireq_callout)) {
 		/* Callout was reset. */
 		TCPLOG_EXPIREQ_UNLOCK();
 		return;
 	}
 
 	/*
 	 * Process entries until we reach one that expires too far in the
 	 * future. Look one second in the future.
 	 */
 	expiry_limit = getsbinuptime() + SBT_1S;
 	tree_locked = TREE_UNLOCKED;
 
 	while ((tln = STAILQ_FIRST(&tcp_log_expireq_head)) != NULL &&
 	    tln->tln_expiretime <= expiry_limit) {
 		if (!callout_active(&tcp_log_expireq_callout)) {
 			/*
 			 * Callout was stopped. I guess we should
 			 * just quit at this point.
 			 */
 			TCPLOG_EXPIREQ_UNLOCK();
 			return;
 		}
 
 		/*
 		 * Remove the node from the head of the list and unlock
 		 * the list. Change the expiry time to SBT_MAX as a signal
 		 * to other threads that we now own this.
 		 */
 		STAILQ_REMOVE_HEAD(&tcp_log_expireq_head, tln_expireq);
 		tln->tln_expiretime = SBT_MAX;
 		TCPLOG_EXPIREQ_UNLOCK();
 
 		/*
 		 * Remove the node from the bucket.
 		 */
 		tlb = tln->tln_bucket;
 		TCPID_BUCKET_LOCK(tlb);
 		if (tcp_log_remove_id_node(NULL, NULL, tlb, tln, &tree_locked)) {
 			tcp_log_id_validate_tree_lock(tree_locked);
 			if (tree_locked == TREE_WLOCKED)
 				TCPID_TREE_WUNLOCK();
 			else
 				TCPID_TREE_RUNLOCK();
 			tree_locked = TREE_UNLOCKED;
 		}
 
 		/* Drop the INP reference. */
 		INP_WLOCK(tln->tln_inp);
 		if (!in_pcbrele_wlocked(tln->tln_inp))
 			INP_WUNLOCK(tln->tln_inp);
 
 		/* Free the log records. */
 		tcp_log_free_entries(&tln->tln_entries, &tln->tln_count);
 
 		/* Free the node. */
 		uma_zfree(tcp_log_id_node_zone, tln);
 
 		/* Relock the expiry queue. */
 		TCPLOG_EXPIREQ_LOCK();
 	}
 
 	/*
 	 * We've expired all the entries we can. Do we need to reschedule
 	 * ourselves?
 	 */
 	callout_deactivate(&tcp_log_expireq_callout);
 	if (tln != NULL) {
 		/*
 		 * Get max(now + TCP_LOG_EXPIRE_INTVL, tln->tln_expiretime) and
 		 * set the next callout to that. (This helps ensure we generally
 		 * run the callout no more often than desired.)
 		 */
 		expiry_limit = getsbinuptime() + TCP_LOG_EXPIRE_INTVL;
 		if (expiry_limit < tln->tln_expiretime)
 			expiry_limit = tln->tln_expiretime;
 		callout_reset_sbt(&tcp_log_expireq_callout, expiry_limit,
 		    SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE);
 	}
 
 	/* We're done. */
 	TCPLOG_EXPIREQ_UNLOCK();
 	return;
 }
 
 /*
  * Move log data from the TCPCB to a new node. This will reset the TCPCB log
  * entries and log count; however, it will not touch other things from the
  * TCPCB (e.g. t_lin, t_lib).
  *
  * NOTE: Must hold a lock on the INP.
  */
 static void
 tcp_log_move_tp_to_node(struct tcpcb *tp, struct tcp_log_id_node *tln)
 {
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tln->tln_ie = tp->t_inpcb->inp_inc.inc_ie;
 	if (tp->t_inpcb->inp_inc.inc_flags & INC_ISIPV6)
 		tln->tln_af = AF_INET6;
 	else
 		tln->tln_af = AF_INET;
 	tln->tln_entries = tp->t_logs;
 	tln->tln_count = tp->t_lognum;
 	tln->tln_bucket = tp->t_lib;
 
 	/* Clear information from the PCB. */
 	STAILQ_INIT(&tp->t_logs);
 	tp->t_lognum = 0;
 }
 
 /* Do per-TCPCB cleanup */
 void
 tcp_log_tcpcbfini(struct tcpcb *tp)
 {
 	struct tcp_log_id_node *tln, *tln_first;
 	struct tcp_log_mem *log_entry;
 	sbintime_t callouttime;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_CONNEND, 0, 0, NULL, false);
 
 	/*
 	 * If we were gathering packets to be automatically dumped, try to do
 	 * it now. If this succeeds, the log information in the TCPCB will be
 	 * cleared. Otherwise, we'll handle the log information as we do
 	 * for other states.
 	 */
 	switch(tp->t_logstate) {
 	case TCP_LOG_STATE_HEAD_AUTO:
 		(void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head",
 		    M_NOWAIT, false);
 		break;
 	case TCP_LOG_STATE_TAIL_AUTO:
 		(void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail",
 		    M_NOWAIT, false);
 		break;
 	case TCP_LOG_STATE_CONTINUAL:
 		(void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
 		    M_NOWAIT, false);
 		break;
 	}
 
 	/*
 	 * There are two ways we could keep logs: per-socket or per-ID. If
 	 * we are tracking logs with an ID, then the logs survive the
 	 * destruction of the TCPCB.
 	 *
 	 * If the TCPCB is associated with an ID node, move the logs from the
 	 * TCPCB to the ID node. In theory, this is safe, for reasons which I
 	 * will now explain for my own benefit when I next need to figure out
 	 * this code. :-)
 	 *
 	 * We own the INP lock. Therefore, no one else can change the contents
 	 * of this node (Rule C). Further, no one can remove this node from
 	 * the bucket while we hold the lock (Rule D). Basically, no one can
 	 * mess with this node. That leaves two states in which we could be:
 	 *
 	 * 1. Another thread is currently waiting to acquire the INP lock, with
 	 *    plans to do something with this node. When we drop the INP lock,
 	 *    they will have a chance to do that. They will recheck the
 	 *    tln_closed field (see note to Rule C) and then acquire the
 	 *    bucket lock before proceeding further.
 	 *
 	 * 2. Another thread will try to acquire a lock at some point in the
 	 *    future. If they try to acquire a lock before we set the
 	 *    tln_closed field, they will follow state #1. If they try to
 	 *    acquire a lock after we set the tln_closed field, they will be
 	 *    able to make changes to the node, at will, following Rule C.
 	 *
 	 * Therefore, we currently own this node and can make any changes
 	 * we want. But, as soon as we set the tln_closed field to true, we
 	 * have effectively dropped our lock on the node. (For this reason, we
 	 * also need to make sure our writes are ordered correctly. An atomic
 	 * operation with "release" semantics should be sufficient.)
 	 */
 
 	if (tp->t_lin != NULL) {
 		/* Copy the relevant information to the log entry. */
 		tln = tp->t_lin;
 		KASSERT(tln->tln_inp == tp->t_inpcb,
 		    ("%s: Mismatched inp (tln->tln_inp=%p, tp->t_inpcb=%p)",
 		    __func__, tln->tln_inp, tp->t_inpcb));
 		tcp_log_move_tp_to_node(tp, tln);
 
 		/* Clear information from the PCB. */
 		tp->t_lin = NULL;
 		tp->t_lib = NULL;
 
 		/*
 		 * Take a reference on the INP. This ensures that the INP
 		 * remains valid while the node is on the expiry queue. This
 		 * ensures the INP is valid for other threads that may be
 		 * racing to lock this node when we move it to the expire
 		 * queue.
 		 */
 		in_pcbref(tp->t_inpcb);
 
 		/*
 		 * Store the entry on the expiry list. The exact behavior
 		 * depends on whether we have entries to keep. If so, we
 		 * put the entry at the tail of the list and expire in
 		 * TCP_LOG_EXPIRE_TIME. Otherwise, we expire "now" and put
 		 * the entry at the head of the list. (Handling the cleanup
 		 * via the expiry timer lets us avoid locking messy-ness here.)
 		 */
 		tln->tln_expiretime = getsbinuptime();
 		TCPLOG_EXPIREQ_LOCK();
 		if (tln->tln_count) {
 			tln->tln_expiretime += TCP_LOG_EXPIRE_TIME;
 			if (STAILQ_EMPTY(&tcp_log_expireq_head) &&
 			    !callout_active(&tcp_log_expireq_callout)) {
 				/*
 				 * We are adding the first entry and a callout
 				 * is not currently scheduled; therefore, we
 				 * need to schedule one.
 				 */
 				callout_reset_sbt(&tcp_log_expireq_callout,
 				    tln->tln_expiretime, SBT_1S, tcp_log_expire,
 				    NULL, C_ABSOLUTE);
 			}
 			STAILQ_INSERT_TAIL(&tcp_log_expireq_head, tln,
 			    tln_expireq);
 		} else {
 			callouttime = tln->tln_expiretime +
 			    TCP_LOG_EXPIRE_INTVL;
 			tln_first = STAILQ_FIRST(&tcp_log_expireq_head);
 
 			if ((tln_first == NULL ||
 			    callouttime < tln_first->tln_expiretime) &&
 			    (callout_pending(&tcp_log_expireq_callout) ||
 			    !callout_active(&tcp_log_expireq_callout))) {
 				/*
 				 * The list is empty, or we want to run the
 				 * expire code before the first entry's timer
 				 * fires. Also, we are in a case where a callout
 				 * is not actively running. We want to reset
 				 * the callout to occur sooner.
 				 */
 				callout_reset_sbt(&tcp_log_expireq_callout,
 				    callouttime, SBT_1S, tcp_log_expire, NULL,
 				    C_ABSOLUTE);
 			}
 
 			/*
 			 * Insert to the head, or just after the head, as
 			 * appropriate. (This might result in small
 			 * mis-orderings as a bunch of "expire now" entries
 			 * gather at the start of the list, but that should
 			 * not produce big problems, since the expire timer
 			 * will walk through all of them.)
 			 */
 			if (tln_first == NULL ||
 			    tln->tln_expiretime < tln_first->tln_expiretime)
 				STAILQ_INSERT_HEAD(&tcp_log_expireq_head, tln,
 				    tln_expireq);
 			else
 				STAILQ_INSERT_AFTER(&tcp_log_expireq_head,
 				    tln_first, tln, tln_expireq);
 		}
 		TCPLOG_EXPIREQ_UNLOCK();
 
 		/*
 		 * We are done messing with the tln. After this point, we
 		 * can't touch it. (Note that the "release" semantics should
 		 * be included with the TCPLOG_EXPIREQ_UNLOCK() call above.
 		 * Therefore, they should be unnecessary here. However, it
 		 * seems like a good idea to include them anyway, since we
 		 * really are releasing a lock here.)
 		 */
 		atomic_store_rel_int(&tln->tln_closed, 1);
 	} else {
 		/* Remove log entries. */
 		while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
 			tcp_log_remove_log_head(tp, log_entry);
 		KASSERT(tp->t_lognum == 0,
 		    ("%s: After freeing entries, tp->t_lognum=%d (expected 0)",
 			__func__, tp->t_lognum));
 	}
 
 	/*
 	 * Change the log state to off (just in case anything tries to sneak
 	 * in a last-minute log).
 	 */
 	tp->t_logstate = TCP_LOG_STATE_OFF;
 }
 
 static void
 tcp_log_purge_tp_logbuf(struct tcpcb *tp)
 {
 	struct tcp_log_mem *log_entry;
 	struct inpcb *inp __diagused;
 
 	inp = tp->t_inpcb;
 	INP_WLOCK_ASSERT(inp);
 	if (tp->t_lognum == 0)
 		return;
 
 	while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
 		tcp_log_remove_log_head(tp, log_entry);
 	KASSERT(tp->t_lognum == 0,
 		("%s: After freeing entries, tp->t_lognum=%d (expected 0)",
 		 __func__, tp->t_lognum));
 	tp->t_logstate = TCP_LOG_STATE_OFF;
 }
 
 /*
  * This logs an event for a TCP socket. Normally, this is called via
  * TCP_LOG_EVENT or TCP_LOG_EVENT_VERBOSE. See the documentation for
  * TCP_LOG_EVENT().
  */
 
 struct tcp_log_buffer *
 tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf,
     struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len,
     union tcp_log_stackspecific *stackinfo, int th_hostorder,
     const char *output_caller, const char *func, int line, const struct timeval *itv)
 {
 	struct tcp_log_mem *log_entry;
 	struct tcp_log_buffer *log_buf;
 	int attempt_count = 0;
 	struct tcp_log_verbose *log_verbose;
 	uint32_t logsn;
 
 	KASSERT((func == NULL && line == 0) || (func != NULL && line > 0),
 	    ("%s called with inconsistent func (%p) and line (%d) arguments",
 		__func__, func, line));
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (tcp_disable_all_bb_logs) {
 		/*
 		 * The global shutdown logging
 		 * switch has been thrown. Call
 		 * the purge function that frees
 		 * purges out the logs and
 		 * turns off logging.
 		 */
 		tcp_log_purge_tp_logbuf(tp);
 		return (NULL);
 	}
 	KASSERT(tp->t_logstate == TCP_LOG_STATE_HEAD ||
 	    tp->t_logstate == TCP_LOG_STATE_TAIL ||
 	    tp->t_logstate == TCP_LOG_STATE_CONTINUAL ||
 	    tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO ||
 	    tp->t_logstate == TCP_LOG_STATE_TAIL_AUTO,
 	    ("%s called with unexpected tp->t_logstate (%d)", __func__,
 		tp->t_logstate));
 
 	/*
 	 * Get the serial number. We do this early so it will
 	 * increment even if we end up skipping the log entry for some
 	 * reason.
 	 */
 	logsn = tp->t_logsn++;
 
 	/*
 	 * Can we get a new log entry? If so, increment the lognum counter
 	 * here.
 	 */
 retry:
 	if (tp->t_lognum < tp->t_loglimit) {
 		if ((log_entry = uma_zalloc(tcp_log_zone, M_NOWAIT)) != NULL)
 			tp->t_lognum++;
 	} else
 		log_entry = NULL;
 
 	/* Do we need to try to reuse? */
 	if (log_entry == NULL) {
 		/*
 		 * Sacrifice auto-logged sessions without a log ID if
 		 * tcp_log_auto_all is false. (If they don't have a log
 		 * ID by now, it is probable that either they won't get one
 		 * or we are resource-constrained.)
 		 */
 		if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) &&
 		    !tcp_log_auto_all) {
 			if (tcp_log_state_change(tp, TCP_LOG_STATE_CLEAR)) {
 #ifdef INVARIANTS
 				panic("%s:%d: tcp_log_state_change() failed "
 				    "to set tp %p to TCP_LOG_STATE_CLEAR",
 				    __func__, __LINE__, tp);
 #endif
 				tp->t_logstate = TCP_LOG_STATE_OFF;
 			}
 			return (NULL);
 		}
 		/*
 		 * If we are in TCP_LOG_STATE_HEAD_AUTO state, try to dump
 		 * the buffers. If successful, deactivate tracing. Otherwise,
 		 * leave it active so we will retry.
 		 */
 		if (tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO &&
 		    !tcp_log_dump_tp_logbuf(tp, "auto-dumped from head",
 		    M_NOWAIT, false)) {
 			tp->t_logstate = TCP_LOG_STATE_OFF;
 			return(NULL);
 		} else if ((tp->t_logstate == TCP_LOG_STATE_CONTINUAL) &&
 		    !tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
 		    M_NOWAIT, false)) {
 			if (attempt_count == 0) {
 				attempt_count++;
 				goto retry;
 			}
 #ifdef TCPLOG_DEBUG_COUNTERS
 			counter_u64_add(tcp_log_que_fail4, 1);
 #endif
 			return(NULL);
 		} else if (tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO)
 			return(NULL);
 
 		/* If in HEAD state, just deactivate the tracing and return. */
 		if (tp->t_logstate == TCP_LOG_STATE_HEAD) {
 			tp->t_logstate = TCP_LOG_STATE_OFF;
 			return(NULL);
 		}
 
 		/*
 		 * Get a buffer to reuse. If that fails, just give up.
 		 * (We can't log anything without a buffer in which to
 		 * put it.)
 		 *
 		 * Note that we don't change the t_lognum counter
 		 * here. Because we are re-using the buffer, the total
 		 * number won't change.
 		 */
 		if ((log_entry = STAILQ_FIRST(&tp->t_logs)) == NULL)
 			return(NULL);
 		STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue);
 		tcp_log_entry_refcnt_rem(log_entry);
 	}
 
 	KASSERT(log_entry != NULL,
 	    ("%s: log_entry unexpectedly NULL", __func__));
 
 	/* Extract the log buffer and verbose buffer pointers. */
 	log_buf = &log_entry->tlm_buf;
 	log_verbose = &log_entry->tlm_v;
 
 	/* Basic entries. */
 	if (itv == NULL)
 		getmicrouptime(&log_buf->tlb_tv);
 	else
 		memcpy(&log_buf->tlb_tv, itv, sizeof(struct timeval));
 	log_buf->tlb_ticks = ticks;
 	log_buf->tlb_sn = logsn;
 	log_buf->tlb_stackid = tp->t_fb->tfb_id;
 	log_buf->tlb_eventid = eventid;
 	log_buf->tlb_eventflags = 0;
 	log_buf->tlb_errno = errornum;
 
 	/* Socket buffers */
 	if (rxbuf != NULL) {
 		log_buf->tlb_eventflags |= TLB_FLAG_RXBUF;
 		log_buf->tlb_rxbuf.tls_sb_acc = rxbuf->sb_acc;
 		log_buf->tlb_rxbuf.tls_sb_ccc = rxbuf->sb_ccc;
 		log_buf->tlb_rxbuf.tls_sb_spare = 0;
 	}
 	if (txbuf != NULL) {
 		log_buf->tlb_eventflags |= TLB_FLAG_TXBUF;
 		log_buf->tlb_txbuf.tls_sb_acc = txbuf->sb_acc;
 		log_buf->tlb_txbuf.tls_sb_ccc = txbuf->sb_ccc;
 		log_buf->tlb_txbuf.tls_sb_spare = 0;
 	}
 	/* Copy values from tp to the log entry. */
 #define	COPY_STAT(f)	log_buf->tlb_ ## f = tp->f
 #define	COPY_STAT_T(f)	log_buf->tlb_ ## f = tp->t_ ## f
 	COPY_STAT_T(state);
 	COPY_STAT_T(starttime);
 	COPY_STAT(iss);
 	COPY_STAT_T(flags);
 	COPY_STAT(snd_una);
 	COPY_STAT(snd_max);
 	COPY_STAT(snd_cwnd);
 	COPY_STAT(snd_nxt);
 	COPY_STAT(snd_recover);
 	COPY_STAT(snd_wnd);
 	COPY_STAT(snd_ssthresh);
 	COPY_STAT_T(srtt);
 	COPY_STAT_T(rttvar);
 	COPY_STAT(rcv_up);
 	COPY_STAT(rcv_adv);
 	COPY_STAT(rcv_nxt);
 	COPY_STAT(rcv_wnd);
 	COPY_STAT_T(dupacks);
 	COPY_STAT_T(segqlen);
 	COPY_STAT(snd_numholes);
 	COPY_STAT(snd_scale);
 	COPY_STAT(rcv_scale);
 	COPY_STAT_T(flags2);
 	COPY_STAT_T(fbyte_in);
 	COPY_STAT_T(fbyte_out);
 #undef COPY_STAT
 #undef COPY_STAT_T
 	log_buf->tlb_flex1 = 0;
 	log_buf->tlb_flex2 = 0;
 	/* Copy stack-specific info. */
 	if (stackinfo != NULL) {
 		memcpy(&log_buf->tlb_stackinfo, stackinfo,
 		    sizeof(log_buf->tlb_stackinfo));
 		log_buf->tlb_eventflags |= TLB_FLAG_STACKINFO;
 	}
 
 	/* The packet */
 	log_buf->tlb_len = len;
 	if (th) {
 		int optlen;
 
 		log_buf->tlb_eventflags |= TLB_FLAG_HDR;
 		log_buf->tlb_th = *th;
 		if (th_hostorder)
 			tcp_fields_to_net(&log_buf->tlb_th);
 		optlen = (th->th_off << 2) - sizeof (struct tcphdr);
 		if (optlen > 0)
 			memcpy(log_buf->tlb_opts, th + 1, optlen);
 	}
 
 	/* Verbose information */
 	if (func != NULL) {
 		log_buf->tlb_eventflags |= TLB_FLAG_VERBOSE;
 		if (output_caller != NULL)
 			strlcpy(log_verbose->tlv_snd_frm, output_caller,
 			    TCP_FUNC_LEN);
 		else
 			*log_verbose->tlv_snd_frm = 0;
 		strlcpy(log_verbose->tlv_trace_func, func, TCP_FUNC_LEN);
 		log_verbose->tlv_trace_line = line;
 	}
 
 	/* Insert the new log at the tail. */
 	STAILQ_INSERT_TAIL(&tp->t_logs, log_entry, tlm_queue);
 	tcp_log_entry_refcnt_add(log_entry);
 	return (log_buf);
 }
 
 /*
  * Change the logging state for a TCPCB. Returns 0 on success or an
  * error code on failure.
  */
 int
 tcp_log_state_change(struct tcpcb *tp, int state)
 {
 	struct tcp_log_mem *log_entry;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	switch(state) {
 	case TCP_LOG_STATE_CLEAR:
 		while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
 			tcp_log_remove_log_head(tp, log_entry);
 		/* Fall through */
 
 	case TCP_LOG_STATE_OFF:
 		tp->t_logstate = TCP_LOG_STATE_OFF;
 		break;
 
 	case TCP_LOG_STATE_TAIL:
 	case TCP_LOG_STATE_HEAD:
 	case TCP_LOG_STATE_CONTINUAL:
 	case TCP_LOG_STATE_HEAD_AUTO:
 	case TCP_LOG_STATE_TAIL_AUTO:
 		tp->t_logstate = state;
 		break;
 
 	default:
 		return (EINVAL);
 	}
 	if (tcp_disable_all_bb_logs) {
 		/* We are prohibited from doing any logs */
 		tp->t_logstate = TCP_LOG_STATE_OFF;
 	}
 	tp->t_flags2 &= ~(TF2_LOG_AUTO);
 
 	return (0);
 }
 
 /* If tcp_drain() is called, flush half the log entries. */
 void
 tcp_log_drain(struct tcpcb *tp)
 {
 	struct tcp_log_mem *log_entry, *next;
 	int target, skip;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if ((target = tp->t_lognum / 2) == 0)
 		return;
 
 	/*
 	 * If we are logging the "head" packets, we want to discard
 	 * from the tail of the queue. Otherwise, we want to discard
 	 * from the head.
 	 */
 	if (tp->t_logstate == TCP_LOG_STATE_HEAD ||
 	    tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO) {
 		skip = tp->t_lognum - target;
 		STAILQ_FOREACH(log_entry, &tp->t_logs, tlm_queue)
 			if (!--skip)
 				break;
 		KASSERT(log_entry != NULL,
 		    ("%s: skipped through all entries!", __func__));
 		if (log_entry == NULL)
 			return;
 		while ((next = STAILQ_NEXT(log_entry, tlm_queue)) != NULL) {
 			STAILQ_REMOVE_AFTER(&tp->t_logs, log_entry, tlm_queue);
 			tcp_log_entry_refcnt_rem(next);
 			tcp_log_remove_log_cleanup(tp, next);
 #ifdef INVARIANTS
 			target--;
 #endif
 		}
 		KASSERT(target == 0,
 		    ("%s: After removing from tail, target was %d", __func__,
 			target));
 	} else if (tp->t_logstate == TCP_LOG_STATE_CONTINUAL) {
 		(void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
 		    M_NOWAIT, false);
 	} else {
 		while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL &&
 		    target--)
 			tcp_log_remove_log_head(tp, log_entry);
 		KASSERT(target <= 0,
 		    ("%s: After removing from head, target was %d", __func__,
 			target));
 		KASSERT(tp->t_lognum > 0,
 		    ("%s: After removing from head, tp->t_lognum was %d",
 			__func__, target));
 		KASSERT(log_entry != NULL,
 		    ("%s: After removing from head, the tailq was empty",
 			__func__));
 	}
 }
 
 static inline int
 tcp_log_copyout(struct sockopt *sopt, void *src, void *dst, size_t len)
 {
 
 	if (sopt->sopt_td != NULL)
 		return (copyout(src, dst, len));
 	bcopy(src, dst, len);
 	return (0);
 }
 
 static int
 tcp_log_logs_to_buf(struct sockopt *sopt, struct tcp_log_stailq *log_tailqp,
     struct tcp_log_buffer **end, int count)
 {
 	struct tcp_log_buffer *out_entry;
 	struct tcp_log_mem *log_entry;
 	size_t entrysize;
 	int error;
 #ifdef INVARIANTS
 	int orig_count = count;
 #endif
 
 	/* Copy the data out. */
 	error = 0;
 	out_entry = (struct tcp_log_buffer *) sopt->sopt_val;
 	STAILQ_FOREACH(log_entry, log_tailqp, tlm_queue) {
 		count--;
 		KASSERT(count >= 0,
 		    ("%s:%d: Exceeded expected count (%d) processing list %p",
 		    __func__, __LINE__, orig_count, log_tailqp));
 
 #ifdef TCPLOG_DEBUG_COUNTERS
 		counter_u64_add(tcp_log_que_copyout, 1);
 #endif
 
 		/*
 		 * Skip copying out the header if it isn't present.
 		 * Instead, copy out zeros (to ensure we don't leak info).
 		 * TODO: Make sure we truly do zero everything we don't
 		 * explicitly set.
 		 */
 		if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR)
 			entrysize = sizeof(struct tcp_log_buffer);
 		else
 			entrysize = offsetof(struct tcp_log_buffer, tlb_th);
 		error = tcp_log_copyout(sopt, &log_entry->tlm_buf, out_entry,
 		    entrysize);
 		if (error)
 			break;
 		if (!(log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR)) {
 			error = tcp_log_copyout(sopt, zerobuf,
 			    ((uint8_t *)out_entry) + entrysize,
 			    sizeof(struct tcp_log_buffer) - entrysize);
 		}
 
 		/*
 		 * Copy out the verbose bit, if needed. Either way,
 		 * increment the output pointer the correct amount.
 		 */
 		if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_VERBOSE) {
 			error = tcp_log_copyout(sopt, &log_entry->tlm_v,
 			    out_entry->tlb_verbose,
 			    sizeof(struct tcp_log_verbose));
 			if (error)
 				break;
 			out_entry = (struct tcp_log_buffer *)
 			    (((uint8_t *) (out_entry + 1)) +
 			    sizeof(struct tcp_log_verbose));
 		} else
 			out_entry++;
 	}
 	*end = out_entry;
 	KASSERT(error || count == 0,
 	    ("%s:%d: Less than expected count (%d) processing list %p"
 	    " (%d remain)", __func__, __LINE__, orig_count,
 	    log_tailqp, count));
 
 	return (error);
 }
 
 /*
  * Copy out the buffer. Note that we do incremental copying, so
  * sooptcopyout() won't work. However, the goal is to produce the same
  * end result as if we copied in the entire user buffer, updated it,
  * and then used sooptcopyout() to copy it out.
  *
  * NOTE: This should be called with a write lock on the PCB; however,
  * the function will drop it after it extracts the data from the TCPCB.
  */
 int
 tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp)
 {
 	struct tcp_log_stailq log_tailq;
 	struct tcp_log_mem *log_entry, *log_next;
 	struct tcp_log_buffer *out_entry;
 	struct inpcb *inp;
 	size_t outsize, entrysize;
 	int error, outnum;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	inp = tp->t_inpcb;
 
 	/*
 	 * Determine which log entries will fit in the buffer. As an
 	 * optimization, skip this if all the entries will clearly fit
 	 * in the buffer. (However, get an exact size if we are using
 	 * INVARIANTS.)
 	 */
 #ifndef INVARIANTS
 	if (sopt->sopt_valsize / (sizeof(struct tcp_log_buffer) +
 	    sizeof(struct tcp_log_verbose)) >= tp->t_lognum) {
 		log_entry = STAILQ_LAST(&tp->t_logs, tcp_log_mem, tlm_queue);
 		log_next = NULL;
 		outsize = 0;
 		outnum = tp->t_lognum;
 	} else {
 #endif
 		outsize = outnum = 0;
 		log_entry = NULL;
 		STAILQ_FOREACH(log_next, &tp->t_logs, tlm_queue) {
 			entrysize = sizeof(struct tcp_log_buffer);
 			if (log_next->tlm_buf.tlb_eventflags &
 			    TLB_FLAG_VERBOSE)
 				entrysize += sizeof(struct tcp_log_verbose);
 			if ((sopt->sopt_valsize - outsize) < entrysize)
 				break;
 			outsize += entrysize;
 			outnum++;
 			log_entry = log_next;
 		}
 		KASSERT(outsize <= sopt->sopt_valsize,
 		    ("%s: calculated output size (%zu) greater than available"
 			"space (%zu)", __func__, outsize, sopt->sopt_valsize));
 #ifndef INVARIANTS
 	}
 #endif
 
 	/*
 	 * Copy traditional sooptcopyout() behavior: if sopt->sopt_val
 	 * is NULL, silently skip the copy. However, in this case, we
 	 * will leave the list alone and return. Functionally, this
 	 * gives userspace a way to poll for an approximate buffer
 	 * size they will need to get the log entries.
 	 */
 	if (sopt->sopt_val == NULL) {
 		INP_WUNLOCK(inp);
 		if (outsize == 0) {
 			outsize = outnum * (sizeof(struct tcp_log_buffer) +
 			    sizeof(struct tcp_log_verbose));
 		}
 		if (sopt->sopt_valsize > outsize)
 			sopt->sopt_valsize = outsize;
 		return (0);
 	}
 
 	/*
 	 * Break apart the list. We'll save the ones we want to copy
 	 * out locally and remove them from the TCPCB list. We can
 	 * then drop the INPCB lock while we do the copyout.
 	 *
 	 * There are roughly three cases:
 	 * 1. There was nothing to copy out. That's easy: drop the
 	 * lock and return.
 	 * 2. We are copying out the entire list. Again, that's easy:
 	 * move the whole list.
 	 * 3. We are copying out a partial list. That's harder. We
 	 * need to update the list book-keeping entries.
 	 */
 	if (log_entry != NULL && log_next == NULL) {
 		/* Move entire list. */
 		KASSERT(outnum == tp->t_lognum,
 		    ("%s:%d: outnum (%d) should match tp->t_lognum (%d)",
 			__func__, __LINE__, outnum, tp->t_lognum));
 		log_tailq = tp->t_logs;
 		tp->t_lognum = 0;
 		STAILQ_INIT(&tp->t_logs);
 	} else if (log_entry != NULL) {
 		/* Move partial list. */
 		KASSERT(outnum < tp->t_lognum,
 		    ("%s:%d: outnum (%d) not less than tp->t_lognum (%d)",
 			__func__, __LINE__, outnum, tp->t_lognum));
 		STAILQ_FIRST(&log_tailq) = STAILQ_FIRST(&tp->t_logs);
 		STAILQ_FIRST(&tp->t_logs) = STAILQ_NEXT(log_entry, tlm_queue);
 		KASSERT(STAILQ_NEXT(log_entry, tlm_queue) != NULL,
 		    ("%s:%d: tp->t_logs is unexpectedly shorter than expected"
 		    "(tp: %p, log_tailq: %p, outnum: %d, tp->t_lognum: %d)",
 		    __func__, __LINE__, tp, &log_tailq, outnum, tp->t_lognum));
 		STAILQ_NEXT(log_entry, tlm_queue) = NULL;
 		log_tailq.stqh_last = &STAILQ_NEXT(log_entry, tlm_queue);
 		tp->t_lognum -= outnum;
 	} else
 		STAILQ_INIT(&log_tailq);
 
 	/* Drop the PCB lock. */
 	INP_WUNLOCK(inp);
 
 	/* Copy the data out. */
 	error = tcp_log_logs_to_buf(sopt, &log_tailq, &out_entry, outnum);
 
 	if (error) {
 		/* Restore list */
 		INP_WLOCK(inp);
-		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0) {
+		if ((inp->inp_flags & INP_DROPPED) == 0) {
 			tp = intotcpcb(inp);
 
 			/* Merge the two lists. */
 			STAILQ_CONCAT(&log_tailq, &tp->t_logs);
 			tp->t_logs = log_tailq;
 			tp->t_lognum += outnum;
 		}
 		INP_WUNLOCK(inp);
 	} else {
 		/* Sanity check entries */
 		KASSERT(((caddr_t)out_entry - (caddr_t)sopt->sopt_val)  ==
 		    outsize, ("%s: Actual output size (%zu) != "
 			"calculated output size (%zu)", __func__,
 			(size_t)((caddr_t)out_entry - (caddr_t)sopt->sopt_val),
 			outsize));
 
 		/* Free the entries we just copied out. */
 		STAILQ_FOREACH_SAFE(log_entry, &log_tailq, tlm_queue, log_next) {
 			tcp_log_entry_refcnt_rem(log_entry);
 			uma_zfree(tcp_log_zone, log_entry);
 		}
 	}
 
 	sopt->sopt_valsize = (size_t)((caddr_t)out_entry -
 	    (caddr_t)sopt->sopt_val);
 	return (error);
 }
 
 static void
 tcp_log_free_queue(struct tcp_log_dev_queue *param)
 {
 	struct tcp_log_dev_log_queue *entry;
 
 	KASSERT(param != NULL, ("%s: called with NULL param", __func__));
 	if (param == NULL)
 		return;
 
 	entry = (struct tcp_log_dev_log_queue *)param;
 
 	/* Free the entries. */
 	tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count);
 
 	/* Free the buffer, if it is allocated. */
 	if (entry->tldl_common.tldq_buf != NULL)
 		free(entry->tldl_common.tldq_buf, M_TCPLOGDEV);
 
 	/* Free the queue entry. */
 	free(entry, M_TCPLOGDEV);
 }
 
 static struct tcp_log_common_header *
 tcp_log_expandlogbuf(struct tcp_log_dev_queue *param)
 {
 	struct tcp_log_dev_log_queue *entry;
 	struct tcp_log_header *hdr;
 	uint8_t *end;
 	struct sockopt sopt;
 	int error;
 
 	entry = (struct tcp_log_dev_log_queue *)param;
 
 	/* Take a worst-case guess at space needs. */
 	sopt.sopt_valsize = sizeof(struct tcp_log_header) +
 	    entry->tldl_count * (sizeof(struct tcp_log_buffer) +
 	    sizeof(struct tcp_log_verbose));
 	hdr = malloc(sopt.sopt_valsize, M_TCPLOGDEV, M_NOWAIT);
 	if (hdr == NULL) {
 #ifdef TCPLOG_DEBUG_COUNTERS
 		counter_u64_add(tcp_log_que_fail5, entry->tldl_count);
 #endif
 		return (NULL);
 	}
 	sopt.sopt_val = hdr + 1;
 	sopt.sopt_valsize -= sizeof(struct tcp_log_header);
 	sopt.sopt_td = NULL;
 
 	error = tcp_log_logs_to_buf(&sopt, &entry->tldl_entries,
 	    (struct tcp_log_buffer **)&end, entry->tldl_count);
 	if (error) {
 		free(hdr, M_TCPLOGDEV);
 		return (NULL);
 	}
 
 	/* Free the entries. */
 	tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count);
 	entry->tldl_count = 0;
 
 	memset(hdr, 0, sizeof(struct tcp_log_header));
 	hdr->tlh_version = TCP_LOG_BUF_VER;
 	hdr->tlh_type = TCP_LOG_DEV_TYPE_BBR;
 	hdr->tlh_length = end - (uint8_t *)hdr;
 	hdr->tlh_ie = entry->tldl_ie;
 	hdr->tlh_af = entry->tldl_af;
 	getboottime(&hdr->tlh_offset);
 	strlcpy(hdr->tlh_id, entry->tldl_id, TCP_LOG_ID_LEN);
 	strlcpy(hdr->tlh_tag, entry->tldl_tag, TCP_LOG_TAG_LEN);
 	strlcpy(hdr->tlh_reason, entry->tldl_reason, TCP_LOG_REASON_LEN);
 	return ((struct tcp_log_common_header *)hdr);
 }
 
 /*
  * Queue the tcpcb's log buffer for transmission via the log buffer facility.
  *
  * NOTE: This should be called with a write lock on the PCB.
  *
  * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop
  * and reacquire the INP lock if it needs to do so.
  *
  * If force is false, this will only dump auto-logged sessions if
  * tcp_log_auto_all is true or if there is a log ID defined for the session.
  */
 int
 tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force)
 {
 	struct tcp_log_dev_log_queue *entry;
 	struct inpcb *inp;
 #ifdef TCPLOG_DEBUG_COUNTERS
 	int num_entries;
 #endif
 
 	inp = tp->t_inpcb;
 	INP_WLOCK_ASSERT(inp);
 
 	/* If there are no log entries, there is nothing to do. */
 	if (tp->t_lognum == 0)
 		return (0);
 
 	/* Check for a log ID. */
 	if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) &&
 	    !tcp_log_auto_all && !force) {
 		struct tcp_log_mem *log_entry;
 
 		/*
 		 * We needed a log ID and none was found. Free the log entries
 		 * and return success. Also, cancel further logging. If the
 		 * session doesn't have a log ID by now, we'll assume it isn't
 		 * going to get one.
 		 */
 		while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
 			tcp_log_remove_log_head(tp, log_entry);
 		KASSERT(tp->t_lognum == 0,
 		    ("%s: After freeing entries, tp->t_lognum=%d (expected 0)",
 			__func__, tp->t_lognum));
 		tp->t_logstate = TCP_LOG_STATE_OFF;
 		return (0);
 	}
 
 	/*
 	 * Allocate memory. If we must wait, we'll need to drop the locks
 	 * and reacquire them (and do all the related business that goes
 	 * along with that).
 	 */
 	entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV,
 	    M_NOWAIT);
 	if (entry == NULL && (how & M_NOWAIT)) {
 #ifdef TCPLOG_DEBUG_COUNTERS
 		counter_u64_add(tcp_log_que_fail3, 1);
 #endif
 		return (ENOBUFS);
 	}
 	if (entry == NULL) {
 		INP_WUNLOCK(inp);
 		entry = malloc(sizeof(struct tcp_log_dev_log_queue),
 		    M_TCPLOGDEV, M_WAITOK);
 		INP_WLOCK(inp);
 		/*
 		 * Note that this check is slightly overly-restrictive in
 		 * that the TCB can survive either of these events.
 		 * However, there is currently not a good way to ensure
 		 * that is the case. So, if we hit this M_WAIT path, we
 		 * may end up dropping some entries. That seems like a
 		 * small price to pay for safety.
 		 */
-		if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+		if (inp->inp_flags & INP_DROPPED) {
 			free(entry, M_TCPLOGDEV);
 #ifdef TCPLOG_DEBUG_COUNTERS
 			counter_u64_add(tcp_log_que_fail2, 1);
 #endif
 			return (ECONNRESET);
 		}
 		tp = intotcpcb(inp);
 		if (tp->t_lognum == 0) {
 			free(entry, M_TCPLOGDEV);
 			return (0);
 		}
 	}
 
 	/* Fill in the unique parts of the queue entry. */
 	if (tp->t_lib != NULL) {
 		strlcpy(entry->tldl_id, tp->t_lib->tlb_id, TCP_LOG_ID_LEN);
 		strlcpy(entry->tldl_tag, tp->t_lib->tlb_tag, TCP_LOG_TAG_LEN);
 	} else {
 		strlcpy(entry->tldl_id, "UNKNOWN", TCP_LOG_ID_LEN);
 		strlcpy(entry->tldl_tag, "UNKNOWN", TCP_LOG_TAG_LEN);
 	}
 	if (reason != NULL)
 		strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN);
 	else
 		strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN);
 	entry->tldl_ie = inp->inp_inc.inc_ie;
 	if (inp->inp_inc.inc_flags & INC_ISIPV6)
 		entry->tldl_af = AF_INET6;
 	else
 		entry->tldl_af = AF_INET;
 	entry->tldl_entries = tp->t_logs;
 	entry->tldl_count = tp->t_lognum;
 
 	/* Fill in the common parts of the queue entry. */
 	entry->tldl_common.tldq_buf = NULL;
 	entry->tldl_common.tldq_xform = tcp_log_expandlogbuf;
 	entry->tldl_common.tldq_dtor = tcp_log_free_queue;
 
 	/* Clear the log data from the TCPCB. */
 #ifdef TCPLOG_DEBUG_COUNTERS
 	num_entries = tp->t_lognum;
 #endif
 	tp->t_lognum = 0;
 	STAILQ_INIT(&tp->t_logs);
 
 	/* Add the entry. If no one is listening, free the entry. */
 	if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry)) {
 		tcp_log_free_queue((struct tcp_log_dev_queue *)entry);
 #ifdef TCPLOG_DEBUG_COUNTERS
 		counter_u64_add(tcp_log_que_fail1, num_entries);
 	} else {
 		counter_u64_add(tcp_log_queued, num_entries);
 #endif
 	}
 	return (0);
 }
 
 /*
  * Queue the log_id_node's log buffers for transmission via the log buffer
  * facility.
  *
  * NOTE: This should be called with the bucket locked and referenced.
  *
  * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop
  * and reacquire the bucket lock if it needs to do so. (The caller must
  * ensure that the tln is no longer on any lists so no one else will mess
  * with this while the lock is dropped!)
  */
 static int
 tcp_log_dump_node_logbuf(struct tcp_log_id_node *tln, char *reason, int how)
 {
 	struct tcp_log_dev_log_queue *entry;
 	struct tcp_log_id_bucket *tlb;
 
 	tlb = tln->tln_bucket;
 	TCPID_BUCKET_LOCK_ASSERT(tlb);
 	KASSERT(tlb->tlb_refcnt > 0,
 	    ("%s:%d: Called with unreferenced bucket (tln=%p, tlb=%p)",
 	    __func__, __LINE__, tln, tlb));
 	KASSERT(tln->tln_closed,
 	    ("%s:%d: Called for node with tln_closed==false (tln=%p)",
 	    __func__, __LINE__, tln));
 
 	/* If there are no log entries, there is nothing to do. */
 	if (tln->tln_count == 0)
 		return (0);
 
 	/*
 	 * Allocate memory. If we must wait, we'll need to drop the locks
 	 * and reacquire them (and do all the related business that goes
 	 * along with that).
 	 */
 	entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV,
 	    M_NOWAIT);
 	if (entry == NULL && (how & M_NOWAIT))
 		return (ENOBUFS);
 	if (entry == NULL) {
 		TCPID_BUCKET_UNLOCK(tlb);
 		entry = malloc(sizeof(struct tcp_log_dev_log_queue),
 		    M_TCPLOGDEV, M_WAITOK);
 		TCPID_BUCKET_LOCK(tlb);
 	}
 
 	/* Fill in the common parts of the queue entry.. */
 	entry->tldl_common.tldq_buf = NULL;
 	entry->tldl_common.tldq_xform = tcp_log_expandlogbuf;
 	entry->tldl_common.tldq_dtor = tcp_log_free_queue;
 
 	/* Fill in the unique parts of the queue entry. */
 	strlcpy(entry->tldl_id, tlb->tlb_id, TCP_LOG_ID_LEN);
 	strlcpy(entry->tldl_tag, tlb->tlb_tag, TCP_LOG_TAG_LEN);
 	if (reason != NULL)
 		strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN);
 	else
 		strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN);
 	entry->tldl_ie = tln->tln_ie;
 	entry->tldl_entries = tln->tln_entries;
 	entry->tldl_count = tln->tln_count;
 	entry->tldl_af = tln->tln_af;
 
 	/* Add the entry. If no one is listening, free the entry. */
 	if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry))
 		tcp_log_free_queue((struct tcp_log_dev_queue *)entry);
 
 	return (0);
 }
 
 /*
  * Queue the log buffers for all sessions in a bucket for transmissions via
  * the log buffer facility.
  *
  * NOTE: This should be called with a locked bucket; however, the function
  * will drop the lock.
  */
 #define	LOCAL_SAVE	10
 static void
 tcp_log_dumpbucketlogs(struct tcp_log_id_bucket *tlb, char *reason)
 {
 	struct tcp_log_id_node local_entries[LOCAL_SAVE];
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct tcp_log_id_node *cur_tln, *prev_tln, *tmp_tln;
 	int i, num_local_entries, tree_locked;
 	bool expireq_locked;
 
 	TCPID_BUCKET_LOCK_ASSERT(tlb);
 
 	/*
 	 * Take a reference on the bucket to keep it from disappearing until
 	 * we are done.
 	 */
 	TCPID_BUCKET_REF(tlb);
 
 	/*
 	 * We'll try to create these without dropping locks. However, we
 	 * might very well need to drop locks to get memory. If that's the
 	 * case, we'll save up to 10 on the stack, and sacrifice the rest.
 	 * (Otherwise, we need to worry about finding our place again in a
 	 * potentially changed list. It just doesn't seem worth the trouble
 	 * to do that.
 	 */
 	expireq_locked = false;
 	num_local_entries = 0;
 	prev_tln = NULL;
 	tree_locked = TREE_UNLOCKED;
 	SLIST_FOREACH_SAFE(cur_tln, &tlb->tlb_head, tln_list, tmp_tln) {
 		/*
 		 * If this isn't associated with a TCPCB, we can pull it off
 		 * the list now. We need to be careful that the expire timer
 		 * hasn't already taken ownership (tln_expiretime == SBT_MAX).
 		 * If so, we let the expire timer code free the data.
 		 */
 		if (cur_tln->tln_closed) {
 no_inp:
 			/*
 			 * Get the expireq lock so we can get a consistent
 			 * read of tln_expiretime and so we can remove this
 			 * from the expireq.
 			 */
 			if (!expireq_locked) {
 				TCPLOG_EXPIREQ_LOCK();
 				expireq_locked = true;
 			}
 
 			/*
 			 * We ignore entries with tln_expiretime == SBT_MAX.
 			 * The expire timer code already owns those.
 			 */
 			KASSERT(cur_tln->tln_expiretime > (sbintime_t) 0,
 			    ("%s:%d: node on the expire queue without positive "
 			    "expire time", __func__, __LINE__));
 			if (cur_tln->tln_expiretime == SBT_MAX) {
 				prev_tln = cur_tln;
 				continue;
 			}
 
 			/* Remove the entry from the expireq. */
 			STAILQ_REMOVE(&tcp_log_expireq_head, cur_tln,
 			    tcp_log_id_node, tln_expireq);
 
 			/* Remove the entry from the bucket. */
 			if (prev_tln != NULL)
 				SLIST_REMOVE_AFTER(prev_tln, tln_list);
 			else
 				SLIST_REMOVE_HEAD(&tlb->tlb_head, tln_list);
 
 			/*
 			 * Drop the INP and bucket reference counts. Due to
 			 * lock-ordering rules, we need to drop the expire
 			 * queue lock.
 			 */
 			TCPLOG_EXPIREQ_UNLOCK();
 			expireq_locked = false;
 
 			/* Drop the INP reference. */
 			INP_WLOCK(cur_tln->tln_inp);
 			if (!in_pcbrele_wlocked(cur_tln->tln_inp))
 				INP_WUNLOCK(cur_tln->tln_inp);
 
 			if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) {
 #ifdef INVARIANTS
 				panic("%s: Bucket refcount unexpectedly 0.",
 				    __func__);
 #endif
 				/*
 				 * Recover as best we can: free the entry we
 				 * own.
 				 */
 				tcp_log_free_entries(&cur_tln->tln_entries,
 				    &cur_tln->tln_count);
 				uma_zfree(tcp_log_id_node_zone, cur_tln);
 				goto done;
 			}
 
 			if (tcp_log_dump_node_logbuf(cur_tln, reason,
 			    M_NOWAIT)) {
 				/*
 				 * If we have sapce, save the entries locally.
 				 * Otherwise, free them.
 				 */
 				if (num_local_entries < LOCAL_SAVE) {
 					local_entries[num_local_entries] =
 					    *cur_tln;
 					num_local_entries++;
 				} else {
 					tcp_log_free_entries(
 					    &cur_tln->tln_entries,
 					    &cur_tln->tln_count);
 				}
 			}
 
 			/* No matter what, we are done with the node now. */
 			uma_zfree(tcp_log_id_node_zone, cur_tln);
 
 			/*
 			 * Because we removed this entry from the list, prev_tln
 			 * (which tracks the previous entry still on the tlb
 			 * list) remains unchanged.
 			 */
 			continue;
 		}
 
 		/*
 		 * If we get to this point, the session data is still held in
 		 * the TCPCB. So, we need to pull the data out of that.
 		 *
 		 * We will need to drop the expireq lock so we can lock the INP.
 		 * We can then try to extract the data the "easy" way. If that
 		 * fails, we'll save the log entries for later.
 		 */
 		if (expireq_locked) {
 			TCPLOG_EXPIREQ_UNLOCK();
 			expireq_locked = false;
 		}
 
 		/* Lock the INP and then re-check the state. */
 		inp = cur_tln->tln_inp;
 		INP_WLOCK(inp);
 		/*
 		 * If we caught this while it was transitioning, the data
 		 * might have moved from the TCPCB to the tln (signified by
 		 * setting tln_closed to true. If so, treat this like an
 		 * inactive connection.
 		 */
 		if (cur_tln->tln_closed) {
 			/*
 			 * It looks like we may have caught this connection
 			 * while it was transitioning from active to inactive.
 			 * Treat this like an inactive connection.
 			 */
 			INP_WUNLOCK(inp);
 			goto no_inp;
 		}
 
 		/*
 		 * Try to dump the data from the tp without dropping the lock.
 		 * If this fails, try to save off the data locally.
 		 */
 		tp = cur_tln->tln_tp;
 		if (tcp_log_dump_tp_logbuf(tp, reason, M_NOWAIT, true) &&
 		    num_local_entries < LOCAL_SAVE) {
 			tcp_log_move_tp_to_node(tp,
 			    &local_entries[num_local_entries]);
 			local_entries[num_local_entries].tln_closed = 1;
 			KASSERT(local_entries[num_local_entries].tln_bucket ==
 			    tlb, ("%s: %d: bucket mismatch for node %p",
 			    __func__, __LINE__, cur_tln));
 			num_local_entries++;
 		}
 
 		INP_WUNLOCK(inp);
 
 		/*
 		 * We are goint to leave the current tln on the list. It will
 		 * become the previous tln.
 		 */
 		prev_tln = cur_tln;
 	}
 
 	/* Drop our locks, if any. */
 	KASSERT(tree_locked == TREE_UNLOCKED,
 	    ("%s: %d: tree unexpectedly locked", __func__, __LINE__));
 	switch (tree_locked) {
 	case TREE_WLOCKED:
 		TCPID_TREE_WUNLOCK();
 		tree_locked = TREE_UNLOCKED;
 		break;
 	case TREE_RLOCKED:
 		TCPID_TREE_RUNLOCK();
 		tree_locked = TREE_UNLOCKED;
 		break;
 	}
 	if (expireq_locked) {
 		TCPLOG_EXPIREQ_UNLOCK();
 		expireq_locked = false;
 	}
 
 	/*
 	 * Try again for any saved entries. tcp_log_dump_node_logbuf() is
 	 * guaranteed to free the log entries within the node. And, since
 	 * the node itself is on our stack, we don't need to free it.
 	 */
 	for (i = 0; i < num_local_entries; i++)
 		tcp_log_dump_node_logbuf(&local_entries[i], reason, M_WAITOK);
 
 	/* Drop our reference. */
 	if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL))
 		TCPID_BUCKET_UNLOCK(tlb);
 
 done:
 	/* Drop our locks, if any. */
 	switch (tree_locked) {
 	case TREE_WLOCKED:
 		TCPID_TREE_WUNLOCK();
 		break;
 	case TREE_RLOCKED:
 		TCPID_TREE_RUNLOCK();
 		break;
 	}
 	if (expireq_locked)
 		TCPLOG_EXPIREQ_UNLOCK();
 }
 #undef	LOCAL_SAVE
 
 /*
  * Queue the log buffers for all sessions in a bucket for transmissions via
  * the log buffer facility.
  *
  * NOTE: This should be called with a locked INP; however, the function
  * will drop the lock.
  */
 void
 tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason)
 {
 	struct tcp_log_id_bucket *tlb;
 	int tree_locked;
 
 	/* Figure out our bucket and lock it. */
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	tlb = tp->t_lib;
 	if (tlb == NULL) {
 		/*
 		 * No bucket; treat this like a request to dump a single
 		 * session's traces.
 		 */
 		(void)tcp_log_dump_tp_logbuf(tp, reason, M_WAITOK, true);
 		INP_WUNLOCK(tp->t_inpcb);
 		return;
 	}
 	TCPID_BUCKET_REF(tlb);
 	INP_WUNLOCK(tp->t_inpcb);
 	TCPID_BUCKET_LOCK(tlb);
 
 	/* If we are the last reference, we have nothing more to do here. */
 	tree_locked = TREE_UNLOCKED;
 	if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) {
 		switch (tree_locked) {
 		case TREE_WLOCKED:
 			TCPID_TREE_WUNLOCK();
 			break;
 		case TREE_RLOCKED:
 			TCPID_TREE_RUNLOCK();
 			break;
 		}
 		return;
 	}
 
 	/* Turn this over to tcp_log_dumpbucketlogs() to finish the work. */
 	tcp_log_dumpbucketlogs(tlb, reason);
 }
 
 /*
  * Mark the end of a flow with the current stack. A stack can add
  * stack-specific info to this trace event by overriding this
  * function (see bbr_log_flowend() for example).
  */
 void
 tcp_log_flowend(struct tcpcb *tp)
 {
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		struct socket *so = tp->t_inpcb->inp_socket;
 		TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd,
 				TCP_LOG_FLOWEND, 0, 0, NULL, false);
 	}
 }
diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c
index a4fc5580dfc7..9ec7736aef2f 100644
--- a/sys/netinet/tcp_lro.c
+++ b/sys/netinet/tcp_lro.c
@@ -1,2063 +1,2062 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2007, Myricom Inc.
  * Copyright (c) 2008, Intel Corporation.
  * Copyright (c) 2012 The FreeBSD Foundation
  * Copyright (c) 2016-2021 Mellanox Technologies.
  * All rights reserved.
  *
  * Portions of this software were developed by Bjoern Zeeb
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockbuf.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/ethernet.h>
 #include <net/bpf.h>
 #include <net/vnet.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 #include <net/infiniband.h>
 #include <net/if_lagg.h>
 
 #include <netinet/in_systm.h>
 #include <netinet/in.h>
 #include <netinet/ip6.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_lro.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_hpts.h>
 #include <netinet/tcp_log_buf.h>
 #include <netinet/udp.h>
 #include <netinet6/ip6_var.h>
 
 #include <machine/in_cksum.h>
 
 static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures");
 
 #define	TCP_LRO_TS_OPTION \
     ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
 	  (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)
 
 static void	tcp_lro_rx_done(struct lro_ctrl *lc);
 static int	tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m,
 		    uint32_t csum, bool use_hash);
 
 #ifdef TCPHPTS
 static bool	do_bpf_strip_and_compress(struct inpcb *, struct lro_ctrl *,
 		struct lro_entry *, struct mbuf **, struct mbuf **, struct mbuf **,
  		bool *, bool, bool, struct ifnet *, bool);
 
 #endif
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro,  CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "TCP LRO");
 
 static long tcplro_stacks_wanting_mbufq;
 counter_u64_t tcp_inp_lro_direct_queue;
 counter_u64_t tcp_inp_lro_wokeup_queue;
 counter_u64_t tcp_inp_lro_compressed;
 counter_u64_t tcp_inp_lro_locks_taken;
 counter_u64_t tcp_extra_mbuf;
 counter_u64_t tcp_would_have_but;
 counter_u64_t tcp_comp_total;
 counter_u64_t tcp_uncomp_total;
 counter_u64_t tcp_bad_csums;
 
 static unsigned	tcp_lro_entries = TCP_LRO_ENTRIES;
 SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries,
     CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0,
     "default number of LRO entries");
 
 static uint32_t tcp_lro_cpu_set_thresh = TCP_LRO_CPU_DECLARATION_THRESH;
 SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, lro_cpu_threshold,
     CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_cpu_set_thresh, 0,
     "Number of interrupts in a row on the same CPU that will make us declare an 'affinity' cpu?");
 
 static uint32_t tcp_less_accurate_lro_ts = 0;
 SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, lro_less_accurate,
     CTLFLAG_MPSAFE, &tcp_less_accurate_lro_ts, 0,
     "Do we trade off efficency by doing less timestamp operations for time accuracy?");
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD,
     &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD,
     &tcp_inp_lro_wokeup_queue, "Number of lro's where we woke up transport via hpts");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, compressed, CTLFLAG_RD,
     &tcp_inp_lro_compressed, "Number of lro's compressed and sent to transport");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lockcnt, CTLFLAG_RD,
     &tcp_inp_lro_locks_taken, "Number of lro's inp_wlocks taken");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, extra_mbuf, CTLFLAG_RD,
     &tcp_extra_mbuf, "Number of times we had an extra compressed ack dropped into the tp");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, would_have_but, CTLFLAG_RD,
     &tcp_would_have_but, "Number of times we would have had an extra compressed, but mget failed");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, with_m_ackcmp, CTLFLAG_RD,
     &tcp_comp_total, "Number of mbufs queued with M_ACKCMP flags set");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, without_m_ackcmp, CTLFLAG_RD,
     &tcp_uncomp_total, "Number of mbufs queued without M_ACKCMP");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lro_badcsum, CTLFLAG_RD,
     &tcp_bad_csums, "Number of packets that the common code saw with bad csums");
 
 void
 tcp_lro_reg_mbufq(void)
 {
 	atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, 1);
 }
 
 void
 tcp_lro_dereg_mbufq(void)
 {
 	atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, -1);
 }
 
 static __inline void
 tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket,
     struct lro_entry *le)
 {
 
 	LIST_INSERT_HEAD(&lc->lro_active, le, next);
 	LIST_INSERT_HEAD(bucket, le, hash_next);
 }
 
 static __inline void
 tcp_lro_active_remove(struct lro_entry *le)
 {
 
 	LIST_REMOVE(le, next);		/* active list */
 	LIST_REMOVE(le, hash_next);	/* hash bucket */
 }
 
 int
 tcp_lro_init(struct lro_ctrl *lc)
 {
 	return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0));
 }
 
 int
 tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp,
     unsigned lro_entries, unsigned lro_mbufs)
 {
 	struct lro_entry *le;
 	size_t size;
 	unsigned i, elements;
 
 	lc->lro_bad_csum = 0;
 	lc->lro_queued = 0;
 	lc->lro_flushed = 0;
 	lc->lro_mbuf_count = 0;
 	lc->lro_mbuf_max = lro_mbufs;
 	lc->lro_cnt = lro_entries;
 	lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX;
 	lc->lro_length_lim = TCP_LRO_LENGTH_MAX;
 	lc->ifp = ifp;
 	LIST_INIT(&lc->lro_free);
 	LIST_INIT(&lc->lro_active);
 
 	/* create hash table to accelerate entry lookup */
 	if (lro_entries > lro_mbufs)
 		elements = lro_entries;
 	else
 		elements = lro_mbufs;
 	lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz,
 	    HASH_NOWAIT);
 	if (lc->lro_hash == NULL) {
 		memset(lc, 0, sizeof(*lc));
 		return (ENOMEM);
 	}
 
 	/* compute size to allocate */
 	size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) +
 	    (lro_entries * sizeof(*le));
 	lc->lro_mbuf_data = (struct lro_mbuf_sort *)
 	    malloc(size, M_LRO, M_NOWAIT | M_ZERO);
 
 	/* check for out of memory */
 	if (lc->lro_mbuf_data == NULL) {
 		free(lc->lro_hash, M_LRO);
 		memset(lc, 0, sizeof(*lc));
 		return (ENOMEM);
 	}
 	/* compute offset for LRO entries */
 	le = (struct lro_entry *)
 	    (lc->lro_mbuf_data + lro_mbufs);
 
 	/* setup linked list */
 	for (i = 0; i != lro_entries; i++)
 		LIST_INSERT_HEAD(&lc->lro_free, le + i, next);
 
 	return (0);
 }
 
 struct vxlan_header {
 	uint32_t	vxlh_flags;
 	uint32_t	vxlh_vni;
 };
 
 static inline void *
 tcp_lro_low_level_parser(void *ptr, struct lro_parser *parser, bool update_data, bool is_vxlan, int mlen)
 {
 	const struct ether_vlan_header *eh;
 	void *old;
 	uint16_t eth_type;
 
 	if (update_data)
 		memset(parser, 0, sizeof(*parser));
 
 	old = ptr;
 
 	if (is_vxlan) {
 		const struct vxlan_header *vxh;
 		vxh = ptr;
 		ptr = (uint8_t *)ptr + sizeof(*vxh);
 		if (update_data) {
 			parser->data.vxlan_vni =
 			    vxh->vxlh_vni & htonl(0xffffff00);
 		}
 	}
 
 	eh = ptr;
 	if (__predict_false(eh->evl_encap_proto == htons(ETHERTYPE_VLAN))) {
 		eth_type = eh->evl_proto;
 		if (update_data) {
 			/* strip priority and keep VLAN ID only */
 			parser->data.vlan_id = eh->evl_tag & htons(EVL_VLID_MASK);
 		}
 		/* advance to next header */
 		ptr = (uint8_t *)ptr + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 		mlen -= (ETHER_HDR_LEN  + ETHER_VLAN_ENCAP_LEN);
 	} else {
 		eth_type = eh->evl_encap_proto;
 		/* advance to next header */
 		mlen -= ETHER_HDR_LEN;
 		ptr = (uint8_t *)ptr + ETHER_HDR_LEN;
 	}
 	if (__predict_false(mlen <= 0))
 		return (NULL);
 	switch (eth_type) {
 #ifdef INET
 	case htons(ETHERTYPE_IP):
 		parser->ip4 = ptr;
 		if (__predict_false(mlen < sizeof(struct ip)))
 			return (NULL);
 		/* Ensure there are no IPv4 options. */
 		if ((parser->ip4->ip_hl << 2) != sizeof (*parser->ip4))
 			break;
 		/* .. and the packet is not fragmented. */
 		if (parser->ip4->ip_off & htons(IP_MF|IP_OFFMASK))
 			break;
 		ptr = (uint8_t *)ptr + (parser->ip4->ip_hl << 2);
 		mlen -= sizeof(struct ip);
 		if (update_data) {
 			parser->data.s_addr.v4 = parser->ip4->ip_src;
 			parser->data.d_addr.v4 = parser->ip4->ip_dst;
 		}
 		switch (parser->ip4->ip_p) {
 		case IPPROTO_UDP:
 			if (__predict_false(mlen < sizeof(struct udphdr)))
 				return (NULL);
 			parser->udp = ptr;
 			if (update_data) {
 				parser->data.lro_type = LRO_TYPE_IPV4_UDP;
 				parser->data.s_port = parser->udp->uh_sport;
 				parser->data.d_port = parser->udp->uh_dport;
 			} else {
 				MPASS(parser->data.lro_type == LRO_TYPE_IPV4_UDP);
 			}
 			ptr = ((uint8_t *)ptr + sizeof(*parser->udp));
 			parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old;
 			return (ptr);
 		case IPPROTO_TCP:
 			parser->tcp = ptr;
 			if (__predict_false(mlen < sizeof(struct tcphdr)))
 				return (NULL);
 			if (update_data) {
 				parser->data.lro_type = LRO_TYPE_IPV4_TCP;
 				parser->data.s_port = parser->tcp->th_sport;
 				parser->data.d_port = parser->tcp->th_dport;
 			} else {
 				MPASS(parser->data.lro_type == LRO_TYPE_IPV4_TCP);
 			}
 			if (__predict_false(mlen < (parser->tcp->th_off << 2)))
 				return (NULL);
 			ptr = (uint8_t *)ptr + (parser->tcp->th_off << 2);
 			parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old;
 			return (ptr);
 		default:
 			break;
 		}
 		break;
 #endif
 #ifdef INET6
 	case htons(ETHERTYPE_IPV6):
 		parser->ip6 = ptr;
 		if (__predict_false(mlen < sizeof(struct ip6_hdr)))
 			return (NULL);
 		ptr = (uint8_t *)ptr + sizeof(*parser->ip6);
 		if (update_data) {
 			parser->data.s_addr.v6 = parser->ip6->ip6_src;
 			parser->data.d_addr.v6 = parser->ip6->ip6_dst;
 		}
 		mlen -= sizeof(struct ip6_hdr);
 		switch (parser->ip6->ip6_nxt) {
 		case IPPROTO_UDP:
 			if (__predict_false(mlen < sizeof(struct udphdr)))
 				return (NULL);
 			parser->udp = ptr;
 			if (update_data) {
 				parser->data.lro_type = LRO_TYPE_IPV6_UDP;
 				parser->data.s_port = parser->udp->uh_sport;
 				parser->data.d_port = parser->udp->uh_dport;
 			} else {
 				MPASS(parser->data.lro_type == LRO_TYPE_IPV6_UDP);
 			}
 			ptr = (uint8_t *)ptr + sizeof(*parser->udp);
 			parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old;
 			return (ptr);
 		case IPPROTO_TCP:
 			if (__predict_false(mlen < sizeof(struct tcphdr)))
 				return (NULL);
 			parser->tcp = ptr;
 			if (update_data) {
 				parser->data.lro_type = LRO_TYPE_IPV6_TCP;
 				parser->data.s_port = parser->tcp->th_sport;
 				parser->data.d_port = parser->tcp->th_dport;
 			} else {
 				MPASS(parser->data.lro_type == LRO_TYPE_IPV6_TCP);
 			}
 			if (__predict_false(mlen < (parser->tcp->th_off << 2)))
 				return (NULL);
 			ptr = (uint8_t *)ptr + (parser->tcp->th_off << 2);
 			parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old;
 			return (ptr);
 		default:
 			break;
 		}
 		break;
 #endif
 	default:
 		break;
 	}
 	/* Invalid packet - cannot parse */
 	return (NULL);
 }
 
 static const int vxlan_csum = CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID |
     CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID;
 
 static inline struct lro_parser *
 tcp_lro_parser(struct mbuf *m, struct lro_parser *po, struct lro_parser *pi, bool update_data)
 {
 	void *data_ptr;
 
 	/* Try to parse outer headers first. */
 	data_ptr = tcp_lro_low_level_parser(m->m_data, po, update_data, false, m->m_len);
 	if (data_ptr == NULL || po->total_hdr_len > m->m_len)
 		return (NULL);
 
 	if (update_data) {
 		/* Store VLAN ID, if any. */
 		if (__predict_false(m->m_flags & M_VLANTAG)) {
 			po->data.vlan_id =
 			    htons(m->m_pkthdr.ether_vtag) & htons(EVL_VLID_MASK);
 		}
 		/* Store decrypted flag, if any. */
 		if (__predict_false((m->m_pkthdr.csum_flags &
 		    CSUM_TLS_MASK) == CSUM_TLS_DECRYPTED))
 			po->data.lro_flags |= LRO_FLAG_DECRYPTED;
 	}
 
 	switch (po->data.lro_type) {
 	case LRO_TYPE_IPV4_UDP:
 	case LRO_TYPE_IPV6_UDP:
 		/* Check for VXLAN headers. */
 		if ((m->m_pkthdr.csum_flags & vxlan_csum) != vxlan_csum)
 			break;
 
 		/* Try to parse inner headers. */
 		data_ptr = tcp_lro_low_level_parser(data_ptr, pi, update_data, true,
 						    (m->m_len - ((caddr_t)data_ptr - m->m_data)));
 		if (data_ptr == NULL || (pi->total_hdr_len + po->total_hdr_len) > m->m_len)
 			break;
 
 		/* Verify supported header types. */
 		switch (pi->data.lro_type) {
 		case LRO_TYPE_IPV4_TCP:
 		case LRO_TYPE_IPV6_TCP:
 			return (pi);
 		default:
 			break;
 		}
 		break;
 	case LRO_TYPE_IPV4_TCP:
 	case LRO_TYPE_IPV6_TCP:
 		if (update_data)
 			memset(pi, 0, sizeof(*pi));
 		return (po);
 	default:
 		break;
 	}
 	return (NULL);
 }
 
 static inline int
 tcp_lro_trim_mbuf_chain(struct mbuf *m, const struct lro_parser *po)
 {
 	int len;
 
 	switch (po->data.lro_type) {
 #ifdef INET
 	case LRO_TYPE_IPV4_TCP:
 		len = ((uint8_t *)po->ip4 - (uint8_t *)m->m_data) +
 		    ntohs(po->ip4->ip_len);
 		break;
 #endif
 #ifdef INET6
 	case LRO_TYPE_IPV6_TCP:
 		len = ((uint8_t *)po->ip6 - (uint8_t *)m->m_data) +
 		    ntohs(po->ip6->ip6_plen) + sizeof(*po->ip6);
 		break;
 #endif
 	default:
 		return (TCP_LRO_CANNOT);
 	}
 
 	/*
 	 * If the frame is padded beyond the end of the IP packet,
 	 * then trim the extra bytes off:
 	 */
 	if (__predict_true(m->m_pkthdr.len == len)) {
 		return (0);
 	} else if (m->m_pkthdr.len > len) {
 		m_adj(m, len - m->m_pkthdr.len);
 		return (0);
 	}
 	return (TCP_LRO_CANNOT);
 }
 
 static struct tcphdr *
 tcp_lro_get_th(struct mbuf *m)
 {
 	return ((struct tcphdr *)((uint8_t *)m->m_data + m->m_pkthdr.lro_tcp_h_off));
 }
 
 static void
 lro_free_mbuf_chain(struct mbuf *m)
 {
 	struct mbuf *save;
 
 	while (m) {
 		save = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		m_freem(m);
 		m = save;
 	}
 }
 
 void
 tcp_lro_free(struct lro_ctrl *lc)
 {
 	struct lro_entry *le;
 	unsigned x;
 
 	/* reset LRO free list */
 	LIST_INIT(&lc->lro_free);
 
 	/* free active mbufs, if any */
 	while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
 		tcp_lro_active_remove(le);
 		lro_free_mbuf_chain(le->m_head);
 	}
 
 	/* free hash table */
 	free(lc->lro_hash, M_LRO);
 	lc->lro_hash = NULL;
 	lc->lro_hashsz = 0;
 
 	/* free mbuf array, if any */
 	for (x = 0; x != lc->lro_mbuf_count; x++)
 		m_freem(lc->lro_mbuf_data[x].mb);
 	lc->lro_mbuf_count = 0;
 
 	/* free allocated memory, if any */
 	free(lc->lro_mbuf_data, M_LRO);
 	lc->lro_mbuf_data = NULL;
 }
 
 static uint16_t
 tcp_lro_rx_csum_tcphdr(const struct tcphdr *th)
 {
 	const uint16_t *ptr;
 	uint32_t csum;
 	uint16_t len;
 
 	csum = -th->th_sum;	/* exclude checksum field */
 	len = th->th_off;
 	ptr = (const uint16_t *)th;
 	while (len--) {
 		csum += *ptr;
 		ptr++;
 		csum += *ptr;
 		ptr++;
 	}
 	while (csum > 0xffff)
 		csum = (csum >> 16) + (csum & 0xffff);
 
 	return (csum);
 }
 
 static uint16_t
 tcp_lro_rx_csum_data(const struct lro_parser *pa, uint16_t tcp_csum)
 {
 	uint32_t c;
 	uint16_t cs;
 
 	c = tcp_csum;
 
 	switch (pa->data.lro_type) {
 #ifdef INET6
 	case LRO_TYPE_IPV6_TCP:
 		/* Compute full pseudo IPv6 header checksum. */
 		cs = in6_cksum_pseudo(pa->ip6, ntohs(pa->ip6->ip6_plen), pa->ip6->ip6_nxt, 0);
 		break;
 #endif
 #ifdef INET
 	case LRO_TYPE_IPV4_TCP:
 		/* Compute full pseudo IPv4 header checsum. */
 		cs = in_addword(ntohs(pa->ip4->ip_len) - sizeof(*pa->ip4), IPPROTO_TCP);
 		cs = in_pseudo(pa->ip4->ip_src.s_addr, pa->ip4->ip_dst.s_addr, htons(cs));
 		break;
 #endif
 	default:
 		cs = 0;		/* Keep compiler happy. */
 		break;
 	}
 
 	/* Complement checksum. */
 	cs = ~cs;
 	c += cs;
 
 	/* Remove TCP header checksum. */
 	cs = ~tcp_lro_rx_csum_tcphdr(pa->tcp);
 	c += cs;
 
 	/* Compute checksum remainder. */
 	while (c > 0xffff)
 		c = (c >> 16) + (c & 0xffff);
 
 	return (c);
 }
 
 static void
 tcp_lro_rx_done(struct lro_ctrl *lc)
 {
 	struct lro_entry *le;
 
 	while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
 		tcp_lro_active_remove(le);
 		tcp_lro_flush(lc, le);
 	}
 }
 
 static void
 tcp_lro_flush_active(struct lro_ctrl *lc)
 {
 	struct lro_entry *le;
 
 	/*
 	 * Walk through the list of le entries, and
 	 * any one that does have packets flush. This
 	 * is called because we have an inbound packet
 	 * (e.g. SYN) that has to have all others flushed
 	 * in front of it. Note we have to do the remove
 	 * because tcp_lro_flush() assumes that the entry
 	 * is being freed. This is ok it will just get
 	 * reallocated again like it was new.
 	 */
 	LIST_FOREACH(le, &lc->lro_active, next) {
 		if (le->m_head != NULL) {
 			tcp_lro_active_remove(le);
 			tcp_lro_flush(lc, le);
 		}
 	}
 }
 
 void
 tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout)
 {
 	struct lro_entry *le, *le_tmp;
 	uint64_t now, tov;
 	struct bintime bt;
 
 	NET_EPOCH_ASSERT();
 	if (LIST_EMPTY(&lc->lro_active))
 		return;
 
 	/* get timeout time and current time in ns */
 	binuptime(&bt);
 	now = bintime2ns(&bt);
 	tov = ((timeout->tv_sec * 1000000000) + (timeout->tv_usec * 1000));
 	LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) {
 		if (now >= (bintime2ns(&le->alloc_time) + tov)) {
 			tcp_lro_active_remove(le);
 			tcp_lro_flush(lc, le);
 		}
 	}
 }
 
 #ifdef INET
 static int
 tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4)
 {
 	uint16_t csum;
 
 	/* Legacy IP has a header checksum that needs to be correct. */
 	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
 		if (__predict_false((m->m_pkthdr.csum_flags & CSUM_IP_VALID) == 0)) {
 			lc->lro_bad_csum++;
 			return (TCP_LRO_CANNOT);
 		}
 	} else {
 		csum = in_cksum_hdr(ip4);
 		if (__predict_false(csum != 0)) {
 			lc->lro_bad_csum++;
 			return (TCP_LRO_CANNOT);
 		}
 	}
 	return (0);
 }
 #endif
 
 #ifdef TCPHPTS
 static void
 tcp_lro_log(struct tcpcb *tp, const struct lro_ctrl *lc,
     const struct lro_entry *le, const struct mbuf *m,
     int frm, int32_t tcp_data_len, uint32_t th_seq,
     uint32_t th_ack, uint16_t th_win)
 {
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv, btv;
 		uint32_t cts;
 
 		cts = tcp_get_usecs(&tv);
 		memset(&log, 0, sizeof(union tcp_log_stackspecific));
 		log.u_bbr.flex8 = frm;
 		log.u_bbr.flex1 = tcp_data_len;
 		if (m)
 			log.u_bbr.flex2 = m->m_pkthdr.len;
 		else
 			log.u_bbr.flex2 = 0;
 		if (le->m_head) {
 			log.u_bbr.flex3 = le->m_head->m_pkthdr.lro_nsegs;
 			log.u_bbr.flex4 = le->m_head->m_pkthdr.lro_tcp_d_len;
 			log.u_bbr.flex5 = le->m_head->m_pkthdr.len;
 			log.u_bbr.delRate = le->m_head->m_flags;
 			log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp;
 		}
 		log.u_bbr.inflight = th_seq;
 		log.u_bbr.delivered = th_ack;
 		log.u_bbr.timeStamp = cts;
 		log.u_bbr.epoch = le->next_seq;
 		log.u_bbr.lt_epoch = le->ack_seq;
 		log.u_bbr.pacing_gain = th_win;
 		log.u_bbr.cwnd_gain = le->window;
 		log.u_bbr.lost = curcpu;
 		log.u_bbr.cur_del_rate = (uintptr_t)m;
 		log.u_bbr.bw_inuse = (uintptr_t)le->m_head;
 		bintime2timeval(&lc->lro_last_queue_time, &btv);
 		log.u_bbr.flex6 = tcp_tv_to_usectick(&btv);
 		log.u_bbr.flex7 = le->compressed;
 		log.u_bbr.pacing_gain = le->uncompressed;
 		if (in_epoch(net_epoch_preempt))
 			log.u_bbr.inhpts = 1;
 		else
 			log.u_bbr.inhpts = 0;
 		TCP_LOG_EVENTP(tp, NULL,
 			       &tp->t_inpcb->inp_socket->so_rcv,
 			       &tp->t_inpcb->inp_socket->so_snd,
 			       TCP_LOG_LRO, 0,
 			       0, &log, false, &tv);
 	}
 }
 #endif
 
 static inline void
 tcp_lro_assign_and_checksum_16(uint16_t *ptr, uint16_t value, uint16_t *psum)
 {
 	uint32_t csum;
 
 	csum = 0xffff - *ptr + value;
 	while (csum > 0xffff)
 		csum = (csum >> 16) + (csum & 0xffff);
 	*ptr = value;
 	*psum = csum;
 }
 
 static uint16_t
 tcp_lro_update_checksum(const struct lro_parser *pa, const struct lro_entry *le,
     uint16_t payload_len, uint16_t delta_sum)
 {
 	uint32_t csum;
 	uint16_t tlen;
 	uint16_t temp[5] = {};
 
 	switch (pa->data.lro_type) {
 	case LRO_TYPE_IPV4_TCP:
 		/* Compute new IPv4 length. */
 		tlen = (pa->ip4->ip_hl << 2) + (pa->tcp->th_off << 2) + payload_len;
 		tcp_lro_assign_and_checksum_16(&pa->ip4->ip_len, htons(tlen), &temp[0]);
 
 		/* Subtract delta from current IPv4 checksum. */
 		csum = pa->ip4->ip_sum + 0xffff - temp[0];
 		while (csum > 0xffff)
 			csum = (csum >> 16) + (csum & 0xffff);
 		tcp_lro_assign_and_checksum_16(&pa->ip4->ip_sum, csum, &temp[1]);
 		goto update_tcp_header;
 
 	case LRO_TYPE_IPV6_TCP:
 		/* Compute new IPv6 length. */
 		tlen = (pa->tcp->th_off << 2) + payload_len;
 		tcp_lro_assign_and_checksum_16(&pa->ip6->ip6_plen, htons(tlen), &temp[0]);
 		goto update_tcp_header;
 
 	case LRO_TYPE_IPV4_UDP:
 		/* Compute new IPv4 length. */
 		tlen = (pa->ip4->ip_hl << 2) + sizeof(*pa->udp) + payload_len;
 		tcp_lro_assign_and_checksum_16(&pa->ip4->ip_len, htons(tlen), &temp[0]);
 
 		/* Subtract delta from current IPv4 checksum. */
 		csum = pa->ip4->ip_sum + 0xffff - temp[0];
 		while (csum > 0xffff)
 			csum = (csum >> 16) + (csum & 0xffff);
 		tcp_lro_assign_and_checksum_16(&pa->ip4->ip_sum, csum, &temp[1]);
 		goto update_udp_header;
 
 	case LRO_TYPE_IPV6_UDP:
 		/* Compute new IPv6 length. */
 		tlen = sizeof(*pa->udp) + payload_len;
 		tcp_lro_assign_and_checksum_16(&pa->ip6->ip6_plen, htons(tlen), &temp[0]);
 		goto update_udp_header;
 
 	default:
 		return (0);
 	}
 
 update_tcp_header:
 	/* Compute current TCP header checksum. */
 	temp[2] = tcp_lro_rx_csum_tcphdr(pa->tcp);
 
 	/* Incorporate the latest ACK into the TCP header. */
 	pa->tcp->th_ack = le->ack_seq;
 	pa->tcp->th_win = le->window;
 
 	/* Incorporate latest timestamp into the TCP header. */
 	if (le->timestamp != 0) {
 		uint32_t *ts_ptr;
 
 		ts_ptr = (uint32_t *)(pa->tcp + 1);
 		ts_ptr[1] = htonl(le->tsval);
 		ts_ptr[2] = le->tsecr;
 	}
 
 	/* Compute new TCP header checksum. */
 	temp[3] = tcp_lro_rx_csum_tcphdr(pa->tcp);
 
 	/* Compute new TCP checksum. */
 	csum = pa->tcp->th_sum + 0xffff - delta_sum +
 	    0xffff - temp[0] + 0xffff - temp[3] + temp[2];
 	while (csum > 0xffff)
 		csum = (csum >> 16) + (csum & 0xffff);
 
 	/* Assign new TCP checksum. */
 	tcp_lro_assign_and_checksum_16(&pa->tcp->th_sum, csum, &temp[4]);
 
 	/* Compute all modififications affecting next checksum. */
 	csum = temp[0] + temp[1] + 0xffff - temp[2] +
 	    temp[3] + temp[4] + delta_sum;
 	while (csum > 0xffff)
 		csum = (csum >> 16) + (csum & 0xffff);
 
 	/* Return delta checksum to next stage, if any. */
 	return (csum);
 
 update_udp_header:
 	tlen = sizeof(*pa->udp) + payload_len;
 	/* Assign new UDP length and compute checksum delta. */
 	tcp_lro_assign_and_checksum_16(&pa->udp->uh_ulen, htons(tlen), &temp[2]);
 
 	/* Check if there is a UDP checksum. */
 	if (__predict_false(pa->udp->uh_sum != 0)) {
 		/* Compute new UDP checksum. */
 		csum = pa->udp->uh_sum + 0xffff - delta_sum +
 		    0xffff - temp[0] + 0xffff - temp[2];
 		while (csum > 0xffff)
 			csum = (csum >> 16) + (csum & 0xffff);
 		/* Assign new UDP checksum. */
 		tcp_lro_assign_and_checksum_16(&pa->udp->uh_sum, csum, &temp[3]);
 	}
 
 	/* Compute all modififications affecting next checksum. */
 	csum = temp[0] + temp[1] + temp[2] + temp[3] + delta_sum;
 	while (csum > 0xffff)
 		csum = (csum >> 16) + (csum & 0xffff);
 
 	/* Return delta checksum to next stage, if any. */
 	return (csum);
 }
 
 static void
 tcp_flush_out_entry(struct lro_ctrl *lc, struct lro_entry *le)
 {
 	/* Check if we need to recompute any checksums. */
 	if (le->needs_merge) {
 		uint16_t csum;
 
 		switch (le->inner.data.lro_type) {
 		case LRO_TYPE_IPV4_TCP:
 			csum = tcp_lro_update_checksum(&le->inner, le,
 			    le->m_head->m_pkthdr.lro_tcp_d_len,
 			    le->m_head->m_pkthdr.lro_tcp_d_csum);
 			csum = tcp_lro_update_checksum(&le->outer, NULL,
 			    le->m_head->m_pkthdr.lro_tcp_d_len +
 			    le->inner.total_hdr_len, csum);
 			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
 			    CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
 			le->m_head->m_pkthdr.csum_data = 0xffff;
 			if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED))
 				le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED;
 			break;
 		case LRO_TYPE_IPV6_TCP:
 			csum = tcp_lro_update_checksum(&le->inner, le,
 			    le->m_head->m_pkthdr.lro_tcp_d_len,
 			    le->m_head->m_pkthdr.lro_tcp_d_csum);
 			csum = tcp_lro_update_checksum(&le->outer, NULL,
 			    le->m_head->m_pkthdr.lro_tcp_d_len +
 			    le->inner.total_hdr_len, csum);
 			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
 			    CSUM_PSEUDO_HDR;
 			le->m_head->m_pkthdr.csum_data = 0xffff;
 			if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED))
 				le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED;
 			break;
 		case LRO_TYPE_NONE:
 			switch (le->outer.data.lro_type) {
 			case LRO_TYPE_IPV4_TCP:
 				csum = tcp_lro_update_checksum(&le->outer, le,
 				    le->m_head->m_pkthdr.lro_tcp_d_len,
 				    le->m_head->m_pkthdr.lro_tcp_d_csum);
 				le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
 				    CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
 				le->m_head->m_pkthdr.csum_data = 0xffff;
 				if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED))
 					le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED;
 				break;
 			case LRO_TYPE_IPV6_TCP:
 				csum = tcp_lro_update_checksum(&le->outer, le,
 				    le->m_head->m_pkthdr.lro_tcp_d_len,
 				    le->m_head->m_pkthdr.lro_tcp_d_csum);
 				le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
 				    CSUM_PSEUDO_HDR;
 				le->m_head->m_pkthdr.csum_data = 0xffff;
 				if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED))
 					le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED;
 				break;
 			default:
 				break;
 			}
 			break;
 		default:
 			break;
 		}
 	}
 
 	/*
 	 * Break any chain, this is not set to NULL on the singleton
 	 * case m_nextpkt points to m_head. Other case set them
 	 * m_nextpkt to NULL in push_and_replace.
 	 */
 	le->m_head->m_nextpkt = NULL;
 	lc->lro_queued += le->m_head->m_pkthdr.lro_nsegs;
 	(*lc->ifp->if_input)(lc->ifp, le->m_head);
 }
 
 static void
 tcp_set_entry_to_mbuf(struct lro_ctrl *lc, struct lro_entry *le,
     struct mbuf *m, struct tcphdr *th)
 {
 	uint32_t *ts_ptr;
 	uint16_t tcp_data_len;
 	uint16_t tcp_opt_len;
 
 	ts_ptr = (uint32_t *)(th + 1);
 	tcp_opt_len = (th->th_off << 2);
 	tcp_opt_len -= sizeof(*th);
 
 	/* Check if there is a timestamp option. */
 	if (tcp_opt_len == 0 ||
 	    __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA ||
 	    *ts_ptr != TCP_LRO_TS_OPTION)) {
 		/* We failed to find the timestamp option. */
 		le->timestamp = 0;
 	} else {
 		le->timestamp = 1;
 		le->tsval = ntohl(*(ts_ptr + 1));
 		le->tsecr = *(ts_ptr + 2);
 	}
 
 	tcp_data_len = m->m_pkthdr.lro_tcp_d_len;
 
 	/* Pull out TCP sequence numbers and window size. */
 	le->next_seq = ntohl(th->th_seq) + tcp_data_len;
 	le->ack_seq = th->th_ack;
 	le->window = th->th_win;
 	le->flags = tcp_get_flags(th);
 	le->needs_merge = 0;
 
 	/* Setup new data pointers. */
 	le->m_head = m;
 	le->m_tail = m_last(m);
 }
 
 static void
 tcp_push_and_replace(struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m)
 {
 	struct lro_parser *pa;
 
 	/*
 	 * Push up the stack of the current entry
 	 * and replace it with "m".
 	 */
 	struct mbuf *msave;
 
 	/* Grab off the next and save it */
 	msave = le->m_head->m_nextpkt;
 	le->m_head->m_nextpkt = NULL;
 
 	/* Now push out the old entry */
 	tcp_flush_out_entry(lc, le);
 
 	/* Re-parse new header, should not fail. */
 	pa = tcp_lro_parser(m, &le->outer, &le->inner, false);
 	KASSERT(pa != NULL,
 	    ("tcp_push_and_replace: LRO parser failed on m=%p\n", m));
 
 	/*
 	 * Now to replace the data properly in the entry
 	 * we have to reset the TCP header and
 	 * other fields.
 	 */
 	tcp_set_entry_to_mbuf(lc, le, m, pa->tcp);
 
 	/* Restore the next list */
 	m->m_nextpkt = msave;
 }
 
 static void
 tcp_lro_mbuf_append_pkthdr(struct lro_entry *le, const struct mbuf *p)
 {
 	struct mbuf *m;
 	uint32_t csum;
 
 	m = le->m_head;
 	if (m->m_pkthdr.lro_nsegs == 1) {
 		/* Compute relative checksum. */
 		csum = p->m_pkthdr.lro_tcp_d_csum;
 	} else {
 		/* Merge TCP data checksums. */
 		csum = (uint32_t)m->m_pkthdr.lro_tcp_d_csum +
 		    (uint32_t)p->m_pkthdr.lro_tcp_d_csum;
 		while (csum > 0xffff)
 			csum = (csum >> 16) + (csum & 0xffff);
 	}
 
 	/* Update various counters. */
 	m->m_pkthdr.len += p->m_pkthdr.lro_tcp_d_len;
 	m->m_pkthdr.lro_tcp_d_csum = csum;
 	m->m_pkthdr.lro_tcp_d_len += p->m_pkthdr.lro_tcp_d_len;
 	m->m_pkthdr.lro_nsegs += p->m_pkthdr.lro_nsegs;
 	le->needs_merge = 1;
 }
 
 static void
 tcp_lro_condense(struct lro_ctrl *lc, struct lro_entry *le)
 {
 	/*
 	 * Walk through the mbuf chain we
 	 * have on tap and compress/condense
 	 * as required.
 	 */
 	uint32_t *ts_ptr;
 	struct mbuf *m;
 	struct tcphdr *th;
 	uint32_t tcp_data_len_total;
 	uint32_t tcp_data_seg_total;
 	uint16_t tcp_data_len;
 	uint16_t tcp_opt_len;
 
 	/*
 	 * First we must check the lead (m_head)
 	 * we must make sure that it is *not*
 	 * something that should be sent up
 	 * right away (sack etc).
 	 */
 again:
 	m = le->m_head->m_nextpkt;
 	if (m == NULL) {
 		/* Just one left. */
 		return;
 	}
 
 	th = tcp_lro_get_th(m);
 	tcp_opt_len = (th->th_off << 2);
 	tcp_opt_len -= sizeof(*th);
 	ts_ptr = (uint32_t *)(th + 1);
 
 	if (tcp_opt_len != 0 && __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA ||
 	    *ts_ptr != TCP_LRO_TS_OPTION)) {
 		/*
 		 * Its not the timestamp. We can't
 		 * use this guy as the head.
 		 */
 		le->m_head->m_nextpkt = m->m_nextpkt;
 		tcp_push_and_replace(lc, le, m);
 		goto again;
 	}
 	if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH)) != 0) {
 		/*
 		 * Make sure that previously seen segments/ACKs are delivered
 		 * before this segment, e.g. FIN.
 		 */
 		le->m_head->m_nextpkt = m->m_nextpkt;
 		tcp_push_and_replace(lc, le, m);
 		goto again;
 	}
 	while((m = le->m_head->m_nextpkt) != NULL) {
 		/*
 		 * condense m into le, first
 		 * pull m out of the list.
 		 */
 		le->m_head->m_nextpkt = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		/* Setup my data */
 		tcp_data_len = m->m_pkthdr.lro_tcp_d_len;
 		th = tcp_lro_get_th(m);
 		ts_ptr = (uint32_t *)(th + 1);
 		tcp_opt_len = (th->th_off << 2);
 		tcp_opt_len -= sizeof(*th);
 		tcp_data_len_total = le->m_head->m_pkthdr.lro_tcp_d_len + tcp_data_len;
 		tcp_data_seg_total = le->m_head->m_pkthdr.lro_nsegs + m->m_pkthdr.lro_nsegs;
 
 		if (tcp_data_seg_total >= lc->lro_ackcnt_lim ||
 		    tcp_data_len_total >= lc->lro_length_lim) {
 			/* Flush now if appending will result in overflow. */
 			tcp_push_and_replace(lc, le, m);
 			goto again;
 		}
 		if (tcp_opt_len != 0 &&
 		    __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA ||
 		    *ts_ptr != TCP_LRO_TS_OPTION)) {
 			/*
 			 * Maybe a sack in the new one? We need to
 			 * start all over after flushing the
 			 * current le. We will go up to the beginning
 			 * and flush it (calling the replace again possibly
 			 * or just returning).
 			 */
 			tcp_push_and_replace(lc, le, m);
 			goto again;
 		}
 		if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH)) != 0) {
 			tcp_push_and_replace(lc, le, m);
 			goto again;
 		}
 		if (tcp_opt_len != 0) {
 			uint32_t tsval = ntohl(*(ts_ptr + 1));
 			/* Make sure timestamp values are increasing. */
 			if (TSTMP_GT(le->tsval, tsval))  {
 				tcp_push_and_replace(lc, le, m);
 				goto again;
 			}
 			le->tsval = tsval;
 			le->tsecr = *(ts_ptr + 2);
 		}
 		/* Try to append the new segment. */
 		if (__predict_false(ntohl(th->th_seq) != le->next_seq ||
 				    ((tcp_get_flags(th) & TH_ACK) !=
 				      (le->flags & TH_ACK)) ||
 				    (tcp_data_len == 0 &&
 				     le->ack_seq == th->th_ack &&
 				     le->window == th->th_win))) {
 			/* Out of order packet, non-ACK + ACK or dup ACK. */
 			tcp_push_and_replace(lc, le, m);
 			goto again;
 		}
 		if (tcp_data_len != 0 ||
 		    SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) {
 			le->next_seq += tcp_data_len;
 			le->ack_seq = th->th_ack;
 			le->window = th->th_win;
 			le->needs_merge = 1;
 		} else if (th->th_ack == le->ack_seq) {
 			if (WIN_GT(th->th_win, le->window)) {
 				le->window = th->th_win;
 				le->needs_merge = 1;
 			}
 		}
 
 		if (tcp_data_len == 0) {
 			m_freem(m);
 			continue;
 		}
 
 		/* Merge TCP data checksum and length to head mbuf. */
 		tcp_lro_mbuf_append_pkthdr(le, m);
 
 		/*
 		 * Adjust the mbuf so that m_data points to the first byte of
 		 * the ULP payload.  Adjust the mbuf to avoid complications and
 		 * append new segment to existing mbuf chain.
 		 */
 		m_adj(m, m->m_pkthdr.len - tcp_data_len);
 		m_demote_pkthdr(m);
 		le->m_tail->m_next = m;
 		le->m_tail = m_last(m);
 	}
 }
 
 #ifdef TCPHPTS
 static void
 tcp_queue_pkts(struct inpcb *inp, struct tcpcb *tp, struct lro_entry *le)
 {
 	INP_WLOCK_ASSERT(inp);
 	if (tp->t_in_pkt == NULL) {
 		/* Nothing yet there */
 		tp->t_in_pkt = le->m_head;
 		tp->t_tail_pkt = le->m_last_mbuf;
 	} else {
 		/* Already some there */
 		tp->t_tail_pkt->m_nextpkt = le->m_head;
 		tp->t_tail_pkt = le->m_last_mbuf;
 	}
 	le->m_head = NULL;
 	le->m_last_mbuf = NULL;
 }
 
 static struct mbuf *
 tcp_lro_get_last_if_ackcmp(struct lro_ctrl *lc, struct lro_entry *le,
     struct inpcb *inp, int32_t *new_m, bool can_append_old_cmp)
 {
 	struct tcpcb *tp;
 	struct mbuf *m;
 
 	tp = intotcpcb(inp);
 	if (__predict_false(tp == NULL))
 		return (NULL);
 
 	/* Look at the last mbuf if any in queue */
  	if (can_append_old_cmp) {
 		m = tp->t_tail_pkt;
 		if (m != NULL && (m->m_flags & M_ACKCMP) != 0) {
 			if (M_TRAILINGSPACE(m) >= sizeof(struct tcp_ackent)) {
 				tcp_lro_log(tp, lc, le, NULL, 23, 0, 0, 0, 0);
 				*new_m = 0;
 				counter_u64_add(tcp_extra_mbuf, 1);
 				return (m);
 			} else {
 				/* Mark we ran out of space */
 				inp->inp_flags2 |= INP_MBUF_L_ACKS;
 			}
 		}
 	}
 	/* Decide mbuf size. */
 	tcp_lro_log(tp, lc, le, NULL, 21, 0, 0, 0, 0);
 	if (inp->inp_flags2 & INP_MBUF_L_ACKS)
 		m = m_getcl(M_NOWAIT, MT_DATA, M_ACKCMP | M_PKTHDR);
 	else
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 
 	if (__predict_false(m == NULL)) {
 		counter_u64_add(tcp_would_have_but, 1);
 		return (NULL);
 	}
 	counter_u64_add(tcp_comp_total, 1);
  	m->m_pkthdr.rcvif = lc->ifp;
 	m->m_flags |= M_ACKCMP;
 	*new_m = 1;
 	return (m);
 }
 
 static struct inpcb *
 tcp_lro_lookup(struct ifnet *ifp, struct lro_parser *pa)
 {
 	struct inpcb *inp;
 
 	switch (pa->data.lro_type) {
 #ifdef INET6
 	case LRO_TYPE_IPV6_TCP:
 		inp = in6_pcblookup(&V_tcbinfo,
 		    &pa->data.s_addr.v6,
 		    pa->data.s_port,
 		    &pa->data.d_addr.v6,
 		    pa->data.d_port,
 		    INPLOOKUP_WLOCKPCB,
 		    ifp);
 		break;
 #endif
 #ifdef INET
 	case LRO_TYPE_IPV4_TCP:
 		inp = in_pcblookup(&V_tcbinfo,
 		    pa->data.s_addr.v4,
 		    pa->data.s_port,
 		    pa->data.d_addr.v4,
 		    pa->data.d_port,
 		    INPLOOKUP_WLOCKPCB,
 		    ifp);
 		break;
 #endif
 	default:
 		inp = NULL;
 		break;
 	}
 	return (inp);
 }
 
 static inline bool
 tcp_lro_ack_valid(struct mbuf *m, struct tcphdr *th, uint32_t **ppts, bool *other_opts)
 {
 	/*
 	 * This function returns two bits of valuable information.
 	 * a) Is what is present capable of being ack-compressed,
 	 *    we can ack-compress if there is no options or just
 	 *    a timestamp option, and of course the th_flags must
 	 *    be correct as well.
 	 * b) Our other options present such as SACK. This is
 	 *    used to determine if we want to wakeup or not.
 	 */
 	bool ret = true;
 
 	switch (th->th_off << 2) {
 	case (sizeof(*th) + TCPOLEN_TSTAMP_APPA):
 		*ppts = (uint32_t *)(th + 1);
 		/* Check if we have only one timestamp option. */
 		if (**ppts == TCP_LRO_TS_OPTION)
 			*other_opts = false;
 		else {
 			*other_opts = true;
 			ret = false;
 		}
 		break;
 	case (sizeof(*th)):
 		/* No options. */
 		*ppts = NULL;
 		*other_opts = false;
 		break;
 	default:
 		*ppts = NULL;
 		*other_opts = true;
 		ret = false;
 		break;
 	}
 	/* For ACKCMP we only accept ACK, PUSH, ECE and CWR. */
 	if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) != 0)
 		ret = false;
 	/* If it has data on it we cannot compress it */
 	if (m->m_pkthdr.lro_tcp_d_len)
 		ret = false;
 
 	/* ACK flag must be set. */
 	if (!(tcp_get_flags(th) & TH_ACK))
 		ret = false;
 	return (ret);
 }
 
 static int
 tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct mbuf **pp, *cmp, *mv_to;
 	struct ifnet *lagg_ifp;
  	bool bpf_req, lagg_bpf_req, should_wake, can_append_old_cmp;
 
 	/* Check if packet doesn't belongs to our network interface. */
 	if ((tcplro_stacks_wanting_mbufq == 0) ||
 	    (le->outer.data.vlan_id != 0) ||
 	    (le->inner.data.lro_type != LRO_TYPE_NONE))
 		return (TCP_LRO_CANNOT);
 
 #ifdef INET6
 	/*
 	 * Be proactive about unspecified IPv6 address in source. As
 	 * we use all-zero to indicate unbounded/unconnected pcb,
 	 * unspecified IPv6 address can be used to confuse us.
 	 *
 	 * Note that packets with unspecified IPv6 destination is
 	 * already dropped in ip6_input.
 	 */
 	if (__predict_false(le->outer.data.lro_type == LRO_TYPE_IPV6_TCP &&
 	    IN6_IS_ADDR_UNSPECIFIED(&le->outer.data.s_addr.v6)))
 		return (TCP_LRO_CANNOT);
 
 	if (__predict_false(le->inner.data.lro_type == LRO_TYPE_IPV6_TCP &&
 	    IN6_IS_ADDR_UNSPECIFIED(&le->inner.data.s_addr.v6)))
 		return (TCP_LRO_CANNOT);
 #endif
 	/* Lookup inp, if any. */
 	inp = tcp_lro_lookup(lc->ifp,
 	    (le->inner.data.lro_type == LRO_TYPE_NONE) ? &le->outer : &le->inner);
 	if (inp == NULL)
 		return (TCP_LRO_CANNOT);
 
 	counter_u64_add(tcp_inp_lro_locks_taken, 1);
 
 	/* Get TCP control structure. */
 	tp = intotcpcb(inp);
 
 	/* Check if the inp is dead, Jim. */
-	if (tp == NULL ||
-	    (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
+	if (tp == NULL || (inp->inp_flags & INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		return (TCP_LRO_CANNOT);
 	}
 	if ((inp->inp_irq_cpu_set == 0)  && (lc->lro_cpu_is_set == 1)) {
 		inp->inp_irq_cpu = lc->lro_last_cpu;
 		inp->inp_irq_cpu_set = 1;
 	}
 	/* Check if the transport doesn't support the needed optimizations. */
 	if ((inp->inp_flags2 & (INP_SUPPORTS_MBUFQ | INP_MBUF_ACKCMP)) == 0) {
 		INP_WUNLOCK(inp);
 		return (TCP_LRO_CANNOT);
 	}
 
 	if (inp->inp_flags2 & INP_MBUF_QUEUE_READY)
 		should_wake = false;
 	else
 		should_wake = true;
 	/* Check if packets should be tapped to BPF. */
 	bpf_req = bpf_peers_present(lc->ifp->if_bpf);
 	lagg_bpf_req = false;
 	lagg_ifp = NULL;
 	if (lc->ifp->if_type == IFT_IEEE8023ADLAG ||
 	    lc->ifp->if_type == IFT_INFINIBANDLAG) {
 		struct lagg_port *lp = lc->ifp->if_lagg;
 		struct lagg_softc *sc = lp->lp_softc;
 
 		lagg_ifp = sc->sc_ifp;
 		if (lagg_ifp != NULL)
 			lagg_bpf_req = bpf_peers_present(lagg_ifp->if_bpf);
 	}
 
 	/* Strip and compress all the incoming packets. */
  	can_append_old_cmp = true;
 	cmp = NULL;
 	for (pp = &le->m_head; *pp != NULL; ) {
 		mv_to = NULL;
 		if (do_bpf_strip_and_compress(inp, lc, le, pp,
 			&cmp, &mv_to, &should_wake, bpf_req,
  			lagg_bpf_req, lagg_ifp, can_append_old_cmp) == false) {
 			/* Advance to next mbuf. */
 			pp = &(*pp)->m_nextpkt;
  			/*
  			 * Once we have appended we can't look in the pending
  			 * inbound packets for a compressed ack to append to.
  			 */
  			can_append_old_cmp = false;
  			/*
  			 * Once we append we also need to stop adding to any
  			 * compressed ack we were remembering. A new cmp
  			 * ack will be required.
  			 */
  			cmp = NULL;
  			tcp_lro_log(tp, lc, le, NULL, 25, 0, 0, 0, 0);
 		} else if (mv_to != NULL) {
 			/* We are asked to move pp up */
 			pp = &mv_to->m_nextpkt;
  			tcp_lro_log(tp, lc, le, NULL, 24, 0, 0, 0, 0);
 		} else
  			tcp_lro_log(tp, lc, le, NULL, 26, 0, 0, 0, 0);
 	}
 	/* Update "m_last_mbuf", if any. */
 	if (pp == &le->m_head)
 		le->m_last_mbuf = *pp;
 	else
 		le->m_last_mbuf = __containerof(pp, struct mbuf, m_nextpkt);
 
 	/* Check if any data mbufs left. */
 	if (le->m_head != NULL) {
 		counter_u64_add(tcp_inp_lro_direct_queue, 1);
 		tcp_lro_log(tp, lc, le, NULL, 22, 1, inp->inp_flags2, 0, 1);
 		tcp_queue_pkts(inp, tp, le);
 	}
 	if (should_wake) {
 		/* Wakeup */
 		counter_u64_add(tcp_inp_lro_wokeup_queue, 1);
 		if ((*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0))
 			inp = NULL;
 	}
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 	return (0);	/* Success. */
 }
 #endif
 
 void
 tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
 {
 	/* Only optimise if there are multiple packets waiting. */
 #ifdef TCPHPTS
 	int error;
 #endif
 
 	NET_EPOCH_ASSERT();
 #ifdef TCPHPTS
 	CURVNET_SET(lc->ifp->if_vnet);
 	error = tcp_lro_flush_tcphpts(lc, le);
 	CURVNET_RESTORE();
 	if (error != 0) {
 #endif
 		tcp_lro_condense(lc, le);
 		tcp_flush_out_entry(lc, le);
 #ifdef TCPHPTS
 	}
 #endif
 	lc->lro_flushed++;
 	bzero(le, sizeof(*le));
 	LIST_INSERT_HEAD(&lc->lro_free, le, next);
 }
 
 #ifdef HAVE_INLINE_FLSLL
 #define	tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1))
 #else
 static inline uint64_t
 tcp_lro_msb_64(uint64_t x)
 {
 	x |= (x >> 1);
 	x |= (x >> 2);
 	x |= (x >> 4);
 	x |= (x >> 8);
 	x |= (x >> 16);
 	x |= (x >> 32);
 	return (x & ~(x >> 1));
 }
 #endif
 
 /*
  * The tcp_lro_sort() routine is comparable to qsort(), except it has
  * a worst case complexity limit of O(MIN(N,64)*N), where N is the
  * number of elements to sort and 64 is the number of sequence bits
  * available. The algorithm is bit-slicing the 64-bit sequence number,
  * sorting one bit at a time from the most significant bit until the
  * least significant one, skipping the constant bits. This is
  * typically called a radix sort.
  */
 static void
 tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size)
 {
 	struct lro_mbuf_sort temp;
 	uint64_t ones;
 	uint64_t zeros;
 	uint32_t x;
 	uint32_t y;
 
 repeat:
 	/* for small arrays insertion sort is faster */
 	if (size <= 12) {
 		for (x = 1; x < size; x++) {
 			temp = parray[x];
 			for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--)
 				parray[y] = parray[y - 1];
 			parray[y] = temp;
 		}
 		return;
 	}
 
 	/* compute sequence bits which are constant */
 	ones = 0;
 	zeros = 0;
 	for (x = 0; x != size; x++) {
 		ones |= parray[x].seq;
 		zeros |= ~parray[x].seq;
 	}
 
 	/* compute bits which are not constant into "ones" */
 	ones &= zeros;
 	if (ones == 0)
 		return;
 
 	/* pick the most significant bit which is not constant */
 	ones = tcp_lro_msb_64(ones);
 
 	/*
 	 * Move entries having cleared sequence bits to the beginning
 	 * of the array:
 	 */
 	for (x = y = 0; y != size; y++) {
 		/* skip set bits */
 		if (parray[y].seq & ones)
 			continue;
 		/* swap entries */
 		temp = parray[x];
 		parray[x] = parray[y];
 		parray[y] = temp;
 		x++;
 	}
 
 	KASSERT(x != 0 && x != size, ("Memory is corrupted\n"));
 
 	/* sort zeros */
 	tcp_lro_sort(parray, x);
 
 	/* sort ones */
 	parray += x;
 	size -= x;
 	goto repeat;
 }
 
 void
 tcp_lro_flush_all(struct lro_ctrl *lc)
 {
 	uint64_t seq;
 	uint64_t nseq;
 	unsigned x;
 
 	NET_EPOCH_ASSERT();
 	/* check if no mbufs to flush */
 	if (lc->lro_mbuf_count == 0)
 		goto done;
 	if (lc->lro_cpu_is_set == 0) {
 		if (lc->lro_last_cpu == curcpu) {
 			lc->lro_cnt_of_same_cpu++;
 			/* Have we reached the threshold to declare a cpu? */
 			if (lc->lro_cnt_of_same_cpu > tcp_lro_cpu_set_thresh)
 				lc->lro_cpu_is_set = 1;
 		} else {
 			lc->lro_last_cpu = curcpu;
 			lc->lro_cnt_of_same_cpu = 0;
 		}
 	}
 	CURVNET_SET(lc->ifp->if_vnet);
 
 	/* get current time */
 	binuptime(&lc->lro_last_queue_time);
 
 	/* sort all mbufs according to stream */
 	tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count);
 
 	/* input data into LRO engine, stream by stream */
 	seq = 0;
 	for (x = 0; x != lc->lro_mbuf_count; x++) {
 		struct mbuf *mb;
 
 		/* get mbuf */
 		mb = lc->lro_mbuf_data[x].mb;
 
 		/* get sequence number, masking away the packet index */
 		nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24);
 
 		/* check for new stream */
 		if (seq != nseq) {
 			seq = nseq;
 
 			/* flush active streams */
 			tcp_lro_rx_done(lc);
 		}
 
 		/* add packet to LRO engine */
 		if (tcp_lro_rx_common(lc, mb, 0, false) != 0) {
  			/* Flush anything we have acummulated */
  			tcp_lro_flush_active(lc);
 			/* input packet to network layer */
 			(*lc->ifp->if_input)(lc->ifp, mb);
 			lc->lro_queued++;
 			lc->lro_flushed++;
 		}
 	}
 	CURVNET_RESTORE();
 done:
 	/* flush active streams */
 	tcp_lro_rx_done(lc);
 
 #ifdef TCPHPTS
 	tcp_run_hpts();
 #endif
 	lc->lro_mbuf_count = 0;
 }
 
 #ifdef TCPHPTS
 static void
 build_ack_entry(struct tcp_ackent *ae, struct tcphdr *th, struct mbuf *m,
     uint32_t *ts_ptr, uint16_t iptos)
 {
 	/*
 	 * Given a TCP ACK, summarize it down into the small TCP ACK
 	 * entry.
 	 */
 	ae->timestamp = m->m_pkthdr.rcv_tstmp;
 	ae->flags = 0;
 	if (m->m_flags & M_TSTMP_LRO)
 		ae->flags |= TSTMP_LRO;
 	else if (m->m_flags & M_TSTMP)
 		ae->flags |= TSTMP_HDWR;
 	ae->seq = ntohl(th->th_seq);
 	ae->ack = ntohl(th->th_ack);
 	ae->flags |= tcp_get_flags(th);
 	if (ts_ptr != NULL) {
 		ae->ts_value = ntohl(ts_ptr[1]);
 		ae->ts_echo = ntohl(ts_ptr[2]);
 		ae->flags |= HAS_TSTMP;
 	}
 	ae->win = ntohs(th->th_win);
 	ae->codepoint = iptos;
 }
 
 /*
  * Do BPF tap for either ACK_CMP packets or MBUF QUEUE type packets
  * and strip all, but the IPv4/IPv6 header.
  */
 static bool
 do_bpf_strip_and_compress(struct inpcb *inp, struct lro_ctrl *lc,
     struct lro_entry *le, struct mbuf **pp, struct mbuf **cmp, struct mbuf **mv_to,
     bool *should_wake, bool bpf_req, bool lagg_bpf_req, struct ifnet *lagg_ifp, bool can_append_old_cmp)
 {
 	union {
 		void *ptr;
 		struct ip *ip4;
 		struct ip6_hdr *ip6;
 	} l3;
 	struct mbuf *m;
 	struct mbuf *nm;
 	struct tcphdr *th;
 	struct tcp_ackent *ack_ent;
 	uint32_t *ts_ptr;
 	int32_t n_mbuf;
 	bool other_opts, can_compress;
 	uint8_t lro_type;
 	uint16_t iptos;
 	int tcp_hdr_offset;
 	int idx;
 
 	/* Get current mbuf. */
 	m = *pp;
 
 	/* Let the BPF see the packet */
 	if (__predict_false(bpf_req))
 		ETHER_BPF_MTAP(lc->ifp, m);
 
 	if (__predict_false(lagg_bpf_req))
 		ETHER_BPF_MTAP(lagg_ifp, m);
 
 	tcp_hdr_offset = m->m_pkthdr.lro_tcp_h_off;
 	lro_type = le->inner.data.lro_type;
 	switch (lro_type) {
 	case LRO_TYPE_NONE:
 		lro_type = le->outer.data.lro_type;
 		switch (lro_type) {
 		case LRO_TYPE_IPV4_TCP:
 			tcp_hdr_offset -= sizeof(*le->outer.ip4);
 			m->m_pkthdr.lro_etype = ETHERTYPE_IP;
 			break;
 		case LRO_TYPE_IPV6_TCP:
 			tcp_hdr_offset -= sizeof(*le->outer.ip6);
 			m->m_pkthdr.lro_etype = ETHERTYPE_IPV6;
 			break;
 		default:
 			goto compressed;
 		}
 		break;
 	case LRO_TYPE_IPV4_TCP:
 		tcp_hdr_offset -= sizeof(*le->outer.ip4);
 		m->m_pkthdr.lro_etype = ETHERTYPE_IP;
 		break;
 	case LRO_TYPE_IPV6_TCP:
 		tcp_hdr_offset -= sizeof(*le->outer.ip6);
 		m->m_pkthdr.lro_etype = ETHERTYPE_IPV6;
 		break;
 	default:
 		goto compressed;
 	}
 
 	MPASS(tcp_hdr_offset >= 0);
 
 	m_adj(m, tcp_hdr_offset);
 	m->m_flags |= M_LRO_EHDRSTRP;
 	m->m_flags &= ~M_ACKCMP;
 	m->m_pkthdr.lro_tcp_h_off -= tcp_hdr_offset;
 
 	th = tcp_lro_get_th(m);
 
 	th->th_sum = 0;		/* TCP checksum is valid. */
 
 	/* Check if ACK can be compressed */
 	can_compress = tcp_lro_ack_valid(m, th, &ts_ptr, &other_opts);
 
 	/* Now lets look at the should wake states */
 	if ((other_opts == true) &&
 	    ((inp->inp_flags2 & INP_DONT_SACK_QUEUE) == 0)) {
 		/*
 		 * If there are other options (SACK?) and the
 		 * tcp endpoint has not expressly told us it does
 		 * not care about SACKS, then we should wake up.
 		 */
 		*should_wake = true;
 	}
 	/* Is the ack compressable? */
 	if (can_compress == false)
 		goto done;
 	/* Does the TCP endpoint support ACK compression? */
 	if ((inp->inp_flags2 & INP_MBUF_ACKCMP) == 0)
 		goto done;
 
 	/* Lets get the TOS/traffic class field */
 	l3.ptr = mtod(m, void *);
 	switch (lro_type) {
 	case LRO_TYPE_IPV4_TCP:
 		iptos = l3.ip4->ip_tos;
 		break;
 	case LRO_TYPE_IPV6_TCP:
 		iptos = IPV6_TRAFFIC_CLASS(l3.ip6);
 		break;
 	default:
 		iptos = 0;	/* Keep compiler happy. */
 		break;
 	}
 	/* Now lets get space if we don't have some already */
 	if (*cmp == NULL) {
 new_one:
 		nm = tcp_lro_get_last_if_ackcmp(lc, le, inp, &n_mbuf, can_append_old_cmp);
 		if (__predict_false(nm == NULL))
 			goto done;
 		*cmp = nm;
 		if (n_mbuf) {
 			/*
 			 *  Link in the new cmp ack to our in-order place,
 			 * first set our cmp ack's next to where we are.
 			 */
 			nm->m_nextpkt = m;
 			(*pp) = nm;
 			/*
 			 * Set it up so mv_to is advanced to our
 			 * compressed ack. This way the caller can
 			 * advance pp to the right place.
 			 */
 			*mv_to = nm;
 			/*
 			 * Advance it here locally as well.
 			 */
 			pp = &nm->m_nextpkt;
 		}
 	} else {
 		/* We have one already we are working on */
 		nm = *cmp;
 		if (M_TRAILINGSPACE(nm) < sizeof(struct tcp_ackent)) {
 			/* We ran out of space */
 			inp->inp_flags2 |= INP_MBUF_L_ACKS;
 			goto new_one;
 		}
 	}
 	MPASS(M_TRAILINGSPACE(nm) >= sizeof(struct tcp_ackent));
 	counter_u64_add(tcp_inp_lro_compressed, 1);
 	le->compressed++;
 	/* We can add in to the one on the tail */
 	ack_ent = mtod(nm, struct tcp_ackent *);
 	idx = (nm->m_len / sizeof(struct tcp_ackent));
 	build_ack_entry(&ack_ent[idx], th, m, ts_ptr, iptos);
 
 	/* Bump the size of both pkt-hdr and len */
 	nm->m_len += sizeof(struct tcp_ackent);
 	nm->m_pkthdr.len += sizeof(struct tcp_ackent);
 compressed:
 	/* Advance to next mbuf before freeing. */
 	*pp = m->m_nextpkt;
 	m->m_nextpkt = NULL;
 	m_freem(m);
 	return (true);
 done:
 	counter_u64_add(tcp_uncomp_total, 1);
 	le->uncompressed++;
 	return (false);
 }
 #endif
 
 static struct lro_head *
 tcp_lro_rx_get_bucket(struct lro_ctrl *lc, struct mbuf *m, struct lro_parser *parser)
 {
 	u_long hash;
 
 	if (M_HASHTYPE_ISHASH(m)) {
 		hash = m->m_pkthdr.flowid;
 	} else {
 		for (unsigned i = hash = 0; i != LRO_RAW_ADDRESS_MAX; i++)
 			hash += parser->data.raw[i];
 	}
 	return (&lc->lro_hash[hash % lc->lro_hashsz]);
 }
 
 static int
 tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, bool use_hash)
 {
 	struct lro_parser pi;	/* inner address data */
 	struct lro_parser po;	/* outer address data */
 	struct lro_parser *pa;	/* current parser for TCP stream */
 	struct lro_entry *le;
 	struct lro_head *bucket;
 	struct tcphdr *th;
 	int tcp_data_len;
 	int tcp_opt_len;
 	int error;
 	uint16_t tcp_data_sum;
 
 #ifdef INET
 	/* Quickly decide if packet cannot be LRO'ed */
 	if (__predict_false(V_ipforwarding != 0))
 		return (TCP_LRO_CANNOT);
 #endif
 #ifdef INET6
 	/* Quickly decide if packet cannot be LRO'ed */
 	if (__predict_false(V_ip6_forwarding != 0))
 		return (TCP_LRO_CANNOT);
 #endif
 	if (((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) !=
 	     ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || 
 	    (m->m_pkthdr.csum_data != 0xffff)) {
 		/* 
 		 * The checksum either did not have hardware offload
 		 * or it was a bad checksum. We can't LRO such
 		 * a packet.
 		 */
 		counter_u64_add(tcp_bad_csums, 1);
 		return (TCP_LRO_CANNOT);
 	}
 	/* We expect a contiguous header [eh, ip, tcp]. */
 	pa = tcp_lro_parser(m, &po, &pi, true);
 	if (__predict_false(pa == NULL))
 		return (TCP_LRO_NOT_SUPPORTED);
 
 	/* We don't expect any padding. */
 	error = tcp_lro_trim_mbuf_chain(m, pa);
 	if (__predict_false(error != 0))
 		return (error);
 
 #ifdef INET
 	switch (pa->data.lro_type) {
 	case LRO_TYPE_IPV4_TCP:
 		error = tcp_lro_rx_ipv4(lc, m, pa->ip4);
 		if (__predict_false(error != 0))
 			return (error);
 		break;
 	default:
 		break;
 	}
 #endif
 	/* If no hardware or arrival stamp on the packet add timestamp */
 	if ((m->m_flags & (M_TSTMP_LRO | M_TSTMP)) == 0) {
 		m->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); 
 		m->m_flags |= M_TSTMP_LRO;
 	}
 
 	/* Get pointer to TCP header. */
 	th = pa->tcp;
 
 	/* Don't process SYN packets. */
 	if (__predict_false(tcp_get_flags(th) & TH_SYN))
 		return (TCP_LRO_CANNOT);
 
 	/* Get total TCP header length and compute payload length. */
 	tcp_opt_len = (th->th_off << 2);
 	tcp_data_len = m->m_pkthdr.len - ((uint8_t *)th -
 	    (uint8_t *)m->m_data) - tcp_opt_len;
 	tcp_opt_len -= sizeof(*th);
 
 	/* Don't process invalid TCP headers. */
 	if (__predict_false(tcp_opt_len < 0 || tcp_data_len < 0))
 		return (TCP_LRO_CANNOT);
 
 	/* Compute TCP data only checksum. */
 	if (tcp_data_len == 0)
 		tcp_data_sum = 0;	/* no data, no checksum */
 	else if (__predict_false(csum != 0))
 		tcp_data_sum = tcp_lro_rx_csum_data(pa, ~csum);
 	else
 		tcp_data_sum = tcp_lro_rx_csum_data(pa, ~th->th_sum);
 
 	/* Save TCP info in mbuf. */
 	m->m_nextpkt = NULL;
 	m->m_pkthdr.rcvif = lc->ifp;
 	m->m_pkthdr.lro_tcp_d_csum = tcp_data_sum;
 	m->m_pkthdr.lro_tcp_d_len = tcp_data_len;
 	m->m_pkthdr.lro_tcp_h_off = ((uint8_t *)th - (uint8_t *)m->m_data);
 	m->m_pkthdr.lro_nsegs = 1;
 
 	/* Get hash bucket. */
 	if (!use_hash) {
 		bucket = &lc->lro_hash[0];
 	} else {
 		bucket = tcp_lro_rx_get_bucket(lc, m, pa);
 	}
 
 	/* Try to find a matching previous segment. */
 	LIST_FOREACH(le, bucket, hash_next) {
 		/* Compare addresses and ports. */
 		if (lro_address_compare(&po.data, &le->outer.data) == false ||
 		    lro_address_compare(&pi.data, &le->inner.data) == false)
 			continue;
 
 		/* Check if no data and old ACK. */
 		if (tcp_data_len == 0 &&
 		    SEQ_LT(ntohl(th->th_ack), ntohl(le->ack_seq))) {
 			m_freem(m);
 			return (0);
 		}
 
 		/* Mark "m" in the last spot. */
 		le->m_last_mbuf->m_nextpkt = m;
 		/* Now set the tail to "m". */
 		le->m_last_mbuf = m;
 		return (0);
 	}
 
 	/* Try to find an empty slot. */
 	if (LIST_EMPTY(&lc->lro_free))
 		return (TCP_LRO_NO_ENTRIES);
 
 	/* Start a new segment chain. */
 	le = LIST_FIRST(&lc->lro_free);
 	LIST_REMOVE(le, next);
 	tcp_lro_active_insert(lc, bucket, le);
 
 	/* Make sure the headers are set. */
 	le->inner = pi;
 	le->outer = po;
 
 	/* Store time this entry was allocated. */
 	le->alloc_time = lc->lro_last_queue_time;
 
 	tcp_set_entry_to_mbuf(lc, le, m, th);
 
 	/* Now set the tail to "m". */
 	le->m_last_mbuf = m;
 
 	return (0);
 }
 
 int
 tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
 {
 	int error;
 
 	if (((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) !=
 	     ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || 
 	    (m->m_pkthdr.csum_data != 0xffff)) {
 		/* 
 		 * The checksum either did not have hardware offload
 		 * or it was a bad checksum. We can't LRO such
 		 * a packet.
 		 */
 		counter_u64_add(tcp_bad_csums, 1);
 		return (TCP_LRO_CANNOT);
 	}
 	/* get current time */
 	binuptime(&lc->lro_last_queue_time);
 	CURVNET_SET(lc->ifp->if_vnet);
 	error = tcp_lro_rx_common(lc, m, csum, true);
 	if (__predict_false(error != 0)) {
 		/*
 		 * Flush anything we have acummulated
 		 * ahead of this packet that can't
 		 * be LRO'd. This preserves order.
 		 */
 		tcp_lro_flush_active(lc);
 	}
 	CURVNET_RESTORE();
 
 	return (error);
 }
 
 void
 tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb)
 {
 	NET_EPOCH_ASSERT();
 	/* sanity checks */
 	if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL ||
 	    lc->lro_mbuf_max == 0)) {
 		/* packet drop */
 		m_freem(mb);
 		return;
 	}
 
 	/* check if packet is not LRO capable */
 	if (__predict_false((lc->ifp->if_capenable & IFCAP_LRO) == 0)) {
 		/* input packet to network layer */
 		(*lc->ifp->if_input) (lc->ifp, mb);
 		return;
 	}
 
  	/* If no hardware or arrival stamp on the packet add timestamp */
  	if ((tcplro_stacks_wanting_mbufq > 0) &&
  	    (tcp_less_accurate_lro_ts == 0) &&
  	    ((mb->m_flags & M_TSTMP) == 0)) {
  		/* Add in an LRO time since no hardware */
  		binuptime(&lc->lro_last_queue_time);
  		mb->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); 
  		mb->m_flags |= M_TSTMP_LRO;
  	}
 
 	/* create sequence number */
 	lc->lro_mbuf_data[lc->lro_mbuf_count].seq =
 	    (((uint64_t)M_HASHTYPE_GET(mb)) << 56) |
 	    (((uint64_t)mb->m_pkthdr.flowid) << 24) |
 	    ((uint64_t)lc->lro_mbuf_count);
 
 	/* enter mbuf */
 	lc->lro_mbuf_data[lc->lro_mbuf_count].mb = mb;
 
 	/* flush if array is full */
 	if (__predict_false(++lc->lro_mbuf_count == lc->lro_mbuf_max))
 		tcp_lro_flush_all(lc);
 }
 
 /* end */
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index 31b5c2cc78dc..40d3a40dbe38 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -1,14887 +1,14887 @@
 /*-
  * Copyright (c) 2016-2020 Netflix, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 /**
  * Author: Randall Stewart <rrs@netflix.com>
  * This work is based on the ACM Queue paper
  * BBR - Congestion Based Congestion Control
  * and also numerous discussions with Neal, Yuchung and Van.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_tcpdebug.h"
 #include "opt_ratelimit.h"
 #include <sys/param.h>
 #include <sys/arb.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/libkern.h>
 #ifdef TCP_HHOOK
 #include <sys/hhook.h>
 #endif
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #ifdef STATS
 #include <sys/qmath.h>
 #include <sys/tree.h>
 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
 #endif
 #include <sys/refcount.h>
 #include <sys/queue.h>
 #include <sys/eventhandler.h>
 #include <sys/smp.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/tim_filter.h>
 #include <sys/time.h>
 #include <sys/protosw.h>
 #include <vm/uma.h>
 #include <sys/kern_prefetch.h>
 
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #define TCPSTATES		/* for logging */
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* required for icmp_var.h */
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #define	TCPOUTFLAGS
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_hpts.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_ratelimit.h>
 #include <netinet/tcp_lro.h>
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif				/* TCPDEBUG */
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcp_fastopen.h>
 
 #include <netipsec/ipsec_support.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/ethernet.h>
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif				/* IPSEC */
 
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <machine/in_cksum.h>
 
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
 
 #include "sack_filter.h"
 #include "tcp_bbr.h"
 #include "rack_bbr_common.h"
 uma_zone_t bbr_zone;
 uma_zone_t bbr_pcb_zone;
 
 struct sysctl_ctx_list bbr_sysctl_ctx;
 struct sysctl_oid *bbr_sysctl_root;
 
 #define	TCPT_RANGESET_NOSLOP(tv, value, tvmin, tvmax) do { \
 	(tv) = (value); \
 	if ((u_long)(tv) < (u_long)(tvmin)) \
 		(tv) = (tvmin); \
 	if ((u_long)(tv) > (u_long)(tvmax)) \
 		(tv) = (tvmax); \
 } while(0)
 
 /*#define BBR_INVARIANT 1*/
 
 /*
  * initial window
  */
 static uint32_t bbr_def_init_win = 10;
 static int32_t bbr_persist_min = 250000;	/* 250ms */
 static int32_t bbr_persist_max = 1000000;	/* 1 Second */
 static int32_t bbr_cwnd_may_shrink = 0;
 static int32_t bbr_cwndtarget_rtt_touse = BBR_RTT_PROP;
 static int32_t bbr_num_pktepo_for_del_limit = BBR_NUM_RTTS_FOR_DEL_LIMIT;
 static int32_t bbr_hardware_pacing_limit = 8000;
 static int32_t bbr_quanta = 3;	/* How much extra quanta do we get? */
 static int32_t bbr_no_retran = 0;
 
 static int32_t bbr_error_base_paceout = 10000; /* usec to pace */
 static int32_t bbr_max_net_error_cnt = 10;
 /* Should the following be dynamic too -- loss wise */
 static int32_t bbr_rtt_gain_thresh = 0;
 /* Measurement controls */
 static int32_t bbr_use_google_algo = 1;
 static int32_t bbr_ts_limiting = 1;
 static int32_t bbr_ts_can_raise = 0;
 static int32_t bbr_do_red = 600;
 static int32_t bbr_red_scale = 20000;
 static int32_t bbr_red_mul = 1;
 static int32_t bbr_red_div = 2;
 static int32_t bbr_red_growth_restrict = 1;
 static int32_t  bbr_target_is_bbunit = 0;
 static int32_t bbr_drop_limit = 0;
 /*
  * How much gain do we need to see to
  * stay in startup?
  */
 static int32_t bbr_marks_rxt_sack_passed = 0;
 static int32_t bbr_start_exit = 25;
 static int32_t bbr_low_start_exit = 25;	/* When we are in reduced gain */
 static int32_t bbr_startup_loss_thresh = 2000;	/* 20.00% loss */
 static int32_t bbr_hptsi_max_mul = 1;	/* These two mul/div assure a min pacing */
 static int32_t bbr_hptsi_max_div = 2;	/* time, 0 means turned off. We need this
 					 * if we go back ever to where the pacer
 					 * has priority over timers.
 					 */
 static int32_t bbr_policer_call_from_rack_to = 0;
 static int32_t bbr_policer_detection_enabled = 1;
 static int32_t bbr_min_measurements_req = 1;	/* We need at least 2
 						 * measurements before we are
 						 * "good" note that 2 == 1.
 						 * This is because we use a >
 						 * comparison. This means if
 						 * min_measure was 0, it takes
 						 * num-measures > min(0) and
 						 * you get 1 measurement and
 						 * you are good. Set to 1, you
 						 * have to have two
 						 * measurements (this is done
 						 * to prevent it from being ok
 						 * to have no measurements). */
 static int32_t bbr_no_pacing_until = 4;
 
 static int32_t bbr_min_usec_delta = 20000;	/* 20,000 usecs */
 static int32_t bbr_min_peer_delta = 20;		/* 20 units */
 static int32_t bbr_delta_percent = 150;		/* 15.0 % */
 
 static int32_t bbr_target_cwnd_mult_limit = 8;
 /*
  * bbr_cwnd_min_val is the number of
  * segments we hold to in the RTT probe
  * state typically 4.
  */
 static int32_t bbr_cwnd_min_val = BBR_PROBERTT_NUM_MSS;
 
 static int32_t bbr_cwnd_min_val_hs = BBR_HIGHSPEED_NUM_MSS;
 
 static int32_t bbr_gain_to_target = 1;
 static int32_t bbr_gain_gets_extra_too = 1;
 /*
  * bbr_high_gain is the 2/ln(2) value we need
  * to double the sending rate in startup. This
  * is used for both cwnd and hptsi gain's.
  */
 static int32_t bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
 static int32_t bbr_startup_lower = BBR_UNIT * 1500 / 1000 + 1;
 static int32_t bbr_use_lower_gain_in_startup = 1;
 
 /* thresholds for reduction on drain in sub-states/drain */
 static int32_t bbr_drain_rtt = BBR_SRTT;
 static int32_t bbr_drain_floor = 88;
 static int32_t google_allow_early_out = 1;
 static int32_t google_consider_lost = 1;
 static int32_t bbr_drain_drop_mul = 4;
 static int32_t bbr_drain_drop_div = 5;
 static int32_t bbr_rand_ot = 50;
 static int32_t bbr_can_force_probertt = 0;
 static int32_t bbr_can_adjust_probertt = 1;
 static int32_t bbr_probertt_sets_rtt = 0;
 static int32_t bbr_can_use_ts_for_rtt = 1;
 static int32_t bbr_is_ratio = 0;
 static int32_t bbr_sub_drain_app_limit = 1;
 static int32_t bbr_prtt_slam_cwnd = 1;
 static int32_t bbr_sub_drain_slam_cwnd = 1;
 static int32_t bbr_slam_cwnd_in_main_drain = 1;
 static int32_t bbr_filter_len_sec = 6;	/* How long does the rttProp filter
 					 * hold */
 static uint32_t bbr_rtt_probe_limit = (USECS_IN_SECOND * 4);
 /*
  * bbr_drain_gain is the reverse of the high_gain
  * designed to drain back out the standing queue
  * that is formed in startup by causing a larger
  * hptsi gain and thus drainging the packets
  * in flight.
  */
 static int32_t bbr_drain_gain = BBR_UNIT * 1000 / 2885;
 static int32_t bbr_rttprobe_gain = 192;
 
 /*
  * The cwnd_gain is the default cwnd gain applied when
  * calculating a target cwnd. Note that the cwnd is
  * a secondary factor in the way BBR works (see the
  * paper and think about it, it will take some time).
  * Basically the hptsi_gain spreads the packets out
  * so you never get more than BDP to the peer even
  * if the cwnd is high. In our implemenation that
  * means in non-recovery/retransmission scenarios
  * cwnd will never be reached by the flight-size.
  */
 static int32_t bbr_cwnd_gain = BBR_UNIT * 2;
 static int32_t bbr_tlp_type_to_use = BBR_SRTT;
 static int32_t bbr_delack_time = 100000;	/* 100ms in useconds */
 static int32_t bbr_sack_not_required = 0;	/* set to one to allow non-sack to use bbr */
 static int32_t bbr_initial_bw_bps = 62500;	/* 500kbps in bytes ps */
 static int32_t bbr_ignore_data_after_close = 1;
 static int16_t bbr_hptsi_gain[] = {
 	(BBR_UNIT *5 / 4),
 	(BBR_UNIT * 3 / 4),
 	BBR_UNIT,
 	BBR_UNIT,
 	BBR_UNIT,
 	BBR_UNIT,
 	BBR_UNIT,
 	BBR_UNIT
 };
 int32_t bbr_use_rack_resend_cheat = 1;
 int32_t bbr_sends_full_iwnd = 1;
 
 #define BBR_HPTSI_GAIN_MAX 8
 /*
  * The BBR module incorporates a number of
  * TCP ideas that have been put out into the IETF
  * over the last few years:
  * - Yuchung Cheng's RACK TCP (for which its named) that
  *    will stop us using the number of dup acks and instead
  *    use time as the gage of when we retransmit.
  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
  *    of Dukkipati et.al.
  * - Van Jacobson's et.al BBR.
  *
  * RACK depends on SACK, so if an endpoint arrives that
  * cannot do SACK the state machine below will shuttle the
  * connection back to using the "default" TCP stack that is
  * in FreeBSD.
  *
  * To implement BBR and RACK the original TCP stack was first decomposed
  * into a functional state machine with individual states
  * for each of the possible TCP connection states. The do_segment
  * functions role in life is to mandate the connection supports SACK
  * initially and then assure that the RACK state matches the conenction
  * state before calling the states do_segment function. Data processing
  * of inbound segments also now happens in the hpts_do_segment in general
  * with only one exception. This is so we can keep the connection on
  * a single CPU.
  *
  * Each state is simplified due to the fact that the original do_segment
  * has been decomposed and we *know* what state we are in (no
  * switches on the state) and all tests for SACK are gone. This
  * greatly simplifies what each state does.
  *
  * TCP output is also over-written with a new version since it
  * must maintain the new rack scoreboard and has had hptsi
  * integrated as a requirment. Still todo is to eliminate the
  * use of the callout_() system and use the hpts for all
  * timers as well.
  */
 static uint32_t bbr_rtt_probe_time = 200000;	/* 200ms in micro seconds */
 static uint32_t bbr_rtt_probe_cwndtarg = 4;	/* How many mss's outstanding */
 static const int32_t bbr_min_req_free = 2;	/* The min we must have on the
 						 * free list */
 static int32_t bbr_tlp_thresh = 1;
 static int32_t bbr_reorder_thresh = 2;
 static int32_t bbr_reorder_fade = 60000000;	/* 0 - never fade, def
 						 * 60,000,000 - 60 seconds */
 static int32_t bbr_pkt_delay = 1000;
 static int32_t bbr_min_to = 1000;	/* Number of usec's minimum timeout */
 static int32_t bbr_incr_timers = 1;
 
 static int32_t bbr_tlp_min = 10000;	/* 10ms in usecs */
 static int32_t bbr_delayed_ack_time = 200000;	/* 200ms in usecs */
 static int32_t bbr_exit_startup_at_loss = 1;
 
 /*
  * bbr_lt_bw_ratio is 1/8th
  * bbr_lt_bw_diff is  < 4 Kbit/sec
  */
 static uint64_t bbr_lt_bw_diff = 4000 / 8;	/* In bytes per second */
 static uint64_t bbr_lt_bw_ratio = 8;	/* For 1/8th */
 static uint32_t bbr_lt_bw_max_rtts = 48;	/* How many rtt's do we use
 						 * the lt_bw for */
 static uint32_t bbr_lt_intvl_min_rtts = 4;	/* Min num of RTT's to measure
 						 * lt_bw */
 static int32_t bbr_lt_intvl_fp = 0;		/* False positive epoch diff */
 static int32_t bbr_lt_loss_thresh = 196;	/* Lost vs delivered % */
 static int32_t bbr_lt_fd_thresh = 100;		/* false detection % */
 
 static int32_t bbr_verbose_logging = 0;
 /*
  * Currently regular tcp has a rto_min of 30ms
  * the backoff goes 12 times so that ends up
  * being a total of 122.850 seconds before a
  * connection is killed.
  */
 static int32_t bbr_rto_min_ms = 30;	/* 30ms same as main freebsd */
 static int32_t bbr_rto_max_sec = 4;	/* 4 seconds */
 
 /****************************************************/
 /* DEFAULT TSO SIZING  (cpu performance impacting)  */
 /****************************************************/
 /* What amount is our formula using to get TSO size */
 static int32_t bbr_hptsi_per_second = 1000;
 
 /*
  * For hptsi under bbr_cross_over connections what is delay
  * target 7ms (in usec) combined with a seg_max of 2
  * gets us close to identical google behavior in
  * TSO size selection (possibly more 1MSS sends).
  */
 static int32_t bbr_hptsi_segments_delay_tar = 7000;
 
 /* Does pacing delay include overhead's in its time calculations? */
 static int32_t bbr_include_enet_oh = 0;
 static int32_t bbr_include_ip_oh = 1;
 static int32_t bbr_include_tcp_oh = 1;
 static int32_t bbr_google_discount = 10;
 
 /* Do we use (nf mode) pkt-epoch to drive us or rttProp? */
 static int32_t bbr_state_is_pkt_epoch = 0;
 static int32_t bbr_state_drain_2_tar = 1;
 /* What is the max the 0 - bbr_cross_over MBPS TSO target
  * can reach using our delay target. Note that this
  * value becomes the floor for the cross over
  * algorithm.
  */
 static int32_t bbr_hptsi_segments_max = 2;
 static int32_t bbr_hptsi_segments_floor = 1;
 static int32_t bbr_hptsi_utter_max = 0;
 
 /* What is the min the 0 - bbr_cross-over MBPS  TSO target can be */
 static int32_t bbr_hptsi_bytes_min = 1460;
 static int32_t bbr_all_get_min = 0;
 
 /* Cross over point from algo-a to algo-b */
 static uint32_t bbr_cross_over = TWENTY_THREE_MBPS;
 
 /* Do we deal with our restart state? */
 static int32_t bbr_uses_idle_restart = 0;
 static int32_t bbr_idle_restart_threshold = 100000;	/* 100ms in useconds */
 
 /* Do we allow hardware pacing? */
 static int32_t bbr_allow_hdwr_pacing = 0;
 static int32_t bbr_hdwr_pace_adjust = 2;	/* multipler when we calc the tso size */
 static int32_t bbr_hdwr_pace_floor = 1;
 static int32_t bbr_hdwr_pacing_delay_cnt = 10;
 
 /****************************************************/
 static int32_t bbr_resends_use_tso = 0;
 static int32_t bbr_tlp_max_resend = 2;
 static int32_t bbr_sack_block_limit = 128;
 
 #define  BBR_MAX_STAT 19
 counter_u64_t bbr_state_time[BBR_MAX_STAT];
 counter_u64_t bbr_state_lost[BBR_MAX_STAT];
 counter_u64_t bbr_state_resend[BBR_MAX_STAT];
 counter_u64_t bbr_stat_arry[BBR_STAT_SIZE];
 counter_u64_t bbr_opts_arry[BBR_OPTS_SIZE];
 counter_u64_t bbr_out_size[TCP_MSS_ACCT_SIZE];
 counter_u64_t bbr_flows_whdwr_pacing;
 counter_u64_t bbr_flows_nohdwr_pacing;
 
 counter_u64_t bbr_nohdwr_pacing_enobuf;
 counter_u64_t bbr_hdwr_pacing_enobuf;
 
 static inline uint64_t bbr_get_bw(struct tcp_bbr *bbr);
 
 /*
  * Static defintions we need for forward declarations.
  */
 static uint32_t
 bbr_get_pacing_length(struct tcp_bbr *bbr, uint16_t gain,
 		      uint32_t useconds_time, uint64_t bw);
 static uint32_t
 bbr_get_a_state_target(struct tcp_bbr *bbr, uint32_t gain);
 static void
 bbr_set_state(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t win);
 static void
 bbr_set_probebw_gains(struct tcp_bbr *bbr,  uint32_t cts, uint32_t losses);
 static void
 bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int line,
 		    int dolog);
 static uint32_t
 bbr_get_target_cwnd(struct tcp_bbr *bbr, uint64_t bw, uint32_t gain);
 static void
 bbr_state_change(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch,
 		 int32_t pkt_epoch, uint32_t losses);
 static uint32_t
 bbr_calc_thresh_rack(struct tcp_bbr *bbr, uint32_t srtt, uint32_t cts,
 		     struct bbr_sendmap *rsm);
 static uint32_t
 bbr_initial_cwnd(struct tcp_bbr *bbr, struct tcpcb *tp);
 static uint32_t
 bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr,
 		    struct bbr_sendmap *rsm, uint32_t srtt, uint32_t cts);
 static void
 bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts,
 		 int32_t line);
 static void
 bbr_set_state_target(struct tcp_bbr *bbr, int line);
 static void
 bbr_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts, int32_t line);
 static void
 bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick,
 		       int event, int line);
 static void
 tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts);
 static void
 bbr_setup_red_bw(struct tcp_bbr *bbr, uint32_t cts);
 static void
 bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied,
 		    uint32_t rtt, uint32_t line, uint8_t is_start,
 		    uint16_t set);
 static struct bbr_sendmap *
 bbr_find_lowest_rsm(struct tcp_bbr *bbr);
 static __inline uint32_t
 bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type);
 static void
 bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot,
 		 uint8_t which);
 static void
 bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts,
 		  uint32_t time_since_sent, uint32_t srtt,
 		  uint32_t thresh, uint32_t to);
 static void
 bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag);
 static void
 bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot,
 		    uint32_t del_by, uint32_t cts, uint32_t sloton,
 		    uint32_t prev_delay);
 static void
 bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts,
 		  int32_t line);
 static void
 bbr_stop_all_timers(struct tcpcb *tp);
 static void
 bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts);
 static void
 bbr_check_probe_rtt_limits(struct tcp_bbr *bbr, uint32_t cts);
 static void
 bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts);
 static void
 bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len,
 			  uint32_t cts, uint32_t usecs, uint64_t bw,
 			  uint32_t override, int mod);
 static int
 bbr_ctloutput(struct inpcb *inp, struct sockopt *sopt);
 
 static inline uint8_t
 bbr_state_val(struct tcp_bbr *bbr)
 {
 	return(bbr->rc_bbr_substate);
 }
 
 static inline uint32_t
 get_min_cwnd(struct tcp_bbr *bbr)
 {
 	int mss;
 
 	mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options),
 		  bbr->r_ctl.rc_pace_max_segs);
 	if (bbr_get_rtt(bbr, BBR_RTT_PROP) < BBR_HIGH_SPEED)
 		return (bbr_cwnd_min_val_hs * mss);
 	else
 		return (bbr_cwnd_min_val * mss);
 }
 
 static uint32_t
 bbr_get_persists_timer_val(struct tcpcb *tp, struct tcp_bbr *bbr)
 {
 	uint64_t srtt, var;
 	uint64_t ret_val;
 
 	bbr->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
 	if (tp->t_srtt == 0) {
 		srtt = (uint64_t)BBR_INITIAL_RTO;
 		var = 0;
 	} else {
 		srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT);
 		var = ((uint64_t)TICKS_2_USEC(tp->t_rttvar) >> TCP_RTT_SHIFT);
 	}
 	TCPT_RANGESET_NOSLOP(ret_val, ((srtt + var) * tcp_backoff[tp->t_rxtshift]),
 	    bbr_persist_min, bbr_persist_max);
 	return ((uint32_t)ret_val);
 }
 
 static uint32_t
 bbr_timer_start(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 {
 	/*
 	 * Start the FR timer, we do this based on getting the first one in
 	 * the rc_tmap. Note that if its NULL we must stop the timer. in all
 	 * events we need to stop the running timer (if its running) before
 	 * starting the new one.
 	 */
 	uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse;
 	int32_t idx;
 	int32_t is_tlp_timer = 0;
 	struct bbr_sendmap *rsm;
 
 	if (bbr->rc_all_timers_stopped) {
 		/* All timers have been stopped none are to run */
 		return (0);
 	}
 	if (bbr->rc_in_persist) {
 		/* We can't start any timer in persists */
 		return (bbr_get_persists_timer_val(tp, bbr));
 	}
 	rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
 	if ((rsm == NULL) ||
 	    ((tp->t_flags & TF_SACK_PERMIT) == 0) ||
 	    (tp->t_state < TCPS_ESTABLISHED)) {
 		/* Nothing on the send map */
 activate_rxt:
 		if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
 			uint64_t tov;
 
 			time_since_sent = 0;
 			rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
 			if (rsm) {
 				idx = rsm->r_rtr_cnt - 1;
 				if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time))
 					tstmp_touse = rsm->r_tim_lastsent[idx];
 				else
 					tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time;
 				if (TSTMP_GT(tstmp_touse, cts))
 				    time_since_sent = cts - tstmp_touse;
 			}
 			bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
 			if (tp->t_srtt == 0)
 				tov = BBR_INITIAL_RTO;
 			else
 				tov = ((uint64_t)(TICKS_2_USEC(tp->t_srtt) +
 				    ((uint64_t)TICKS_2_USEC(tp->t_rttvar) * (uint64_t)4)) >> TCP_RTT_SHIFT);
 			if (tp->t_rxtshift)
 				tov *= tcp_backoff[tp->t_rxtshift];
 			if (tov > time_since_sent)
 				tov -= time_since_sent;
 			else
 				tov = bbr->r_ctl.rc_min_to;
 			TCPT_RANGESET_NOSLOP(to, tov,
 			    (bbr->r_ctl.rc_min_rto_ms * MS_IN_USEC),
 			    (bbr->rc_max_rto_sec * USECS_IN_SECOND));
 			bbr_log_timer_var(bbr, 2, cts, 0, srtt, 0, to);
 			return (to);
 		}
 		return (0);
 	}
 	if (rsm->r_flags & BBR_ACKED) {
 		rsm = bbr_find_lowest_rsm(bbr);
 		if (rsm == NULL) {
 			/* No lowest? */
 			goto activate_rxt;
 		}
 	}
 	/* Convert from ms to usecs */
 	if (rsm->r_flags & BBR_SACK_PASSED) {
 		if ((tp->t_flags & TF_SENTFIN) &&
 		    ((tp->snd_max - tp->snd_una) == 1) &&
 		    (rsm->r_flags & BBR_HAS_FIN)) {
 			/*
 			 * We don't start a bbr rack timer if all we have is
 			 * a FIN outstanding.
 			 */
 			goto activate_rxt;
 		}
 		srtt = bbr_get_rtt(bbr, BBR_RTT_RACK);
 		thresh = bbr_calc_thresh_rack(bbr, srtt, cts, rsm);
 		idx = rsm->r_rtr_cnt - 1;
 		exp = rsm->r_tim_lastsent[idx] + thresh;
 		if (SEQ_GEQ(exp, cts)) {
 			to = exp - cts;
 			if (to < bbr->r_ctl.rc_min_to) {
 				to = bbr->r_ctl.rc_min_to;
 			}
 		} else {
 			to = bbr->r_ctl.rc_min_to;
 		}
 	} else {
 		/* Ok we need to do a TLP not RACK */
 		if (bbr->rc_tlp_in_progress != 0) {
 			/*
 			 * The previous send was a TLP.
 			 */
 			goto activate_rxt;
 		}
 		rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_tmap, bbr_sendmap, r_tnext);
 		if (rsm == NULL) {
 			/* We found no rsm to TLP with. */
 			goto activate_rxt;
 		}
 		if (rsm->r_flags & BBR_HAS_FIN) {
 			/* If its a FIN we don't do TLP */
 			rsm = NULL;
 			goto activate_rxt;
 		}
 		time_since_sent = 0;
 		idx = rsm->r_rtr_cnt - 1;
 		if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time))
 			tstmp_touse = rsm->r_tim_lastsent[idx];
 		else
 			tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time;
 		if (TSTMP_GT(tstmp_touse, cts))
 		    time_since_sent = cts - tstmp_touse;
 		is_tlp_timer = 1;
 		srtt = bbr_get_rtt(bbr, bbr_tlp_type_to_use);
 		thresh = bbr_calc_thresh_tlp(tp, bbr, rsm, srtt, cts);
 		if (thresh > time_since_sent)
 			to = thresh - time_since_sent;
 		else
 			to = bbr->r_ctl.rc_min_to;
 		if (to > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) {
 			/*
 			 * If the TLP time works out to larger than the max
 			 * RTO lets not do TLP.. just RTO.
 			 */
 			goto activate_rxt;
 		}
 		if ((bbr->rc_tlp_rtx_out == 1) &&
 		    (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq)) {
 			/*
 			 * Second retransmit of the same TLP
 			 * lets not.
 			 */
 			bbr->rc_tlp_rtx_out = 0;
 			goto activate_rxt;
 		}
 		if (rsm->r_start != bbr->r_ctl.rc_last_tlp_seq) {
 			/*
 			 * The tail is no longer the last one I did a probe
 			 * on
 			 */
 			bbr->r_ctl.rc_tlp_seg_send_cnt = 0;
 			bbr->r_ctl.rc_last_tlp_seq = rsm->r_start;
 		}
 	}
 	if (is_tlp_timer == 0) {
 		BBR_STAT_INC(bbr_to_arm_rack);
 		bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
 	} else {
 		bbr_log_timer_var(bbr, 1, cts, time_since_sent, srtt, thresh, to);
 		if (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend) {
 			/*
 			 * We have exceeded how many times we can retran the
 			 * current TLP timer, switch to the RTO timer.
 			 */
 			goto activate_rxt;
 		} else {
 			BBR_STAT_INC(bbr_to_arm_tlp);
 			bbr->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
 		}
 	}
 	return (to);
 }
 
 static inline int32_t
 bbr_minseg(struct tcp_bbr *bbr)
 {
 	return (bbr->r_ctl.rc_pace_min_segs - bbr->rc_last_options);
 }
 
 static void
 bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t slot, uint32_t tot_len)
 {
 	struct inpcb *inp;
 	struct hpts_diag diag;
 	uint32_t delayed_ack = 0;
 	uint32_t left = 0;
 	uint32_t hpts_timeout;
 	uint8_t stopped;
 	int32_t delay_calc = 0;
 	uint32_t prev_delay = 0;
 
 	inp = tp->t_inpcb;
 	if (tcp_in_hpts(inp)) {
 		/* A previous call is already set up */
 		return;
 	}
 	if ((tp->t_state == TCPS_CLOSED) ||
 	    (tp->t_state == TCPS_LISTEN)) {
 		return;
 	}
 	stopped = bbr->rc_tmr_stopped;
 	if (stopped && TSTMP_GT(bbr->r_ctl.rc_timer_exp, cts)) {
 		left = bbr->r_ctl.rc_timer_exp - cts;
 	}
 	bbr->r_ctl.rc_hpts_flags = 0;
 	bbr->r_ctl.rc_timer_exp = 0;
 	prev_delay = bbr->r_ctl.rc_last_delay_val;
 	if (bbr->r_ctl.rc_last_delay_val &&
 	    (slot == 0)) {
 		/*
 		 * If a previous pacer delay was in place we
 		 * are not coming from the output side (where
 		 * we calculate a delay, more likely a timer).
 		 */
 		slot = bbr->r_ctl.rc_last_delay_val;
 		if (TSTMP_GT(cts, bbr->rc_pacer_started)) {
 			/* Compensate for time passed  */
 			delay_calc = cts - bbr->rc_pacer_started;
 			if (delay_calc <= slot)
 				slot -= delay_calc;
 		}
 	}
 	/* Do we have early to make up for by pushing out the pacing time? */
 	if (bbr->r_agg_early_set) {
 		bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, slot, 0, bbr->r_agg_early_set, 2);
 		slot += bbr->r_ctl.rc_agg_early;
 		bbr->r_ctl.rc_agg_early = 0;
 		bbr->r_agg_early_set = 0;
 	}
 	/* Are we running a total debt that needs to be compensated for? */
 	if (bbr->r_ctl.rc_hptsi_agg_delay) {
 		if (slot > bbr->r_ctl.rc_hptsi_agg_delay) {
 			/* We nuke the delay */
 			slot -= bbr->r_ctl.rc_hptsi_agg_delay;
 			bbr->r_ctl.rc_hptsi_agg_delay = 0;
 		} else {
 			/* We nuke some of the delay, put in a minimal 100usecs  */
 			bbr->r_ctl.rc_hptsi_agg_delay -= slot;
 			bbr->r_ctl.rc_last_delay_val = slot = 100;
 		}
 	}
 	bbr->r_ctl.rc_last_delay_val = slot;
 	hpts_timeout = bbr_timer_start(tp, bbr, cts);
 	if (tp->t_flags & TF_DELACK) {
 		if (bbr->rc_in_persist == 0) {
 			delayed_ack = bbr_delack_time;
 		} else {
 			/*
 			 * We are in persists and have
 			 * gotten a new data element.
 			 */
 			if (hpts_timeout > bbr_delack_time) {
 				/*
 				 * Lets make the persists timer (which acks)
 				 * be the smaller of hpts_timeout and bbr_delack_time.
 				 */
 				hpts_timeout = bbr_delack_time;
 			}
 		}
 	}
 	if (delayed_ack &&
 	    ((hpts_timeout == 0) ||
 	     (delayed_ack < hpts_timeout))) {
 		/* We need a Delayed ack timer */
 		bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;
 		hpts_timeout = delayed_ack;
 	}
 	if (slot) {
 		/* Mark that we have a pacing timer up */
 		BBR_STAT_INC(bbr_paced_segments);
 		bbr->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
 	}
 	/*
 	 * If no timers are going to run and we will fall off thfe hptsi
 	 * wheel, we resort to a keep-alive timer if its configured.
 	 */
 	if ((hpts_timeout == 0) &&
 	    (slot == 0)) {
 		if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
 		    (tp->t_state <= TCPS_CLOSING)) {
 			/*
 			 * Ok we have no timer (persists, rack, tlp, rxt  or
 			 * del-ack), we don't have segments being paced. So
 			 * all that is left is the keepalive timer.
 			 */
 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
 				hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp));
 			} else {
 				hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp));
 			}
 			bbr->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
 		}
 	}
 	if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
 	    (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
 		/*
 		 * RACK, TLP, persists and RXT timers all are restartable
 		 * based on actions input .. i.e we received a packet (ack
 		 * or sack) and that changes things (rw, or snd_una etc).
 		 * Thus we can restart them with a new value. For
 		 * keep-alive, delayed_ack we keep track of what was left
 		 * and restart the timer with a smaller value.
 		 */
 		if (left < hpts_timeout)
 			hpts_timeout = left;
 	}
 	if (bbr->r_ctl.rc_incr_tmrs && slot &&
 	    (bbr->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
 		/*
 		 * If configured to do so, and the timer is either
 		 * the TLP or RXT timer, we need to increase the timeout
 		 * by the pacing time. Consider the bottleneck at my
 		 * machine as an example, we are sending something
 		 * to start a TLP on. The last packet won't be emitted
 		 * fully until the pacing time (the bottleneck will hold
 		 * the data in place). Once the packet is emitted that
 		 * is when we want to start waiting for the TLP. This
 		 * is most evident with hardware pacing (where the nic
 		 * is holding the packet(s) before emitting). But it
 		 * can also show up in the network so we do it for all
 		 * cases. Technically we would take off one packet from
 		 * this extra delay but this is easier and being more
 		 * conservative is probably better.
 		 */
 		hpts_timeout += slot;
 	}
 	if (hpts_timeout) {
 		/*
 		 * Hack alert for now we can't time-out over 2147 seconds (a
 		 * bit more than 35min)
 		 */
 		if (hpts_timeout > 0x7ffffffe)
 			hpts_timeout = 0x7ffffffe;
 		bbr->r_ctl.rc_timer_exp = cts + hpts_timeout;
 	} else
 		bbr->r_ctl.rc_timer_exp = 0;
 	if ((slot) &&
 	    (bbr->rc_use_google ||
 	     bbr->output_error_seen ||
 	     (slot <= hpts_timeout))  ) {
 		/*
 		 * Tell LRO that it can queue packets while
 		 * we pace.
 		 */
 		bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
 		if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
 		    (bbr->rc_cwnd_limited == 0)) {
 			/*
 			 * If we are not cwnd limited and we
 			 * are running a rack timer we put on
 			 * the do not disturbe even for sack.
 			 */
 			inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
 		} else
 			inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
 		bbr->rc_pacer_started = cts;
 
 		(void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot),
 					   __LINE__, &diag);
 		bbr->rc_timer_first = 0;
 		bbr->bbr_timer_src = frm;
 		bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1);
 		bbr_log_hpts_diag(bbr, cts, &diag);
 	} else if (hpts_timeout) {
 		(void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout),
 					   __LINE__, &diag);
 		/*
 		 * We add the flag here as well if the slot is set,
 		 * since hpts will call in to clear the queue first before
 		 * calling the output routine (which does our timers).
 		 * We don't want to set the flag if its just a timer
 		 * else the arrival of data might (that causes us
 		 * to send more) might get delayed. Imagine being
 		 * on a keep-alive timer and a request comes in for
 		 * more data.
 		 */
 		if (slot)
 			bbr->rc_pacer_started = cts;
 		if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
 		    (bbr->rc_cwnd_limited == 0)) {
 			/*
 			 * For a rack timer, don't wake us even
 			 * if a sack arrives as long as we are
 			 * not cwnd limited.
 			 */
 			bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
 			inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
 		} else {
 			/* All other timers wake us up */
 			bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
 			inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
 		}
 		bbr->bbr_timer_src = frm;
 		bbr_log_to_start(bbr, cts, hpts_timeout, slot, 0);
 		bbr_log_hpts_diag(bbr, cts, &diag);
 		bbr->rc_timer_first = 1;
 	}
 	bbr->rc_tmr_stopped = 0;
 	bbr_log_type_bbrsnd(bbr, tot_len, slot, delay_calc, cts, frm, prev_delay);
 }
 
 static void
 bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sockbuf *sb)
 {
 	/*
 	 * We received an ack, and then did not call send or were bounced
 	 * out due to the hpts was running. Now a timer is up as well, is it
 	 * the right timer?
 	 */
 	struct inpcb *inp;
 	struct bbr_sendmap *rsm;
 	uint32_t hpts_timeout;
 	int tmr_up;
 
 	tmr_up = bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
 	if (bbr->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
 		return;
 	rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
 	if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
 	    (tmr_up == PACE_TMR_RXT)) {
 		/* Should be an RXT */
 		return;
 	}
 	inp = bbr->rc_inp;
 	if (rsm == NULL) {
 		/* Nothing outstanding? */
 		if (tp->t_flags & TF_DELACK) {
 			if (tmr_up == PACE_TMR_DELACK)
 				/*
 				 * We are supposed to have delayed ack up
 				 * and we do
 				 */
 				return;
 		} else if (sbavail(&inp->inp_socket->so_snd) &&
 		    (tmr_up == PACE_TMR_RXT)) {
 			/*
 			 * if we hit enobufs then we would expect the
 			 * possibility of nothing outstanding and the RXT up
 			 * (and the hptsi timer).
 			 */
 			return;
 		} else if (((V_tcp_always_keepalive ||
 			    inp->inp_socket->so_options & SO_KEEPALIVE) &&
 			    (tp->t_state <= TCPS_CLOSING)) &&
 			    (tmr_up == PACE_TMR_KEEP) &&
 		    (tp->snd_max == tp->snd_una)) {
 			/* We should have keep alive up and we do */
 			return;
 		}
 	}
 	if (rsm && (rsm->r_flags & BBR_SACK_PASSED)) {
 		if ((tp->t_flags & TF_SENTFIN) &&
 		    ((tp->snd_max - tp->snd_una) == 1) &&
 		    (rsm->r_flags & BBR_HAS_FIN)) {
 			/* needs to be a RXT */
 			if (tmr_up == PACE_TMR_RXT)
 				return;
 			else
 				goto wrong_timer;
 		} else if (tmr_up == PACE_TMR_RACK)
 			return;
 		else
 			goto wrong_timer;
 	} else if (rsm && (tmr_up == PACE_TMR_RACK)) {
 		/* Rack timer has priority if we have data out */
 		return;
 	} else if (SEQ_GT(tp->snd_max, tp->snd_una) &&
 		    ((tmr_up == PACE_TMR_TLP) ||
 	    (tmr_up == PACE_TMR_RXT))) {
 		/*
 		 * Either a TLP or RXT is fine if no sack-passed is in place
 		 * and data is outstanding.
 		 */
 		return;
 	} else if (tmr_up == PACE_TMR_DELACK) {
 		/*
 		 * If the delayed ack was going to go off before the
 		 * rtx/tlp/rack timer were going to expire, then that would
 		 * be the timer in control. Note we don't check the time
 		 * here trusting the code is correct.
 		 */
 		return;
 	}
 	if (SEQ_GT(tp->snd_max, tp->snd_una) &&
 	    ((tmr_up == PACE_TMR_RXT) ||
 	     (tmr_up == PACE_TMR_TLP) ||
 	     (tmr_up == PACE_TMR_RACK))) {
 		/*
 		 * We have outstanding data and
 		 * we *do* have a RACK, TLP or RXT
 		 * timer running. We won't restart
 		 * anything here since thats probably ok we
 		 * will get called with some timer here shortly.
 		 */
 		return;
 	}
 	/*
 	 * Ok the timer originally started is not what we want now. We will
 	 * force the hpts to be stopped if any, and restart with the slot
 	 * set to what was in the saved slot.
 	 */
 wrong_timer:
 	if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) {
 		if (tcp_in_hpts(inp))
 			tcp_hpts_remove(inp);
 		bbr_timer_cancel(bbr, __LINE__, cts);
 		bbr_start_hpts_timer(bbr, tp, cts, 1, bbr->r_ctl.rc_last_delay_val,
 		    0);
 	} else {
 		/*
 		 * Output is hptsi so we just need to switch the type of
 		 * timer. We don't bother with keep-alive, since when we
 		 * jump through the output, it will start the keep-alive if
 		 * nothing is sent.
 		 *
 		 * We only need a delayed-ack added and or the hpts_timeout.
 		 */
 		hpts_timeout = bbr_timer_start(tp, bbr, cts);
 		if (tp->t_flags & TF_DELACK) {
 			if (hpts_timeout == 0) {
 				hpts_timeout = bbr_delack_time;
 				bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;
 			}
 			else if (hpts_timeout > bbr_delack_time) {
 				hpts_timeout = bbr_delack_time;
 				bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK;
 			}
 		}
 		if (hpts_timeout) {
 			if (hpts_timeout > 0x7ffffffe)
 				hpts_timeout = 0x7ffffffe;
 			bbr->r_ctl.rc_timer_exp = cts + hpts_timeout;
 		}
 	}
 }
 
 int32_t bbr_clear_lost = 0;
 
 /*
  * Considers the two time values now (cts) and earlier.
  * If cts is smaller than earlier, we could have
  * had a sequence wrap (our counter wraps every
  * 70 min or so) or it could be just clock skew
  * getting us two different time values. Clock skew
  * will show up within 10ms or so. So in such
  * a case (where cts is behind earlier time by
  * less than 10ms) we return 0. Otherwise we
  * return the true difference between them.
  */
 static inline uint32_t
 bbr_calc_time(uint32_t cts, uint32_t earlier_time) {
 	/*
 	 * Given two timestamps, the current time stamp cts, and some other
 	 * time-stamp taken in theory earlier return the difference. The
 	 * trick is here sometimes locking will get the other timestamp
 	 * after the cts. If this occurs we need to return 0.
 	 */
 	if (TSTMP_GEQ(cts, earlier_time))
 		return (cts - earlier_time);
 	/*
 	 * cts is behind earlier_time if its less than 10ms consider it 0.
 	 * If its more than 10ms difference then we had a time wrap. Else
 	 * its just the normal locking foo. I wonder if we should not go to
 	 * 64bit TS and get rid of this issue.
 	 */
 	if (TSTMP_GEQ((cts + 10000), earlier_time))
 		return (0);
 	/*
 	 * Ok the time must have wrapped. So we need to answer a large
 	 * amount of time, which the normal subtraction should do.
 	 */
 	return (cts - earlier_time);
 }
 
 static int
 sysctl_bbr_clear_lost(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t stat;
 	int32_t error;
 
 	error = SYSCTL_OUT(req, &bbr_clear_lost, sizeof(uint32_t));
 	if (error || req->newptr == NULL)
 		return error;
 
 	error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
 	if (error)
 		return (error);
 	if (stat == 1) {
 #ifdef BBR_INVARIANTS
 		printf("Clearing BBR lost counters\n");
 #endif
 		COUNTER_ARRAY_ZERO(bbr_state_lost, BBR_MAX_STAT);
 		COUNTER_ARRAY_ZERO(bbr_state_time, BBR_MAX_STAT);
 		COUNTER_ARRAY_ZERO(bbr_state_resend, BBR_MAX_STAT);
 	} else if (stat == 2) {
 #ifdef BBR_INVARIANTS
 		printf("Clearing BBR option counters\n");
 #endif
 		COUNTER_ARRAY_ZERO(bbr_opts_arry, BBR_OPTS_SIZE);
 	} else if (stat == 3) {
 #ifdef BBR_INVARIANTS
 		printf("Clearing BBR stats counters\n");
 #endif
 		COUNTER_ARRAY_ZERO(bbr_stat_arry, BBR_STAT_SIZE);
 	} else if (stat == 4) {
 #ifdef BBR_INVARIANTS
 		printf("Clearing BBR out-size counters\n");
 #endif
 		COUNTER_ARRAY_ZERO(bbr_out_size, TCP_MSS_ACCT_SIZE);
 	}
 	bbr_clear_lost = 0;
 	return (0);
 }
 
 static void
 bbr_init_sysctls(void)
 {
 	struct sysctl_oid *bbr_probertt;
 	struct sysctl_oid *bbr_hptsi;
 	struct sysctl_oid *bbr_measure;
 	struct sysctl_oid *bbr_cwnd;
 	struct sysctl_oid *bbr_timeout;
 	struct sysctl_oid *bbr_states;
 	struct sysctl_oid *bbr_startup;
 	struct sysctl_oid *bbr_policer;
 
 	/* Probe rtt controls */
 	bbr_probertt = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO,
 	    "probertt",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_probertt),
 	    OID_AUTO, "gain", CTLFLAG_RW,
 	    &bbr_rttprobe_gain, 192,
 	    "What is the filter gain drop in probe_rtt (0=disable)?");
 	SYSCTL_ADD_U32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_probertt),
 	    OID_AUTO, "cwnd", CTLFLAG_RW,
 	    &bbr_rtt_probe_cwndtarg, 4,
 	    "How many mss's are outstanding during probe-rtt");
 	SYSCTL_ADD_U32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_probertt),
 	    OID_AUTO, "int", CTLFLAG_RW,
 	    &bbr_rtt_probe_limit, 4000000,
 	    "If RTT has not shrank in this many micro-seconds enter probe-rtt");
 	SYSCTL_ADD_U32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_probertt),
 	    OID_AUTO, "mintime", CTLFLAG_RW,
 	    &bbr_rtt_probe_time, 200000,
 	    "How many microseconds in probe-rtt");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_probertt),
 	    OID_AUTO, "filter_len_sec", CTLFLAG_RW,
 	    &bbr_filter_len_sec, 6,
 	    "How long in seconds does the rttProp filter run?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_probertt),
 	    OID_AUTO, "drain_rtt", CTLFLAG_RW,
 	    &bbr_drain_rtt, BBR_SRTT,
 	    "What is the drain rtt to use in probeRTT (rtt_prop=0, rtt_rack=1, rtt_pkt=2, rtt_srtt=3?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_probertt),
 	    OID_AUTO, "can_force", CTLFLAG_RW,
 	    &bbr_can_force_probertt, 0,
 	    "If we keep setting new low rtt's but delay going in probe-rtt can we force in??");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_probertt),
 	    OID_AUTO, "enter_sets_force", CTLFLAG_RW,
 	    &bbr_probertt_sets_rtt, 0,
 	    "In NF mode, do we imitate google_mode and set the rttProp on entry to probe-rtt?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_probertt),
 	    OID_AUTO, "can_adjust", CTLFLAG_RW,
 	    &bbr_can_adjust_probertt, 1,
 	    "Can we dynamically adjust the probe-rtt limits and times?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_probertt),
 	    OID_AUTO, "is_ratio", CTLFLAG_RW,
 	    &bbr_is_ratio, 0,
 	    "is the limit to filter a ratio?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_probertt),
 	    OID_AUTO, "use_cwnd", CTLFLAG_RW,
 	    &bbr_prtt_slam_cwnd, 0,
 	    "Should we set/recover cwnd?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_probertt),
 	    OID_AUTO, "can_use_ts", CTLFLAG_RW,
 	    &bbr_can_use_ts_for_rtt, 1,
 	    "Can we use the ms timestamp if available for retransmistted rtt calculations?");
 
 	/* Pacing controls */
 	bbr_hptsi = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO,
 	    "pacing",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "");
 	SYSCTL_ADD_U32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "hw_pacing", CTLFLAG_RW,
 	    &bbr_allow_hdwr_pacing, 1,
 	    "Do we allow hardware pacing?");
 	SYSCTL_ADD_U32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "hw_pacing_limit", CTLFLAG_RW,
 	    &bbr_hardware_pacing_limit, 4000,
 	    "Do we have a limited number of connections for pacing chelsio (0=no limit)?");
 	SYSCTL_ADD_U32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "hw_pacing_adj", CTLFLAG_RW,
 	    &bbr_hdwr_pace_adjust, 2,
 	    "Multiplier to calculated tso size?");
 	SYSCTL_ADD_U32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "hw_pacing_floor", CTLFLAG_RW,
 	    &bbr_hdwr_pace_floor, 1,
 	    "Do we invoke the hardware pacing floor?");
 	SYSCTL_ADD_U32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "hw_pacing_delay_cnt", CTLFLAG_RW,
 	    &bbr_hdwr_pacing_delay_cnt, 10,
 	    "How many packets must be sent after hdwr pacing is enabled");
 	SYSCTL_ADD_U32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "bw_cross", CTLFLAG_RW,
 	    &bbr_cross_over, 3000000,
 	    "What is the point where we cross over to linux like TSO size set");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "seg_deltarg", CTLFLAG_RW,
 	    &bbr_hptsi_segments_delay_tar, 7000,
 	    "What is the worse case delay target for hptsi < 48Mbp connections");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "enet_oh", CTLFLAG_RW,
 	    &bbr_include_enet_oh, 0,
 	    "Do we include the ethernet overhead in calculating pacing delay?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "ip_oh", CTLFLAG_RW,
 	    &bbr_include_ip_oh, 1,
 	    "Do we include the IP overhead in calculating pacing delay?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "tcp_oh", CTLFLAG_RW,
 	    &bbr_include_tcp_oh, 0,
 	    "Do we include the TCP overhead in calculating pacing delay?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "google_discount", CTLFLAG_RW,
 	    &bbr_google_discount, 10,
 	    "What is the default google discount percentage wise for pacing (11 = 1.1%%)?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "all_get_min", CTLFLAG_RW,
 	    &bbr_all_get_min, 0,
 	    "If you are less than a MSS do you just get the min?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "tso_min", CTLFLAG_RW,
 	    &bbr_hptsi_bytes_min, 1460,
 	    "For 0 -> 24Mbps what is floor number of segments for TSO");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "seg_tso_max", CTLFLAG_RW,
 	    &bbr_hptsi_segments_max, 6,
 	    "For 0 -> 24Mbps what is top number of segments for TSO");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "seg_floor", CTLFLAG_RW,
 	    &bbr_hptsi_segments_floor, 1,
 	    "Minimum TSO size we will fall too in segments");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "utter_max", CTLFLAG_RW,
 	    &bbr_hptsi_utter_max, 0,
 	    "The absolute maximum that any pacing (outside of hardware) can be");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "seg_divisor", CTLFLAG_RW,
 	    &bbr_hptsi_per_second, 100,
 	    "What is the divisor in our hptsi TSO calculation 512Mbps < X > 24Mbps ");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "srtt_mul", CTLFLAG_RW,
 	    &bbr_hptsi_max_mul, 1,
 	    "The multiplier for pace len max");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_hptsi),
 	    OID_AUTO, "srtt_div", CTLFLAG_RW,
 	    &bbr_hptsi_max_div, 2,
 	    "The divisor for pace len max");
 	/* Measurement controls */
 	bbr_measure = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO,
 	    "measure",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "Measurement controls");
 	SYSCTL_ADD_U32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_measure),
 	    OID_AUTO, "min_i_bw", CTLFLAG_RW,
 	    &bbr_initial_bw_bps, 62500,
 	    "Minimum initial b/w in bytes per second");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_measure),
 	    OID_AUTO, "no_sack_needed", CTLFLAG_RW,
 	    &bbr_sack_not_required, 0,
 	    "Do we allow bbr to run on connections not supporting SACK?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_measure),
 	    OID_AUTO, "use_google", CTLFLAG_RW,
 	    &bbr_use_google_algo, 0,
 	    "Use has close to google V1.0 has possible?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_measure),
 	    OID_AUTO, "ts_limiting", CTLFLAG_RW,
 	    &bbr_ts_limiting, 1,
 	    "Do we attempt to use the peers timestamp to limit b/w caculations?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_measure),
 	    OID_AUTO, "ts_can_raise", CTLFLAG_RW,
 	    &bbr_ts_can_raise, 0,
 	    "Can we raise the b/w via timestamp b/w calculation?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_measure),
 	    OID_AUTO, "ts_delta", CTLFLAG_RW,
 	    &bbr_min_usec_delta, 20000,
 	    "How long in usec between ts of our sends in ts validation code?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_measure),
 	    OID_AUTO, "ts_peer_delta", CTLFLAG_RW,
 	    &bbr_min_peer_delta, 20,
 	    "What min numerical value should be between the peer deltas?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_measure),
 	    OID_AUTO, "ts_delta_percent", CTLFLAG_RW,
 	    &bbr_delta_percent, 150,
 	    "What percentage (150 = 15.0) do we allow variance for?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_measure),
 	    OID_AUTO, "min_measure_good_bw", CTLFLAG_RW,
 	    &bbr_min_measurements_req, 1,
 	    "What is the minimum measurement count we need before we switch to our b/w estimate");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_measure),
 	    OID_AUTO, "min_measure_before_pace", CTLFLAG_RW,
 	    &bbr_no_pacing_until, 4,
 	    "How many pkt-epoch's (0 is off) do we need before pacing is on?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_measure),
 	    OID_AUTO, "quanta", CTLFLAG_RW,
 	    &bbr_quanta, 2,
 	    "Extra quanta to add when calculating the target (ID section 4.2.3.2).");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_measure),
 	    OID_AUTO, "noretran", CTLFLAG_RW,
 	    &bbr_no_retran, 0,
 	    "Should google mode not use retransmission measurements for the b/w estimation?");
 	/* State controls */
 	bbr_states = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO,
 	    "states",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "State controls");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_states),
 	    OID_AUTO, "idle_restart", CTLFLAG_RW,
 	    &bbr_uses_idle_restart, 0,
 	    "Do we use a new special idle_restart state to ramp back up quickly?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_states),
 	    OID_AUTO, "idle_restart_threshold", CTLFLAG_RW,
 	    &bbr_idle_restart_threshold, 100000,
 	    "How long must we be idle before we restart??");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_states),
 	    OID_AUTO, "use_pkt_epoch", CTLFLAG_RW,
 	    &bbr_state_is_pkt_epoch, 0,
 	    "Do we use a pkt-epoch for substate if 0 rttProp?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_states),
 	    OID_AUTO, "startup_rtt_gain", CTLFLAG_RW,
 	    &bbr_rtt_gain_thresh, 0,
 	    "What increase in RTT triggers us to stop ignoring no-loss and possibly exit startup?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_states),
 	    OID_AUTO, "drain_floor", CTLFLAG_RW,
 	    &bbr_drain_floor, 88,
 	    "What is the lowest we can drain (pg) too?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_states),
 	    OID_AUTO, "drain_2_target", CTLFLAG_RW,
 	    &bbr_state_drain_2_tar, 1,
 	    "Do we drain to target in drain substate?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_states),
 	    OID_AUTO, "gain_2_target", CTLFLAG_RW,
 	    &bbr_gain_to_target, 1,
 	    "Does probe bw gain to target??");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_states),
 	    OID_AUTO, "gain_extra_time", CTLFLAG_RW,
 	    &bbr_gain_gets_extra_too, 1,
 	    "Does probe bw gain get the extra time too?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_states),
 	    OID_AUTO, "ld_div", CTLFLAG_RW,
 	    &bbr_drain_drop_div, 5,
 	    "Long drain drop divider?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_states),
 	    OID_AUTO, "ld_mul", CTLFLAG_RW,
 	    &bbr_drain_drop_mul, 4,
 	    "Long drain drop multiplier?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_states),
 	    OID_AUTO, "rand_ot_disc", CTLFLAG_RW,
 	    &bbr_rand_ot, 50,
 	    "Random discount of the ot?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_states),
 	    OID_AUTO, "dr_filter_life", CTLFLAG_RW,
 	    &bbr_num_pktepo_for_del_limit, BBR_NUM_RTTS_FOR_DEL_LIMIT,
 	    "How many packet-epochs does the b/w delivery rate last?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_states),
 	    OID_AUTO, "subdrain_applimited", CTLFLAG_RW,
 	    &bbr_sub_drain_app_limit, 0,
 	    "Does our sub-state drain invoke app limited if its long?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_states),
 	    OID_AUTO, "use_cwnd_subdrain", CTLFLAG_RW,
 	    &bbr_sub_drain_slam_cwnd, 0,
 	    "Should we set/recover cwnd for sub-state drain?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_states),
 	    OID_AUTO, "use_cwnd_maindrain", CTLFLAG_RW,
 	    &bbr_slam_cwnd_in_main_drain, 0,
 	    "Should we set/recover cwnd for main-state drain?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_states),
 	    OID_AUTO, "google_gets_earlyout", CTLFLAG_RW,
 	    &google_allow_early_out, 1,
 	    "Should we allow google probe-bw/drain to exit early at flight target?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_states),
 	    OID_AUTO, "google_exit_loss", CTLFLAG_RW,
 	    &google_consider_lost, 1,
 	    "Should we have losses exit gain of probebw in google mode??");
 	/* Startup controls */
 	bbr_startup = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO,
 	    "startup",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "Startup controls");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_startup),
 	    OID_AUTO, "cheat_iwnd", CTLFLAG_RW,
 	    &bbr_sends_full_iwnd, 1,
 	    "Do we not pace but burst out initial windows has our TSO size?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_startup),
 	    OID_AUTO, "loss_threshold", CTLFLAG_RW,
 	    &bbr_startup_loss_thresh, 2000,
 	    "In startup what is the loss threshold in a pe that will exit us from startup?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_startup),
 	    OID_AUTO, "use_lowerpg", CTLFLAG_RW,
 	    &bbr_use_lower_gain_in_startup, 1,
 	    "Should we use a lower hptsi gain if we see loss in startup?");
 	SYSCTL_ADD_U32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_startup),
 	    OID_AUTO, "gain", CTLFLAG_RW,
 	    &bbr_start_exit, 25,
 	    "What gain percent do we need to see to stay in startup??");
 	SYSCTL_ADD_U32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_startup),
 	    OID_AUTO, "low_gain", CTLFLAG_RW,
 	    &bbr_low_start_exit, 15,
 	    "What gain percent do we need to see to stay in the lower gain startup??");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_startup),
 	    OID_AUTO, "loss_exit", CTLFLAG_RW,
 	    &bbr_exit_startup_at_loss, 1,
 	    "Should we exit startup at loss in an epoch if we are not gaining?");
 	/* CWND controls */
 	bbr_cwnd = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO,
 	    "cwnd",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "Cwnd controls");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_cwnd),
 	    OID_AUTO, "tar_rtt", CTLFLAG_RW,
 	    &bbr_cwndtarget_rtt_touse, 0,
 	    "Target cwnd rtt measurement to use (0=rtt_prop, 1=rtt_rack, 2=pkt_rtt, 3=srtt)?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_cwnd),
 	    OID_AUTO, "may_shrink", CTLFLAG_RW,
 	    &bbr_cwnd_may_shrink, 0,
 	    "Can the cwnd shrink if it would grow to more than the target?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_cwnd),
 	    OID_AUTO, "max_target_limit", CTLFLAG_RW,
 	    &bbr_target_cwnd_mult_limit, 8,
 	    "Do we limit the cwnd to some multiple of the cwnd target if cwnd can't shrink 0=no?");
 	SYSCTL_ADD_U32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_cwnd),
 	    OID_AUTO, "highspeed_min", CTLFLAG_RW,
 	    &bbr_cwnd_min_val_hs, BBR_HIGHSPEED_NUM_MSS,
 	    "What is the high-speed min cwnd (rttProp under 1ms)");
 	SYSCTL_ADD_U32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_cwnd),
 	    OID_AUTO, "lowspeed_min", CTLFLAG_RW,
 	    &bbr_cwnd_min_val, BBR_PROBERTT_NUM_MSS,
 	    "What is the min cwnd (rttProp > 1ms)");
 	SYSCTL_ADD_U32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_cwnd),
 	    OID_AUTO, "initwin", CTLFLAG_RW,
 	    &bbr_def_init_win, 10,
 	    "What is the BBR initial window, if 0 use tcp version");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_cwnd),
 	    OID_AUTO, "do_loss_red", CTLFLAG_RW,
 	    &bbr_do_red, 600,
 	    "Do we reduce the b/w at exit from recovery based on ratio of prop/srtt (800=80.0, 0=off)?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_cwnd),
 	    OID_AUTO, "red_scale", CTLFLAG_RW,
 	    &bbr_red_scale, 20000,
 	    "What RTT do we scale with?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_cwnd),
 	    OID_AUTO, "red_growslow", CTLFLAG_RW,
 	    &bbr_red_growth_restrict, 1,
 	    "Do we restrict cwnd growth for whats in flight?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_cwnd),
 	    OID_AUTO, "red_div", CTLFLAG_RW,
 	    &bbr_red_div, 2,
 	    "If we reduce whats the divisor?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_cwnd),
 	    OID_AUTO, "red_mul", CTLFLAG_RW,
 	    &bbr_red_mul, 1,
 	    "If we reduce whats the mulitiplier?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_cwnd),
 	    OID_AUTO, "target_is_unit", CTLFLAG_RW,
 	    &bbr_target_is_bbunit, 0,
 	    "Is the state target the pacing_gain or BBR_UNIT?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_cwnd),
 	    OID_AUTO, "drop_limit", CTLFLAG_RW,
 	    &bbr_drop_limit, 0,
 	    "Number of segments limit for drop (0=use min_cwnd w/flight)?");
 
 	/* Timeout controls */
 	bbr_timeout = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO,
 	    "timeout",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "Time out controls");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_timeout),
 	    OID_AUTO, "delack", CTLFLAG_RW,
 	    &bbr_delack_time, 100000,
 	    "BBR's delayed ack time");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_timeout),
 	    OID_AUTO, "tlp_uses", CTLFLAG_RW,
 	    &bbr_tlp_type_to_use, 3,
 	    "RTT that TLP uses in its calculations, 0=rttProp, 1=Rack_rtt, 2=pkt_rtt and 3=srtt");
 	SYSCTL_ADD_U32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_timeout),
 	    OID_AUTO, "persmin", CTLFLAG_RW,
 	    &bbr_persist_min, 250000,
 	    "What is the minimum time in microseconds between persists");
 	SYSCTL_ADD_U32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_timeout),
 	    OID_AUTO, "persmax", CTLFLAG_RW,
 	    &bbr_persist_max, 1000000,
 	    "What is the largest delay in microseconds between persists");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_timeout),
 	    OID_AUTO, "tlp_minto", CTLFLAG_RW,
 	    &bbr_tlp_min, 10000,
 	    "TLP Min timeout in usecs");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_timeout),
 	    OID_AUTO, "tlp_dack_time", CTLFLAG_RW,
 	    &bbr_delayed_ack_time, 200000,
 	    "TLP delayed ack compensation value");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "minrto", CTLFLAG_RW,
 	    &bbr_rto_min_ms, 30,
 	    "Minimum RTO in ms");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_timeout),
 	    OID_AUTO, "maxrto", CTLFLAG_RW,
 	    &bbr_rto_max_sec, 4,
 	    "Maximum RTO in seconds -- should be at least as large as min_rto");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_timeout),
 	    OID_AUTO, "tlp_retry", CTLFLAG_RW,
 	    &bbr_tlp_max_resend, 2,
 	    "How many times does TLP retry a single segment or multiple with no ACK");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_timeout),
 	    OID_AUTO, "minto", CTLFLAG_RW,
 	    &bbr_min_to, 1000,
 	    "Minimum rack timeout in useconds");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_timeout),
 	    OID_AUTO, "pktdelay", CTLFLAG_RW,
 	    &bbr_pkt_delay, 1000,
 	    "Extra RACK time (in useconds) besides reordering thresh");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_timeout),
 	    OID_AUTO, "incr_tmrs", CTLFLAG_RW,
 	    &bbr_incr_timers, 1,
 	    "Increase the RXT/TLP timer by the pacing time used?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_timeout),
 	    OID_AUTO, "rxtmark_sackpassed", CTLFLAG_RW,
 	    &bbr_marks_rxt_sack_passed, 0,
 	    "Mark sack passed on all those not ack'd when a RXT hits?");
 	/* Policer controls */
 	bbr_policer = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO,
 	    "policer",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "Policer controls");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_policer),
 	    OID_AUTO, "detect_enable", CTLFLAG_RW,
 	    &bbr_policer_detection_enabled, 1,
 	    "Is policer detection enabled??");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_policer),
 	    OID_AUTO, "min_pes", CTLFLAG_RW,
 	    &bbr_lt_intvl_min_rtts, 4,
 	    "Minimum number of PE's?");
 	SYSCTL_ADD_U64(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_policer),
 	    OID_AUTO, "bwdiff", CTLFLAG_RW,
 	    &bbr_lt_bw_diff, (4000/8),
 	    "Minimal bw diff?");
 	SYSCTL_ADD_U64(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_policer),
 	    OID_AUTO, "bwratio", CTLFLAG_RW,
 	    &bbr_lt_bw_ratio, 8,
 	    "Minimal bw diff?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_policer),
 	    OID_AUTO, "from_rack_rxt", CTLFLAG_RW,
 	    &bbr_policer_call_from_rack_to, 0,
 	    "Do we call the policer detection code from a rack-timeout?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_policer),
 	    OID_AUTO, "false_postive", CTLFLAG_RW,
 	    &bbr_lt_intvl_fp, 0,
 	    "What packet epoch do we do false-positive detection at (0=no)?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_policer),
 	    OID_AUTO, "loss_thresh", CTLFLAG_RW,
 	    &bbr_lt_loss_thresh, 196,
 	    "Loss threshold 196 = 19.6%?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_policer),
 	    OID_AUTO, "false_postive_thresh", CTLFLAG_RW,
 	    &bbr_lt_fd_thresh, 100,
 	    "What percentage is the false detection threshold (150=15.0)?");
 	/* All the rest */
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "cheat_rxt", CTLFLAG_RW,
 	    &bbr_use_rack_resend_cheat, 0,
 	    "Do we burst 1ms between sends on retransmissions (like rack)?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "error_paceout", CTLFLAG_RW,
 	    &bbr_error_base_paceout, 10000,
 	    "When we hit an error what is the min to pace out in usec's?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "kill_paceout", CTLFLAG_RW,
 	    &bbr_max_net_error_cnt, 10,
 	    "When we hit this many errors in a row, kill the session?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "data_after_close", CTLFLAG_RW,
 	    &bbr_ignore_data_after_close, 1,
 	    "Do we hold off sending a RST until all pending data is ack'd");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "resend_use_tso", CTLFLAG_RW,
 	    &bbr_resends_use_tso, 0,
 	    "Can resends use TSO?");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "sblklimit", CTLFLAG_RW,
 	    &bbr_sack_block_limit, 128,
 	    "When do we start ignoring small sack blocks");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "bb_verbose", CTLFLAG_RW,
 	    &bbr_verbose_logging, 0,
 	    "Should BBR black box logging be verbose");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "reorder_thresh", CTLFLAG_RW,
 	    &bbr_reorder_thresh, 2,
 	    "What factor for rack will be added when seeing reordering (shift right)");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "reorder_fade", CTLFLAG_RW,
 	    &bbr_reorder_fade, 0,
 	    "Does reorder detection fade, if so how many ms (0 means never)");
 	SYSCTL_ADD_S32(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
 	    &bbr_tlp_thresh, 1,
 	    "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
 	/* Stats and counters */
 	/* The pacing counters for hdwr/software can't be in the array */
 	bbr_nohdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK);
 	bbr_hdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "enob_hdwr_pacing", CTLFLAG_RD,
 	    &bbr_hdwr_pacing_enobuf,
 	    "Total number of enobufs for hardware paced flows");
 	SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "enob_no_hdwr_pacing", CTLFLAG_RD,
 	    &bbr_nohdwr_pacing_enobuf,
 	    "Total number of enobufs for non-hardware paced flows");
 
 	bbr_flows_whdwr_pacing = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "hdwr_pacing", CTLFLAG_RD,
 	    &bbr_flows_whdwr_pacing,
 	    "Total number of hardware paced flows");
 	bbr_flows_nohdwr_pacing = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "software_pacing", CTLFLAG_RD,
 	    &bbr_flows_nohdwr_pacing,
 	    "Total number of software paced flows");
 	COUNTER_ARRAY_ALLOC(bbr_stat_arry, BBR_STAT_SIZE, M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "stats", CTLFLAG_RD,
 	    bbr_stat_arry, BBR_STAT_SIZE, "BBR Stats");
 	COUNTER_ARRAY_ALLOC(bbr_opts_arry, BBR_OPTS_SIZE, M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "opts", CTLFLAG_RD,
 	    bbr_opts_arry, BBR_OPTS_SIZE, "BBR Option Stats");
 	COUNTER_ARRAY_ALLOC(bbr_state_lost, BBR_MAX_STAT, M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "lost", CTLFLAG_RD,
 	    bbr_state_lost, BBR_MAX_STAT, "Stats of when losses occur");
 	COUNTER_ARRAY_ALLOC(bbr_state_resend, BBR_MAX_STAT, M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "stateresend", CTLFLAG_RD,
 	    bbr_state_resend, BBR_MAX_STAT, "Stats of what states resend");
 	COUNTER_ARRAY_ALLOC(bbr_state_time, BBR_MAX_STAT, M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "statetime", CTLFLAG_RD,
 	    bbr_state_time, BBR_MAX_STAT, "Stats of time spent in the states");
 	COUNTER_ARRAY_ALLOC(bbr_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "outsize", CTLFLAG_RD,
 	    bbr_out_size, TCP_MSS_ACCT_SIZE, "Size of output calls");
 	SYSCTL_ADD_PROC(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO, "clrlost", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 	    &bbr_clear_lost, 0, sysctl_bbr_clear_lost, "IU", "Clear lost counters");
 }
 
 static void
 bbr_counter_destroy(void)
 {
 	COUNTER_ARRAY_FREE(bbr_stat_arry, BBR_STAT_SIZE);
 	COUNTER_ARRAY_FREE(bbr_opts_arry, BBR_OPTS_SIZE);
 	COUNTER_ARRAY_FREE(bbr_out_size, TCP_MSS_ACCT_SIZE);
 	COUNTER_ARRAY_FREE(bbr_state_lost, BBR_MAX_STAT);
 	COUNTER_ARRAY_FREE(bbr_state_time, BBR_MAX_STAT);
 	COUNTER_ARRAY_FREE(bbr_state_resend, BBR_MAX_STAT);
 	counter_u64_free(bbr_nohdwr_pacing_enobuf);
 	counter_u64_free(bbr_hdwr_pacing_enobuf);
 	counter_u64_free(bbr_flows_whdwr_pacing);
 	counter_u64_free(bbr_flows_nohdwr_pacing);
 
 }
 
 static __inline void
 bbr_fill_in_logging_data(struct tcp_bbr *bbr, struct tcp_log_bbr *l, uint32_t cts)
 {
 	memset(l, 0, sizeof(union tcp_log_stackspecific));
 	l->cur_del_rate = bbr->r_ctl.rc_bbr_cur_del_rate;
 	l->delRate = get_filter_value(&bbr->r_ctl.rc_delrate);
 	l->rttProp = get_filter_value_small(&bbr->r_ctl.rc_rttprop);
 	l->bw_inuse = bbr_get_bw(bbr);
 	l->inflight = ctf_flight_size(bbr->rc_tp,
 			  (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
 	l->applimited = bbr->r_ctl.r_app_limited_until;
 	l->delivered = bbr->r_ctl.rc_delivered;
 	l->timeStamp = cts;
 	l->lost = bbr->r_ctl.rc_lost;
 	l->bbr_state = bbr->rc_bbr_state;
 	l->bbr_substate = bbr_state_val(bbr);
 	l->epoch = bbr->r_ctl.rc_rtt_epoch;
 	l->lt_epoch = bbr->r_ctl.rc_lt_epoch;
 	l->pacing_gain = bbr->r_ctl.rc_bbr_hptsi_gain;
 	l->cwnd_gain = bbr->r_ctl.rc_bbr_cwnd_gain;
 	l->inhpts = tcp_in_hpts(bbr->rc_inp);
 	l->use_lt_bw = bbr->rc_lt_use_bw;
 	l->pkts_out = bbr->r_ctl.rc_flight_at_input;
 	l->pkt_epoch = bbr->r_ctl.rc_pkt_epoch;
 }
 
 static void
 bbr_log_type_bw_reduce(struct tcp_bbr *bbr, int reason)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
 		log.u_bbr.flex1 = 0;
 		log.u_bbr.flex2 = 0;
 		log.u_bbr.flex5 = 0;
 		log.u_bbr.flex3 = 0;
 		log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_loss_rate;
 		log.u_bbr.flex7 = reason;
 		log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_enters_probertt;
 		log.u_bbr.flex8 = 0;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_BW_RED_EV, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_type_rwnd_collapse(struct tcp_bbr *bbr, int seq, int mode, uint32_t count)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
 		log.u_bbr.flex1 = seq;
 		log.u_bbr.flex2 = count;
 		log.u_bbr.flex8 = mode;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_LOWGAIN, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_type_just_return(struct tcp_bbr *bbr, uint32_t cts, uint32_t tlen, uint8_t hpts_calling,
     uint8_t reason, uint32_t p_maxseg, int len)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = p_maxseg;
 		log.u_bbr.flex2 = bbr->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp;
 		log.u_bbr.flex4 = reason;
 		log.u_bbr.flex5 = bbr->rc_in_persist;
 		log.u_bbr.flex6 = bbr->r_ctl.rc_last_delay_val;
 		log.u_bbr.flex7 = p_maxseg;
 		log.u_bbr.flex8 = bbr->rc_in_persist;
 		log.u_bbr.pkts_out = 0;
 		log.u_bbr.applimited = len;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_JUSTRET, 0,
 		    tlen, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_type_enter_rec(struct tcp_bbr *bbr, uint32_t seq)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
 		log.u_bbr.flex1 = seq;
 		log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent;
 		log.u_bbr.flex3 = bbr->r_ctl.rc_recovery_start;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_ENTREC, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_msgsize_fail(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t len, uint32_t maxseg, uint32_t mtu, int32_t csum_flags, int32_t tso, uint32_t cts)
 {
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = tso;
 		log.u_bbr.flex2 = maxseg;
 		log.u_bbr.flex3 = mtu;
 		log.u_bbr.flex4 = csum_flags;
 		TCP_LOG_EVENTP(tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_MSGSIZE, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_flowend(struct tcp_bbr *bbr)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct sockbuf *r, *s;
 		struct timeval tv;
 
 		if (bbr->rc_inp->inp_socket) {
 			r = &bbr->rc_inp->inp_socket->so_rcv;
 			s = &bbr->rc_inp->inp_socket->so_snd;
 		} else {
 			r = s = NULL;
 		}
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, tcp_get_usecs(&tv));
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    r, s,
 		    TCP_LOG_FLOWEND, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 static void
 bbr_log_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line,
     uint32_t lost, uint32_t del)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = lost;
 		log.u_bbr.flex2 = del;
 		log.u_bbr.flex3 = bbr->r_ctl.rc_bbr_lastbtlbw;
 		log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_rtt;
 		log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch;
 		log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup;
 		log.u_bbr.flex7 = line;
 		log.u_bbr.flex8 = 0;
 		log.u_bbr.inflight = bbr->r_ctl.r_measurement_count;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_PKT_EPOCH, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_time_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line, uint32_t epoch_time)
 {
 	if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = bbr->r_ctl.rc_lost;
 		log.u_bbr.flex2 = bbr->rc_inp->inp_socket->so_snd.sb_lowat;
 		log.u_bbr.flex3 = bbr->rc_inp->inp_socket->so_snd.sb_hiwat;
 		log.u_bbr.flex7 = line;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_TIME_EPOCH, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_set_of_state_target(struct tcp_bbr *bbr, uint32_t new_tar, int line, int meth)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
 		log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state;
 		log.u_bbr.flex2 = new_tar;
 		log.u_bbr.flex3 = line;
 		log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs;
 		log.u_bbr.flex5 = bbr_quanta;
 		log.u_bbr.flex6 = bbr->r_ctl.rc_pace_min_segs;
 		log.u_bbr.flex7 = bbr->rc_last_options;
 		log.u_bbr.flex8 = meth;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_STATE_TARGET, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 
 }
 
 static void
 bbr_log_type_statechange(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = line;
 		log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks;
 		log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int;
 		if (bbr_state_is_pkt_epoch)
 			log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PKTRTT);
 		else
 			log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PROP);
 		log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch;
 		log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup;
 		log.u_bbr.flex7 = (bbr->r_ctl.rc_target_at_state/1000);
 		log.u_bbr.lt_epoch = bbr->r_ctl.rc_level_state_extra;
 		log.u_bbr.pkts_out = bbr->r_ctl.rc_target_at_state;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_STATE, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied,
 		    uint32_t rtt, uint32_t line, uint8_t reas, uint16_t cond)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = line;
 		log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks;
 		log.u_bbr.flex3 = bbr->r_ctl.last_in_probertt;
 		log.u_bbr.flex4 = applied;
 		log.u_bbr.flex5 = rtt;
 		log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state;
 		log.u_bbr.flex7 = cond;
 		log.u_bbr.flex8 = reas;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_RTT_SHRINKS, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_type_exit_rec(struct tcp_bbr *bbr)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
 		log.u_bbr.flex1 = bbr->r_ctl.rc_recovery_start;
 		log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent;
 		log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_EXITREC, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_type_cwndupd(struct tcp_bbr *bbr, uint32_t bytes_this_ack, uint32_t chg,
     uint32_t prev_acked, int32_t meth, uint32_t target, uint32_t th_ack, int32_t line)
 {
 	if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
 		log.u_bbr.flex1 = line;
 		log.u_bbr.flex2 = prev_acked;
 		log.u_bbr.flex3 = bytes_this_ack;
 		log.u_bbr.flex4 = chg;
 		log.u_bbr.flex5 = th_ack;
 		log.u_bbr.flex6 = target;
 		log.u_bbr.flex8 = meth;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_CWND, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_rtt_sample(struct tcp_bbr *bbr, uint32_t rtt, uint32_t tsin)
 {
 	/*
 	 * Log the rtt sample we are applying to the srtt algorithm in
 	 * useconds.
 	 */
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
 		log.u_bbr.flex1 = rtt;
 		log.u_bbr.flex2 = bbr->r_ctl.rc_bbr_state_time;
 		log.u_bbr.flex3 = bbr->r_ctl.rc_ack_hdwr_delay;
 		log.u_bbr.flex4 = bbr->rc_tp->ts_offset;
 		log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
 		log.u_bbr.pkts_out = tcp_tv_to_mssectick(&bbr->rc_tv);
 		log.u_bbr.flex6 = tsin;
 		log.u_bbr.flex7 = 0;
 		log.u_bbr.flex8 = bbr->rc_ack_was_delayed;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    TCP_LOG_RTT, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_type_pesist(struct tcp_bbr *bbr, uint32_t cts, uint32_t time_in, int32_t line, uint8_t enter_exit)
 {
 	if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = time_in;
 		log.u_bbr.flex2 = line;
 		log.u_bbr.flex8 = enter_exit;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_PERSIST, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 static void
 bbr_log_ack_clear(struct tcp_bbr *bbr, uint32_t cts)
 {
 	if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = bbr->rc_tp->ts_recent_age;
 		log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks;
 		log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int;
 		log.u_bbr.flex4 = bbr->r_ctl.rc_went_idle_time;
 		log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_ACKCLEAR, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_ack_event(struct tcp_bbr *bbr, struct tcphdr *th, struct tcpopt *to, uint32_t tlen,
 		  uint16_t nsegs, uint32_t cts, int32_t nxt_pkt, struct mbuf *m)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = nsegs;
 		log.u_bbr.flex2 = bbr->r_ctl.rc_lost_bytes;
 		if (m) {
 			struct timespec ts;
 
 			log.u_bbr.flex3 = m->m_flags;
 			if (m->m_flags & M_TSTMP) {
 				mbuf_tstmp2timespec(m, &ts);
 				tv.tv_sec = ts.tv_sec;
 				tv.tv_usec = ts.tv_nsec / 1000;
 				log.u_bbr.lt_epoch = tcp_tv_to_usectick(&tv);
 			} else {
 				log.u_bbr.lt_epoch = 0;
 			}
 			if (m->m_flags & M_TSTMP_LRO) {
 				mbuf_tstmp2timeval(m, &tv);
 				log.u_bbr.flex5 = tcp_tv_to_usectick(&tv);
 			} else {
 				/* No arrival timestamp */
 				log.u_bbr.flex5 = 0;
 			}
 
 			log.u_bbr.pkts_out = tcp_get_usecs(&tv);
 		} else {
 			log.u_bbr.flex3 = 0;
 			log.u_bbr.flex5 = 0;
 			log.u_bbr.flex6 = 0;
 			log.u_bbr.pkts_out = 0;
 		}
 		log.u_bbr.flex4 = bbr->r_ctl.rc_target_at_state;
 		log.u_bbr.flex7 = bbr->r_wanted_output;
 		log.u_bbr.flex8 = bbr->rc_in_persist;
 		TCP_LOG_EVENTP(bbr->rc_tp, th,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    TCP_LOG_IN, 0,
 		    tlen, &log, true, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_doseg_done(struct tcp_bbr *bbr, uint32_t cts, int32_t nxt_pkt, int32_t did_out)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = did_out;
 		log.u_bbr.flex2 = nxt_pkt;
 		log.u_bbr.flex3 = bbr->r_ctl.rc_last_delay_val;
 		log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex5 = bbr->r_ctl.rc_timer_exp;
 		log.u_bbr.flex6 = bbr->r_ctl.rc_lost_bytes;
 		log.u_bbr.flex7 = bbr->r_wanted_output;
 		log.u_bbr.flex8 = bbr->rc_in_persist;
 		log.u_bbr.pkts_out = bbr->r_ctl.highest_hdwr_delay;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_DOSEG_DONE, 0,
 		    0, &log, true, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_enobuf_jmp(struct tcp_bbr *bbr, uint32_t len, uint32_t cts,
     int32_t line, uint32_t o_len, uint32_t segcnt, uint32_t segsiz)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = line;
 		log.u_bbr.flex2 = o_len;
 		log.u_bbr.flex3 = segcnt;
 		log.u_bbr.flex4 = segsiz;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_ENOBUF_JMP, ENOBUFS,
 		    len, &log, true, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_to_processing(struct tcp_bbr *bbr, uint32_t cts, int32_t ret, int32_t timers, uint8_t hpts_calling)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = timers;
 		log.u_bbr.flex2 = ret;
 		log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp;
 		log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex5 = cts;
 		log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state;
 		log.u_bbr.flex8 = hpts_calling;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_TO_PROCESS, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_to_event(struct tcp_bbr *bbr, uint32_t cts, int32_t to_num)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		uint64_t ar;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = bbr->bbr_timer_src;
 		log.u_bbr.flex2 = 0;
 		log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;
 		ar = (uint64_t)(bbr->r_ctl.rc_resend);
 		ar >>= 32;
 		ar &= 0x00000000ffffffff;
 		log.u_bbr.flex4 = (uint32_t)ar;
 		ar = (uint64_t)bbr->r_ctl.rc_resend;
 		ar &= 0x00000000ffffffff;
 		log.u_bbr.flex5 = (uint32_t)ar;
 		log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
 		log.u_bbr.flex8 = to_num;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_RTO, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_startup_event(struct tcp_bbr *bbr, uint32_t cts, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint8_t reason)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = flex1;
 		log.u_bbr.flex2 = flex2;
 		log.u_bbr.flex3 = flex3;
 		log.u_bbr.flex4 = 0;
 		log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
 		log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup;
 		log.u_bbr.flex8 = reason;
 		log.u_bbr.cur_del_rate = bbr->r_ctl.rc_bbr_lastbtlbw;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_REDUCE, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag)
 {
 	if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = diag->p_nxt_slot;
 		log.u_bbr.flex2 = diag->p_cur_slot;
 		log.u_bbr.flex3 = diag->slot_req;
 		log.u_bbr.flex4 = diag->inp_hptsslot;
 		log.u_bbr.flex5 = diag->slot_remaining;
 		log.u_bbr.flex6 = diag->need_new_to;
 		log.u_bbr.flex7 = diag->p_hpts_active;
 		log.u_bbr.flex8 = diag->p_on_min_sleep;
 		/* Hijack other fields as needed  */
 		log.u_bbr.epoch = diag->have_slept;
 		log.u_bbr.lt_epoch = diag->yet_to_sleep;
 		log.u_bbr.pkts_out = diag->co_ret;
 		log.u_bbr.applimited = diag->hpts_sleep_time;
 		log.u_bbr.delivered = diag->p_prev_slot;
 		log.u_bbr.inflight = diag->p_runningslot;
 		log.u_bbr.bw_inuse = diag->wheel_slot;
 		log.u_bbr.rttProp = diag->wheel_cts;
 		log.u_bbr.delRate = diag->maxslots;
 		log.u_bbr.cur_del_rate = diag->p_curtick;
 		log.u_bbr.cur_del_rate <<= 32;
 		log.u_bbr.cur_del_rate |= diag->p_lasttick;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_HPTSDIAG, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, uint32_t time_since_sent, uint32_t srtt,
     uint32_t thresh, uint32_t to)
 {
 	if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = bbr->rc_tp->t_rttvar;
 		log.u_bbr.flex2 = time_since_sent;
 		log.u_bbr.flex3 = srtt;
 		log.u_bbr.flex4 = thresh;
 		log.u_bbr.flex5 = to;
 		log.u_bbr.flex6 = bbr->rc_tp->t_srtt;
 		log.u_bbr.flex8 = mode;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_TIMERPREP, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len,
     uint32_t cts, uint32_t usecs, uint64_t bw, uint32_t override, int mod)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = usecs;
 		log.u_bbr.flex2 = len;
 		log.u_bbr.flex3 = (uint32_t)((bw >> 32) & 0x00000000ffffffff);
 		log.u_bbr.flex4 = (uint32_t)(bw & 0x00000000ffffffff);
 		if (override)
 			log.u_bbr.flex5 = (1 << 2);
 		else
 			log.u_bbr.flex5 = 0;
 		log.u_bbr.flex6 = override;
 		log.u_bbr.flex7 = gain;
 		log.u_bbr.flex8 = mod;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_HPTSI_CALC, 0,
 		    len, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 
 		log.u_bbr.flex1 = bbr->bbr_timer_src;
 		log.u_bbr.flex2 = to;
 		log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex4 = slot;
 		log.u_bbr.flex5 = bbr->rc_inp->inp_hptsslot;
 		log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
 		log.u_bbr.pkts_out = bbr->rc_inp->inp_flags2;
 		log.u_bbr.flex8 = which;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_TIMERSTAR, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_thresh_choice(struct tcp_bbr *bbr, uint32_t cts, uint32_t thresh, uint32_t lro, uint32_t srtt, struct bbr_sendmap *rsm, uint8_t frm)
 {
 	if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = thresh;
 		log.u_bbr.flex2 = lro;
 		log.u_bbr.flex3 = bbr->r_ctl.rc_reorder_ts;
 		log.u_bbr.flex4 = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
 		log.u_bbr.flex5 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
 		log.u_bbr.flex6 = srtt;
 		log.u_bbr.flex7 = bbr->r_ctl.rc_reorder_shift;
 		log.u_bbr.flex8 = frm;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_THRESH_CALC, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_to_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts, uint8_t hpts_removed)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = line;
 		log.u_bbr.flex2 = bbr->bbr_timer_src;
 		log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex4 = bbr->rc_in_persist;
 		log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state;
 		log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
 		log.u_bbr.flex8 = hpts_removed;
 		log.u_bbr.pkts_out = bbr->rc_pacer_started;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_TIMERCANC, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_tstmp_validation(struct tcp_bbr *bbr, uint64_t peer_delta, uint64_t delta)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
 		log.u_bbr.flex1 = bbr->r_ctl.bbr_peer_tsratio;
 		log.u_bbr.flex2 = (peer_delta >> 32);
 		log.u_bbr.flex3 = (peer_delta & 0x00000000ffffffff);
 		log.u_bbr.flex4 = (delta >> 32);
 		log.u_bbr.flex5 = (delta & 0x00000000ffffffff);
 		log.u_bbr.flex7 = bbr->rc_ts_clock_set;
 		log.u_bbr.flex8 = bbr->rc_ts_cant_be_used;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_TSTMP_VAL, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_type_tsosize(struct tcp_bbr *bbr, uint32_t cts, uint32_t tsosz, uint32_t tls, uint32_t old_val, uint32_t maxseg, int hdwr)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = tsosz;
 		log.u_bbr.flex2 = tls;
 		log.u_bbr.flex3 = tcp_min_hptsi_time;
 		log.u_bbr.flex4 = bbr->r_ctl.bbr_hptsi_bytes_min;
 		log.u_bbr.flex5 = old_val;
 		log.u_bbr.flex6 = maxseg;
 		log.u_bbr.flex7 = bbr->rc_no_pacing;
 		log.u_bbr.flex7 <<= 1;
 		log.u_bbr.flex7 |= bbr->rc_past_init_win;
 		if (hdwr)
 			log.u_bbr.flex8 = 0x80 | bbr->rc_use_google;
 		else
 			log.u_bbr.flex8 = bbr->rc_use_google;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_BBRTSO, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_type_rsmclear(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm,
 		      uint32_t flags, uint32_t line)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = line;
 		log.u_bbr.flex2 = rsm->r_start;
 		log.u_bbr.flex3 = rsm->r_end;
 		log.u_bbr.flex4 = rsm->r_delivered;
 		log.u_bbr.flex5 = rsm->r_rtr_cnt;
 		log.u_bbr.flex6 = rsm->r_dupack;
 		log.u_bbr.flex7 = rsm->r_tim_lastsent[0];
 		log.u_bbr.flex8 = rsm->r_flags;
 		/* Hijack the pkts_out fids */
 		log.u_bbr.applimited = flags;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_RSM_CLEARED, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_type_bbrupd(struct tcp_bbr *bbr, uint8_t flex8, uint32_t cts,
     uint32_t flex3, uint32_t flex2, uint32_t flex5,
     uint32_t flex6, uint32_t pkts_out, int flex7,
     uint32_t flex4, uint32_t flex1)
 {
 
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = flex1;
 		log.u_bbr.flex2 = flex2;
 		log.u_bbr.flex3 = flex3;
 		log.u_bbr.flex4 = flex4;
 		log.u_bbr.flex5 = flex5;
 		log.u_bbr.flex6 = flex6;
 		log.u_bbr.flex7 = flex7;
 		/* Hijack the pkts_out fids */
 		log.u_bbr.pkts_out = pkts_out;
 		log.u_bbr.flex8 = flex8;
 		if (bbr->rc_ack_was_delayed)
 			log.u_bbr.epoch = bbr->r_ctl.rc_ack_hdwr_delay;
 		else
 			log.u_bbr.epoch = 0;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_BBRUPD, 0,
 		    flex2, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_type_ltbw(struct tcp_bbr *bbr, uint32_t cts, int32_t reason,
 	uint32_t newbw, uint32_t obw, uint32_t diff,
 	uint32_t tim)
 {
 	if (/*bbr_verbose_logging && */(bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = reason;
 		log.u_bbr.flex2 = newbw;
 		log.u_bbr.flex3 = obw;
 		log.u_bbr.flex4 = diff;
 		log.u_bbr.flex5 = bbr->r_ctl.rc_lt_lost;
 		log.u_bbr.flex6 = bbr->r_ctl.rc_lt_del;
 		log.u_bbr.flex7 = bbr->rc_lt_is_sampling;
 		log.u_bbr.pkts_out = tim;
 		log.u_bbr.bw_inuse = bbr->r_ctl.rc_lt_bw;
 		if (bbr->rc_lt_use_bw == 0)
 			log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch;
 		else
 			log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch_use;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_BWSAMP, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static inline void
 bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line)
 {
 	if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
 		log.u_bbr.flex1 = line;
 		log.u_bbr.flex2 = tick;
 		log.u_bbr.flex3 = tp->t_maxunacktime;
 		log.u_bbr.flex4 = tp->t_acktime;
 		log.u_bbr.flex8 = event;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_PROGRESS, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_type_log_hdwr_pacing(struct tcp_bbr *bbr, const struct ifnet *ifp,
 			 uint64_t rate, uint64_t hw_rate, int line, uint32_t cts,
 			 int error)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff);
 		log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff);
 		log.u_bbr.flex3 = (((uint64_t)ifp  >> 32) & 0x00000000ffffffff);
 		log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff);
 		log.u_bbr.bw_inuse = rate;
 		log.u_bbr.flex5 = line;
 		log.u_bbr.flex6 = error;
 		log.u_bbr.flex8 = bbr->skip_gain;
 		log.u_bbr.flex8 <<= 1;
 		log.u_bbr.flex8 |= bbr->gain_is_limited;
 		log.u_bbr.flex8 <<= 1;
 		log.u_bbr.flex8 |= bbr->bbr_hdrw_pacing;
 		log.u_bbr.pkts_out = bbr->rc_tp->t_maxseg;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_HDWR_PACE, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = slot;
 		log.u_bbr.flex2 = del_by;
 		log.u_bbr.flex3 = prev_delay;
 		log.u_bbr.flex4 = line;
 		log.u_bbr.flex5 = bbr->r_ctl.rc_last_delay_val;
 		log.u_bbr.flex6 = bbr->r_ctl.rc_hptsi_agg_delay;
 		log.u_bbr.flex7 = (0x0000ffff & bbr->r_ctl.rc_hpts_flags);
 		log.u_bbr.flex8 = bbr->rc_in_persist;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_BBRSND, 0,
 		    len, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_type_bbrrttprop(struct tcp_bbr *bbr, uint32_t t, uint32_t end, uint32_t tsconv, uint32_t cts, int32_t match, uint32_t seq, uint8_t flags)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = bbr->r_ctl.rc_delivered;
 		log.u_bbr.flex2 = 0;
 		log.u_bbr.flex3 = bbr->r_ctl.rc_lowest_rtt;
 		log.u_bbr.flex4 = end;
 		log.u_bbr.flex5 = seq;
 		log.u_bbr.flex6 = t;
 		log.u_bbr.flex7 = match;
 		log.u_bbr.flex8 = flags;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_BBRRTT, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_exit_gain(struct tcp_bbr *bbr, uint32_t cts, int32_t entry_method)
 {
 	if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state;
 		log.u_bbr.flex2 = (bbr->rc_tp->t_maxseg - bbr->rc_last_options);
 		log.u_bbr.flex3 = bbr->r_ctl.gain_epoch;
 		log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs;
 		log.u_bbr.flex5 = bbr->r_ctl.rc_pace_min_segs;
 		log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_state_atflight;
 		log.u_bbr.flex7 = 0;
 		log.u_bbr.flex8 = entry_method;
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_EXIT_GAIN, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 static void
 bbr_log_settings_change(struct tcp_bbr *bbr, int settings_desired)
 {
 	if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
 		/* R-HU */
 		log.u_bbr.flex1 = 0;
 		log.u_bbr.flex2 = 0;
 		log.u_bbr.flex3 = 0;
 		log.u_bbr.flex4 = 0;
 		log.u_bbr.flex7 = 0;
 		log.u_bbr.flex8 = settings_desired;
 
 		TCP_LOG_EVENTP(bbr->rc_tp, NULL,
 		    &bbr->rc_inp->inp_socket->so_rcv,
 		    &bbr->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_SETTINGS_CHG, 0,
 		    0, &log, false, &bbr->rc_tv);
 	}
 }
 
 /*
  * Returns the bw from the our filter.
  */
 static inline uint64_t
 bbr_get_full_bw(struct tcp_bbr *bbr)
 {
 	uint64_t bw;
 
 	bw = get_filter_value(&bbr->r_ctl.rc_delrate);
 
 	return (bw);
 }
 
 static inline void
 bbr_set_pktepoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
 {
 	uint64_t calclr;
 	uint32_t lost, del;
 
 	if (bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_pktepoch)
 		lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lost_at_pktepoch;
 	else
 		lost = 0;
 	del = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_pkt_epoch_del;
 	if (lost == 0)  {
 		calclr = 0;
 	} else if (del) {
 		calclr = lost;
 		calclr *= (uint64_t)1000;
 		calclr /= (uint64_t)del;
 	} else {
 		/* Nothing delivered? 100.0% loss */
 		calclr = 1000;
 	}
 	bbr->r_ctl.rc_pkt_epoch_loss_rate =  (uint32_t)calclr;
 	if (IN_RECOVERY(bbr->rc_tp->t_flags))
 		bbr->r_ctl.recovery_lr += (uint32_t)calclr;
 	bbr->r_ctl.rc_pkt_epoch++;
 	if (bbr->rc_no_pacing &&
 	    (bbr->r_ctl.rc_pkt_epoch >= bbr->no_pacing_until)) {
 		bbr->rc_no_pacing = 0;
 		tcp_bbr_tso_size_check(bbr, cts);
 	}
 	bbr->r_ctl.rc_pkt_epoch_rtt = bbr_calc_time(cts, bbr->r_ctl.rc_pkt_epoch_time);
 	bbr->r_ctl.rc_pkt_epoch_time = cts;
 	/* What was our loss rate */
 	bbr_log_pkt_epoch(bbr, cts, line, lost, del);
 	bbr->r_ctl.rc_pkt_epoch_del = bbr->r_ctl.rc_delivered;
 	bbr->r_ctl.rc_lost_at_pktepoch = bbr->r_ctl.rc_lost;
 }
 
 static inline void
 bbr_set_epoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
 {
 	uint32_t epoch_time;
 
 	/* Tick the RTT clock */
 	bbr->r_ctl.rc_rtt_epoch++;
 	epoch_time = cts - bbr->r_ctl.rc_rcv_epoch_start;
 	bbr_log_time_epoch(bbr, cts, line, epoch_time);
 	bbr->r_ctl.rc_rcv_epoch_start = cts;
 }
 
 static inline void
 bbr_isit_a_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm, int32_t line, int32_t cum_acked)
 {
 	if (SEQ_GEQ(rsm->r_delivered, bbr->r_ctl.rc_pkt_epoch_del)) {
 		bbr->rc_is_pkt_epoch_now = 1;
 	}
 }
 
 /*
  * Returns the bw from either the b/w filter
  * or from the lt_bw (if the connection is being
  * policed).
  */
 static inline uint64_t
 __bbr_get_bw(struct tcp_bbr *bbr)
 {
 	uint64_t bw, min_bw;
 	uint64_t rtt;
 	int gm_measure_cnt = 1;
 
 	/*
 	 * For startup we make, like google, a
 	 * minimum b/w. This is generated from the
 	 * IW and the rttProp. We do fall back to srtt
 	 * if for some reason (initial handshake) we don't
 	 * have a rttProp. We, in the worst case, fall back
 	 * to the configured min_bw (rc_initial_hptsi_bw).
 	 */
 	if (bbr->rc_bbr_state == BBR_STATE_STARTUP) {
 		/* Attempt first to use rttProp */
 		rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop);
 		if (rtt && (rtt < 0xffffffff)) {
 measure:
 			min_bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) *
 				((uint64_t)1000000);
 			min_bw /= rtt;
 			if (min_bw < bbr->r_ctl.rc_initial_hptsi_bw) {
 				min_bw = bbr->r_ctl.rc_initial_hptsi_bw;
 			}
 
 		} else if (bbr->rc_tp->t_srtt != 0) {
 			/* No rttProp, use srtt? */
 			rtt = bbr_get_rtt(bbr, BBR_SRTT);
 			goto measure;
 		} else {
 			min_bw = bbr->r_ctl.rc_initial_hptsi_bw;
 		}
 	} else
 		min_bw = 0;
 
 	if ((bbr->rc_past_init_win == 0) &&
 	    (bbr->r_ctl.rc_delivered > bbr_initial_cwnd(bbr, bbr->rc_tp)))
 		bbr->rc_past_init_win = 1;
 	if ((bbr->rc_use_google)  && (bbr->r_ctl.r_measurement_count >= 1))
 		gm_measure_cnt = 0;
 	if (gm_measure_cnt &&
 	    ((bbr->r_ctl.r_measurement_count < bbr_min_measurements_req) ||
 	     (bbr->rc_past_init_win == 0))) {
 		/* For google we use our guess rate until we get 1 measurement */
 
 use_initial_window:
 		rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop);
 		if (rtt && (rtt < 0xffffffff)) {
 			/*
 			 * We have an RTT measurement. Use that in
 			 * combination with our initial window to calculate
 			 * a b/w.
 			 */
 			bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) *
 				((uint64_t)1000000);
 			bw /= rtt;
 			if (bw < bbr->r_ctl.rc_initial_hptsi_bw) {
 				bw = bbr->r_ctl.rc_initial_hptsi_bw;
 			}
 		} else {
 			/* Drop back to the 40 and punt to a default */
 			bw = bbr->r_ctl.rc_initial_hptsi_bw;
 		}
 		if (bw < 1)
 			/* Probably should panic */
 			bw = 1;
 		if (bw > min_bw)
 			return (bw);
 		else
 			return (min_bw);
 	}
 	if (bbr->rc_lt_use_bw)
 		bw = bbr->r_ctl.rc_lt_bw;
 	else if (bbr->r_recovery_bw && (bbr->rc_use_google == 0))
 		bw = bbr->r_ctl.red_bw;
 	else
 		bw = get_filter_value(&bbr->r_ctl.rc_delrate);
 	if (bbr->rc_tp->t_peakrate_thr && (bbr->rc_use_google == 0)) {
 		/*
 		 * Enforce user set rate limit, keep in mind that
 		 * t_peakrate_thr is in B/s already
 		 */
 		bw = uqmin((uint64_t)bbr->rc_tp->t_peakrate_thr, bw);
 	}
 	if (bw == 0) {
 		/* We should not be at 0, go to the initial window then  */
 		goto use_initial_window;
 	}
 	if (bw < 1)
 		/* Probably should panic */
 		bw = 1;
 	if (bw < min_bw)
 		bw = min_bw;
 	return (bw);
 }
 
 static inline uint64_t
 bbr_get_bw(struct tcp_bbr *bbr)
 {
 	uint64_t bw;
 
 	bw = __bbr_get_bw(bbr);
 	return (bw);
 }
 
 static inline void
 bbr_reset_lt_bw_interval(struct tcp_bbr *bbr, uint32_t cts)
 {
 	bbr->r_ctl.rc_lt_epoch = bbr->r_ctl.rc_pkt_epoch;
 	bbr->r_ctl.rc_lt_time = bbr->r_ctl.rc_del_time;
 	bbr->r_ctl.rc_lt_del = bbr->r_ctl.rc_delivered;
 	bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
 }
 
 static inline void
 bbr_reset_lt_bw_sampling(struct tcp_bbr *bbr, uint32_t cts)
 {
 	bbr->rc_lt_is_sampling = 0;
 	bbr->rc_lt_use_bw = 0;
 	bbr->r_ctl.rc_lt_bw = 0;
 	bbr_reset_lt_bw_interval(bbr, cts);
 }
 
 static inline void
 bbr_lt_bw_samp_done(struct tcp_bbr *bbr, uint64_t bw, uint32_t cts, uint32_t timin)
 {
 	uint64_t diff;
 
 	/* Do we have a previous sample? */
 	if (bbr->r_ctl.rc_lt_bw) {
 		/* Get the diff in bytes per second */
 		if (bbr->r_ctl.rc_lt_bw > bw)
 			diff = bbr->r_ctl.rc_lt_bw - bw;
 		else
 			diff = bw - bbr->r_ctl.rc_lt_bw;
 		if ((diff <= bbr_lt_bw_diff) ||
 		    (diff <= (bbr->r_ctl.rc_lt_bw / bbr_lt_bw_ratio))) {
 			/* Consider us policed */
 			uint32_t saved_bw;
 
 			saved_bw = (uint32_t)bbr->r_ctl.rc_lt_bw;
 			bbr->r_ctl.rc_lt_bw = (bw + bbr->r_ctl.rc_lt_bw) / 2;	/* average of two */
 			bbr->rc_lt_use_bw = 1;
 			bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
 			/*
 			 * Use pkt based epoch for measuring length of
 			 * policer up
 			 */
 			bbr->r_ctl.rc_lt_epoch_use = bbr->r_ctl.rc_pkt_epoch;
 			/*
 			 * reason 4 is we need to start consider being
 			 * policed
 			 */
 			bbr_log_type_ltbw(bbr, cts, 4, (uint32_t)bw, saved_bw, (uint32_t)diff, timin);
 			return;
 		}
 	}
 	bbr->r_ctl.rc_lt_bw = bw;
 	bbr_reset_lt_bw_interval(bbr, cts);
 	bbr_log_type_ltbw(bbr, cts, 5, 0, (uint32_t)bw, 0, timin);
 }
 
 static void
 bbr_randomize_extra_state_time(struct tcp_bbr *bbr)
 {
 	uint32_t ran, deduct;
 
 	ran = arc4random_uniform(bbr_rand_ot);
 	if (ran) {
 		deduct = bbr->r_ctl.rc_level_state_extra / ran;
 		bbr->r_ctl.rc_level_state_extra -= deduct;
 	}
 }
 /*
  * Return randomly the starting state
  * to use in probebw.
  */
 static uint8_t
 bbr_pick_probebw_substate(struct tcp_bbr *bbr, uint32_t cts)
 {
 	uint32_t ran;
 	uint8_t ret_val;
 
 	/* Initialize the offset to 0 */
 	bbr->r_ctl.rc_exta_time_gd = 0;
 	bbr->rc_hit_state_1 = 0;
 	bbr->r_ctl.rc_level_state_extra = 0;
 	ran = arc4random_uniform((BBR_SUBSTATE_COUNT-1));
 	/*
 	 * The math works funny here :) the return value is used to set the
 	 * substate and then the state change is called which increments by
 	 * one. So if we return 1 (DRAIN) we will increment to 2 (LEVEL1) when
 	 * we fully enter the state. Note that the (8 - 1 - ran) assures that
 	 * we return 1 - 7, so we dont return 0 and end up starting in
 	 * state 1 (DRAIN).
 	 */
 	ret_val = BBR_SUBSTATE_COUNT - 1 - ran;
 	/* Set an epoch */
 	if ((cts - bbr->r_ctl.rc_rcv_epoch_start) >= bbr_get_rtt(bbr, BBR_RTT_PROP))
 		bbr_set_epoch(bbr, cts, __LINE__);
 
 	bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
 	return (ret_val);
 }
 
 static void
 bbr_lt_bw_sampling(struct tcp_bbr *bbr, uint32_t cts, int32_t loss_detected)
 {
 	uint32_t diff, d_time;
 	uint64_t del_time, bw, lost, delivered;
 
 	if (bbr->r_use_policer == 0)
 		return;
 	if (bbr->rc_lt_use_bw) {
 		/* We are using lt bw do we stop yet? */
 		diff = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch_use;
 		if (diff > bbr_lt_bw_max_rtts) {
 			/* Reset it all */
 reset_all:
 			bbr_reset_lt_bw_sampling(bbr, cts);
 			if (bbr->rc_filled_pipe) {
 				bbr_set_epoch(bbr, cts, __LINE__);
 				bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts);
 				bbr_substate_change(bbr, cts, __LINE__, 0);
 				bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
 				bbr_log_type_statechange(bbr, cts, __LINE__);
 			} else {
 				/*
 				 * This should not happen really
 				 * unless we remove the startup/drain
 				 * restrictions above.
 				 */
 				bbr->rc_bbr_state = BBR_STATE_STARTUP;
 				bbr_set_epoch(bbr, cts, __LINE__);
 				bbr->r_ctl.rc_bbr_state_time = cts;
 				bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
 				bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg;
 				bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg;
 				bbr_set_state_target(bbr, __LINE__);
 				bbr_log_type_statechange(bbr, cts, __LINE__);
 			}
 			/* reason 0 is to stop using lt-bw */
 			bbr_log_type_ltbw(bbr, cts, 0, 0, 0, 0, 0);
 			return;
 		}
 		if (bbr_lt_intvl_fp == 0) {
 			/* Not doing false-positive detection */
 			return;
 		}
 		/* False positive detection */
 		if (diff == bbr_lt_intvl_fp) {
 			/* At bbr_lt_intvl_fp we record the lost */
 			bbr->r_ctl.rc_lt_del = bbr->r_ctl.rc_delivered;
 			bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
 		} else if (diff > (bbr_lt_intvl_min_rtts + bbr_lt_intvl_fp)) {
 			/* Now is our loss rate still high? */
 			lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lt_lost;
 			delivered = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_lt_del;
 			if ((delivered == 0) ||
 			    (((lost * 1000)/delivered) < bbr_lt_fd_thresh)) {
 				/* No still below our threshold */
 				bbr_log_type_ltbw(bbr, cts, 7, lost, delivered, 0, 0);
 			} else {
 				/* Yikes its still high, it must be a false positive */
 				bbr_log_type_ltbw(bbr, cts, 8, lost, delivered, 0, 0);
 				goto reset_all;
 			}
 		}
 		return;
 	}
 	/*
 	 * Wait for the first loss before sampling, to let the policer
 	 * exhaust its tokens and estimate the steady-state rate allowed by
 	 * the policer. Starting samples earlier includes bursts that
 	 * over-estimate the bw.
 	 */
 	if (bbr->rc_lt_is_sampling == 0) {
 		/* reason 1 is to begin doing the sampling  */
 		if (loss_detected == 0)
 			return;
 		bbr_reset_lt_bw_interval(bbr, cts);
 		bbr->rc_lt_is_sampling = 1;
 		bbr_log_type_ltbw(bbr, cts, 1, 0, 0, 0, 0);
 		return;
 	}
 	/* Now how long were we delivering long term last> */
 	if (TSTMP_GEQ(bbr->r_ctl.rc_del_time, bbr->r_ctl.rc_lt_time))
 		d_time = bbr->r_ctl.rc_del_time - bbr->r_ctl.rc_lt_time;
 	else
 		d_time = 0;
 
 	/* To avoid underestimates, reset sampling if we run out of data. */
 	if (bbr->r_ctl.r_app_limited_until) {
 		/* Can not measure in app-limited state */
 		bbr_reset_lt_bw_sampling(bbr, cts);
 		/* reason 2 is to reset sampling due to app limits  */
 		bbr_log_type_ltbw(bbr, cts, 2, 0, 0, 0, d_time);
 		return;
 	}
 	diff = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch;
 	if (diff < bbr_lt_intvl_min_rtts) {
 		/*
 		 * need more samples (we don't
 		 * start on a round like linux so
 		 * we need 1 more).
 		 */
 		/* 6 is not_enough time or no-loss */
 		bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time);
 		return;
 	}
 	if (diff > (4 * bbr_lt_intvl_min_rtts)) {
 		/*
 		 * For now if we wait too long, reset all sampling. We need
 		 * to do some research here, its possible that we should
 		 * base this on how much loss as occurred.. something like
 		 * if its under 10% (or some thresh) reset all otherwise
 		 * don't.  Thats for phase II I guess.
 		 */
 		bbr_reset_lt_bw_sampling(bbr, cts);
  		/* reason 3 is to reset sampling due too long of sampling */
 		bbr_log_type_ltbw(bbr, cts, 3, 0, 0, 0, d_time);
 		return;
 	}
 	/*
 	 * End sampling interval when a packet is lost, so we estimate the
 	 * policer tokens were exhausted. Stopping the sampling before the
 	 * tokens are exhausted under-estimates the policed rate.
 	 */
 	if (loss_detected == 0) {
 		/* 6 is not_enough time or no-loss */
 		bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time);
 		return;
 	}
 	/* Calculate packets lost and delivered in sampling interval. */
 	lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lt_lost;
 	delivered = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_lt_del;
 	if ((delivered == 0) ||
 	    (((lost * 1000)/delivered) < bbr_lt_loss_thresh)) {
 		bbr_log_type_ltbw(bbr, cts, 6, lost, delivered, 0, d_time);
 		return;
 	}
 	if (d_time < 1000) {
 		/* Not enough time. wait */
 		/* 6 is not_enough time or no-loss */
 		bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time);
 		return;
 	}
 	if (d_time >= (0xffffffff / USECS_IN_MSEC)) {
 		/* Too long */
 		bbr_reset_lt_bw_sampling(bbr, cts);
  		/* reason 3 is to reset sampling due too long of sampling */
 		bbr_log_type_ltbw(bbr, cts, 3, 0, 0, 0, d_time);
 		return;
 	}
 	del_time = d_time;
 	bw = delivered;
 	bw *= (uint64_t)USECS_IN_SECOND;
 	bw /= del_time;
 	bbr_lt_bw_samp_done(bbr, bw, cts, d_time);
 }
 
 /*
  * Allocate a sendmap from our zone.
  */
 static struct bbr_sendmap *
 bbr_alloc(struct tcp_bbr *bbr)
 {
 	struct bbr_sendmap *rsm;
 
 	BBR_STAT_INC(bbr_to_alloc);
 	rsm = uma_zalloc(bbr_zone, (M_NOWAIT | M_ZERO));
 	if (rsm) {
 		bbr->r_ctl.rc_num_maps_alloced++;
 		return (rsm);
 	}
 	if (bbr->r_ctl.rc_free_cnt) {
 		BBR_STAT_INC(bbr_to_alloc_emerg);
 		rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free);
 		TAILQ_REMOVE(&bbr->r_ctl.rc_free, rsm, r_next);
 		bbr->r_ctl.rc_free_cnt--;
 		return (rsm);
 	}
 	BBR_STAT_INC(bbr_to_alloc_failed);
 	return (NULL);
 }
 
 static struct bbr_sendmap *
 bbr_alloc_full_limit(struct tcp_bbr *bbr)
 {
 	if ((V_tcp_map_entries_limit > 0) &&
 	    (bbr->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
 		BBR_STAT_INC(bbr_alloc_limited);
 		if (!bbr->alloc_limit_reported) {
 			bbr->alloc_limit_reported = 1;
 			BBR_STAT_INC(bbr_alloc_limited_conns);
 		}
 		return (NULL);
 	}
 	return (bbr_alloc(bbr));
 }
 
 /* wrapper to allocate a sendmap entry, subject to a specific limit */
 static struct bbr_sendmap *
 bbr_alloc_limit(struct tcp_bbr *bbr, uint8_t limit_type)
 {
 	struct bbr_sendmap *rsm;
 
 	if (limit_type) {
 		/* currently there is only one limit type */
 		if (V_tcp_map_split_limit > 0 &&
 		    bbr->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) {
 			BBR_STAT_INC(bbr_split_limited);
 			if (!bbr->alloc_limit_reported) {
 				bbr->alloc_limit_reported = 1;
 				BBR_STAT_INC(bbr_alloc_limited_conns);
 			}
 			return (NULL);
 		}
 	}
 
 	/* allocate and mark in the limit type, if set */
 	rsm = bbr_alloc(bbr);
 	if (rsm != NULL && limit_type) {
 		rsm->r_limit_type = limit_type;
 		bbr->r_ctl.rc_num_split_allocs++;
 	}
 	return (rsm);
 }
 
 static void
 bbr_free(struct tcp_bbr *bbr, struct bbr_sendmap *rsm)
 {
 	if (rsm->r_limit_type) {
 		/* currently there is only one limit type */
 		bbr->r_ctl.rc_num_split_allocs--;
 	}
 	if (rsm->r_is_smallmap)
 		bbr->r_ctl.rc_num_small_maps_alloced--;
 	if (bbr->r_ctl.rc_tlp_send == rsm)
 		bbr->r_ctl.rc_tlp_send = NULL;
 	if (bbr->r_ctl.rc_resend == rsm) {
 		bbr->r_ctl.rc_resend = NULL;
 	}
 	if (bbr->r_ctl.rc_next == rsm)
 		bbr->r_ctl.rc_next = NULL;
 	if (bbr->r_ctl.rc_sacklast == rsm)
 		bbr->r_ctl.rc_sacklast = NULL;
 	if (bbr->r_ctl.rc_free_cnt < bbr_min_req_free) {
 		memset(rsm, 0, sizeof(struct bbr_sendmap));
 		TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next);
 		rsm->r_limit_type = 0;
 		bbr->r_ctl.rc_free_cnt++;
 		return;
 	}
 	bbr->r_ctl.rc_num_maps_alloced--;
 	uma_zfree(bbr_zone, rsm);
 }
 
 /*
  * Returns the BDP.
  */
 static uint64_t
 bbr_get_bw_delay_prod(uint64_t rtt, uint64_t bw) {
 	/*
 	 * Calculate the bytes in flight needed given the bw (in bytes per
 	 * second) and the specifyed rtt in useconds. We need to put out the
 	 * returned value per RTT to match that rate. Gain will normally
 	 * raise it up from there.
 	 *
 	 * This should not overflow as long as the bandwidth is below 1
 	 * TByte per second (bw < 10**12 = 2**40) and the rtt is smaller
 	 * than 1000 seconds (rtt < 10**3 * 10**6 = 10**9 = 2**30).
 	 */
 	uint64_t usec_per_sec;
 
 	usec_per_sec = USECS_IN_SECOND;
 	return ((rtt * bw) / usec_per_sec);
 }
 
 /*
  * Return the initial cwnd.
  */
 static uint32_t
 bbr_initial_cwnd(struct tcp_bbr *bbr, struct tcpcb *tp)
 {
 	uint32_t i_cwnd;
 
 	if (bbr->rc_init_win) {
 		i_cwnd = bbr->rc_init_win * tp->t_maxseg;
 	} else if (V_tcp_initcwnd_segments)
 		i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg),
 		    max(2 * tp->t_maxseg, 14600));
 	else if (V_tcp_do_rfc3390)
 		i_cwnd = min(4 * tp->t_maxseg,
 		    max(2 * tp->t_maxseg, 4380));
 	else {
 		/* Per RFC5681 Section 3.1 */
 		if (tp->t_maxseg > 2190)
 			i_cwnd = 2 * tp->t_maxseg;
 		else if (tp->t_maxseg > 1095)
 			i_cwnd = 3 * tp->t_maxseg;
 		else
 			i_cwnd = 4 * tp->t_maxseg;
 	}
 	return (i_cwnd);
 }
 
 /*
  * Given a specified gain, return the target
  * cwnd based on that gain.
  */
 static uint32_t
 bbr_get_raw_target_cwnd(struct tcp_bbr *bbr, uint32_t gain, uint64_t bw)
 {
 	uint64_t bdp, rtt;
 	uint32_t cwnd;
 
 	if ((get_filter_value_small(&bbr->r_ctl.rc_rttprop) == 0xffffffff) ||
 	    (bbr_get_full_bw(bbr) == 0)) {
 		/* No measurements yet */
 		return (bbr_initial_cwnd(bbr, bbr->rc_tp));
 	}
 	/*
 	 * Get bytes per RTT needed (rttProp is normally in
 	 * bbr_cwndtarget_rtt_touse)
 	 */
 	rtt = bbr_get_rtt(bbr, bbr_cwndtarget_rtt_touse);
 	/* Get the bdp from the two values */
 	bdp = bbr_get_bw_delay_prod(rtt, bw);
 	/* Now apply the gain */
 	cwnd = (uint32_t)(((bdp * ((uint64_t)gain)) + (uint64_t)(BBR_UNIT - 1)) / ((uint64_t)BBR_UNIT));
 
 	return (cwnd);
 }
 
 static uint32_t
 bbr_get_target_cwnd(struct tcp_bbr *bbr, uint64_t bw, uint32_t gain)
 {
 	uint32_t cwnd, mss;
 
 	mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs);
 	/* Get the base cwnd with gain rounded to a mss */
 	cwnd = roundup(bbr_get_raw_target_cwnd(bbr, bw, gain), mss);
 	/*
 	 * Add in N (2 default since we do not have a
 	 * fq layer to trap packets in) quanta's per the I-D
 	 * section 4.2.3.2 quanta adjust.
 	 */
 	cwnd += (bbr_quanta * bbr->r_ctl.rc_pace_max_segs);
 	if (bbr->rc_use_google) {
 		if((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) &&
 		   (bbr_state_val(bbr) == BBR_SUB_GAIN)) {
 			/*
 			 * The linux implementation adds
 			 * an extra 2 x mss in gain cycle which
 			 * is documented no-where except in the code.
 			 * so we add more for Neal undocumented feature
 			 */
 			cwnd += 2 * mss;
 		}
  		if ((cwnd / mss) & 0x1) {
 			/* Round up for odd num mss */
 			cwnd += mss;
 		}
 	}
 	/* Are we below the min cwnd? */
 	if (cwnd < get_min_cwnd(bbr))
 		return (get_min_cwnd(bbr));
 	return (cwnd);
 }
 
 static uint16_t
 bbr_gain_adjust(struct tcp_bbr *bbr, uint16_t gain)
 {
 	if (gain < 1)
 		gain = 1;
 	return (gain);
 }
 
 static uint32_t
 bbr_get_header_oh(struct tcp_bbr *bbr)
 {
 	int seg_oh;
 
 	seg_oh = 0;
 	if (bbr->r_ctl.rc_inc_tcp_oh) {
 		/* Do we include TCP overhead? */
 		seg_oh = (bbr->rc_last_options + sizeof(struct tcphdr));
 	}
 	if (bbr->r_ctl.rc_inc_ip_oh) {
 		/* Do we include IP overhead? */
 #ifdef INET6
 		if (bbr->r_is_v6) {
 			seg_oh += sizeof(struct ip6_hdr);
 		} else
 #endif
 		{
 
 #ifdef INET
 			seg_oh += sizeof(struct ip);
 #endif
 		}
 	}
 	if (bbr->r_ctl.rc_inc_enet_oh) {
 		/* Do we include the ethernet overhead?  */
 		seg_oh += sizeof(struct ether_header);
 	}
 	return(seg_oh);
 }
 
 static uint32_t
 bbr_get_pacing_length(struct tcp_bbr *bbr, uint16_t gain, uint32_t useconds_time, uint64_t bw)
 {
 	uint64_t divor, res, tim;
 
 	if (useconds_time == 0)
 		return (0);
 	gain = bbr_gain_adjust(bbr, gain);
 	divor = (uint64_t)USECS_IN_SECOND * (uint64_t)BBR_UNIT;
 	tim = useconds_time;
 	res = (tim * bw * gain) / divor;
 	if (res == 0)
 		res = 1;
 	return ((uint32_t)res);
 }
 
 /*
  * Given a gain and a length return the delay in useconds that
  * should be used to evenly space out packets
  * on the connection (based on the gain factor).
  */
 static uint32_t
 bbr_get_pacing_delay(struct tcp_bbr *bbr, uint16_t gain, int32_t len, uint32_t cts, int nolog)
 {
 	uint64_t bw, lentim, res;
 	uint32_t usecs, srtt, over = 0;
 	uint32_t seg_oh, num_segs, maxseg;
 
 	if (len == 0)
 		return (0);
 
 	maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
 	num_segs = (len + maxseg - 1) / maxseg;
 	if (bbr->rc_use_google == 0) {
 		seg_oh = bbr_get_header_oh(bbr);
 		len += (num_segs * seg_oh);
 	}
 	gain = bbr_gain_adjust(bbr, gain);
 	bw = bbr_get_bw(bbr);
 	if (bbr->rc_use_google) {
 		uint64_t cbw;
 
 		/*
 		 * Reduce the b/w by the google discount
 		 * factor 10 = 1%.
 		 */
 		cbw = bw *  (uint64_t)(1000 - bbr->r_ctl.bbr_google_discount);
 		cbw /= (uint64_t)1000;
 		/* We don't apply a discount if it results in 0 */
 		if (cbw > 0)
 			bw = cbw;
 	}
 	lentim = ((uint64_t)len *
 		  (uint64_t)USECS_IN_SECOND *
 		  (uint64_t)BBR_UNIT);
 	res = lentim / ((uint64_t)gain * bw);
 	if (res == 0)
 		res = 1;
 	usecs = (uint32_t)res;
 	srtt = bbr_get_rtt(bbr, BBR_SRTT);
 	if (bbr_hptsi_max_mul && bbr_hptsi_max_div &&
 	    (bbr->rc_use_google == 0) &&
 	    (usecs > ((srtt * bbr_hptsi_max_mul) / bbr_hptsi_max_div))) {
 		/*
 		 * We cannot let the delay be more than 1/2 the srtt time.
 		 * Otherwise we cannot pace out or send properly.
 		 */
 		over = usecs = (srtt * bbr_hptsi_max_mul) / bbr_hptsi_max_div;
 		BBR_STAT_INC(bbr_hpts_min_time);
 	}
 	if (!nolog)
 		bbr_log_pacing_delay_calc(bbr, gain, len, cts, usecs, bw, over, 1);
 	return (usecs);
 }
 
 static void
 bbr_ack_received(struct tcpcb *tp, struct tcp_bbr *bbr, struct tcphdr *th, uint32_t bytes_this_ack,
 		 uint32_t sack_changed, uint32_t prev_acked, int32_t line, uint32_t losses)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	uint64_t bw;
 	uint32_t cwnd, target_cwnd, saved_bytes, maxseg;
 	int32_t meth;
 
 #ifdef STATS
 	if ((tp->t_flags & TF_GPUTINPROG) &&
 	    SEQ_GEQ(th->th_ack, tp->gput_ack)) {
 		/*
 		 * Strech acks and compressed acks will cause this to
 		 * oscillate but we are doing it the same way as the main
 		 * stack so it will be compariable (though possibly not
 		 * ideal).
 		 */
 		int32_t cgput;
 		int64_t gput, time_stamp;
 
 		gput = (int64_t) (th->th_ack - tp->gput_seq) * 8;
 		time_stamp = max(1, ((bbr->r_ctl.rc_rcvtime - tp->gput_ts) / 1000));
 		cgput = gput / time_stamp;
 		stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
 					 cgput);
 		if (tp->t_stats_gput_prev > 0)
 			stats_voi_update_abs_s32(tp->t_stats,
 						 VOI_TCP_GPUT_ND,
 						 ((gput - tp->t_stats_gput_prev) * 100) /
 						 tp->t_stats_gput_prev);
 		tp->t_flags &= ~TF_GPUTINPROG;
 		tp->t_stats_gput_prev = cgput;
 	}
 #endif
 	if ((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) &&
 	    ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google)) {
 		/* We don't change anything in probe-rtt */
 		return;
 	}
 	maxseg = tp->t_maxseg - bbr->rc_last_options;
 	saved_bytes = bytes_this_ack;
 	bytes_this_ack += sack_changed;
 	if (bytes_this_ack > prev_acked) {
 		bytes_this_ack -= prev_acked;
 		/*
 		 * A byte ack'd gives us a full mss
 		 * to be like linux i.e. they count packets.
 		 */
 		if ((bytes_this_ack < maxseg) && bbr->rc_use_google)
 			bytes_this_ack = maxseg;
 	} else {
 		/* Unlikely */
 		bytes_this_ack = 0;
 	}
 	cwnd = tp->snd_cwnd;
 	bw = get_filter_value(&bbr->r_ctl.rc_delrate);
 	if (bw)
 		target_cwnd = bbr_get_target_cwnd(bbr,
 						  bw,
 						  (uint32_t)bbr->r_ctl.rc_bbr_cwnd_gain);
 	else
 		target_cwnd = bbr_initial_cwnd(bbr, bbr->rc_tp);
 	if (IN_RECOVERY(tp->t_flags) &&
 	    (bbr->bbr_prev_in_rec == 0)) {
 		/*
 		 * We are entering recovery and
 		 * thus packet conservation.
 		 */
 		bbr->pkt_conservation = 1;
 		bbr->r_ctl.rc_recovery_start = bbr->r_ctl.rc_rcvtime;
 		cwnd = ctf_flight_size(tp,
 				       (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) +
 			bytes_this_ack;
 	}
 	if (IN_RECOVERY(tp->t_flags)) {
 		uint32_t flight;
 
 		bbr->bbr_prev_in_rec = 1;
 		if (cwnd > losses) {
 			cwnd -= losses;
 			if (cwnd < maxseg)
 				cwnd = maxseg;
 		} else
 			cwnd = maxseg;
 		flight = ctf_flight_size(tp,
 					 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
 		bbr_log_type_cwndupd(bbr, flight, 0,
 				     losses, 10, 0, 0, line);
 		if (bbr->pkt_conservation) {
 			uint32_t time_in;
 
 			if (TSTMP_GEQ(bbr->r_ctl.rc_rcvtime, bbr->r_ctl.rc_recovery_start))
 				time_in = bbr->r_ctl.rc_rcvtime - bbr->r_ctl.rc_recovery_start;
 			else
 				time_in = 0;
 
 			if (time_in >= bbr_get_rtt(bbr, BBR_RTT_PROP)) {
 				/* Clear packet conservation after an rttProp */
 				bbr->pkt_conservation = 0;
 			} else {
 				if ((flight + bytes_this_ack) > cwnd)
 					cwnd = flight + bytes_this_ack;
 				if (cwnd < get_min_cwnd(bbr))
 					cwnd = get_min_cwnd(bbr);
 				tp->snd_cwnd = cwnd;
 				bbr_log_type_cwndupd(bbr, saved_bytes, sack_changed,
 						     prev_acked, 1, target_cwnd, th->th_ack, line);
 				return;
 			}
 		}
 	} else
 		bbr->bbr_prev_in_rec = 0;
 	if ((bbr->rc_use_google == 0) && bbr->r_ctl.restrict_growth) {
 		bbr->r_ctl.restrict_growth--;
 		if (bytes_this_ack > maxseg)
 			bytes_this_ack = maxseg;
 	}
 	if (bbr->rc_filled_pipe) {
 		/*
 		 * Here we have exited startup and filled the pipe. We will
 		 * thus allow the cwnd to shrink to the target. We hit here
 		 * mostly.
 		 */
 		uint32_t s_cwnd;
 
 		meth = 2;
 		s_cwnd = min((cwnd + bytes_this_ack), target_cwnd);
 		if (s_cwnd > cwnd)
 			cwnd = s_cwnd;
 		else if (bbr_cwnd_may_shrink || bbr->rc_use_google || bbr->rc_no_pacing)
 			cwnd = s_cwnd;
 	} else {
 		/*
 		 * Here we are still in startup, we increase cwnd by what
 		 * has been acked.
 		 */
 		if ((cwnd < target_cwnd) ||
 		    (bbr->rc_past_init_win == 0)) {
 			meth = 3;
 			cwnd += bytes_this_ack;
 		} else {
 			/*
 			 * Method 4 means we are at target so no gain in
 			 * startup and past the initial window.
 			 */
 			meth = 4;
 		}
 	}
 	tp->snd_cwnd = max(cwnd, get_min_cwnd(bbr));
 	bbr_log_type_cwndupd(bbr, saved_bytes, sack_changed, prev_acked, meth, target_cwnd, th->th_ack, line);
 }
 
 static void
 tcp_bbr_partialack(struct tcpcb *tp)
 {
 	struct tcp_bbr *bbr;
 
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (ctf_flight_size(tp,
 		(bbr->r_ctl.rc_sacked  + bbr->r_ctl.rc_lost_bytes)) <=
 	    tp->snd_cwnd) {
 		bbr->r_wanted_output = 1;
 	}
 }
 
 static void
 bbr_post_recovery(struct tcpcb *tp)
 {
 	struct tcp_bbr *bbr;
 	uint32_t  flight;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	/*
 	 * Here we just exit recovery.
 	 */
 	EXIT_RECOVERY(tp->t_flags);
 	/* Lock in our b/w reduction for the specified number of pkt-epochs */
 	bbr->r_recovery_bw = 0;
 	tp->snd_recover = tp->snd_una;
 	tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime);
 	bbr->pkt_conservation = 0;
 	if (bbr->rc_use_google == 0) {
 		/*
 		 * For non-google mode lets
 		 * go ahead and make sure we clear
 		 * the recovery state so if we
 		 * bounce back in to recovery we
 		 * will do PC.
 		 */
 		bbr->bbr_prev_in_rec = 0;
 	}
 	bbr_log_type_exit_rec(bbr);
 	if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) {
 		tp->snd_cwnd = max(tp->snd_cwnd, bbr->r_ctl.rc_cwnd_on_ent);
 		bbr_log_type_cwndupd(bbr, 0, 0, 0, 15, 0, 0, __LINE__);
 	} else {
 		/* For probe-rtt case lets fix up its saved_cwnd */
 		if (bbr->r_ctl.rc_saved_cwnd < bbr->r_ctl.rc_cwnd_on_ent) {
 			bbr->r_ctl.rc_saved_cwnd = bbr->r_ctl.rc_cwnd_on_ent;
 			bbr_log_type_cwndupd(bbr, 0, 0, 0, 16, 0, 0, __LINE__);
 		}
 	}
 	flight = ctf_flight_size(tp,
 		     (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
 	if ((bbr->rc_use_google == 0) &&
 	    bbr_do_red) {
 		uint64_t val, lr2use;
 		uint32_t maxseg, newcwnd, acks_inflight, ratio, cwnd;
 		uint32_t *cwnd_p;
 
 		if (bbr_get_rtt(bbr, BBR_SRTT)) {
 			val = ((uint64_t)bbr_get_rtt(bbr, BBR_RTT_PROP) * (uint64_t)1000);
 			val /= bbr_get_rtt(bbr, BBR_SRTT);
 			ratio = (uint32_t)val;
 		} else
 			ratio = 1000;
 
 		bbr_log_type_cwndupd(bbr, bbr_red_mul, bbr_red_div,
 				     bbr->r_ctl.recovery_lr, 21,
 				     ratio,
 				     bbr->r_ctl.rc_red_cwnd_pe,
 				     __LINE__);
 		if ((ratio < bbr_do_red) || (bbr_do_red == 0))
 			goto done;
 		if (((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) &&
 		     bbr_prtt_slam_cwnd) ||
 		    (bbr_sub_drain_slam_cwnd &&
 		     (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) &&
 		     bbr->rc_hit_state_1 &&
 		     (bbr_state_val(bbr) == BBR_SUB_DRAIN)) ||
 		    ((bbr->rc_bbr_state == BBR_STATE_DRAIN) &&
 		     bbr_slam_cwnd_in_main_drain)) {
 			/*
 			 * Here we must poke at the saved cwnd
 			 * as well as the cwnd.
 			 */
 			cwnd = bbr->r_ctl.rc_saved_cwnd;
 			cwnd_p = &bbr->r_ctl.rc_saved_cwnd;
 		} else {
  			cwnd = tp->snd_cwnd;
 			cwnd_p = &tp->snd_cwnd;
 		}
 		maxseg = tp->t_maxseg - bbr->rc_last_options;
 		/* Add the overall lr with the recovery lr */
 		if (bbr->r_ctl.rc_lost == 0)
 			lr2use = 0;
 		else if (bbr->r_ctl.rc_delivered == 0)
 			lr2use = 1000;
 		else {
 			lr2use = bbr->r_ctl.rc_lost * 1000;
 			lr2use /= bbr->r_ctl.rc_delivered;
 		}
 		lr2use += bbr->r_ctl.recovery_lr;
 		acks_inflight = (flight / (maxseg * 2));
 		if (bbr_red_scale) {
 			lr2use *= bbr_get_rtt(bbr, BBR_SRTT);
 			lr2use /= bbr_red_scale;
 			if ((bbr_red_growth_restrict) &&
 			    ((bbr_get_rtt(bbr, BBR_SRTT)/bbr_red_scale) > 1))
 			    bbr->r_ctl.restrict_growth += acks_inflight;
 		}
 		if (lr2use) {
 			val = (uint64_t)cwnd * lr2use;
 			val /= 1000;
 			if (cwnd > val)
 				newcwnd = roundup((cwnd - val), maxseg);
 			else
 				newcwnd = maxseg;
 		} else {
 			val = (uint64_t)cwnd * (uint64_t)bbr_red_mul;
 			val /= (uint64_t)bbr_red_div;
 			newcwnd = roundup((uint32_t)val, maxseg);
 		}
 		/* with standard delayed acks how many acks can I expect? */
 		if (bbr_drop_limit == 0) {
 			/*
 			 * Anticpate how much we will
 			 * raise the cwnd based on the acks.
 			 */
 			if ((newcwnd + (acks_inflight * maxseg)) < get_min_cwnd(bbr)) {
 				/* We do enforce the min (with the acks) */
 				newcwnd = (get_min_cwnd(bbr) - acks_inflight);
 			}
 		} else {
 			/*
 			 * A strict drop limit of N is inplace
 			 */
 			if (newcwnd < (bbr_drop_limit * maxseg)) {
 				newcwnd = bbr_drop_limit * maxseg;
 			}
 		}
 		/* For the next N acks do we restrict the growth */
 		*cwnd_p = newcwnd;
 		if (tp->snd_cwnd > newcwnd)
 			tp->snd_cwnd = newcwnd;
 		bbr_log_type_cwndupd(bbr, bbr_red_mul, bbr_red_div, val, 22,
 				     (uint32_t)lr2use,
 				     bbr_get_rtt(bbr, BBR_SRTT), __LINE__);
 		bbr->r_ctl.rc_red_cwnd_pe = bbr->r_ctl.rc_pkt_epoch;
 	}
 done:
 	bbr->r_ctl.recovery_lr = 0;
 	if (flight <= tp->snd_cwnd) {
 		bbr->r_wanted_output = 1;
 	}
 	tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime);
 }
 
 static void
 bbr_setup_red_bw(struct tcp_bbr *bbr, uint32_t cts)
 {
 	bbr->r_ctl.red_bw = get_filter_value(&bbr->r_ctl.rc_delrate);
 	/* Limit the drop in b/w to 1/2 our current filter. */
 	if (bbr->r_ctl.red_bw > bbr->r_ctl.rc_bbr_cur_del_rate)
 		bbr->r_ctl.red_bw = bbr->r_ctl.rc_bbr_cur_del_rate;
 	if (bbr->r_ctl.red_bw < (get_filter_value(&bbr->r_ctl.rc_delrate) / 2))
 		bbr->r_ctl.red_bw = get_filter_value(&bbr->r_ctl.rc_delrate) / 2;
 	tcp_bbr_tso_size_check(bbr, cts);
 }
 
 static void
 bbr_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type, struct bbr_sendmap *rsm)
 {
 	struct tcp_bbr *bbr;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 #ifdef STATS
 	stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
 #endif
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	switch (type) {
 	case CC_NDUPACK:
 		if (!IN_RECOVERY(tp->t_flags)) {
 			tp->snd_recover = tp->snd_max;
 			/* Start a new epoch */
 			bbr_set_pktepoch(bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
 			if (bbr->rc_lt_is_sampling || bbr->rc_lt_use_bw) {
 				/*
 				 * Move forward the lt epoch
 				 * so it won't count the truncated
 				 * epoch.
 				 */
 				bbr->r_ctl.rc_lt_epoch++;
 			}
 			if (bbr->rc_bbr_state == BBR_STATE_STARTUP) {
 				/*
 				 * Just like the policer detection code
 				 * if we are in startup we must push
 				 * forward the last startup epoch
 				 * to hide the truncated PE.
 				 */
 				bbr->r_ctl.rc_bbr_last_startup_epoch++;
 			}
 			bbr->r_ctl.rc_cwnd_on_ent = tp->snd_cwnd;
 			ENTER_RECOVERY(tp->t_flags);
 			bbr->rc_tlp_rtx_out = 0;
 			bbr->r_ctl.recovery_lr = bbr->r_ctl.rc_pkt_epoch_loss_rate;
 			tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime);
 			if (tcp_in_hpts(bbr->rc_inp) &&
 			    ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) == 0)) {
 				/*
 				 * When we enter recovery, we need to restart
 				 * any timers. This may mean we gain an agg
 				 * early, which will be made up for at the last
 				 * rxt out.
 				 */
 				bbr->rc_timer_first = 1;
 				bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
 			}
 			/*
 			 * Calculate a new cwnd based on to the current
 			 * delivery rate with no gain. We get the bdp
 			 * without gaining it up like we normally would and
 			 * we use the last cur_del_rate.
 			 */
 			if ((bbr->rc_use_google == 0) &&
 			    (bbr->r_ctl.bbr_rttprobe_gain_val ||
 			     (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT))) {
 				tp->snd_cwnd = ctf_flight_size(tp,
 					           (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) +
 					(tp->t_maxseg - bbr->rc_last_options);
 				if (tp->snd_cwnd < get_min_cwnd(bbr)) {
 					/* We always gate to min cwnd */
 					tp->snd_cwnd = get_min_cwnd(bbr);
 				}
 				bbr_log_type_cwndupd(bbr, 0, 0, 0, 14, 0, 0, __LINE__);
 			}
 			bbr_log_type_enter_rec(bbr, rsm->r_start);
 		}
 		break;
 	case CC_RTO_ERR:
 		KMOD_TCPSTAT_INC(tcps_sndrexmitbad);
 		/* RTO was unnecessary, so reset everything. */
 		bbr_reset_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime);
 		if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) {
 			tp->snd_cwnd = tp->snd_cwnd_prev;
 			tp->snd_ssthresh = tp->snd_ssthresh_prev;
 			tp->snd_recover = tp->snd_recover_prev;
 			tp->snd_cwnd = max(tp->snd_cwnd, bbr->r_ctl.rc_cwnd_on_ent);
 			bbr_log_type_cwndupd(bbr, 0, 0, 0, 13, 0, 0, __LINE__);
 		}
 		tp->t_badrxtwin = 0;
 		break;
 	}
 }
 
 /*
  * Indicate whether this ack should be delayed.  We can delay the ack if
  * following conditions are met:
  *	- There is no delayed ack timer in progress.
  *	- Our last ack wasn't a 0-sized window. We never want to delay
  *	  the ack that opens up a 0-sized window.
  *	- LRO wasn't used for this segment. We make sure by checking that the
  *	  segment size is not larger than the MSS.
  *	- Delayed acks are enabled or this is a half-synchronized T/TCP
  *	  connection.
  *	- The data being acked is less than a full segment (a stretch ack
  *        of more than a segment we should ack.
  *      - nsegs is 1 (if its more than that we received more than 1 ack).
  */
 #define DELAY_ACK(tp, bbr, nsegs)				\
 	(((tp->t_flags & TF_RXWIN0SENT) == 0) &&		\
 	 ((tp->t_flags & TF_DELACK) == 0) && 		 	\
 	 ((bbr->bbr_segs_rcvd + nsegs) < tp->t_delayed_ack) &&	\
 	 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
 
 /*
  * Return the lowest RSM in the map of
  * packets still in flight that is not acked.
  * This should normally find on the first one
  * since we remove packets from the send
  * map after they are marked ACKED.
  */
 static struct bbr_sendmap *
 bbr_find_lowest_rsm(struct tcp_bbr *bbr)
 {
 	struct bbr_sendmap *rsm;
 
 	/*
 	 * Walk the time-order transmitted list looking for an rsm that is
 	 * not acked. This will be the one that was sent the longest time
 	 * ago that is still outstanding.
 	 */
 	TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_tmap, r_tnext) {
 		if (rsm->r_flags & BBR_ACKED) {
 			continue;
 		}
 		goto finish;
 	}
 finish:
 	return (rsm);
 }
 
 static struct bbr_sendmap *
 bbr_find_high_nonack(struct tcp_bbr *bbr, struct bbr_sendmap *rsm)
 {
 	struct bbr_sendmap *prsm;
 
 	/*
 	 * Walk the sequence order list backward until we hit and arrive at
 	 * the highest seq not acked. In theory when this is called it
 	 * should be the last segment (which it was not).
 	 */
 	prsm = rsm;
 	TAILQ_FOREACH_REVERSE_FROM(prsm, &bbr->r_ctl.rc_map, bbr_head, r_next) {
 		if (prsm->r_flags & (BBR_ACKED | BBR_HAS_FIN)) {
 			continue;
 		}
 		return (prsm);
 	}
 	return (NULL);
 }
 
 /*
  * Returns to the caller the number of microseconds that
  * the packet can be outstanding before we think we
  * should have had an ack returned.
  */
 static uint32_t
 bbr_calc_thresh_rack(struct tcp_bbr *bbr, uint32_t srtt, uint32_t cts, struct bbr_sendmap *rsm)
 {
 	/*
 	 * lro is the flag we use to determine if we have seen reordering.
 	 * If it gets set we have seen reordering. The reorder logic either
 	 * works in one of two ways:
 	 *
 	 * If reorder-fade is configured, then we track the last time we saw
 	 * re-ordering occur. If we reach the point where enough time as
 	 * passed we no longer consider reordering has occuring.
 	 *
 	 * Or if reorder-face is 0, then once we see reordering we consider
 	 * the connection to alway be subject to reordering and just set lro
 	 * to 1.
 	 *
 	 * In the end if lro is non-zero we add the extra time for
 	 * reordering in.
 	 */
 	int32_t lro;
 	uint32_t thresh, t_rxtcur;
 
 	if (srtt == 0)
 		srtt = 1;
 	if (bbr->r_ctl.rc_reorder_ts) {
 		if (bbr->r_ctl.rc_reorder_fade) {
 			if (SEQ_GEQ(cts, bbr->r_ctl.rc_reorder_ts)) {
 				lro = cts - bbr->r_ctl.rc_reorder_ts;
 				if (lro == 0) {
 					/*
 					 * No time as passed since the last
 					 * reorder, mark it as reordering.
 					 */
 					lro = 1;
 				}
 			} else {
 				/* Negative time? */
 				lro = 0;
 			}
 			if (lro > bbr->r_ctl.rc_reorder_fade) {
 				/* Turn off reordering seen too */
 				bbr->r_ctl.rc_reorder_ts = 0;
 				lro = 0;
 			}
 		} else {
 			/* Reodering does not fade */
 			lro = 1;
 		}
 	} else {
 		lro = 0;
 	}
 	thresh = srtt + bbr->r_ctl.rc_pkt_delay;
 	if (lro) {
 		/* It must be set, if not you get 1/4 rtt */
 		if (bbr->r_ctl.rc_reorder_shift)
 			thresh += (srtt >> bbr->r_ctl.rc_reorder_shift);
 		else
 			thresh += (srtt >> 2);
 	} else {
 		thresh += 1000;
 	}
 	/* We don't let the rack timeout be above a RTO */
 	if ((bbr->rc_tp)->t_srtt == 0)
 		t_rxtcur = BBR_INITIAL_RTO;
 	else
 		t_rxtcur = TICKS_2_USEC(bbr->rc_tp->t_rxtcur);
 	if (thresh > t_rxtcur) {
 		thresh = t_rxtcur;
 	}
 	/* And we don't want it above the RTO max either */
 	if (thresh > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) {
 		thresh = (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND);
 	}
 	bbr_log_thresh_choice(bbr, cts, thresh, lro, srtt, rsm, BBR_TO_FRM_RACK);
 	return (thresh);
 }
 
 /*
  * Return to the caller the amount of time in mico-seconds
  * that should be used for the TLP timer from the last
  * send time of this packet.
  */
 static uint32_t
 bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr,
     struct bbr_sendmap *rsm, uint32_t srtt,
     uint32_t cts)
 {
 	uint32_t thresh, len, maxseg, t_rxtcur;
 	struct bbr_sendmap *prsm;
 
 	if (srtt == 0)
 		srtt = 1;
 	if (bbr->rc_tlp_threshold)
 		thresh = srtt + (srtt / bbr->rc_tlp_threshold);
 	else
 		thresh = (srtt * 2);
 	maxseg = tp->t_maxseg - bbr->rc_last_options;
 	/* Get the previous sent packet, if any  */
 	len = rsm->r_end - rsm->r_start;
 
 	/* 2.1 behavior */
 	prsm = TAILQ_PREV(rsm, bbr_head, r_tnext);
 	if (prsm && (len <= maxseg)) {
 		/*
 		 * Two packets outstanding, thresh should be (2*srtt) +
 		 * possible inter-packet delay (if any).
 		 */
 		uint32_t inter_gap = 0;
 		int idx, nidx;
 
 		idx = rsm->r_rtr_cnt - 1;
 		nidx = prsm->r_rtr_cnt - 1;
 		if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) {
 			/* Yes it was sent later (or at the same time) */
 			inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
 		}
 		thresh += inter_gap;
 	} else if (len <= maxseg) {
 		/*
 		 * Possibly compensate for delayed-ack.
 		 */
 		uint32_t alt_thresh;
 
 		alt_thresh = srtt + (srtt / 2) + bbr_delayed_ack_time;
 		if (alt_thresh > thresh)
 			thresh = alt_thresh;
 	}
 	/* Not above the current  RTO */
 	if (tp->t_srtt == 0)
 		t_rxtcur = BBR_INITIAL_RTO;
 	else
 		t_rxtcur = TICKS_2_USEC(tp->t_rxtcur);
 
 	bbr_log_thresh_choice(bbr, cts, thresh, t_rxtcur, srtt, rsm, BBR_TO_FRM_TLP);
 	/* Not above an RTO */
 	if (thresh > t_rxtcur) {
 		thresh = t_rxtcur;
 	}
 	/* Not above a RTO max */
 	if (thresh > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) {
 		thresh = (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND);
 	}
 	/* And now apply the user TLP min */
 	if (thresh < bbr_tlp_min) {
 		thresh = bbr_tlp_min;
 	}
 	return (thresh);
 }
 
 /*
  * Return one of three RTTs to use (in microseconds).
  */
 static __inline uint32_t
 bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type)
 {
 	uint32_t f_rtt;
 	uint32_t srtt;
 
 	f_rtt = get_filter_value_small(&bbr->r_ctl.rc_rttprop);
 	if (get_filter_value_small(&bbr->r_ctl.rc_rttprop) == 0xffffffff) {
 		/* We have no rtt at all */
 		if (bbr->rc_tp->t_srtt == 0)
 			f_rtt = BBR_INITIAL_RTO;
 		else
 			f_rtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT);
 		/*
 		 * Since we don't know how good the rtt is apply a
 		 * delayed-ack min
 		 */
 		if (f_rtt < bbr_delayed_ack_time) {
 			f_rtt = bbr_delayed_ack_time;
 		}
 	}
 	/* Take the filter version or last measured pkt-rtt */
 	if (rtt_type == BBR_RTT_PROP) {
 		srtt = f_rtt;
 	} else if (rtt_type == BBR_RTT_PKTRTT) {
 		if (bbr->r_ctl.rc_pkt_epoch_rtt) {
 			srtt = bbr->r_ctl.rc_pkt_epoch_rtt;
 		} else {
 			/* No pkt rtt yet */
 			srtt = f_rtt;
 		}
 	} else if (rtt_type == BBR_RTT_RACK) {
 		srtt = bbr->r_ctl.rc_last_rtt;
 		/* We need to add in any internal delay for our timer */
 		if (bbr->rc_ack_was_delayed)
 			srtt += bbr->r_ctl.rc_ack_hdwr_delay;
 	} else if (rtt_type == BBR_SRTT) {
 		srtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT);
 	} else {
 		/* TSNH */
 		srtt = f_rtt;
 #ifdef BBR_INVARIANTS
 		panic("Unknown rtt request type %d", rtt_type);
 #endif
 	}
 	return (srtt);
 }
 
 static int
 bbr_is_lost(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t cts)
 {
 	uint32_t thresh;
 
 	thresh = bbr_calc_thresh_rack(bbr, bbr_get_rtt(bbr, BBR_RTT_RACK),
 				      cts, rsm);
 	if ((cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) >= thresh) {
 		/* It is lost (past time) */
 		return (1);
 	}
 	return (0);
 }
 
 /*
  * Return a sendmap if we need to retransmit something.
  */
 static struct bbr_sendmap *
 bbr_check_recovery_mode(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 {
 	/*
 	 * Check to see that we don't need to fall into recovery. We will
 	 * need to do so if our oldest transmit is past the time we should
 	 * have had an ack.
 	 */
 
 	struct bbr_sendmap *rsm;
 	int32_t idx;
 
 	if (TAILQ_EMPTY(&bbr->r_ctl.rc_map)) {
 		/* Nothing outstanding that we know of */
 		return (NULL);
 	}
 	rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
 	if (rsm == NULL) {
 		/* Nothing in the transmit map */
 		return (NULL);
 	}
 	if (tp->t_flags & TF_SENTFIN) {
 		/* Fin restricted, don't find anything once a fin is sent */
 		return (NULL);
 	}
 	if (rsm->r_flags & BBR_ACKED) {
 		/*
 		 * Ok the first one is acked (this really should not happen
 		 * since we remove the from the tmap once they are acked)
 		 */
 		rsm = bbr_find_lowest_rsm(bbr);
 		if (rsm == NULL)
 			return (NULL);
 	}
 	idx = rsm->r_rtr_cnt - 1;
 	if (SEQ_LEQ(cts, rsm->r_tim_lastsent[idx])) {
 		/* Send timestamp is the same or less? can't be ready */
 		return (NULL);
 	}
 	/* Get our RTT time */
 	if (bbr_is_lost(bbr, rsm, cts) &&
 	    ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
 	     (rsm->r_flags & BBR_SACK_PASSED))) {
 		if ((rsm->r_flags & BBR_MARKED_LOST) == 0) {
 			rsm->r_flags |= BBR_MARKED_LOST;
 			bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start;
 			bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start;
 		}
 		bbr_cong_signal(tp, NULL, CC_NDUPACK, rsm);
 #ifdef BBR_INVARIANTS
 		if ((rsm->r_end - rsm->r_start) == 0)
 			panic("tp:%p bbr:%p rsm:%p length is 0?", tp, bbr, rsm);
 #endif
 		return (rsm);
 	}
 	return (NULL);
 }
 
 /*
  * RACK Timer, here we simply do logging and house keeping.
  * the normal bbr_output_wtime() function will call the
  * appropriate thing to check if we need to do a RACK retransmit.
  * We return 1, saying don't proceed with bbr_output_wtime only
  * when all timers have been stopped (destroyed PCB?).
  */
 static int
 bbr_timeout_rack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 {
 	/*
 	 * This timer simply provides an internal trigger to send out data.
 	 * The check_recovery_mode call will see if there are needed
 	 * retransmissions, if so we will enter fast-recovery. The output
 	 * call may or may not do the same thing depending on sysctl
 	 * settings.
 	 */
 	uint32_t lost;
 
 	if (bbr->rc_all_timers_stopped) {
 		return (1);
 	}
 	if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) {
 		/* Its not time yet */
 		return (0);
 	}
 	BBR_STAT_INC(bbr_to_tot);
 	lost = bbr->r_ctl.rc_lost;
 	if (bbr->r_state && (bbr->r_state != tp->t_state))
 		bbr_set_state(tp, bbr, 0);
 	bbr_log_to_event(bbr, cts, BBR_TO_FRM_RACK);
 	if (bbr->r_ctl.rc_resend == NULL) {
 		/* Lets do the check here */
 		bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts);
 	}
 	if (bbr_policer_call_from_rack_to)
 		bbr_lt_bw_sampling(bbr, cts, (bbr->r_ctl.rc_lost > lost));
 	bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
 	return (0);
 }
 
 static __inline void
 bbr_clone_rsm(struct tcp_bbr *bbr, struct bbr_sendmap *nrsm, struct bbr_sendmap *rsm, uint32_t start)
 {
 	int idx;
 
 	nrsm->r_start = start;
 	nrsm->r_end = rsm->r_end;
 	nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
 	nrsm-> r_rtt_not_allowed = rsm->r_rtt_not_allowed;
 	nrsm->r_flags = rsm->r_flags;
 	/* We don't transfer forward the SYN flag */
 	nrsm->r_flags &= ~BBR_HAS_SYN;
 	/* We move forward the FIN flag, not that this should happen */
 	rsm->r_flags &= ~BBR_HAS_FIN;
 	nrsm->r_dupack = rsm->r_dupack;
 	nrsm->r_rtr_bytes = 0;
 	nrsm->r_is_gain = rsm->r_is_gain;
 	nrsm->r_is_drain = rsm->r_is_drain;
 	nrsm->r_delivered = rsm->r_delivered;
 	nrsm->r_ts_valid = rsm->r_ts_valid;
 	nrsm->r_del_ack_ts = rsm->r_del_ack_ts;
 	nrsm->r_del_time = rsm->r_del_time;
 	nrsm->r_app_limited = rsm->r_app_limited;
 	nrsm->r_first_sent_time = rsm->r_first_sent_time;
 	nrsm->r_flight_at_send = rsm->r_flight_at_send;
 	/* We split a piece the lower section looses any just_ret flag. */
 	nrsm->r_bbr_state = rsm->r_bbr_state;
 	for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
 		nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
 	}
 	rsm->r_end = nrsm->r_start;
 	idx = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs);
 	idx /= 8;
 	/* Check if we got too small */
 	if ((rsm->r_is_smallmap == 0) &&
 	    ((rsm->r_end - rsm->r_start) <= idx)) {
 		bbr->r_ctl.rc_num_small_maps_alloced++;
 		rsm->r_is_smallmap = 1;
 	}
 	/* Check the new one as well */
 	if ((nrsm->r_end - nrsm->r_start) <= idx) {
 		bbr->r_ctl.rc_num_small_maps_alloced++;
 		nrsm->r_is_smallmap = 1;
 	}
 }
 
 static int
 bbr_sack_mergable(struct bbr_sendmap *at,
 		  uint32_t start, uint32_t end)
 {
 	/*
 	 * Given a sack block defined by
 	 * start and end, and a current position
 	 * at. Return 1 if either side of at
 	 * would show that the block is mergable
 	 * to that side. A block to be mergable
 	 * must have overlap with the start/end
 	 * and be in the SACK'd state.
 	 */
 	struct bbr_sendmap *l_rsm;
 	struct bbr_sendmap *r_rsm;
 
 	/* first get the either side blocks */
 	l_rsm = TAILQ_PREV(at, bbr_head, r_next);
 	r_rsm = TAILQ_NEXT(at, r_next);
 	if (l_rsm && (l_rsm->r_flags & BBR_ACKED)) {
 		/* Potentially mergeable */
 		if ((l_rsm->r_end == start) ||
 		    (SEQ_LT(start, l_rsm->r_end) &&
 		     SEQ_GT(end, l_rsm->r_end))) {
 			    /*
 			     * map blk   |------|
 			     * sack blk         |------|
 			     * <or>
 			     * map blk   |------|
 			     * sack blk      |------|
 			     */
 			    return (1);
 		    }
 	}
 	if (r_rsm && (r_rsm->r_flags & BBR_ACKED)) {
 		/* Potentially mergeable */
 		if ((r_rsm->r_start == end) ||
 		    (SEQ_LT(start, r_rsm->r_start) &&
 		     SEQ_GT(end, r_rsm->r_start))) {
 			/*
 			 * map blk          |---------|
 			 * sack blk    |----|
 			 * <or>
 			 * map blk          |---------|
 			 * sack blk    |-------|
 			 */
 			return (1);
 		}
 	}
 	return (0);
 }
 
 static struct bbr_sendmap *
 bbr_merge_rsm(struct tcp_bbr *bbr,
 	      struct bbr_sendmap *l_rsm,
 	      struct bbr_sendmap *r_rsm)
 {
 	/*
 	 * We are merging two ack'd RSM's,
 	 * the l_rsm is on the left (lower seq
 	 * values) and the r_rsm is on the right
 	 * (higher seq value). The simplest way
 	 * to merge these is to move the right
 	 * one into the left. I don't think there
 	 * is any reason we need to try to find
 	 * the oldest (or last oldest retransmitted).
 	 */
 	l_rsm->r_end = r_rsm->r_end;
 	if (l_rsm->r_dupack < r_rsm->r_dupack)
 		l_rsm->r_dupack = r_rsm->r_dupack;
 	if (r_rsm->r_rtr_bytes)
 		l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
 	if (r_rsm->r_in_tmap) {
 		/* This really should not happen */
 		TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, r_rsm, r_tnext);
 	}
 	if (r_rsm->r_app_limited)
 		l_rsm->r_app_limited = r_rsm->r_app_limited;
 	/* Now the flags */
 	if (r_rsm->r_flags & BBR_HAS_FIN)
 		l_rsm->r_flags |= BBR_HAS_FIN;
 	if (r_rsm->r_flags & BBR_TLP)
 		l_rsm->r_flags |= BBR_TLP;
 	if (r_rsm->r_flags & BBR_RWND_COLLAPSED)
 		l_rsm->r_flags |= BBR_RWND_COLLAPSED;
 	if (r_rsm->r_flags & BBR_MARKED_LOST) {
 		/* This really should not happen */
 		bbr->r_ctl.rc_lost_bytes -= r_rsm->r_end - r_rsm->r_start;
 	}
 	TAILQ_REMOVE(&bbr->r_ctl.rc_map, r_rsm, r_next);
 	if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
 		/* Transfer the split limit to the map we free */
 		r_rsm->r_limit_type = l_rsm->r_limit_type;
 		l_rsm->r_limit_type = 0;
 	}
 	bbr_free(bbr, r_rsm);
 	return(l_rsm);
 }
 
 /*
  * TLP Timer, here we simply setup what segment we want to
  * have the TLP expire on, the normal bbr_output_wtime() will then
  * send it out.
  *
  * We return 1, saying don't proceed with bbr_output_wtime only
  * when all timers have been stopped (destroyed PCB?).
  */
 static int
 bbr_timeout_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 {
 	/*
 	 * Tail Loss Probe.
 	 */
 	struct bbr_sendmap *rsm = NULL;
 	struct socket *so;
 	uint32_t amm;
 	uint32_t out, avail;
 	uint32_t maxseg;
 	int collapsed_win = 0;
 
 	if (bbr->rc_all_timers_stopped) {
 		return (1);
 	}
 	if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) {
 		/* Its not time yet */
 		return (0);
 	}
 	if (ctf_progress_timeout_check(tp, true)) {
 		bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 		return (-ETIMEDOUT);	/* tcp_drop() */
 	}
 	/* Did we somehow get into persists? */
 	if (bbr->rc_in_persist) {
 		return (0);
 	}
 	if (bbr->r_state && (bbr->r_state != tp->t_state))
 		bbr_set_state(tp, bbr, 0);
 	BBR_STAT_INC(bbr_tlp_tot);
 	maxseg = tp->t_maxseg - bbr->rc_last_options;
 	/*
 	 * A TLP timer has expired. We have been idle for 2 rtts. So we now
 	 * need to figure out how to force a full MSS segment out.
 	 */
 	so = tp->t_inpcb->inp_socket;
 	avail = sbavail(&so->so_snd);
 	out = ctf_outstanding(tp);
 	if (out > tp->snd_wnd) {
 		/* special case, we need a retransmission */
 		collapsed_win = 1;
 		goto need_retran;
 	}
 	if (avail > out) {
 		/* New data is available */
 		amm = avail - out;
 		if (amm > maxseg) {
 			amm = maxseg;
 		} else if ((amm < maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) {
 			/* not enough to fill a MTU and no-delay is off */
 			goto need_retran;
 		}
 		/* Set the send-new override */
 		if ((out + amm) <= tp->snd_wnd) {
 			bbr->rc_tlp_new_data = 1;
 		} else {
 			goto need_retran;
 		}
 		bbr->r_ctl.rc_tlp_seg_send_cnt = 0;
 		bbr->r_ctl.rc_last_tlp_seq = tp->snd_max;
 		bbr->r_ctl.rc_tlp_send = NULL;
 		/* cap any slots */
 		BBR_STAT_INC(bbr_tlp_newdata);
 		goto send;
 	}
 need_retran:
 	/*
 	 * Ok we need to arrange the last un-acked segment to be re-sent, or
 	 * optionally the first un-acked segment.
 	 */
 	if (collapsed_win == 0) {
 		rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next);
 		if (rsm && (BBR_ACKED | BBR_HAS_FIN)) {
 			rsm = bbr_find_high_nonack(bbr, rsm);
 		}
 		if (rsm == NULL) {
 			goto restore;
 		}
 	} else {
 		/*
 		 * We must find the last segment
 		 * that was acceptable by the client.
 		 */
 		TAILQ_FOREACH_REVERSE(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) {
 			if ((rsm->r_flags & BBR_RWND_COLLAPSED) == 0) {
 				/* Found one */
 				break;
 			}
 		}
 		if (rsm == NULL) {
 			/* None? if so send the first */
 			rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
 			if (rsm == NULL)
 				goto restore;
 		}
 	}
 	if ((rsm->r_end - rsm->r_start) > maxseg) {
 		/*
 		 * We need to split this the last segment in two.
 		 */
 		struct bbr_sendmap *nrsm;
 
 		nrsm = bbr_alloc_full_limit(bbr);
 		if (nrsm == NULL) {
 			/*
 			 * We can't get memory to split, we can either just
 			 * not split it. Or retransmit the whole piece, lets
 			 * do the large send (BTLP :-) ).
 			 */
 			goto go_for_it;
 		}
 		bbr_clone_rsm(bbr, nrsm, rsm, (rsm->r_end - maxseg));
 		TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
 		if (rsm->r_in_tmap) {
 			TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 			nrsm->r_in_tmap = 1;
 		}
 		rsm->r_flags &= (~BBR_HAS_FIN);
 		rsm = nrsm;
 	}
 go_for_it:
 	bbr->r_ctl.rc_tlp_send = rsm;
 	bbr->rc_tlp_rtx_out = 1;
 	if (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq) {
 		bbr->r_ctl.rc_tlp_seg_send_cnt++;
 		tp->t_rxtshift++;
 	} else {
 		bbr->r_ctl.rc_last_tlp_seq = rsm->r_start;
 		bbr->r_ctl.rc_tlp_seg_send_cnt = 1;
 	}
 send:
 	if (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend) {
 		/*
 		 * Can't [re]/transmit a segment we have retranmitted the
 		 * max times. We need the retransmit timer to take over.
 		 */
 restore:
 		bbr->rc_tlp_new_data = 0;
 		bbr->r_ctl.rc_tlp_send = NULL;
 		if (rsm)
 			rsm->r_flags &= ~BBR_TLP;
 		BBR_STAT_INC(bbr_tlp_retran_fail);
 		return (0);
 	} else if (rsm) {
 		rsm->r_flags |= BBR_TLP;
 	}
 	if (rsm && (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq) &&
 	    (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend)) {
 		/*
 		 * We have retransmitted to many times for TLP. Switch to
 		 * the regular RTO timer
 		 */
 		goto restore;
 	}
 	bbr_log_to_event(bbr, cts, BBR_TO_FRM_TLP);
 	bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
 	return (0);
 }
 
 /*
  * Delayed ack Timer, here we simply need to setup the
  * ACK_NOW flag and remove the DELACK flag. From there
  * the output routine will send the ack out.
  *
  * We only return 1, saying don't proceed, if all timers
  * are stopped (destroyed PCB?).
  */
 static int
 bbr_timeout_delack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 {
 	if (bbr->rc_all_timers_stopped) {
 		return (1);
 	}
 	bbr_log_to_event(bbr, cts, BBR_TO_FRM_DELACK);
 	tp->t_flags &= ~TF_DELACK;
 	tp->t_flags |= TF_ACKNOW;
 	KMOD_TCPSTAT_INC(tcps_delack);
 	bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
 	return (0);
 }
 
 /*
  * Here we send a KEEP-ALIVE like probe to the
  * peer, we do not send data.
  *
  * We only return 1, saying don't proceed, if all timers
  * are stopped (destroyed PCB?).
  */
 static int
 bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 {
 	struct tcptemp *t_template;
 	int32_t retval = 1;
 
 	if (bbr->rc_all_timers_stopped) {
 		return (1);
 	}
 	if (bbr->rc_in_persist == 0)
 		return (0);
 	KASSERT(tp->t_inpcb != NULL,
 	    ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	/*
 	 * Persistence timer into zero window. Force a byte to be output, if
 	 * possible.
 	 */
 	bbr_log_to_event(bbr, cts, BBR_TO_FRM_PERSIST);
 	bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
 	KMOD_TCPSTAT_INC(tcps_persisttimeo);
 	/*
 	 * Have we exceeded the user specified progress time?
 	 */
 	if (ctf_progress_timeout_check(tp, true)) {
 		bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 		return (-ETIMEDOUT);	/* tcp_drop() */
 	}
 	/*
 	 * Hack: if the peer is dead/unreachable, we do not time out if the
 	 * window is closed.  After a full backoff, drop the connection if
 	 * the idle time (no responses to probes) reaches the maximum
 	 * backoff that we would use if retransmitting.
 	 */
 	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
 	    ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
 		KMOD_TCPSTAT_INC(tcps_persistdrop);
 		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
 		return (-ETIMEDOUT);	/* tcp_drop() */
 	}
 	if ((sbavail(&bbr->rc_inp->inp_socket->so_snd) == 0) &&
 	    tp->snd_una == tp->snd_max) {
 		bbr_exit_persist(tp, bbr, cts, __LINE__);
 		retval = 0;
 		goto out;
 	}
 	/*
 	 * If the user has closed the socket then drop a persisting
 	 * connection after a much reduced timeout.
 	 */
 	if (tp->t_state > TCPS_CLOSE_WAIT &&
 	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
 		KMOD_TCPSTAT_INC(tcps_persistdrop);
 		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
 		return (-ETIMEDOUT);	/* tcp_drop() */
 	}
 	t_template = tcpip_maketemplate(bbr->rc_inp);
 	if (t_template) {
 		tcp_respond(tp, t_template->tt_ipgen,
 			    &t_template->tt_t, (struct mbuf *)NULL,
 			    tp->rcv_nxt, tp->snd_una - 1, 0);
 		/* This sends an ack */
 		if (tp->t_flags & TF_DELACK)
 			tp->t_flags &= ~TF_DELACK;
 		free(t_template, M_TEMP);
 	}
 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
 		tp->t_rxtshift++;
 	bbr_start_hpts_timer(bbr, tp, cts, 3, 0, 0);
 out:
 	return (retval);
 }
 
 /*
  * If a keepalive goes off, we had no other timers
  * happening. We always return 1 here since this
  * routine either drops the connection or sends
  * out a segment with respond.
  */
 static int
 bbr_timeout_keepalive(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 {
 	struct tcptemp *t_template;
 	struct inpcb *inp;
 
 	if (bbr->rc_all_timers_stopped) {
 		return (1);
 	}
 	bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
 	inp = tp->t_inpcb;
 	bbr_log_to_event(bbr, cts, BBR_TO_FRM_KEEP);
 	/*
 	 * Keep-alive timer went off; send something or drop connection if
 	 * idle for too long.
 	 */
 	KMOD_TCPSTAT_INC(tcps_keeptimeo);
 	if (tp->t_state < TCPS_ESTABLISHED)
 		goto dropit;
 	if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
 	    tp->t_state <= TCPS_CLOSING) {
 		if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
 			goto dropit;
 		/*
 		 * Send a packet designed to force a response if the peer is
 		 * up and reachable: either an ACK if the connection is
 		 * still alive, or an RST if the peer has closed the
 		 * connection due to timeout or reboot. Using sequence
 		 * number tp->snd_una-1 causes the transmitted zero-length
 		 * segment to lie outside the receive window; by the
 		 * protocol spec, this requires the correspondent TCP to
 		 * respond.
 		 */
 		KMOD_TCPSTAT_INC(tcps_keepprobe);
 		t_template = tcpip_maketemplate(inp);
 		if (t_template) {
 			tcp_respond(tp, t_template->tt_ipgen,
 			    &t_template->tt_t, (struct mbuf *)NULL,
 			    tp->rcv_nxt, tp->snd_una - 1, 0);
 			free(t_template, M_TEMP);
 		}
 	}
 	bbr_start_hpts_timer(bbr, tp, cts, 4, 0, 0);
 	return (1);
 dropit:
 	KMOD_TCPSTAT_INC(tcps_keepdrops);
 	tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
 	return (-ETIMEDOUT);	/* tcp_drop() */
 }
 
 /*
  * Retransmit helper function, clear up all the ack
  * flags and take care of important book keeping.
  */
 static void
 bbr_remxt_tmr(struct tcpcb *tp)
 {
 	/*
 	 * The retransmit timer went off, all sack'd blocks must be
 	 * un-acked.
 	 */
 	struct bbr_sendmap *rsm, *trsm = NULL;
 	struct tcp_bbr *bbr;
 	uint32_t cts, lost;
 
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	cts = tcp_get_usecs(&bbr->rc_tv);
 	lost = bbr->r_ctl.rc_lost;
 	if (bbr->r_state && (bbr->r_state != tp->t_state))
 		bbr_set_state(tp, bbr, 0);
 
 	TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
 		if (rsm->r_flags & BBR_ACKED) {
 			uint32_t old_flags;
 
 			rsm->r_dupack = 0;
 			if (rsm->r_in_tmap == 0) {
 				/* We must re-add it back to the tlist */
 				if (trsm == NULL) {
 					TAILQ_INSERT_HEAD(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
 				} else {
 					TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, trsm, rsm, r_tnext);
 				}
 				rsm->r_in_tmap = 1;
 			}
 			old_flags = rsm->r_flags;
 			rsm->r_flags |= BBR_RXT_CLEARED;
 			rsm->r_flags &= ~(BBR_ACKED | BBR_SACK_PASSED | BBR_WAS_SACKPASS);
 			bbr_log_type_rsmclear(bbr, cts, rsm, old_flags, __LINE__);
 		} else {
 			if ((tp->t_state < TCPS_ESTABLISHED) &&
 			    (rsm->r_start == tp->snd_una)) {
 				/*
 				 * Special case for TCP FO. Where
 				 * we sent more data beyond the snd_max.
 				 * We don't mark that as lost and stop here.
 				 */
 				break;
 			}
 			if ((rsm->r_flags & BBR_MARKED_LOST) == 0) {
 				bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start;
 				bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start;
 			}
 			if (bbr_marks_rxt_sack_passed) {
 				/*
 				 * With this option, we will rack out
 				 * in 1ms increments the rest of the packets.
 				 */
 				rsm->r_flags |= BBR_SACK_PASSED | BBR_MARKED_LOST;
 				rsm->r_flags &= ~BBR_WAS_SACKPASS;
 			} else {
 				/*
 				 * With this option we only mark them lost
 				 * and remove all sack'd markings. We will run
 				 * another RXT or a TLP. This will cause
 				 * us to eventually send more based on what
 				 * ack's come in.
 				 */
 				rsm->r_flags |= BBR_MARKED_LOST;
 				rsm->r_flags &= ~BBR_WAS_SACKPASS;
 				rsm->r_flags &= ~BBR_SACK_PASSED;
 			}
 		}
 		trsm = rsm;
 	}
 	bbr->r_ctl.rc_resend = TAILQ_FIRST(&bbr->r_ctl.rc_map);
 	/* Clear the count (we just un-acked them) */
 	bbr_log_to_event(bbr, cts, BBR_TO_FRM_TMR);
 	bbr->rc_tlp_new_data = 0;
 	bbr->r_ctl.rc_tlp_seg_send_cnt = 0;
 	/* zap the behindness on a rxt */
 	bbr->r_ctl.rc_hptsi_agg_delay = 0;
 	bbr->r_agg_early_set = 0;
 	bbr->r_ctl.rc_agg_early = 0;
 	bbr->rc_tlp_rtx_out = 0;
 	bbr->r_ctl.rc_sacked = 0;
 	bbr->r_ctl.rc_sacklast = NULL;
 	bbr->r_timer_override = 1;
 	bbr_lt_bw_sampling(bbr, cts, (bbr->r_ctl.rc_lost > lost));
 }
 
 /*
  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
  * we will setup to retransmit the lowest seq number outstanding.
  */
 static int
 bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 {
 	int32_t rexmt;
 	int32_t retval = 0;
 	bool isipv6;
 
 	bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
 	if (bbr->rc_all_timers_stopped) {
 		return (1);
 	}
 	if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 	    (tp->snd_una == tp->snd_max)) {
 		/* Nothing outstanding .. nothing to do */
 		return (0);
 	}
 	/*
 	 * Retransmission timer went off.  Message has not been acked within
 	 * retransmit interval.  Back off to a longer retransmit interval
 	 * and retransmit one segment.
 	 */
 	if (ctf_progress_timeout_check(tp, true)) {
 		bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 		return (-ETIMEDOUT);	/* tcp_drop() */
 	}
 	bbr_remxt_tmr(tp);
 	if ((bbr->r_ctl.rc_resend == NULL) ||
 	    ((bbr->r_ctl.rc_resend->r_flags & BBR_RWND_COLLAPSED) == 0)) {
 		/*
 		 * If the rwnd collapsed on
 		 * the one we are retransmitting
 		 * it does not count against the
 		 * rxt count.
 		 */
 		tp->t_rxtshift++;
 	}
 	if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
 		KMOD_TCPSTAT_INC(tcps_timeoutdrop);
 		tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
 		/* XXXGL: previously t_softerror was casted to uint16_t */
 		MPASS(tp->t_softerror >= 0);
 		retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT;
 		return (retval);	/* tcp_drop() */
 	}
 	if (tp->t_state == TCPS_SYN_SENT) {
 		/*
 		 * If the SYN was retransmitted, indicate CWND to be limited
 		 * to 1 segment in cc_conn_init().
 		 */
 		tp->snd_cwnd = 1;
 	} else if (tp->t_rxtshift == 1) {
 		/*
 		 * first retransmit; record ssthresh and cwnd so they can be
 		 * recovered if this turns out to be a "bad" retransmit. A
 		 * retransmit is considered "bad" if an ACK for this segment
 		 * is received within RTT/2 interval; the assumption here is
 		 * that the ACK was already in flight.  See "On Estimating
 		 * End-to-End Network Path Properties" by Allman and Paxson
 		 * for more details.
 		 */
 		tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options;
 		if (!IN_RECOVERY(tp->t_flags)) {
 			tp->snd_cwnd_prev = tp->snd_cwnd;
 			tp->snd_ssthresh_prev = tp->snd_ssthresh;
 			tp->snd_recover_prev = tp->snd_recover;
 			tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
 			tp->t_flags |= TF_PREVVALID;
 		} else {
 			tp->t_flags &= ~TF_PREVVALID;
 		}
 		tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options;
 	} else {
 		tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options;
 		tp->t_flags &= ~TF_PREVVALID;
 	}
 	KMOD_TCPSTAT_INC(tcps_rexmttimeo);
 	if ((tp->t_state == TCPS_SYN_SENT) ||
 	    (tp->t_state == TCPS_SYN_RECEIVED))
 		rexmt = USEC_2_TICKS(BBR_INITIAL_RTO) * tcp_backoff[tp->t_rxtshift];
 	else
 		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
 	TCPT_RANGESET(tp->t_rxtcur, rexmt,
 	    MSEC_2_TICKS(bbr->r_ctl.rc_min_rto_ms),
 	    MSEC_2_TICKS(((uint32_t)bbr->rc_max_rto_sec) * 1000));
 	/*
 	 * We enter the path for PLMTUD if connection is established or, if
 	 * connection is FIN_WAIT_1 status, reason for the last is that if
 	 * amount of data we send is very small, we could send it in couple
 	 * of packets and process straight to FIN. In that case we won't
 	 * catch ESTABLISHED state.
 	 */
 #ifdef INET6
 	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false;
 #else
 	isipv6 = false;
 #endif
 	if (((V_tcp_pmtud_blackhole_detect == 1) ||
 	    (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) ||
 	    (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) &&
 	    ((tp->t_state == TCPS_ESTABLISHED) ||
 	    (tp->t_state == TCPS_FIN_WAIT_1))) {
 		/*
 		 * Idea here is that at each stage of mtu probe (usually,
 		 * 1448 -> 1188 -> 524) should be given 2 chances to recover
 		 * before further clamping down. 'tp->t_rxtshift % 2 == 0'
 		 * should take care of that.
 		 */
 		if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
 		    (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
 		    (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
 		    tp->t_rxtshift % 2 == 0)) {
 			/*
 			 * Enter Path MTU Black-hole Detection mechanism: -
 			 * Disable Path MTU Discovery (IP "DF" bit). -
 			 * Reduce MTU to lower value than what we negotiated
 			 * with peer.
 			 */
 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
 				/*
 				 * Record that we may have found a black
 				 * hole.
 				 */
 				tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
 				/* Keep track of previous MSS. */
 				tp->t_pmtud_saved_maxseg = tp->t_maxseg;
 			}
 			/*
 			 * Reduce the MSS to blackhole value or to the
 			 * default in an attempt to retransmit.
 			 */
 #ifdef INET6
 			isipv6 = bbr->r_is_v6;
 			if (isipv6 &&
 			    tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
 				/* Use the sysctl tuneable blackhole MSS. */
 				tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
 				KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
 			} else if (isipv6) {
 				/* Use the default MSS. */
 				tp->t_maxseg = V_tcp_v6mssdflt;
 				/*
 				 * Disable Path MTU Discovery when we switch
 				 * to minmss.
 				 */
 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 				KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
 			}
 #endif
 #if defined(INET6) && defined(INET)
 			else
 #endif
 #ifdef INET
 			if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
 				/* Use the sysctl tuneable blackhole MSS. */
 				tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
 				KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
 			} else {
 				/* Use the default MSS. */
 				tp->t_maxseg = V_tcp_mssdflt;
 				/*
 				 * Disable Path MTU Discovery when we switch
 				 * to minmss.
 				 */
 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 				KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
 			}
 #endif
 		} else {
 			/*
 			 * If further retransmissions are still unsuccessful
 			 * with a lowered MTU, maybe this isn't a blackhole
 			 * and we restore the previous MSS and blackhole
 			 * detection flags. The limit '6' is determined by
 			 * giving each probe stage (1448, 1188, 524) 2
 			 * chances to recover.
 			 */
 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
 			    (tp->t_rxtshift >= 6)) {
 				tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 				tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
 				tp->t_maxseg = tp->t_pmtud_saved_maxseg;
 				KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed);
 			}
 		}
 	}
 	/*
 	 * Disable RFC1323 and SACK if we haven't got any response to our
 	 * third SYN to work-around some broken terminal servers (most of
 	 * which have hopefully been retired) that have bad VJ header
 	 * compression code which trashes TCP segments containing
 	 * unknown-to-them TCP options.
 	 */
 	if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
 	    (tp->t_rxtshift == 3))
 		tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT);
 	/*
 	 * If we backed off this far, our srtt estimate is probably bogus.
 	 * Clobber it so we'll take the next rtt measurement as our srtt;
 	 * move the current srtt into rttvar to keep the current retransmit
 	 * times until then.
 	 */
 	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
 #ifdef INET6
 		if (bbr->r_is_v6)
 			in6_losing(tp->t_inpcb);
 		else
 #endif
 			in_losing(tp->t_inpcb);
 		tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
 		tp->t_srtt = 0;
 	}
 	sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una);
 	tp->snd_recover = tp->snd_max;
 	tp->t_flags |= TF_ACKNOW;
 	tp->t_rtttime = 0;
 
 	return (retval);
 }
 
 static int
 bbr_process_timers(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, uint8_t hpts_calling)
 {
 	int32_t ret = 0;
 	int32_t timers = (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
 
 	if (timers == 0) {
 		return (0);
 	}
 	if (tp->t_state == TCPS_LISTEN) {
 		/* no timers on listen sockets */
 		if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
 			return (0);
 		return (1);
 	}
 	if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) {
 		uint32_t left;
 
 		if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
 			ret = -1;
 			bbr_log_to_processing(bbr, cts, ret, 0, hpts_calling);
 			return (0);
 		}
 		if (hpts_calling == 0) {
 			ret = -2;
 			bbr_log_to_processing(bbr, cts, ret, 0, hpts_calling);
 			return (0);
 		}
 		/*
 		 * Ok our timer went off early and we are not paced false
 		 * alarm, go back to sleep.
 		 */
 		left = bbr->r_ctl.rc_timer_exp - cts;
 		ret = -3;
 		bbr_log_to_processing(bbr, cts, ret, left, hpts_calling);
 		tcp_hpts_insert(tp->t_inpcb, HPTS_USEC_TO_SLOTS(left));
 		return (1);
 	}
 	bbr->rc_tmr_stopped = 0;
 	bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
 	if (timers & PACE_TMR_DELACK) {
 		ret = bbr_timeout_delack(tp, bbr, cts);
 	} else if (timers & PACE_TMR_PERSIT) {
 		ret = bbr_timeout_persist(tp, bbr, cts);
 	} else if (timers & PACE_TMR_RACK) {
 		bbr->r_ctl.rc_tlp_rxt_last_time = cts;
 		ret = bbr_timeout_rack(tp, bbr, cts);
 	} else if (timers & PACE_TMR_TLP) {
 		bbr->r_ctl.rc_tlp_rxt_last_time = cts;
 		ret = bbr_timeout_tlp(tp, bbr, cts);
 	} else if (timers & PACE_TMR_RXT) {
 		bbr->r_ctl.rc_tlp_rxt_last_time = cts;
 		ret = bbr_timeout_rxt(tp, bbr, cts);
 	} else if (timers & PACE_TMR_KEEP) {
 		ret = bbr_timeout_keepalive(tp, bbr, cts);
 	}
 	bbr_log_to_processing(bbr, cts, ret, timers, hpts_calling);
 	return (ret);
 }
 
 static void
 bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts)
 {
 	if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
 		uint8_t hpts_removed = 0;
 
 		if (tcp_in_hpts(bbr->rc_inp) &&
 		    (bbr->rc_timer_first == 1)) {
 			/*
 			 * If we are canceling timer's when we have the
 			 * timer ahead of the output being paced. We also
 			 * must remove ourselves from the hpts.
 			 */
 			hpts_removed = 1;
 			tcp_hpts_remove(bbr->rc_inp);
 			if (bbr->r_ctl.rc_last_delay_val) {
 				/* Update the last hptsi delay too */
 				uint32_t time_since_send;
 
 				if (TSTMP_GT(cts, bbr->rc_pacer_started))
 					time_since_send = cts - bbr->rc_pacer_started;
 				else
 					time_since_send = 0;
 				if (bbr->r_ctl.rc_last_delay_val > time_since_send) {
 					/* Cut down our slot time */
 					bbr->r_ctl.rc_last_delay_val -= time_since_send;
 				} else {
 					bbr->r_ctl.rc_last_delay_val = 0;
 				}
 				bbr->rc_pacer_started = cts;
 			}
 		}
 		bbr->rc_timer_first = 0;
 		bbr_log_to_cancel(bbr, line, cts, hpts_removed);
 		bbr->rc_tmr_stopped = bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
 		bbr->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
 	}
 }
 
 static void
 bbr_timer_stop(struct tcpcb *tp, uint32_t timer_type)
 {
 	struct tcp_bbr *bbr;
 
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	bbr->rc_all_timers_stopped = 1;
 	return;
 }
 
 /*
  * stop all timers always returning 0.
  */
 static int
 bbr_stopall(struct tcpcb *tp)
 {
 	return (0);
 }
 
 static void
 bbr_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
 {
 	return;
 }
 
 /*
  * return true if a bbr timer (rack or tlp) is active.
  */
 static int
 bbr_timer_active(struct tcpcb *tp, uint32_t timer_type)
 {
 	return (0);
 }
 
 static uint32_t
 bbr_get_earliest_send_outstanding(struct tcp_bbr *bbr, struct bbr_sendmap *u_rsm, uint32_t cts)
 {
 	struct bbr_sendmap *rsm;
 
 	rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
 	if ((rsm == NULL) || (u_rsm == rsm))
 		return (cts);
 	return(rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]);
 }
 
 static void
 bbr_update_rsm(struct tcpcb *tp, struct tcp_bbr *bbr,
      struct bbr_sendmap *rsm, uint32_t cts, uint32_t pacing_time)
 {
 	int32_t idx;
 
 	rsm->r_rtr_cnt++;
 	rsm->r_dupack = 0;
 	if (rsm->r_rtr_cnt > BBR_NUM_OF_RETRANS) {
 		rsm->r_rtr_cnt = BBR_NUM_OF_RETRANS;
 		rsm->r_flags |= BBR_OVERMAX;
 	}
 	if (rsm->r_flags & BBR_RWND_COLLAPSED) {
 		/* Take off the collapsed flag at rxt */
 		rsm->r_flags &= ~BBR_RWND_COLLAPSED;
 	}
 	if (rsm->r_flags & BBR_MARKED_LOST) {
 		/* We have retransmitted, its no longer lost */
 		rsm->r_flags &= ~BBR_MARKED_LOST;
 		bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
 	}
 	if (rsm->r_flags & BBR_RXT_CLEARED) {
 		/*
 		 * We hit a RXT timer on it and
 		 * we cleared the "acked" flag.
 		 * We now have it going back into
 		 * flight, we can remove the cleared
 		 * flag and possibly do accounting on
 		 * this piece.
 		 */
 		rsm->r_flags &= ~BBR_RXT_CLEARED;
 	}
 	if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & BBR_TLP) == 0)) {
 		bbr->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
 		rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
 	}
 	idx = rsm->r_rtr_cnt - 1;
 	rsm->r_tim_lastsent[idx] = cts;
 	rsm->r_pacing_delay = pacing_time;
 	rsm->r_delivered = bbr->r_ctl.rc_delivered;
 	rsm->r_ts_valid = bbr->rc_ts_valid;
 	if (bbr->rc_ts_valid)
 		rsm->r_del_ack_ts = bbr->r_ctl.last_inbound_ts;
 	if (bbr->r_ctl.r_app_limited_until)
 		rsm->r_app_limited = 1;
 	else
 		rsm->r_app_limited = 0;
 	if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW)
 		rsm->r_bbr_state = bbr_state_val(bbr);
 	else
 		rsm->r_bbr_state = 8;
 	if (rsm->r_flags & BBR_ACKED) {
 		/* Problably MTU discovery messing with us */
 		uint32_t old_flags;
 
 		old_flags = rsm->r_flags;
 		rsm->r_flags &= ~BBR_ACKED;
 		bbr_log_type_rsmclear(bbr, cts, rsm, old_flags, __LINE__);
 		bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
 		if (bbr->r_ctl.rc_sacked == 0)
 			bbr->r_ctl.rc_sacklast = NULL;
 	}
 	if (rsm->r_in_tmap) {
 		TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
 	}
 	TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
 	rsm->r_in_tmap = 1;
 	if (rsm->r_flags & BBR_SACK_PASSED) {
 		/* We have retransmitted due to the SACK pass */
 		rsm->r_flags &= ~BBR_SACK_PASSED;
 		rsm->r_flags |= BBR_WAS_SACKPASS;
 	}
 	rsm->r_first_sent_time = bbr_get_earliest_send_outstanding(bbr, rsm, cts);
 	rsm->r_flight_at_send = ctf_flight_size(bbr->rc_tp,
 						(bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
 	bbr->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
 	if (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT) {
 		rsm->r_is_gain = 1;
 		rsm->r_is_drain = 0;
 	} else if (bbr->r_ctl.rc_bbr_hptsi_gain < BBR_UNIT) {
 		rsm->r_is_drain = 1;
 		rsm->r_is_gain = 0;
 	} else {
 		rsm->r_is_drain = 0;
 		rsm->r_is_gain = 0;
 	}
 	rsm->r_del_time = bbr->r_ctl.rc_del_time; /* TEMP GOOGLE CODE */
 }
 
 /*
  * Returns 0, or the sequence where we stopped
  * updating. We also update the lenp to be the amount
  * of data left.
  */
 
 static uint32_t
 bbr_update_entry(struct tcpcb *tp, struct tcp_bbr *bbr,
     struct bbr_sendmap *rsm, uint32_t cts, int32_t *lenp, uint32_t pacing_time)
 {
 	/*
 	 * We (re-)transmitted starting at rsm->r_start for some length
 	 * (possibly less than r_end.
 	 */
 	struct bbr_sendmap *nrsm;
 	uint32_t c_end;
 	int32_t len;
 
 	len = *lenp;
 	c_end = rsm->r_start + len;
 	if (SEQ_GEQ(c_end, rsm->r_end)) {
 		/*
 		 * We retransmitted the whole piece or more than the whole
 		 * slopping into the next rsm.
 		 */
 		bbr_update_rsm(tp, bbr, rsm, cts, pacing_time);
 		if (c_end == rsm->r_end) {
 			*lenp = 0;
 			return (0);
 		} else {
 			int32_t act_len;
 
 			/* Hangs over the end return whats left */
 			act_len = rsm->r_end - rsm->r_start;
 			*lenp = (len - act_len);
 			return (rsm->r_end);
 		}
 		/* We don't get out of this block. */
 	}
 	/*
 	 * Here we retransmitted less than the whole thing which means we
 	 * have to split this into what was transmitted and what was not.
 	 */
 	nrsm = bbr_alloc_full_limit(bbr);
 	if (nrsm == NULL) {
 		*lenp = 0;
 		return (0);
 	}
 	/*
 	 * So here we are going to take the original rsm and make it what we
 	 * retransmitted. nrsm will be the tail portion we did not
 	 * retransmit. For example say the chunk was 1, 11 (10 bytes). And
 	 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
 	 * 1, 6 and the new piece will be 6, 11.
 	 */
 	bbr_clone_rsm(bbr, nrsm, rsm, c_end);
 	TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
 	nrsm->r_dupack = 0;
 	if (rsm->r_in_tmap) {
 		TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 		nrsm->r_in_tmap = 1;
 	}
 	rsm->r_flags &= (~BBR_HAS_FIN);
 	bbr_update_rsm(tp, bbr, rsm, cts, pacing_time);
 	*lenp = 0;
 	return (0);
 }
 
 static uint64_t
 bbr_get_hardware_rate(struct tcp_bbr *bbr)
 {
 	uint64_t bw;
 
 	bw = bbr_get_bw(bbr);
 	bw *= (uint64_t)bbr_hptsi_gain[BBR_SUB_GAIN];
 	bw /= (uint64_t)BBR_UNIT;
 	return(bw);
 }
 
 static void
 bbr_setup_less_of_rate(struct tcp_bbr *bbr, uint32_t cts,
 		       uint64_t act_rate, uint64_t rate_wanted)
 {
 	/*
 	 * We could not get a full gains worth
 	 * of rate.
 	 */
 	if (get_filter_value(&bbr->r_ctl.rc_delrate) >= act_rate) {
 		/* we can't even get the real rate */
 		uint64_t red;
 
 		bbr->skip_gain = 1;
 		bbr->gain_is_limited = 0;
 		red = get_filter_value(&bbr->r_ctl.rc_delrate) - act_rate;
 		if (red)
 			filter_reduce_by(&bbr->r_ctl.rc_delrate, red, cts);
 	} else {
 		/* We can use a lower gain */
 		bbr->skip_gain = 0;
 		bbr->gain_is_limited = 1;
 	}
 }
 
 static void
 bbr_update_hardware_pacing_rate(struct tcp_bbr *bbr, uint32_t cts)
 {
 	const struct tcp_hwrate_limit_table *nrte;
 	int error, rate = -1;
 
 	if (bbr->r_ctl.crte == NULL)
 		return;
 	if ((bbr->rc_inp->inp_route.ro_nh == NULL) ||
 	    (bbr->rc_inp->inp_route.ro_nh->nh_ifp == NULL)) {
 		/* Lost our routes? */
 		/* Clear the way for a re-attempt */
 		bbr->bbr_attempt_hdwr_pace = 0;
 lost_rate:
 		bbr->gain_is_limited = 0;
 		bbr->skip_gain = 0;
 		bbr->bbr_hdrw_pacing = 0;
 		counter_u64_add(bbr_flows_whdwr_pacing, -1);
 		counter_u64_add(bbr_flows_nohdwr_pacing, 1);
 		tcp_bbr_tso_size_check(bbr, cts);
 		return;
 	}
 	rate = bbr_get_hardware_rate(bbr);
 	nrte = tcp_chg_pacing_rate(bbr->r_ctl.crte,
 				   bbr->rc_tp,
 				   bbr->rc_inp->inp_route.ro_nh->nh_ifp,
 				   rate,
 				   (RS_PACING_GEQ|RS_PACING_SUB_OK),
 				   &error, NULL);
 	if (nrte == NULL) {
 		goto lost_rate;
 	}
 	if (nrte != bbr->r_ctl.crte) {
 		bbr->r_ctl.crte = nrte;
 		if (error == 0)  {
 			BBR_STAT_INC(bbr_hdwr_rl_mod_ok);
 			if (bbr->r_ctl.crte->rate < rate) {
 				/* We have a problem */
 				bbr_setup_less_of_rate(bbr, cts,
 						       bbr->r_ctl.crte->rate, rate);
 			} else {
 				/* We are good */
 				bbr->gain_is_limited = 0;
 				bbr->skip_gain = 0;
 			}
 		} else {
 			/* A failure should release the tag */
 			BBR_STAT_INC(bbr_hdwr_rl_mod_fail);
 			bbr->gain_is_limited = 0;
 			bbr->skip_gain = 0;
 			bbr->bbr_hdrw_pacing = 0;
 		}
 		bbr_type_log_hdwr_pacing(bbr,
 					 bbr->r_ctl.crte->ptbl->rs_ifp,
 					 rate,
 					 ((bbr->r_ctl.crte == NULL) ? 0 : bbr->r_ctl.crte->rate),
 					 __LINE__,
 					 cts,
 					 error);
 	}
 }
 
 static void
 bbr_adjust_for_hw_pacing(struct tcp_bbr *bbr, uint32_t cts)
 {
 	/*
 	 * If we have hardware pacing support
 	 * we need to factor that in for our
 	 * TSO size.
 	 */
 	const struct tcp_hwrate_limit_table *rlp;
 	uint32_t cur_delay, seg_sz, maxseg, new_tso, delta, hdwr_delay;
 
 	if ((bbr->bbr_hdrw_pacing == 0) ||
 	    (IN_RECOVERY(bbr->rc_tp->t_flags)) ||
 	    (bbr->r_ctl.crte == NULL))
 		return;
 	if (bbr->hw_pacing_set == 0) {
 		/* Not yet by the hdwr pacing count delay */
 		return;
 	}
 	if (bbr_hdwr_pace_adjust == 0) {
 		/* No adjustment */
 		return;
 	}
 	rlp = bbr->r_ctl.crte;
 	if (bbr->rc_tp->t_maxseg > bbr->rc_last_options)
 		maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
 	else
 		maxseg = BBR_MIN_SEG - bbr->rc_last_options;
 	/*
 	 * So lets first get the
 	 * time we will take between
 	 * TSO sized sends currently without
 	 * hardware help.
 	 */
 	cur_delay = bbr_get_pacing_delay(bbr, BBR_UNIT,
 		        bbr->r_ctl.rc_pace_max_segs, cts, 1);
 	hdwr_delay = bbr->r_ctl.rc_pace_max_segs / maxseg;
 	hdwr_delay *= rlp->time_between;
 	if (cur_delay > hdwr_delay)
 		delta = cur_delay - hdwr_delay;
 	else
 		delta = 0;
 	bbr_log_type_tsosize(bbr, cts, delta, cur_delay, hdwr_delay,
 			     (bbr->r_ctl.rc_pace_max_segs / maxseg),
 			     1);
 	if (delta &&
 	    (delta < (max(rlp->time_between,
 			  bbr->r_ctl.bbr_hptsi_segments_delay_tar)))) {
 		/*
 		 * Now lets divide by the pacing
 		 * time between each segment the
 		 * hardware sends rounding up and
 		 * derive a bytes from that. We multiply
 		 * that by bbr_hdwr_pace_adjust to get
 		 * more bang for our buck.
 		 *
 		 * The goal is to have the software pacer
 		 * waiting no more than an additional
 		 * pacing delay if we can (without the
 		 * compensation i.e. x bbr_hdwr_pace_adjust).
 		 */
 		seg_sz = max(((cur_delay + rlp->time_between)/rlp->time_between),
 			     (bbr->r_ctl.rc_pace_max_segs/maxseg));
 		seg_sz *= bbr_hdwr_pace_adjust;
 		if (bbr_hdwr_pace_floor &&
 		    (seg_sz < bbr->r_ctl.crte->ptbl->rs_min_seg)) {
 			/* Currently hardware paces
 			 * out rs_min_seg segments at a time.
 			 * We need to make sure we always send at least
 			 * a full burst of bbr_hdwr_pace_floor down.
 			 */
 			seg_sz = bbr->r_ctl.crte->ptbl->rs_min_seg;
 		}
 		seg_sz *= maxseg;
 	} else if (delta == 0) {
 		/*
 		 * The highest pacing rate is
 		 * above our b/w gained. This means
 		 * we probably are going quite fast at
 		 * the hardware highest rate. Lets just multiply
 		 * the calculated TSO size by the
 		 * multiplier factor (its probably
 		 * 4 segments in the default config for
 		 * mlx).
 		 */
 		seg_sz = bbr->r_ctl.rc_pace_max_segs * bbr_hdwr_pace_adjust;
 		if (bbr_hdwr_pace_floor &&
 		    (seg_sz < bbr->r_ctl.crte->ptbl->rs_min_seg)) {
 			/* Currently hardware paces
 			 * out rs_min_seg segments at a time.
 			 * We need to make sure we always send at least
 			 * a full burst of bbr_hdwr_pace_floor down.
 			 */
 			seg_sz = bbr->r_ctl.crte->ptbl->rs_min_seg;
 		}
 	} else {
 		/*
 		 * The pacing time difference is so
 		 * big that the hardware will
 		 * pace out more rapidly then we
 		 * really want and then we
 		 * will have a long delay. Lets just keep
 		 * the same TSO size so its as if
 		 * we were not using hdwr pacing (we
 		 * just gain a bit of spacing from the
 		 * hardware if seg_sz > 1).
 		 */
 		seg_sz = bbr->r_ctl.rc_pace_max_segs;
 	}
 	if (seg_sz > bbr->r_ctl.rc_pace_max_segs)
 		new_tso = seg_sz;
 	else
 		new_tso = bbr->r_ctl.rc_pace_max_segs;
 	if (new_tso >= (PACE_MAX_IP_BYTES-maxseg))
 		new_tso = PACE_MAX_IP_BYTES - maxseg;
 
 	if (new_tso != bbr->r_ctl.rc_pace_max_segs) {
 		bbr_log_type_tsosize(bbr, cts, new_tso, 0, bbr->r_ctl.rc_pace_max_segs, maxseg, 0);
 		bbr->r_ctl.rc_pace_max_segs = new_tso;
 	}
 }
 
 static void
 tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts)
 {
 	uint64_t bw;
 	uint32_t old_tso = 0, new_tso;
 	uint32_t maxseg, bytes;
 	uint32_t tls_seg=0;
 	/*
 	 * Google/linux uses the following algorithm to determine
 	 * the TSO size based on the b/w of the link (from Neal Cardwell email 9/27/18):
 	 *
 	 *  bytes = bw_in_bytes_per_second / 1000
 	 *  bytes = min(bytes, 64k)
 	 *  tso_segs = bytes / MSS
 	 *  if (bw < 1.2Mbs)
 	 *      min_tso_segs = 1
 	 *  else
 	 *	min_tso_segs = 2
 	 * tso_segs = max(tso_segs, min_tso_segs)
 	 *
 	 * * Note apply a device specific limit (we apply this in the
 	 *   tcp_m_copym).
 	 * Note that before the initial measurement is made google bursts out
 	 * a full iwnd just like new-reno/cubic.
 	 *
 	 * We do not use this algorithm. Instead we
 	 * use a two phased approach:
 	 *
 	 *  if ( bw <= per-tcb-cross-over)
 	 *     goal_tso =  calculate how much with this bw we
 	 *                 can send in goal-time seconds.
 	 *     if (goal_tso > mss)
 	 *         seg = goal_tso / mss
 	 *         tso = seg * mss
 	 *     else
 	 *         tso = mss
 	 *     if (tso > per-tcb-max)
 	 *         tso = per-tcb-max
 	 *  else if ( bw > 512Mbps)
 	 *     tso = max-tso (64k/mss)
 	 *  else
 	 *     goal_tso = bw / per-tcb-divsor
 	 *     seg = (goal_tso + mss-1)/mss
 	 *     tso = seg * mss
 	 *
 	 * if (tso < per-tcb-floor)
 	 *    tso = per-tcb-floor
 	 * if (tso > per-tcb-utter_max)
 	 *    tso = per-tcb-utter_max
 	 *
 	 * Note the default per-tcb-divisor is 1000 (same as google).
 	 * the goal cross over is 30Mbps however. To recreate googles
 	 * algorithm you need to set:
 	 *
 	 * cross-over = 23,168,000 bps
 	 * goal-time = 18000
 	 * per-tcb-max = 2
 	 * per-tcb-divisor = 1000
 	 * per-tcb-floor = 1
 	 *
 	 * This will get you "google bbr" behavior with respect to tso size.
 	 *
 	 * Note we do set anything TSO size until we are past the initial
 	 * window. Before that we gnerally use either a single MSS
 	 * or we use the full IW size (so we burst a IW at a time)
 	 */
 
 	if (bbr->rc_tp->t_maxseg > bbr->rc_last_options) {
 		maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
 	} else {
 		maxseg = BBR_MIN_SEG - bbr->rc_last_options;
 	}
 	old_tso = bbr->r_ctl.rc_pace_max_segs;
 	if (bbr->rc_past_init_win == 0) {
 		/*
 		 * Not enough data has been acknowledged to make a
 		 * judgement. Set up the initial TSO based on if we
 		 * are sending a full IW at once or not.
 		 */
 		if (bbr->rc_use_google)
 			bbr->r_ctl.rc_pace_max_segs = ((bbr->rc_tp->t_maxseg - bbr->rc_last_options) * 2);
 		else if (bbr->bbr_init_win_cheat)
 			bbr->r_ctl.rc_pace_max_segs = bbr_initial_cwnd(bbr, bbr->rc_tp);
 		else
 			bbr->r_ctl.rc_pace_max_segs = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
 		if (bbr->r_ctl.rc_pace_min_segs != bbr->rc_tp->t_maxseg)
 			bbr->r_ctl.rc_pace_min_segs = bbr->rc_tp->t_maxseg;
 		if (bbr->r_ctl.rc_pace_max_segs == 0) {
 			bbr->r_ctl.rc_pace_max_segs = maxseg;
 		}
 		bbr_log_type_tsosize(bbr, cts, bbr->r_ctl.rc_pace_max_segs, tls_seg, old_tso, maxseg, 0);
 			bbr_adjust_for_hw_pacing(bbr, cts);
 		return;
 	}
 	/**
 	 * Now lets set the TSO goal based on our delivery rate in
 	 * bytes per second. Note we only do this if
 	 * we have acked at least the initial cwnd worth of data.
 	 */
 	bw = bbr_get_bw(bbr);
 	if (IN_RECOVERY(bbr->rc_tp->t_flags) &&
 	     (bbr->rc_use_google == 0)) {
 		/* We clamp to one MSS in recovery */
 		new_tso = maxseg;
 	} else if (bbr->rc_use_google) {
 		int min_tso_segs;
 
 		/* Google considers the gain too */
 		if (bbr->r_ctl.rc_bbr_hptsi_gain != BBR_UNIT) {
 			bw *= bbr->r_ctl.rc_bbr_hptsi_gain;
 			bw /= BBR_UNIT;
 		}
 		bytes = bw / 1024;
 		if (bytes > (64 * 1024))
 			bytes = 64 * 1024;
 		new_tso = bytes / maxseg;
 		if (bw < ONE_POINT_TWO_MEG)
 			min_tso_segs = 1;
 		else
 			min_tso_segs = 2;
 		if (new_tso < min_tso_segs)
 			new_tso = min_tso_segs;
 		new_tso *= maxseg;
 	} else if (bbr->rc_no_pacing) {
 		new_tso = (PACE_MAX_IP_BYTES / maxseg) * maxseg;
 	} else if (bw <= bbr->r_ctl.bbr_cross_over) {
 		/*
 		 * Calculate the worse case b/w TSO if we are inserting no
 		 * more than a delay_target number of TSO's.
 		 */
 		uint32_t tso_len, min_tso;
 
 		tso_len = bbr_get_pacing_length(bbr, BBR_UNIT, bbr->r_ctl.bbr_hptsi_segments_delay_tar, bw);
 		if (tso_len > maxseg) {
 			new_tso = tso_len / maxseg;
 			if (new_tso > bbr->r_ctl.bbr_hptsi_segments_max)
 				new_tso = bbr->r_ctl.bbr_hptsi_segments_max;
 			new_tso *= maxseg;
 		} else {
 			/*
 			 * less than a full sized frame yikes.. long rtt or
 			 * low bw?
 			 */
 			min_tso = bbr_minseg(bbr);
 			if ((tso_len > min_tso) && (bbr_all_get_min == 0))
 				new_tso = rounddown(tso_len, min_tso);
 			else
 				new_tso = min_tso;
 		}
 	} else if (bw > FIVETWELVE_MBPS) {
 		/*
 		 * This guy is so fast b/w wise that we can TSO as large as
 		 * possible of segments that the NIC will allow.
 		 */
 		new_tso = rounddown(PACE_MAX_IP_BYTES, maxseg);
 	} else {
 		/*
 		 * This formula is based on attempting to send a segment or
 		 * more every bbr_hptsi_per_second. The default is 1000
 		 * which means you are targeting what you can send every 1ms
 		 * based on the peers bw.
 		 *
 		 * If the number drops to say 500, then you are looking more
 		 * at 2ms and you will raise how much we send in a single
 		 * TSO thus saving CPU (less bbr_output_wtime() calls). The
 		 * trade off of course is you will send more at once and
 		 * thus tend to clump up the sends into larger "bursts"
 		 * building a queue.
 		 */
 		bw /= bbr->r_ctl.bbr_hptsi_per_second;
 		new_tso = roundup(bw, (uint64_t)maxseg);
 		/*
 		 * Gate the floor to match what our lower than 48Mbps
 		 * algorithm does. The ceiling (bbr_hptsi_segments_max) thus
 		 * becomes the floor for this calculation.
 		 */
 		if (new_tso < (bbr->r_ctl.bbr_hptsi_segments_max * maxseg))
 			new_tso = (bbr->r_ctl.bbr_hptsi_segments_max * maxseg);
 	}
 	if (bbr->r_ctl.bbr_hptsi_segments_floor && (new_tso < (maxseg * bbr->r_ctl.bbr_hptsi_segments_floor)))
 		new_tso = maxseg * bbr->r_ctl.bbr_hptsi_segments_floor;
 	if (new_tso > PACE_MAX_IP_BYTES)
 		new_tso = rounddown(PACE_MAX_IP_BYTES, maxseg);
 	/* Enforce an utter maximum. */
 	if (bbr->r_ctl.bbr_utter_max && (new_tso > (bbr->r_ctl.bbr_utter_max * maxseg))) {
 		new_tso = bbr->r_ctl.bbr_utter_max * maxseg;
 	}
 	if (old_tso != new_tso) {
 		/* Only log changes */
 		bbr_log_type_tsosize(bbr, cts, new_tso, tls_seg, old_tso, maxseg, 0);
 		bbr->r_ctl.rc_pace_max_segs = new_tso;
 	}
 	/* We have hardware pacing! */
 	bbr_adjust_for_hw_pacing(bbr, cts);
 }
 
 static void
 bbr_log_output(struct tcp_bbr *bbr, struct tcpcb *tp, struct tcpopt *to, int32_t len,
     uint32_t seq_out, uint16_t th_flags, int32_t err, uint32_t cts,
     struct mbuf *mb, int32_t * abandon, struct bbr_sendmap *hintrsm, uint32_t delay_calc,
     struct sockbuf *sb)
 {
 
 	struct bbr_sendmap *rsm, *nrsm;
 	register uint32_t snd_max, snd_una;
 	uint32_t pacing_time;
 	/*
 	 * Add to the RACK log of packets in flight or retransmitted. If
 	 * there is a TS option we will use the TS echoed, if not we will
 	 * grab a TS.
 	 *
 	 * Retransmissions will increment the count and move the ts to its
 	 * proper place. Note that if options do not include TS's then we
 	 * won't be able to effectively use the ACK for an RTT on a retran.
 	 *
 	 * Notes about r_start and r_end. Lets consider a send starting at
 	 * sequence 1 for 10 bytes. In such an example the r_start would be
 	 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
 	 * This means that r_end is actually the first sequence for the next
 	 * slot (11).
 	 *
 	 */
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (err) {
 		/*
 		 * We don't log errors -- we could but snd_max does not
 		 * advance in this case either.
 		 */
 		return;
 	}
 	if (th_flags & TH_RST) {
 		/*
 		 * We don't log resets and we return immediately from
 		 * sending
 		 */
 		*abandon = 1;
 		return;
 	}
 	snd_una = tp->snd_una;
 	if (th_flags & (TH_SYN | TH_FIN) && (hintrsm == NULL)) {
 		/*
 		 * The call to bbr_log_output is made before bumping
 		 * snd_max. This means we can record one extra byte on a SYN
 		 * or FIN if seq_out is adding more on and a FIN is present
 		 * (and we are not resending).
 		 */
 		if ((th_flags & TH_SYN) && (tp->iss == seq_out))
 			len++;
 		if (th_flags & TH_FIN)
 			len++;
 	}
 	if (SEQ_LEQ((seq_out + len), snd_una)) {
 		/* Are sending an old segment to induce an ack (keep-alive)? */
 		return;
 	}
 	if (SEQ_LT(seq_out, snd_una)) {
 		/* huh? should we panic? */
 		uint32_t end;
 
 		end = seq_out + len;
 		seq_out = snd_una;
 		len = end - seq_out;
 	}
 	snd_max = tp->snd_max;
 	if (len == 0) {
 		/* We don't log zero window probes */
 		return;
 	}
 	pacing_time = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, len, cts, 1);
 	/* First question is it a retransmission? */
 	if (seq_out == snd_max) {
 again:
 		rsm = bbr_alloc(bbr);
 		if (rsm == NULL) {
 			return;
 		}
 		rsm->r_flags = 0;
 		if (th_flags & TH_SYN)
 			rsm->r_flags |= BBR_HAS_SYN;
 		if (th_flags & TH_FIN)
 			rsm->r_flags |= BBR_HAS_FIN;
 		rsm->r_tim_lastsent[0] = cts;
 		rsm->r_rtr_cnt = 1;
 		rsm->r_rtr_bytes = 0;
 		rsm->r_start = seq_out;
 		rsm->r_end = rsm->r_start + len;
 		rsm->r_dupack = 0;
 		rsm->r_delivered = bbr->r_ctl.rc_delivered;
 		rsm->r_pacing_delay = pacing_time;
 		rsm->r_ts_valid = bbr->rc_ts_valid;
 		if (bbr->rc_ts_valid)
 			rsm->r_del_ack_ts = bbr->r_ctl.last_inbound_ts;
 		rsm->r_del_time = bbr->r_ctl.rc_del_time;
 		if (bbr->r_ctl.r_app_limited_until)
 			rsm->r_app_limited = 1;
 		else
 			rsm->r_app_limited = 0;
 		rsm->r_first_sent_time = bbr_get_earliest_send_outstanding(bbr, rsm, cts);
 		rsm->r_flight_at_send = ctf_flight_size(bbr->rc_tp,
 						(bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
 		/*
 		 * Here we must also add in this rsm since snd_max
 		 * is updated after we return from a new send.
 		 */
 		rsm->r_flight_at_send += len;
 		TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_map, rsm, r_next);
 		TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
 		rsm->r_in_tmap = 1;
 		if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW)
 			rsm->r_bbr_state = bbr_state_val(bbr);
 		else
 			rsm->r_bbr_state = 8;
 		if (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT) {
 			rsm->r_is_gain = 1;
 			rsm->r_is_drain = 0;
 		} else if (bbr->r_ctl.rc_bbr_hptsi_gain < BBR_UNIT) {
 			rsm->r_is_drain = 1;
 			rsm->r_is_gain = 0;
 		} else {
 			rsm->r_is_drain = 0;
 			rsm->r_is_gain = 0;
 		}
 		return;
 	}
 	/*
 	 * If we reach here its a retransmission and we need to find it.
 	 */
 more:
 	if (hintrsm && (hintrsm->r_start == seq_out)) {
 		rsm = hintrsm;
 		hintrsm = NULL;
 	} else if (bbr->r_ctl.rc_next) {
 		/* We have a hint from a previous run */
 		rsm = bbr->r_ctl.rc_next;
 	} else {
 		/* No hints sorry */
 		rsm = NULL;
 	}
 	if ((rsm) && (rsm->r_start == seq_out)) {
 		/*
 		 * We used rc_next or hintrsm  to retransmit, hopefully the
 		 * likely case.
 		 */
 		seq_out = bbr_update_entry(tp, bbr, rsm, cts, &len, pacing_time);
 		if (len == 0) {
 			return;
 		} else {
 			goto more;
 		}
 	}
 	/* Ok it was not the last pointer go through it the hard way. */
 	TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
 		if (rsm->r_start == seq_out) {
 			seq_out = bbr_update_entry(tp, bbr, rsm, cts, &len, pacing_time);
 			bbr->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
 			if (len == 0) {
 				return;
 			} else {
 				continue;
 			}
 		}
 		if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
 			/* Transmitted within this piece */
 			/*
 			 * Ok we must split off the front and then let the
 			 * update do the rest
 			 */
 			nrsm = bbr_alloc_full_limit(bbr);
 			if (nrsm == NULL) {
 				bbr_update_rsm(tp, bbr, rsm, cts, pacing_time);
 				return;
 			}
 			/*
 			 * copy rsm to nrsm and then trim the front of rsm
 			 * to not include this part.
 			 */
 			bbr_clone_rsm(bbr, nrsm, rsm, seq_out);
 			TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
 			if (rsm->r_in_tmap) {
 				TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 				nrsm->r_in_tmap = 1;
 			}
 			rsm->r_flags &= (~BBR_HAS_FIN);
 			seq_out = bbr_update_entry(tp, bbr, nrsm, cts, &len, pacing_time);
 			if (len == 0) {
 				return;
 			}
 		}
 	}
 	/*
 	 * Hmm not found in map did they retransmit both old and on into the
 	 * new?
 	 */
 	if (seq_out == tp->snd_max) {
 		goto again;
 	} else if (SEQ_LT(seq_out, tp->snd_max)) {
 #ifdef BBR_INVARIANTS
 		printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
 		    seq_out, len, tp->snd_una, tp->snd_max);
 		printf("Starting Dump of all rack entries\n");
 		TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
 			printf("rsm:%p start:%u end:%u\n",
 			    rsm, rsm->r_start, rsm->r_end);
 		}
 		printf("Dump complete\n");
 		panic("seq_out not found rack:%p tp:%p",
 		    bbr, tp);
 #endif
 	} else {
 #ifdef BBR_INVARIANTS
 		/*
 		 * Hmm beyond sndmax? (only if we are using the new rtt-pack
 		 * flag)
 		 */
 		panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
 		    seq_out, len, tp->snd_max, tp);
 #endif
 	}
 }
 
 static void
 bbr_collapse_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, int32_t rtt)
 {
 	/*
 	 * Collapse timeout back the cum-ack moved.
 	 */
 	tp->t_rxtshift = 0;
 	tp->t_softerror = 0;
 }
 
 static void
 tcp_bbr_xmit_timer(struct tcp_bbr *bbr, uint32_t rtt_usecs, uint32_t rsm_send_time, uint32_t r_start, uint32_t tsin)
 {
 	bbr->rtt_valid = 1;
 	bbr->r_ctl.cur_rtt = rtt_usecs;
 	bbr->r_ctl.ts_in = tsin;
 	if (rsm_send_time)
 		bbr->r_ctl.cur_rtt_send_time = rsm_send_time;
 }
 
 static void
 bbr_make_timestamp_determination(struct tcp_bbr *bbr)
 {
 	/**
 	 * We have in our bbr control:
 	 * 1) The timestamp we started observing cum-acks (bbr->r_ctl.bbr_ts_check_tstmp).
 	 * 2) Our timestamp indicating when we sent that packet (bbr->r_ctl.rsm->bbr_ts_check_our_cts).
 	 * 3) The current timestamp that just came in (bbr->r_ctl.last_inbound_ts)
 	 * 4) The time that the packet that generated that ack was sent (bbr->r_ctl.cur_rtt_send_time)
 	 *
 	 * Now we can calculate the time between the sends by doing:
 	 *
 	 * delta = bbr->r_ctl.cur_rtt_send_time - bbr->r_ctl.bbr_ts_check_our_cts
 	 *
 	 * And the peer's time between receiving them by doing:
 	 *
 	 * peer_delta = bbr->r_ctl.last_inbound_ts - bbr->r_ctl.bbr_ts_check_tstmp
 	 *
 	 * We want to figure out if the timestamp values are in msec, 10msec or usec.
 	 * We also may find that we can't use the timestamps if say we see
 	 * that the peer_delta indicates that though we may have taken 10ms to
 	 * pace out the data, it only saw 1ms between the two packets. This would
 	 * indicate that somewhere on the path is a batching entity that is giving
 	 * out time-slices of the actual b/w. This would mean we could not use
 	 * reliably the peers timestamps.
 	 *
 	 * We expect delta > peer_delta initially. Until we figure out the
 	 * timestamp difference which we will store in bbr->r_ctl.bbr_peer_tsratio.
 	 * If we place 1000 there then its a ms vs our usec. If we place 10000 there
 	 * then its 10ms vs our usec. If the peer is running a usec clock we would
 	 * put a 1 there. If the value is faster then ours, we will disable the
 	 * use of timestamps (though we could revist this later if we find it to be not
 	 * just an isolated one or two flows)).
 	 *
 	 * To detect the batching middle boxes we will come up with our compensation and
 	 * if with it in place, we find the peer is drastically off (by some margin) in
 	 * the smaller direction, then we will assume the worst case and disable use of timestamps.
 	 *
 	 */
 	uint64_t delta, peer_delta, delta_up;
 
 	delta = bbr->r_ctl.cur_rtt_send_time - bbr->r_ctl.bbr_ts_check_our_cts;
 	if (delta < bbr_min_usec_delta) {
 		/*
 		 * Have not seen a min amount of time
 		 * between our send times so we can
 		 * make a determination of the timestamp
 		 * yet.
 		 */
 		return;
 	}
 	peer_delta = bbr->r_ctl.last_inbound_ts - bbr->r_ctl.bbr_ts_check_tstmp;
 	if (peer_delta < bbr_min_peer_delta) {
 		/*
 		 * We may have enough in the form of
 		 * our delta but the peers number
 		 * has not changed that much. It could
 		 * be its clock ratio is such that
 		 * we need more data (10ms tick) or
 		 * there may be other compression scenarios
 		 * going on. In any event we need the
 		 * spread to be larger.
 		 */
 		return;
 	}
 	/* Ok lets first see which way our delta is going */
 	if (peer_delta > delta) {
 		/* Very unlikely, the peer without
 		 * compensation shows that it saw
 		 * the two sends arrive further apart
 		 * then we saw then in micro-seconds.
 		 */
 		if (peer_delta < (delta + ((delta * (uint64_t)1000)/ (uint64_t)bbr_delta_percent))) {
 			/* well it looks like the peer is a micro-second clock. */
 			bbr->rc_ts_clock_set = 1;
 			bbr->r_ctl.bbr_peer_tsratio = 1;
 		} else {
 			bbr->rc_ts_cant_be_used = 1;
 			bbr->rc_ts_clock_set = 1;
 		}
 		return;
 	}
 	/* Ok we know that the peer_delta is smaller than our send distance */
 	bbr->rc_ts_clock_set = 1;
 	/* First question is it within the percentage that they are using usec time? */
 	delta_up = (peer_delta * 1000) / (uint64_t)bbr_delta_percent;
 	if ((peer_delta + delta_up) >= delta) {
 		/* Its a usec clock */
 		bbr->r_ctl.bbr_peer_tsratio = 1;
 		bbr_log_tstmp_validation(bbr, peer_delta, delta);
 		return;
 	}
 	/* Ok if not usec, what about 10usec (though unlikely)? */
 	delta_up = (peer_delta * 1000 * 10) / (uint64_t)bbr_delta_percent;
 	if (((peer_delta * 10) + delta_up) >= delta) {
 		bbr->r_ctl.bbr_peer_tsratio = 10;
 		bbr_log_tstmp_validation(bbr, peer_delta, delta);
 		return;
 	}
 	/* And what about 100usec (though again unlikely)? */
 	delta_up = (peer_delta * 1000 * 100) / (uint64_t)bbr_delta_percent;
 	if (((peer_delta * 100) + delta_up) >= delta) {
 		bbr->r_ctl.bbr_peer_tsratio = 100;
 		bbr_log_tstmp_validation(bbr, peer_delta, delta);
 		return;
 	}
 	/* And how about 1 msec (the most likely one)? */
 	delta_up = (peer_delta * 1000 * 1000) / (uint64_t)bbr_delta_percent;
 	if (((peer_delta * 1000) + delta_up) >= delta) {
 		bbr->r_ctl.bbr_peer_tsratio = 1000;
 		bbr_log_tstmp_validation(bbr, peer_delta, delta);
 		return;
 	}
 	/* Ok if not msec could it be 10 msec? */
 	delta_up = (peer_delta * 1000 * 10000) / (uint64_t)bbr_delta_percent;
 	if (((peer_delta * 10000) + delta_up) >= delta) {
 		bbr->r_ctl.bbr_peer_tsratio = 10000;
 		return;
 	}
 	/* If we fall down here the clock tick so slowly we can't use it */
 	bbr->rc_ts_cant_be_used = 1;
 	bbr->r_ctl.bbr_peer_tsratio = 0;
 	bbr_log_tstmp_validation(bbr, peer_delta, delta);
 }
 
 /*
  * Collect new round-trip time estimate
  * and update averages and current timeout.
  */
 static void
 tcp_bbr_xmit_timer_commit(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts)
 {
 	int32_t delta;
 	uint32_t rtt, tsin;
 	int32_t rtt_ticks;
 
 	if (bbr->rtt_valid == 0)
 		/* No valid sample */
 		return;
 
 	rtt = bbr->r_ctl.cur_rtt;
 	tsin = bbr->r_ctl.ts_in;
 	if (bbr->rc_prtt_set_ts) {
 		/*
 		 * We are to force feed the rttProp filter due
 		 * to an entry into PROBE_RTT. This assures
 		 * that the times are sync'd between when we
 		 * go into PROBE_RTT and the filter expiration.
 		 *
 		 * Google does not use a true filter, so they do
 		 * this implicitly since they only keep one value
 		 * and when they enter probe-rtt they update the
 		 * value to the newest rtt.
 		 */
 		uint32_t rtt_prop;
 
 		bbr->rc_prtt_set_ts = 0;
 		rtt_prop = get_filter_value_small(&bbr->r_ctl.rc_rttprop);
 		if (rtt > rtt_prop)
 			filter_increase_by_small(&bbr->r_ctl.rc_rttprop, (rtt - rtt_prop), cts);
 		else
 			apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
 	}
 	if (bbr->rc_ack_was_delayed)
 		rtt += bbr->r_ctl.rc_ack_hdwr_delay;
 
 	if (rtt < bbr->r_ctl.rc_lowest_rtt)
 		bbr->r_ctl.rc_lowest_rtt = rtt;
 	bbr_log_rtt_sample(bbr, rtt, tsin);
 	if (bbr->r_init_rtt) {
 		/*
 		 * The initial rtt is not-trusted, nuke it and lets get
 		 * our first valid measurement in.
 		 */
 		bbr->r_init_rtt = 0;
 		tp->t_srtt = 0;
 	}
 	if ((bbr->rc_ts_clock_set == 0) && bbr->rc_ts_valid) {
 		/*
 		 * So we have not yet figured out
 		 * what the peers TSTMP value is
 		 * in (most likely ms). We need a
 		 * series of cum-ack's to determine
 		 * this reliably.
 		 */
 		if (bbr->rc_ack_is_cumack) {
 			if (bbr->rc_ts_data_set) {
 				/* Lets attempt to determine the timestamp granularity. */
 				bbr_make_timestamp_determination(bbr);
 			} else {
 				bbr->rc_ts_data_set = 1;
 				bbr->r_ctl.bbr_ts_check_tstmp = bbr->r_ctl.last_inbound_ts;
 				bbr->r_ctl.bbr_ts_check_our_cts = bbr->r_ctl.cur_rtt_send_time;
 			}
 		} else {
 			/*
 			 * We have to have consecutive acks
 			 * reset any "filled" state to none.
 			 */
 			bbr->rc_ts_data_set = 0;
 		}
 	}
 	/* Round it up */
 	rtt_ticks = USEC_2_TICKS((rtt + (USECS_IN_MSEC - 1)));
 	if (rtt_ticks == 0)
 		rtt_ticks = 1;
 	if (tp->t_srtt != 0) {
 		/*
 		 * srtt is stored as fixed point with 5 bits after the
 		 * binary point (i.e., scaled by 8).  The following magic is
 		 * equivalent to the smoothing algorithm in rfc793 with an
 		 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point).
 		 * Adjust rtt to origin 0.
 		 */
 
 		delta = ((rtt_ticks - 1) << TCP_DELTA_SHIFT)
 		    - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
 
 		tp->t_srtt += delta;
 		if (tp->t_srtt <= 0)
 			tp->t_srtt = 1;
 
 		/*
 		 * We accumulate a smoothed rtt variance (actually, a
 		 * smoothed mean difference), then set the retransmit timer
 		 * to smoothed rtt + 4 times the smoothed variance. rttvar
 		 * is stored as fixed point with 4 bits after the binary
 		 * point (scaled by 16).  The following is equivalent to
 		 * rfc793 smoothing with an alpha of .75 (rttvar =
 		 * rttvar*3/4 + |delta| / 4).  This replaces rfc793's
 		 * wired-in beta.
 		 */
 		if (delta < 0)
 			delta = -delta;
 		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
 		tp->t_rttvar += delta;
 		if (tp->t_rttvar <= 0)
 			tp->t_rttvar = 1;
 		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
 			tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	} else {
 		/*
 		 * No rtt measurement yet - use the unsmoothed rtt. Set the
 		 * variance to half the rtt (so our first retransmit happens
 		 * at 3*rtt).
 		 */
 		tp->t_srtt = rtt_ticks << TCP_RTT_SHIFT;
 		tp->t_rttvar = rtt_ticks << (TCP_RTTVAR_SHIFT - 1);
 		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	}
 	KMOD_TCPSTAT_INC(tcps_rttupdated);
 	tp->t_rttupdated++;
 #ifdef STATS
 	stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt_ticks));
 #endif
 	/*
 	 * the retransmit should happen at rtt + 4 * rttvar. Because of the
 	 * way we do the smoothing, srtt and rttvar will each average +1/2
 	 * tick of bias.  When we compute the retransmit timer, we want 1/2
 	 * tick of rounding and 1 extra tick because of +-1/2 tick
 	 * uncertainty in the firing of the timer.  The bias will give us
 	 * exactly the 1.5 tick we need.  But, because the bias is
 	 * statistical, we have to test that we don't drop below the minimum
 	 * feasible timer (which is 2 ticks).
 	 */
 	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
 	    max(MSEC_2_TICKS(bbr->r_ctl.rc_min_rto_ms), rtt_ticks + 2),
 	    MSEC_2_TICKS(((uint32_t)bbr->rc_max_rto_sec) * 1000));
 
 	/*
 	 * We received an ack for a packet that wasn't retransmitted; it is
 	 * probably safe to discard any error indications we've received
 	 * recently.  This isn't quite right, but close enough for now (a
 	 * route might have failed after we sent a segment, and the return
 	 * path might not be symmetrical).
 	 */
 	tp->t_softerror = 0;
 	rtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT);
 	if (bbr->r_ctl.bbr_smallest_srtt_this_state > rtt)
 		bbr->r_ctl.bbr_smallest_srtt_this_state = rtt;
 }
 
 static void
 bbr_set_reduced_rtt(struct tcp_bbr *bbr, uint32_t cts, uint32_t line)
 {
 	bbr->r_ctl.rc_rtt_shrinks = cts;
 	if (bbr_can_force_probertt &&
 	    (TSTMP_GT(cts, bbr->r_ctl.last_in_probertt)) &&
 	    ((cts - bbr->r_ctl.last_in_probertt) > bbr->r_ctl.rc_probertt_int)) {
 		/*
 		 * We should enter probe-rtt its been too long
 		 * since we have been there.
 		 */
 		bbr_enter_probe_rtt(bbr, cts, __LINE__);
 	} else
 		bbr_check_probe_rtt_limits(bbr, cts);
 }
 
 static void
 tcp_bbr_commit_bw(struct tcp_bbr *bbr, uint32_t cts)
 {
 	uint64_t orig_bw;
 
 	if (bbr->r_ctl.rc_bbr_cur_del_rate == 0) {
 		/* We never apply a zero measurement */
 		bbr_log_type_bbrupd(bbr, 20, cts, 0, 0,
 				    0, 0, 0, 0, 0, 0);
 		return;
 	}
 	if (bbr->r_ctl.r_measurement_count < 0xffffffff)
 		bbr->r_ctl.r_measurement_count++;
 	orig_bw = get_filter_value(&bbr->r_ctl.rc_delrate);
 	apply_filter_max(&bbr->r_ctl.rc_delrate, bbr->r_ctl.rc_bbr_cur_del_rate, bbr->r_ctl.rc_pkt_epoch);
 	bbr_log_type_bbrupd(bbr, 21, cts, (uint32_t)orig_bw,
 			    (uint32_t)get_filter_value(&bbr->r_ctl.rc_delrate),
 			    0, 0, 0, 0, 0, 0);
 	if (orig_bw &&
 	    (orig_bw != get_filter_value(&bbr->r_ctl.rc_delrate))) {
 		if (bbr->bbr_hdrw_pacing) {
 			/*
 			 * Apply a new rate to the hardware
 			 * possibly.
 			 */
 			bbr_update_hardware_pacing_rate(bbr, cts);
 		}
 		bbr_set_state_target(bbr, __LINE__);
 		tcp_bbr_tso_size_check(bbr, cts);
 		if (bbr->r_recovery_bw)  {
 			bbr_setup_red_bw(bbr, cts);
 			bbr_log_type_bw_reduce(bbr, BBR_RED_BW_USELRBW);
 		}
 	} else if ((orig_bw == 0) && get_filter_value(&bbr->r_ctl.rc_delrate))
 		tcp_bbr_tso_size_check(bbr, cts);
 }
 
 static void
 bbr_nf_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts)
 {
 	if (bbr->rc_in_persist == 0) {
 		/* We log only when not in persist */
 		/* Translate to a Bytes Per Second */
 		uint64_t tim, bw, ts_diff, ts_bw;
 		uint32_t delivered;
 
 		if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time))
 			tim = (uint64_t)(bbr->r_ctl.rc_del_time - rsm->r_del_time);
 		else
 			tim = 1;
 		/*
 		 * Now that we have processed the tim (skipping the sample
 		 * or possibly updating the time, go ahead and
 		 * calculate the cdr.
 		 */
 		delivered = (bbr->r_ctl.rc_delivered - rsm->r_delivered);
 		bw = (uint64_t)delivered;
 		bw *= (uint64_t)USECS_IN_SECOND;
 		bw /= tim;
 		if (bw == 0) {
 			/* We must have a calculatable amount */
 			return;
 		}
 		/*
 		 * If we are using this b/w shove it in now so we
 		 * can see in the trace viewer if it gets over-ridden.
 		 */
 		if (rsm->r_ts_valid &&
 		    bbr->rc_ts_valid &&
 		    bbr->rc_ts_clock_set &&
 		    (bbr->rc_ts_cant_be_used == 0) &&
 		    bbr->rc_use_ts_limit) {
 			ts_diff = max((bbr->r_ctl.last_inbound_ts - rsm->r_del_ack_ts), 1);
 			ts_diff *= bbr->r_ctl.bbr_peer_tsratio;
 			if ((delivered == 0) ||
 			    (rtt < 1000)) {
 				/* Can't use the ts */
 				bbr_log_type_bbrupd(bbr, 61, cts,
 						    ts_diff,
 						    bbr->r_ctl.last_inbound_ts,
 						    rsm->r_del_ack_ts, 0,
 						    0, 0, 0, delivered);
 			} else {
 				ts_bw = (uint64_t)delivered;
 				ts_bw *= (uint64_t)USECS_IN_SECOND;
 				ts_bw /= ts_diff;
 				bbr_log_type_bbrupd(bbr, 62, cts,
 						    (ts_bw >> 32),
 						    (ts_bw & 0xffffffff), 0, 0,
 						    0, 0, ts_diff, delivered);
 				if ((bbr->ts_can_raise) &&
 				    (ts_bw > bw)) {
 					bbr_log_type_bbrupd(bbr, 8, cts,
 							    delivered,
 							    ts_diff,
 							    (bw >> 32),
 							    (bw & 0x00000000ffffffff),
 							    0, 0, 0, 0);
 					bw = ts_bw;
 				} else if (ts_bw && (ts_bw < bw)) {
 					bbr_log_type_bbrupd(bbr, 7, cts,
 							    delivered,
 							    ts_diff,
 							    (bw >> 32),
 							    (bw & 0x00000000ffffffff),
 							    0, 0, 0, 0);
 					bw = ts_bw;
 				}
 			}
 		}
 		if (rsm->r_first_sent_time &&
 		    TSTMP_GT(rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)],rsm->r_first_sent_time)) {
 			uint64_t sbw, sti;
 			/*
 			 * We use what was in flight at the time of our
 			 * send  and the size of this send to figure
 			 * out what we have been sending at (amount).
 			 * For the time we take from the time of
 			 * the send of the first send outstanding
 			 * until this send plus this sends pacing
 			 * time. This gives us a good calculation
 			 * as to the rate we have been sending at.
 			 */
 
 			sbw = (uint64_t)(rsm->r_flight_at_send);
 			sbw *= (uint64_t)USECS_IN_SECOND;
 			sti = rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)] - rsm->r_first_sent_time;
 			sti += rsm->r_pacing_delay;
 			sbw /= sti;
 			if (sbw < bw) {
 				bbr_log_type_bbrupd(bbr, 6, cts,
 						    delivered,
 						    (uint32_t)sti,
 						    (bw >> 32),
 						    (uint32_t)bw,
 						    rsm->r_first_sent_time, 0, (sbw >> 32),
 						    (uint32_t)sbw);
 				bw = sbw;
 			}
 		}
 		/* Use the google algorithm for b/w measurements */
 		bbr->r_ctl.rc_bbr_cur_del_rate = bw;
 		if ((rsm->r_app_limited == 0) ||
 		    (bw > get_filter_value(&bbr->r_ctl.rc_delrate))) {
 			tcp_bbr_commit_bw(bbr, cts);
 			bbr_log_type_bbrupd(bbr, 10, cts, (uint32_t)tim, delivered,
 					    0, 0, 0, 0,  bbr->r_ctl.rc_del_time,  rsm->r_del_time);
 		}
 	}
 }
 
 static void
 bbr_google_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts)
 {
 	if (bbr->rc_in_persist == 0) {
 		/* We log only when not in persist */
 		/* Translate to a Bytes Per Second */
 		uint64_t tim, bw;
 		uint32_t delivered;
 		int no_apply = 0;
 
 		if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time))
 			tim = (uint64_t)(bbr->r_ctl.rc_del_time - rsm->r_del_time);
 		else
 			tim = 1;
 		/*
 		 * Now that we have processed the tim (skipping the sample
 		 * or possibly updating the time, go ahead and
 		 * calculate the cdr.
 		 */
 		delivered = (bbr->r_ctl.rc_delivered - rsm->r_delivered);
 		bw = (uint64_t)delivered;
 		bw *= (uint64_t)USECS_IN_SECOND;
 		bw /= tim;
 		if (tim < bbr->r_ctl.rc_lowest_rtt) {
 			bbr_log_type_bbrupd(bbr, 99, cts, (uint32_t)tim, delivered,
 					    tim, bbr->r_ctl.rc_lowest_rtt, 0, 0, 0, 0);
 
 			no_apply = 1;
 		}
 		/*
 		 * If we are using this b/w shove it in now so we
 		 * can see in the trace viewer if it gets over-ridden.
 		 */
 		bbr->r_ctl.rc_bbr_cur_del_rate = bw;
 		/* Gate by the sending rate */
 		if (rsm->r_first_sent_time &&
 		    TSTMP_GT(rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)],rsm->r_first_sent_time)) {
 			uint64_t sbw, sti;
 			/*
 			 * We use what was in flight at the time of our
 			 * send  and the size of this send to figure
 			 * out what we have been sending at (amount).
 			 * For the time we take from the time of
 			 * the send of the first send outstanding
 			 * until this send plus this sends pacing
 			 * time. This gives us a good calculation
 			 * as to the rate we have been sending at.
 			 */
 
 			sbw = (uint64_t)(rsm->r_flight_at_send);
 			sbw *= (uint64_t)USECS_IN_SECOND;
 			sti = rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)] - rsm->r_first_sent_time;
 			sti += rsm->r_pacing_delay;
 			sbw /= sti;
 			if (sbw < bw) {
 				bbr_log_type_bbrupd(bbr, 6, cts,
 						    delivered,
 						    (uint32_t)sti,
 						    (bw >> 32),
 						    (uint32_t)bw,
 						    rsm->r_first_sent_time, 0, (sbw >> 32),
 						    (uint32_t)sbw);
 				bw = sbw;
 			}
 			if ((sti > tim) &&
 			    (sti < bbr->r_ctl.rc_lowest_rtt)) {
 				bbr_log_type_bbrupd(bbr, 99, cts, (uint32_t)tim, delivered,
 						    (uint32_t)sti, bbr->r_ctl.rc_lowest_rtt, 0, 0, 0, 0);
 				no_apply = 1;
 			} else
 				no_apply = 0;
 		}
 		bbr->r_ctl.rc_bbr_cur_del_rate = bw;
 		if ((no_apply == 0) &&
 		    ((rsm->r_app_limited == 0) ||
 		     (bw > get_filter_value(&bbr->r_ctl.rc_delrate)))) {
 			tcp_bbr_commit_bw(bbr, cts);
 			bbr_log_type_bbrupd(bbr, 10, cts, (uint32_t)tim, delivered,
 					    0, 0, 0, 0, bbr->r_ctl.rc_del_time,  rsm->r_del_time);
 		}
 	}
 }
 
 static void
 bbr_update_bbr_info(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts, uint32_t tsin,
     uint32_t uts, int32_t match, uint32_t rsm_send_time, int32_t ack_type, struct tcpopt *to)
 {
 	uint64_t old_rttprop;
 
 	/* Update our delivery time and amount */
 	bbr->r_ctl.rc_delivered += (rsm->r_end - rsm->r_start);
 	bbr->r_ctl.rc_del_time = cts;
 	if (rtt == 0) {
 		/*
 		 * 0 means its a retransmit, for now we don't use these for
 		 * the rest of BBR.
 		 */
 		return;
 	}
 	if ((bbr->rc_use_google == 0) &&
 	    (match != BBR_RTT_BY_EXACTMATCH) &&
 	    (match != BBR_RTT_BY_TIMESTAMP)){
 		/*
 		 * We get a lot of rtt updates, lets not pay attention to
 		 * any that are not an exact match. That way we don't have
 		 * to worry about timestamps and the whole nonsense of
 		 * unsure if its a retransmission etc (if we ever had the
 		 * timestamp fixed to always have the last thing sent this
 		 * would not be a issue).
 		 */
 		return;
 	}
 	if ((bbr_no_retran && bbr->rc_use_google) &&
 	    (match != BBR_RTT_BY_EXACTMATCH) &&
 	    (match != BBR_RTT_BY_TIMESTAMP)){
 		/*
 		 * We only do measurements in google mode
 		 * with bbr_no_retran on for sure things.
 		 */
 		return;
 	}
 	/* Only update srtt if we know by exact match */
 	tcp_bbr_xmit_timer(bbr, rtt, rsm_send_time, rsm->r_start, tsin);
 	if (ack_type == BBR_CUM_ACKED)
 		bbr->rc_ack_is_cumack = 1;
 	else
 		bbr->rc_ack_is_cumack = 0;
 	old_rttprop = bbr_get_rtt(bbr, BBR_RTT_PROP);
 	/*
 	 * Note the following code differs to the original
 	 * BBR spec. It calls for <= not <. However after a
 	 * long discussion in email with Neal, he acknowledged
 	 * that it should be < than so that we will have flows
 	 * going into probe-rtt (we were seeing cases where that
 	 * did not happen and caused ugly things to occur). We
 	 * have added this agreed upon fix to our code base.
 	 */
 	if (rtt < old_rttprop) {
 		/* Update when we last saw a rtt drop */
 		bbr_log_rtt_shrinks(bbr, cts, 0, rtt, __LINE__, BBR_RTTS_NEWRTT, 0);
 		bbr_set_reduced_rtt(bbr, cts, __LINE__);
 	}
 	bbr_log_type_bbrrttprop(bbr, rtt, (rsm ? rsm->r_end : 0), uts, cts,
 	    match, rsm->r_start, rsm->r_flags);
 	apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
 	if (old_rttprop != bbr_get_rtt(bbr, BBR_RTT_PROP)) {
 		/*
 		 * The RTT-prop moved, reset the target (may be a
 		 * nop for some states).
 		 */
 		bbr_set_state_target(bbr, __LINE__);
 		if (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT)
 			bbr_log_rtt_shrinks(bbr, cts, 0, 0,
 					    __LINE__, BBR_RTTS_NEW_TARGET, 0);
 		else if (old_rttprop < bbr_get_rtt(bbr, BBR_RTT_PROP))
 			/* It went up */
 			bbr_check_probe_rtt_limits(bbr, cts);
 	}
 	if ((bbr->rc_use_google == 0) &&
 	    (match == BBR_RTT_BY_TIMESTAMP)) {
 		/*
 		 * We don't do b/w update with
 		 * these since they are not really
 		 * reliable.
 		 */
 		return;
 	}
 	if (bbr->r_ctl.r_app_limited_until &&
 	    (bbr->r_ctl.rc_delivered >= bbr->r_ctl.r_app_limited_until)) {
 		/* We are no longer app-limited */
 		bbr->r_ctl.r_app_limited_until = 0;
 	}
 	if (bbr->rc_use_google) {
 		bbr_google_measurement(bbr, rsm, rtt, cts);
 	} else {
 		bbr_nf_measurement(bbr, rsm, rtt, cts);
 	}
 }
 
 /*
  * Convert a timestamp that the main stack
  * uses (milliseconds) into one that bbr uses
  * (microseconds). Return that converted timestamp.
  */
 static uint32_t
 bbr_ts_convert(uint32_t cts) {
 	uint32_t sec, msec;
 
 	sec = cts / MS_IN_USEC;
 	msec = cts - (MS_IN_USEC * sec);
 	return ((sec * USECS_IN_SECOND) + (msec * MS_IN_USEC));
 }
 
 /*
  * Return 0 if we did not update the RTT time, return
  * 1 if we did.
  */
 static int
 bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr,
     struct bbr_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, uint32_t th_ack)
 {
 	int32_t i;
 	uint32_t t, uts = 0;
 
 	if ((rsm->r_flags & BBR_ACKED) ||
 	    (rsm->r_flags & BBR_WAS_RENEGED) ||
 	    (rsm->r_flags & BBR_RXT_CLEARED)) {
 		/* Already done */
 		return (0);
 	}
 	if (rsm->r_rtt_not_allowed) {
 		/* Not allowed */
 		return (0);
 	}
 	if (rsm->r_rtr_cnt == 1) {
 		/*
 		 * Only one transmit. Hopefully the normal case.
 		 */
 		if (TSTMP_GT(cts, rsm->r_tim_lastsent[0]))
 			t = cts - rsm->r_tim_lastsent[0];
 		else
 			t = 1;
 		if ((int)t <= 0)
 			t = 1;
 		bbr->r_ctl.rc_last_rtt = t;
 		bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0,
 				    BBR_RTT_BY_EXACTMATCH, rsm->r_tim_lastsent[0], ack_type, to);
 		return (1);
 	}
 	/* Convert to usecs */
 	if ((bbr_can_use_ts_for_rtt == 1) &&
 	    (bbr->rc_use_google == 1) &&
 	    (ack_type == BBR_CUM_ACKED) &&
 	    (to->to_flags & TOF_TS) &&
 	    (to->to_tsecr != 0)) {
 		t = tcp_tv_to_mssectick(&bbr->rc_tv) - to->to_tsecr;
 		if (t < 1)
 			t = 1;
 		t *= MS_IN_USEC;
 		bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0,
 				    BBR_RTT_BY_TIMESTAMP,
 				    rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)],
 				    ack_type, to);
 		return (1);
 	}
 	uts = bbr_ts_convert(to->to_tsecr);
 	if ((to->to_flags & TOF_TS) &&
 	    (to->to_tsecr != 0) &&
 	    (ack_type == BBR_CUM_ACKED) &&
 	    ((rsm->r_flags & BBR_OVERMAX) == 0)) {
 		/*
 		 * Now which timestamp does it match? In this block the ACK
 		 * may be coming from a previous transmission.
 		 */
 		uint32_t fudge;
 
 		fudge = BBR_TIMER_FUDGE;
 		for (i = 0; i < rsm->r_rtr_cnt; i++) {
 			if ((SEQ_GEQ(uts, (rsm->r_tim_lastsent[i] - fudge))) &&
 			    (SEQ_LEQ(uts, (rsm->r_tim_lastsent[i] + fudge)))) {
 				if (TSTMP_GT(cts, rsm->r_tim_lastsent[i]))
 					t = cts - rsm->r_tim_lastsent[i];
 				else
 					t = 1;
 				if ((int)t <= 0)
 					t = 1;
 				bbr->r_ctl.rc_last_rtt = t;
 				bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_TSMATCHING,
 						    rsm->r_tim_lastsent[i], ack_type, to);
 				if ((i + 1) < rsm->r_rtr_cnt) {
 					/* Likely */
 					return (0);
 				} else if (rsm->r_flags & BBR_TLP) {
 					bbr->rc_tlp_rtx_out = 0;
 				}
 				return (1);
 			}
 		}
 		/* Fall through if we can't find a matching timestamp */
 	}
 	/*
 	 * Ok its a SACK block that we retransmitted. or a windows
 	 * machine without timestamps. We can tell nothing from the
 	 * time-stamp since its not there or the time the peer last
 	 * recieved a segment that moved forward its cum-ack point.
 	 *
 	 * Lets look at the last retransmit and see what we can tell
 	 * (with BBR for space we only keep 2 note we have to keep
 	 * at least 2 so the map can not be condensed more).
 	 */
 	i = rsm->r_rtr_cnt - 1;
 	if (TSTMP_GT(cts, rsm->r_tim_lastsent[i]))
 		t = cts - rsm->r_tim_lastsent[i];
 	else
 		goto not_sure;
 	if (t < bbr->r_ctl.rc_lowest_rtt) {
 		/*
 		 * We retransmitted and the ack came back in less
 		 * than the smallest rtt we have observed in the
 		 * windowed rtt. We most likey did an improper
 		 * retransmit as outlined in 4.2 Step 3 point 2 in
 		 * the rack-draft.
 		 *
 		 * Use the prior transmission to update all the
 		 * information as long as there is only one prior
 		 * transmission.
 		 */
 		if ((rsm->r_flags & BBR_OVERMAX) == 0) {
 #ifdef BBR_INVARIANTS
 			if (rsm->r_rtr_cnt == 1)
 				panic("rsm:%p bbr:%p rsm has overmax and only 1 retranmit flags:%x?", rsm, bbr, rsm->r_flags);
 #endif
 			i = rsm->r_rtr_cnt - 2;
 			if (TSTMP_GT(cts, rsm->r_tim_lastsent[i]))
 				t = cts - rsm->r_tim_lastsent[i];
 			else
 				t = 1;
 			bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_EARLIER_RET,
 					    rsm->r_tim_lastsent[i], ack_type, to);
 			return (0);
 		} else {
 			/*
 			 * Too many prior transmissions, just
 			 * updated BBR delivered
 			 */
 not_sure:
 			bbr_update_bbr_info(bbr, rsm, 0, cts, to->to_tsecr, uts,
 					    BBR_RTT_BY_SOME_RETRAN, 0, ack_type, to);
 		}
 	} else {
 		/*
 		 * We retransmitted it and the retransmit did the
 		 * job.
 		 */
 		if (rsm->r_flags & BBR_TLP)
 			bbr->rc_tlp_rtx_out = 0;
 		if ((rsm->r_flags & BBR_OVERMAX) == 0)
 			bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts,
 					    BBR_RTT_BY_THIS_RETRAN, 0, ack_type, to);
 		else
 			bbr_update_bbr_info(bbr, rsm, 0, cts, to->to_tsecr, uts,
 					    BBR_RTT_BY_SOME_RETRAN, 0, ack_type, to);
 		return (1);
 	}
 	return (0);
 }
 
 /*
  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
  */
 static void
 bbr_log_sack_passed(struct tcpcb *tp,
     struct tcp_bbr *bbr, struct bbr_sendmap *rsm)
 {
 	struct bbr_sendmap *nrsm;
 
 	nrsm = rsm;
 	TAILQ_FOREACH_REVERSE_FROM(nrsm, &bbr->r_ctl.rc_tmap,
 	    bbr_head, r_tnext) {
 		if (nrsm == rsm) {
 			/* Skip orginal segment he is acked */
 			continue;
 		}
 		if (nrsm->r_flags & BBR_ACKED) {
 			/* Skip ack'd segments */
 			continue;
 		}
 		if (nrsm->r_flags & BBR_SACK_PASSED) {
 			/*
 			 * We found one that is already marked
 			 * passed, we have been here before and
 			 * so all others below this are marked.
 			 */
 			break;
 		}
 		BBR_STAT_INC(bbr_sack_passed);
 		nrsm->r_flags |= BBR_SACK_PASSED;
 		if (((nrsm->r_flags & BBR_MARKED_LOST) == 0) &&
 		    bbr_is_lost(bbr, nrsm, bbr->r_ctl.rc_rcvtime)) {
 			bbr->r_ctl.rc_lost += nrsm->r_end - nrsm->r_start;
 			bbr->r_ctl.rc_lost_bytes += nrsm->r_end - nrsm->r_start;
 			nrsm->r_flags |= BBR_MARKED_LOST;
 		}
 		nrsm->r_flags &= ~BBR_WAS_SACKPASS;
 	}
 }
 
 /*
  * Returns the number of bytes that were
  * newly ack'd by sack blocks.
  */
 static uint32_t
 bbr_proc_sack_blk(struct tcpcb *tp, struct tcp_bbr *bbr, struct sackblk *sack,
     struct tcpopt *to, struct bbr_sendmap **prsm, uint32_t cts)
 {
 	int32_t times = 0;
 	uint32_t start, end, changed = 0;
 	struct bbr_sendmap *rsm, *nrsm;
 	int32_t used_ref = 1;
 	uint8_t went_back = 0, went_fwd = 0;
 
 	start = sack->start;
 	end = sack->end;
 	rsm = *prsm;
 	if (rsm == NULL)
 		used_ref = 0;
 
 	/* Do we locate the block behind where we last were? */
 	if (rsm && SEQ_LT(start, rsm->r_start)) {
 		went_back = 1;
 		TAILQ_FOREACH_REVERSE_FROM(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) {
 			if (SEQ_GEQ(start, rsm->r_start) &&
 			    SEQ_LT(start, rsm->r_end)) {
 				goto do_rest_ofb;
 			}
 		}
 	}
 start_at_beginning:
 	went_fwd = 1;
 	/*
 	 * Ok lets locate the block where this guy is fwd from rsm (if its
 	 * set)
 	 */
 	TAILQ_FOREACH_FROM(rsm, &bbr->r_ctl.rc_map, r_next) {
 		if (SEQ_GEQ(start, rsm->r_start) &&
 		    SEQ_LT(start, rsm->r_end)) {
 			break;
 		}
 	}
 do_rest_ofb:
 	if (rsm == NULL) {
 		/*
 		 * This happens when we get duplicate sack blocks with the
 		 * same end. For example SACK 4: 100 SACK 3: 100 The sort
 		 * will not change there location so we would just start at
 		 * the end of the first one and get lost.
 		 */
 		if (tp->t_flags & TF_SENTFIN) {
 			/*
 			 * Check to see if we have not logged the FIN that
 			 * went out.
 			 */
 			nrsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next);
 			if (nrsm && (nrsm->r_end + 1) == tp->snd_max) {
 				/*
 				 * Ok we did not get the FIN logged.
 				 */
 				nrsm->r_end++;
 				rsm = nrsm;
 				goto do_rest_ofb;
 			}
 		}
 		if (times == 1) {
 #ifdef BBR_INVARIANTS
 			panic("tp:%p bbr:%p sack:%p to:%p prsm:%p",
 			    tp, bbr, sack, to, prsm);
 #else
 			goto out;
 #endif
 		}
 		times++;
 		BBR_STAT_INC(bbr_sack_proc_restart);
 		rsm = NULL;
 		goto start_at_beginning;
 	}
 	/* Ok we have an ACK for some piece of rsm */
 	if (rsm->r_start != start) {
 		/*
 		 * Need to split this in two pieces the before and after.
 		 */
 		if (bbr_sack_mergable(rsm, start, end))
 			nrsm = bbr_alloc_full_limit(bbr);
 		else
 			nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT);
 		if (nrsm == NULL) {
 			/* We could not allocate ignore the sack */
 			struct sackblk blk;
 
 			blk.start = start;
 			blk.end = end;
 			sack_filter_reject(&bbr->r_ctl.bbr_sf, &blk);
 			goto out;
 		}
 		bbr_clone_rsm(bbr, nrsm, rsm, start);
 		TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
 		if (rsm->r_in_tmap) {
 			TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 			nrsm->r_in_tmap = 1;
 		}
 		rsm->r_flags &= (~BBR_HAS_FIN);
 		rsm = nrsm;
 	}
 	if (SEQ_GEQ(end, rsm->r_end)) {
 		/*
 		 * The end of this block is either beyond this guy or right
 		 * at this guy.
 		 */
 		if ((rsm->r_flags & BBR_ACKED) == 0) {
 			bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_SACKED, 0);
 			changed += (rsm->r_end - rsm->r_start);
 			bbr->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
 			bbr_log_sack_passed(tp, bbr, rsm);
 			if (rsm->r_flags & BBR_MARKED_LOST) {
 				bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
 			}
 			/* Is Reordering occuring? */
 			if (rsm->r_flags & BBR_SACK_PASSED) {
 				BBR_STAT_INC(bbr_reorder_seen);
 				bbr->r_ctl.rc_reorder_ts = cts;
 				if (rsm->r_flags & BBR_MARKED_LOST) {
 					bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
 					if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
 						/* LT sampling also needs adjustment */
 						bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
 				}
 			}
 			rsm->r_flags |= BBR_ACKED;
 			rsm->r_flags &= ~(BBR_TLP|BBR_WAS_RENEGED|BBR_RXT_CLEARED|BBR_MARKED_LOST);
 			if (rsm->r_in_tmap) {
 				TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
 				rsm->r_in_tmap = 0;
 			}
 		}
 		bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_SACKED);
 		if (end == rsm->r_end) {
 			/* This block only - done */
 			goto out;
 		}
 		/* There is more not coverend by this rsm move on */
 		start = rsm->r_end;
 		nrsm = TAILQ_NEXT(rsm, r_next);
 		rsm = nrsm;
 		times = 0;
 		goto do_rest_ofb;
 	}
 	if (rsm->r_flags & BBR_ACKED) {
 		/* Been here done that */
 		goto out;
 	}
 	/* Ok we need to split off this one at the tail */
 	if (bbr_sack_mergable(rsm, start, end))
 		nrsm = bbr_alloc_full_limit(bbr);
 	else
 		nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT);
 	if (nrsm == NULL) {
 		/* failed XXXrrs what can we do but loose the sack info? */
 		struct sackblk blk;
 
 		blk.start = start;
 		blk.end = end;
 		sack_filter_reject(&bbr->r_ctl.bbr_sf, &blk);
 		goto out;
 	}
 	/* Clone it */
 	bbr_clone_rsm(bbr, nrsm, rsm, end);
 	/* The sack block does not cover this guy fully */
 	rsm->r_flags &= (~BBR_HAS_FIN);
 	TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
 	if (rsm->r_in_tmap) {
 		TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 		nrsm->r_in_tmap = 1;
 	}
 	nrsm->r_dupack = 0;
 	bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_SACKED, 0);
 	bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_SACKED);
 	changed += (rsm->r_end - rsm->r_start);
 	bbr->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
 	bbr_log_sack_passed(tp, bbr, rsm);
 	/* Is Reordering occuring? */
 	if (rsm->r_flags & BBR_MARKED_LOST) {
 		bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
 	}
 	if (rsm->r_flags & BBR_SACK_PASSED) {
 		BBR_STAT_INC(bbr_reorder_seen);
 		bbr->r_ctl.rc_reorder_ts = cts;
 		if (rsm->r_flags & BBR_MARKED_LOST) {
 			bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
 			if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
 				/* LT sampling also needs adjustment */
 				bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
 		}
 	}
 	rsm->r_flags &= ~(BBR_TLP|BBR_WAS_RENEGED|BBR_RXT_CLEARED|BBR_MARKED_LOST);
 	rsm->r_flags |= BBR_ACKED;
 	if (rsm->r_in_tmap) {
 		TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
 		rsm->r_in_tmap = 0;
 	}
 out:
 	if (rsm && (rsm->r_flags & BBR_ACKED)) {
 		/*
 		 * Now can we merge this newly acked
 		 * block with either the previous or
 		 * next block?
 		 */
 		nrsm = TAILQ_NEXT(rsm, r_next);
 		if (nrsm &&
 		    (nrsm->r_flags & BBR_ACKED)) {
 			/* yep this and next can be merged */
 			rsm = bbr_merge_rsm(bbr, rsm, nrsm);
 		}
 		/* Now what about the previous? */
 		nrsm = TAILQ_PREV(rsm, bbr_head, r_next);
 		if (nrsm &&
 		    (nrsm->r_flags & BBR_ACKED)) {
 			/* yep the previous and this can be merged */
 			rsm = bbr_merge_rsm(bbr, nrsm, rsm);
 		}
 	}
 	if (used_ref == 0) {
 		BBR_STAT_INC(bbr_sack_proc_all);
 	} else {
 		BBR_STAT_INC(bbr_sack_proc_short);
 	}
 	if (went_fwd && went_back) {
 		BBR_STAT_INC(bbr_sack_search_both);
 	} else if (went_fwd) {
 		BBR_STAT_INC(bbr_sack_search_fwd);
 	} else if (went_back) {
 		BBR_STAT_INC(bbr_sack_search_back);
 	}
 	/* Save off where the next seq is */
 	if (rsm)
 		bbr->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next);
 	else
 		bbr->r_ctl.rc_sacklast = NULL;
 	*prsm = rsm;
 	return (changed);
 }
 
 static void inline
 bbr_peer_reneges(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, tcp_seq th_ack)
 {
 	struct bbr_sendmap *tmap;
 
 	BBR_STAT_INC(bbr_reneges_seen);
 	tmap = NULL;
 	while (rsm && (rsm->r_flags & BBR_ACKED)) {
 		/* Its no longer sacked, mark it so */
 		uint32_t oflags;
 		bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
 #ifdef BBR_INVARIANTS
 		if (rsm->r_in_tmap) {
 			panic("bbr:%p rsm:%p flags:0x%x in tmap?",
 			    bbr, rsm, rsm->r_flags);
 		}
 #endif
 		oflags = rsm->r_flags;
 		if (rsm->r_flags & BBR_MARKED_LOST) {
 			bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
 			bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
 			if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
 				/* LT sampling also needs adjustment */
 				bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
 		}
 		rsm->r_flags &= ~(BBR_ACKED | BBR_SACK_PASSED | BBR_WAS_SACKPASS | BBR_MARKED_LOST);
 		rsm->r_flags |= BBR_WAS_RENEGED;
 		rsm->r_flags |= BBR_RXT_CLEARED;
 		bbr_log_type_rsmclear(bbr, bbr->r_ctl.rc_rcvtime, rsm, oflags, __LINE__);
 		/* Rebuild it into our tmap */
 		if (tmap == NULL) {
 			TAILQ_INSERT_HEAD(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
 			tmap = rsm;
 		} else {
 			TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, tmap, rsm, r_tnext);
 			tmap = rsm;
 		}
 		tmap->r_in_tmap = 1;
 		/*
 		 * XXXrrs Delivered? Should we do anything here?
 		 *
 		 * Of course we don't on a rxt timeout so maybe its ok that
 		 * we don't?
 		 *
 		 * For now lets not.
 		 */
 		rsm = TAILQ_NEXT(rsm, r_next);
 	}
 	/*
 	 * Now lets possibly clear the sack filter so we start recognizing
 	 * sacks that cover this area.
 	 */
 	sack_filter_clear(&bbr->r_ctl.bbr_sf, th_ack);
 }
 
 static void
 bbr_log_syn(struct tcpcb *tp, struct tcpopt *to)
 {
 	struct tcp_bbr *bbr;
 	struct bbr_sendmap *rsm;
 	uint32_t cts;
 
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	cts = bbr->r_ctl.rc_rcvtime;
 	rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
 	if (rsm && (rsm->r_flags & BBR_HAS_SYN)) {
 		if ((rsm->r_end - rsm->r_start) <= 1) {
 			/* Log out the SYN completely */
 			bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
 			rsm->r_rtr_bytes = 0;
 			TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next);
 			if (rsm->r_in_tmap) {
 				TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
 				rsm->r_in_tmap = 0;
 			}
 			if (bbr->r_ctl.rc_next == rsm) {
 				/* scoot along the marker */
 				bbr->r_ctl.rc_next = TAILQ_FIRST(&bbr->r_ctl.rc_map);
 			}
 			if (to != NULL)
 				bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_CUM_ACKED, 0);
 			bbr_free(bbr, rsm);
 		} else {
 			/* There is more (Fast open)? strip out SYN. */
 			rsm->r_flags &= ~BBR_HAS_SYN;
 			rsm->r_start++;
 		}
 	}
 }
 
 /*
  * Returns the number of bytes that were
  * acknowledged by SACK blocks.
  */
 
 static uint32_t
 bbr_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th,
     uint32_t *prev_acked)
 {
 	uint32_t changed, last_seq, entered_recovery = 0;
 	struct tcp_bbr *bbr;
 	struct bbr_sendmap *rsm;
 	struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
 	register uint32_t th_ack;
 	int32_t i, j, k, new_sb, num_sack_blks = 0;
 	uint32_t cts, acked, ack_point, sack_changed = 0;
 	uint32_t p_maxseg, maxseg, p_acked = 0;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (tcp_get_flags(th) & TH_RST) {
 		/* We don't log resets */
 		return (0);
 	}
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	cts = bbr->r_ctl.rc_rcvtime;
 
 	rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
 	changed = 0;
 	maxseg = tp->t_maxseg - bbr->rc_last_options;
 	p_maxseg = min(bbr->r_ctl.rc_pace_max_segs, maxseg);
 	th_ack = th->th_ack;
 	if (SEQ_GT(th_ack, tp->snd_una)) {
 		acked = th_ack - tp->snd_una;
 		bbr_log_progress_event(bbr, tp, ticks, PROGRESS_UPDATE, __LINE__);
 		bbr->rc_tp->t_acktime = ticks;
 	} else
 		acked = 0;
 	if (SEQ_LEQ(th_ack, tp->snd_una)) {
 		/* Only sent here for sack processing */
 		goto proc_sack;
 	}
 	if (rsm && SEQ_GT(th_ack, rsm->r_start)) {
 		changed = th_ack - rsm->r_start;
 	} else if ((rsm == NULL) && ((th_ack - 1) == tp->iss)) {
 		/*
 		 * For the SYN incoming case we will not have called
 		 * tcp_output for the sending of the SYN, so there will be
 		 * no map. All other cases should probably be a panic.
 		 */
 		if ((to->to_flags & TOF_TS) && (to->to_tsecr != 0)) {
 			/*
 			 * We have a timestamp that can be used to generate
 			 * an initial RTT.
 			 */
 			uint32_t ts, now, rtt;
 
 			ts = bbr_ts_convert(to->to_tsecr);
 			now = bbr_ts_convert(tcp_tv_to_mssectick(&bbr->rc_tv));
 			rtt = now - ts;
 			if (rtt < 1)
 				rtt = 1;
 			bbr_log_type_bbrrttprop(bbr, rtt,
 						tp->iss, 0, cts,
 						BBR_RTT_BY_TIMESTAMP, tp->iss, 0);
 			apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
 			changed = 1;
 			bbr->r_wanted_output = 1;
 			goto out;
 		}
 		goto proc_sack;
 	} else if (rsm == NULL) {
 		goto out;
 	}
 	if (changed) {
 		/*
 		 * The ACK point is advancing to th_ack, we must drop off
 		 * the packets in the rack log and calculate any eligble
 		 * RTT's.
 		 */
 		bbr->r_wanted_output = 1;
 more:
 		if (rsm == NULL) {
 			if (tp->t_flags & TF_SENTFIN) {
 				/* if we send a FIN we will not hav a map */
 				goto proc_sack;
 			}
 #ifdef BBR_INVARIANTS
 			panic("No rack map tp:%p for th:%p state:%d bbr:%p snd_una:%u snd_max:%u chg:%d\n",
 			    tp,
 			    th, tp->t_state, bbr,
 			    tp->snd_una, tp->snd_max, changed);
 #endif
 			goto proc_sack;
 		}
 	}
 	if (SEQ_LT(th_ack, rsm->r_start)) {
 		/* Huh map is missing this */
 #ifdef BBR_INVARIANTS
 		printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d bbr:%p\n",
 		    rsm->r_start,
 		    th_ack, tp->t_state,
 		    bbr->r_state, bbr);
 		panic("th-ack is bad bbr:%p tp:%p", bbr, tp);
 #endif
 		goto proc_sack;
 	} else if (th_ack == rsm->r_start) {
 		/* None here to ack */
 		goto proc_sack;
 	}
 	/*
 	 * Clear the dup ack counter, it will
 	 * either be freed or if there is some
 	 * remaining we need to start it at zero.
 	 */
 	rsm->r_dupack = 0;
 	/* Now do we consume the whole thing? */
 	if (SEQ_GEQ(th_ack, rsm->r_end)) {
 		/* Its all consumed. */
 		uint32_t left;
 
 		if (rsm->r_flags & BBR_ACKED) {
 			/*
 			 * It was acked on the scoreboard -- remove it from
 			 * total
 			 */
 			p_acked += (rsm->r_end - rsm->r_start);
 			bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
 			if (bbr->r_ctl.rc_sacked == 0)
 				bbr->r_ctl.rc_sacklast = NULL;
 		} else {
 			bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_CUM_ACKED, th_ack);
 			if (rsm->r_flags & BBR_MARKED_LOST) {
 				bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start;
 			}
 			if (rsm->r_flags & BBR_SACK_PASSED) {
 				/*
 				 * There are acked segments ACKED on the
 				 * scoreboard further up. We are seeing
 				 * reordering.
 				 */
 				BBR_STAT_INC(bbr_reorder_seen);
 				bbr->r_ctl.rc_reorder_ts = cts;
 				if (rsm->r_flags & BBR_MARKED_LOST) {
 					bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start;
 					if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost))
 						/* LT sampling also needs adjustment */
 						bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost;
 				}
 			}
 			rsm->r_flags &= ~BBR_MARKED_LOST;
 		}
 		bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
 		rsm->r_rtr_bytes = 0;
 		TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next);
 		if (rsm->r_in_tmap) {
 			TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
 			rsm->r_in_tmap = 0;
 		}
 		if (bbr->r_ctl.rc_next == rsm) {
 			/* scoot along the marker */
 			bbr->r_ctl.rc_next = TAILQ_FIRST(&bbr->r_ctl.rc_map);
 		}
 		bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_CUM_ACKED);
 		/* Adjust the packet counts */
 		left = th_ack - rsm->r_end;
 		/* Free back to zone */
 		bbr_free(bbr, rsm);
 		if (left) {
 			rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
 			goto more;
 		}
 		goto proc_sack;
 	}
 	if (rsm->r_flags & BBR_ACKED) {
 		/*
 		 * It was acked on the scoreboard -- remove it from total
 		 * for the part being cum-acked.
 		 */
 		p_acked += (rsm->r_end - rsm->r_start);
 		bbr->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
 		if (bbr->r_ctl.rc_sacked == 0)
 			bbr->r_ctl.rc_sacklast = NULL;
 	} else {
 		/*
 		 * It was acked up to th_ack point for the first time
 		 */
 		struct bbr_sendmap lrsm;
 
 		memcpy(&lrsm, rsm, sizeof(struct bbr_sendmap));
 		lrsm.r_end = th_ack;
 		bbr_update_rtt(tp, bbr, &lrsm, to, cts, BBR_CUM_ACKED, th_ack);
 	}
 	if ((rsm->r_flags & BBR_MARKED_LOST) &&
 	    ((rsm->r_flags & BBR_ACKED) == 0)) {
 		/*
 		 * It was marked lost and partly ack'd now
 		 * for the first time. We lower the rc_lost_bytes
 		 * and still leave it MARKED.
 		 */
 		bbr->r_ctl.rc_lost_bytes -= th_ack - rsm->r_start;
 	}
 	bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_CUM_ACKED);
 	bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
 	rsm->r_rtr_bytes = 0;
 	/* adjust packet count */
 	rsm->r_start = th_ack;
 proc_sack:
 	/* Check for reneging */
 	rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
 	if (rsm && (rsm->r_flags & BBR_ACKED) && (th_ack == rsm->r_start)) {
 		/*
 		 * The peer has moved snd_una up to the edge of this send,
 		 * i.e. one that it had previously acked. The only way that
 		 * can be true if the peer threw away data (space issues)
 		 * that it had previously sacked (else it would have given
 		 * us snd_una up to (rsm->r_end). We need to undo the acked
 		 * markings here.
 		 *
 		 * Note we have to look to make sure th_ack is our
 		 * rsm->r_start in case we get an old ack where th_ack is
 		 * behind snd_una.
 		 */
 		bbr_peer_reneges(bbr, rsm, th->th_ack);
 	}
 	if ((to->to_flags & TOF_SACK) == 0) {
 		/* We are done nothing left to log */
 		goto out;
 	}
 	rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next);
 	if (rsm) {
 		last_seq = rsm->r_end;
 	} else {
 		last_seq = tp->snd_max;
 	}
 	/* Sack block processing */
 	if (SEQ_GT(th_ack, tp->snd_una))
 		ack_point = th_ack;
 	else
 		ack_point = tp->snd_una;
 	for (i = 0; i < to->to_nsacks; i++) {
 		bcopy((to->to_sacks + i * TCPOLEN_SACK),
 		    &sack, sizeof(sack));
 		sack.start = ntohl(sack.start);
 		sack.end = ntohl(sack.end);
 		if (SEQ_GT(sack.end, sack.start) &&
 		    SEQ_GT(sack.start, ack_point) &&
 		    SEQ_LT(sack.start, tp->snd_max) &&
 		    SEQ_GT(sack.end, ack_point) &&
 		    SEQ_LEQ(sack.end, tp->snd_max)) {
 			if ((bbr->r_ctl.rc_num_small_maps_alloced > bbr_sack_block_limit) &&
 			    (SEQ_LT(sack.end, last_seq)) &&
 			    ((sack.end - sack.start) < (p_maxseg / 8))) {
 				/*
 				 * Not the last piece and its smaller than
 				 * 1/8th of a p_maxseg. We ignore this.
 				 */
 				BBR_STAT_INC(bbr_runt_sacks);
 				continue;
 			}
 			sack_blocks[num_sack_blks] = sack;
 			num_sack_blks++;
 		} else if (SEQ_LEQ(sack.start, th_ack) &&
 		    SEQ_LEQ(sack.end, th_ack)) {
 			/*
 			 * Its a D-SACK block.
 			 */
 			tcp_record_dsack(tp, sack.start, sack.end, 0);
 		}
 	}
 	if (num_sack_blks == 0)
 		goto out;
 	/*
 	 * Sort the SACK blocks so we can update the rack scoreboard with
 	 * just one pass.
 	 */
 	new_sb = sack_filter_blks(&bbr->r_ctl.bbr_sf, sack_blocks,
 				  num_sack_blks, th->th_ack);
 	ctf_log_sack_filter(bbr->rc_tp, new_sb, sack_blocks);
 	BBR_STAT_ADD(bbr_sack_blocks, num_sack_blks);
 	BBR_STAT_ADD(bbr_sack_blocks_skip, (num_sack_blks - new_sb));
 	num_sack_blks = new_sb;
 	if (num_sack_blks < 2) {
 		goto do_sack_work;
 	}
 	/* Sort the sacks */
 	for (i = 0; i < num_sack_blks; i++) {
 		for (j = i + 1; j < num_sack_blks; j++) {
 			if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
 				sack = sack_blocks[i];
 				sack_blocks[i] = sack_blocks[j];
 				sack_blocks[j] = sack;
 			}
 		}
 	}
 	/*
 	 * Now are any of the sack block ends the same (yes some
 	 * implememtations send these)?
 	 */
 again:
 	if (num_sack_blks > 1) {
 		for (i = 0; i < num_sack_blks; i++) {
 			for (j = i + 1; j < num_sack_blks; j++) {
 				if (sack_blocks[i].end == sack_blocks[j].end) {
 					/*
 					 * Ok these two have the same end we
 					 * want the smallest end and then
 					 * throw away the larger and start
 					 * again.
 					 */
 					if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
 						/*
 						 * The second block covers
 						 * more area use that
 						 */
 						sack_blocks[i].start = sack_blocks[j].start;
 					}
 					/*
 					 * Now collapse out the dup-sack and
 					 * lower the count
 					 */
 					for (k = (j + 1); k < num_sack_blks; k++) {
 						sack_blocks[j].start = sack_blocks[k].start;
 						sack_blocks[j].end = sack_blocks[k].end;
 						j++;
 					}
 					num_sack_blks--;
 					goto again;
 				}
 			}
 		}
 	}
 do_sack_work:
 	rsm = bbr->r_ctl.rc_sacklast;
 	for (i = 0; i < num_sack_blks; i++) {
 		acked = bbr_proc_sack_blk(tp, bbr, &sack_blocks[i], to, &rsm, cts);
 		if (acked) {
 			bbr->r_wanted_output = 1;
 			changed += acked;
 			sack_changed += acked;
 		}
 	}
 out:
 	*prev_acked = p_acked;
 	if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) {
 		/*
 		 * Ok we have a high probability that we need to go in to
 		 * recovery since we have data sack'd
 		 */
 		struct bbr_sendmap *rsm;
 
 		rsm = bbr_check_recovery_mode(tp, bbr, cts);
 		if (rsm) {
 			/* Enter recovery */
 			entered_recovery = 1;
 			bbr->r_wanted_output = 1;
 			/*
 			 * When we enter recovery we need to assure we send
 			 * one packet.
 			 */
 			if (bbr->r_ctl.rc_resend == NULL) {
 				bbr->r_ctl.rc_resend = rsm;
 			}
 		}
 	}
 	if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) {
 		/*
 		 * See if we need to rack-retransmit anything if so set it
 		 * up as the thing to resend assuming something else is not
 		 * already in that position.
 		 */
 		if (bbr->r_ctl.rc_resend == NULL) {
 			bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts);
 		}
 	}
 	/*
 	 * We return the amount that changed via sack, this is used by the
 	 * ack-received code to augment what was changed between th_ack <->
 	 * snd_una.
 	 */
 	return (sack_changed);
 }
 
 static void
 bbr_strike_dupack(struct tcp_bbr *bbr)
 {
 	struct bbr_sendmap *rsm;
 
 	rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap);
 	if (rsm && (rsm->r_dupack < 0xff)) {
 		rsm->r_dupack++;
 		if (rsm->r_dupack >= DUP_ACK_THRESHOLD)
 			bbr->r_wanted_output = 1;
 	}
 }
 
 /*
  * Return value of 1, we do not need to call bbr_process_data().
  * return value of 0, bbr_process_data can be called.
  * For ret_val if its 0 the TCB is locked and valid, if its non-zero
  * its unlocked and probably unsafe to touch the TCB.
  */
 static int
 bbr_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to,
     uint32_t tiwin, int32_t tlen,
     int32_t * ofia, int32_t thflags, int32_t * ret_val)
 {
 	int32_t ourfinisacked = 0;
 	int32_t acked_amount;
 	uint16_t nsegs;
 	int32_t acked;
 	uint32_t lost, sack_changed = 0;
 	struct mbuf *mfree;
 	struct tcp_bbr *bbr;
 	uint32_t prev_acked = 0;
 
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	lost = bbr->r_ctl.rc_lost;
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 	if (SEQ_GT(th->th_ack, tp->snd_max)) {
 		ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
 		bbr->r_wanted_output = 1;
 		return (1);
 	}
 	if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
 		/* Process the ack */
 		if (bbr->rc_in_persist)
 			tp->t_rxtshift = 0;
 		if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd))
 			bbr_strike_dupack(bbr);
 		sack_changed = bbr_log_ack(tp, to, th, &prev_acked);
 	}
 	bbr_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime, (bbr->r_ctl.rc_lost > lost));
 	if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
 		/*
 		 * Old ack, behind the last one rcv'd or a duplicate ack
 		 * with SACK info.
 		 */
 		if (th->th_ack == tp->snd_una) {
 			bbr_ack_received(tp, bbr, th, 0, sack_changed, prev_acked, __LINE__, 0);
 			if (bbr->r_state == TCPS_SYN_SENT) {
 				/*
 				 * Special case on where we sent SYN. When
 				 * the SYN-ACK is processed in syn_sent
 				 * state it bumps the snd_una. This causes
 				 * us to hit here even though we did ack 1
 				 * byte.
 				 *
 				 * Go through the nothing left case so we
 				 * send data.
 				 */
 				goto nothing_left;
 			}
 		}
 		return (0);
 	}
 	/*
 	 * If we reach this point, ACK is not a duplicate, i.e., it ACKs
 	 * something we sent.
 	 */
 	if (tp->t_flags & TF_NEEDSYN) {
 		/*
 		 * T/TCP: Connection was half-synchronized, and our SYN has
 		 * been ACK'd (so connection is now fully synchronized).  Go
 		 * to non-starred state, increment snd_una for ACK of SYN,
 		 * and check if we can do window scaling.
 		 */
 		tp->t_flags &= ~TF_NEEDSYN;
 		tp->snd_una++;
 		/* Do window scaling? */
 		if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 		    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 			tp->rcv_scale = tp->request_r_scale;
 			/* Send window already scaled. */
 		}
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	acked = BYTES_THIS_ACK(tp, th);
 	KMOD_TCPSTAT_ADD(tcps_rcvackpack, (int)nsegs);
 	KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
 
 	/*
 	 * If we just performed our first retransmit, and the ACK arrives
 	 * within our recovery window, then it was a mistake to do the
 	 * retransmit in the first place.  Recover our original cwnd and
 	 * ssthresh, and proceed to transmit where we left off.
 	 */
 	if (tp->t_flags & TF_PREVVALID) {
 		tp->t_flags &= ~TF_PREVVALID;
 		if (tp->t_rxtshift == 1 &&
 		    (int)(ticks - tp->t_badrxtwin) < 0)
 			bbr_cong_signal(tp, th, CC_RTO_ERR, NULL);
 	}
 	SOCKBUF_LOCK(&so->so_snd);
 	acked_amount = min(acked, (int)sbavail(&so->so_snd));
 	tp->snd_wnd -= acked_amount;
 	mfree = sbcut_locked(&so->so_snd, acked_amount);
 	/* NB: sowwakeup_locked() does an implicit unlock. */
 	sowwakeup_locked(so);
 	m_freem(mfree);
 	if (SEQ_GT(th->th_ack, tp->snd_una)) {
 		bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp));
 	}
 	tp->snd_una = th->th_ack;
 	bbr_ack_received(tp, bbr, th, acked, sack_changed, prev_acked, __LINE__, (bbr->r_ctl.rc_lost - lost));
 	if (IN_RECOVERY(tp->t_flags)) {
 		if (SEQ_LT(th->th_ack, tp->snd_recover) &&
 		    (SEQ_LT(th->th_ack, tp->snd_max))) {
 			tcp_bbr_partialack(tp);
 		} else {
 			bbr_post_recovery(tp);
 		}
 	}
 	if (SEQ_GT(tp->snd_una, tp->snd_recover)) {
 		tp->snd_recover = tp->snd_una;
 	}
 	if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
 		tp->snd_nxt = tp->snd_max;
 	}
 	if (tp->snd_una == tp->snd_max) {
 		/* Nothing left outstanding */
 nothing_left:
 		bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__);
 		if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
 			bbr->rc_tp->t_acktime = 0;
 		if ((sbused(&so->so_snd) == 0) &&
 		    (tp->t_flags & TF_SENTFIN)) {
 			ourfinisacked = 1;
 		}
 		bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
 		if (bbr->rc_in_persist == 0) {
 			bbr->r_ctl.rc_went_idle_time = bbr->r_ctl.rc_rcvtime;
 		}
 		sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una);
 		bbr_log_ack_clear(bbr, bbr->r_ctl.rc_rcvtime);
 		/*
 		 * We invalidate the last ack here since we
 		 * don't want to transfer forward the time
 		 * for our sum's calculations.
 		 */
 		if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
 		    (sbavail(&so->so_snd) == 0) &&
 		    (tp->t_flags2 & TF2_DROP_AF_DATA)) {
 			/*
 			 * The socket was gone and the peer sent data, time
 			 * to reset him.
 			 */
 			*ret_val = 1;
 			tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
 			/* tcp_close will kill the inp pre-log the Reset */
 			tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
 			tp = tcp_close(tp);
 			ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
 			BBR_STAT_INC(bbr_dropped_af_data);
 			return (1);
 		}
 		/* Set need output so persist might get set */
 		bbr->r_wanted_output = 1;
 	}
 	if (ofia)
 		*ofia = ourfinisacked;
 	return (0);
 }
 
 static void
 bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line)
 {
 	if (bbr->rc_in_persist == 0) {
 		bbr_timer_cancel(bbr, __LINE__, cts);
 		bbr->r_ctl.rc_last_delay_val = 0;
 		tp->t_rxtshift = 0;
 		bbr->rc_in_persist = 1;
 		bbr->r_ctl.rc_went_idle_time = cts;
 		/* We should be capped when rw went to 0 but just in case */
 		bbr_log_type_pesist(bbr, cts, 0, line, 1);
 		/* Time freezes for the state, so do the accounting now */
 		if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
 			uint32_t time_in;
 
 			time_in = cts - bbr->r_ctl.rc_bbr_state_time;
 			if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) {
 				int32_t idx;
 
 				idx = bbr_state_val(bbr);
 				counter_u64_add(bbr_state_time[(idx + 5)], time_in);
 			} else {
 				counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
 			}
 		}
 		bbr->r_ctl.rc_bbr_state_time = cts;
 	}
 }
 
 static void
 bbr_restart_after_idle(struct tcp_bbr *bbr, uint32_t cts, uint32_t idle_time)
 {
 	/*
 	 * Note that if idle time does not exceed our
 	 * threshold, we do nothing continuing the state
 	 * transitions we were last walking through.
 	 */
 	if (idle_time >= bbr_idle_restart_threshold) {
 		if (bbr->rc_use_idle_restart) {
 			bbr->rc_bbr_state = BBR_STATE_IDLE_EXIT;
 			/*
 			 * Set our target using BBR_UNIT, so
 			 * we increase at a dramatic rate but
 			 * we stop when we get the pipe
 			 * full again for our current b/w estimate.
 			 */
 			bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
 			bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT;
 			bbr_set_state_target(bbr, __LINE__);
 			/* Now setup our gains to ramp up */
 			bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg;
 			bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg;
 			bbr_log_type_statechange(bbr, cts, __LINE__);
 		} else if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) {
 			bbr_substate_change(bbr, cts, __LINE__, 1);
 		}
 	}
 }
 
 static void
 bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line)
 {
 	uint32_t idle_time;
 
 	if (bbr->rc_in_persist == 0)
 		return;
 	idle_time = bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time);
 	bbr->rc_in_persist = 0;
 	bbr->rc_hit_state_1 = 0;
 	bbr->r_ctl.rc_del_time = cts;
 	/*
 	 * We invalidate the last ack here since we
 	 * don't want to transfer forward the time
 	 * for our sum's calculations.
 	 */
 	if (tcp_in_hpts(bbr->rc_inp)) {
 		tcp_hpts_remove(bbr->rc_inp);
 		bbr->rc_timer_first = 0;
 		bbr->r_ctl.rc_hpts_flags = 0;
 		bbr->r_ctl.rc_last_delay_val = 0;
 		bbr->r_ctl.rc_hptsi_agg_delay = 0;
 		bbr->r_agg_early_set = 0;
 		bbr->r_ctl.rc_agg_early = 0;
 	}
 	bbr_log_type_pesist(bbr, cts, idle_time, line, 0);
 	if (idle_time >= bbr_rtt_probe_time) {
 		/*
 		 * This qualifies as a RTT_PROBE session since we drop the
 		 * data outstanding to nothing and waited more than
 		 * bbr_rtt_probe_time.
 		 */
 		bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_PERSIST, 0);
 		bbr->r_ctl.last_in_probertt = bbr->r_ctl.rc_rtt_shrinks = cts;
 	}
 	tp->t_rxtshift = 0;
 	/*
 	 * If in probeBW and we have persisted more than an RTT lets do
 	 * special handling.
 	 */
 	/* Force a time based epoch */
 	bbr_set_epoch(bbr, cts, __LINE__);
 	/*
 	 * Setup the lost so we don't count anything against the guy
 	 * we have been stuck with during persists.
 	 */
 	bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
 	/* Time un-freezes for the state */
 	bbr->r_ctl.rc_bbr_state_time = cts;
 	if ((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) ||
 	    (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT)) {
 		/*
 		 * If we are going back to probe-bw
 		 * or probe_rtt, we may need to possibly
 		 * do a fast restart.
 		 */
 		bbr_restart_after_idle(bbr, cts, idle_time);
 	}
 }
 
 static void
 bbr_collapsed_window(struct tcp_bbr *bbr)
 {
 	/*
 	 * Now we must walk the
 	 * send map and divide the
 	 * ones left stranded. These
 	 * guys can't cause us to abort
 	 * the connection and are really
 	 * "unsent". However if a buggy
 	 * client actually did keep some
 	 * of the data i.e. collapsed the win
 	 * and refused to ack and then opened
 	 * the win and acked that data. We would
 	 * get into an ack war, the simplier
 	 * method then of just pretending we
 	 * did not send those segments something
 	 * won't work.
 	 */
 	struct bbr_sendmap *rsm, *nrsm;
 	tcp_seq max_seq;
 	uint32_t maxseg;
 	int can_split = 0;
 	int fnd = 0;
 
 	maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options;
 	max_seq = bbr->rc_tp->snd_una + bbr->rc_tp->snd_wnd;
 	bbr_log_type_rwnd_collapse(bbr, max_seq, 1, 0);
 	TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
 		/* Find the first seq past or at maxseq */
 		if (rsm->r_flags & BBR_RWND_COLLAPSED)
 			rsm->r_flags &= ~BBR_RWND_COLLAPSED;
 		if (SEQ_GEQ(max_seq, rsm->r_start) &&
 		    SEQ_GEQ(rsm->r_end, max_seq)) {
 			fnd = 1;
 			break;
 		}
 	}
 	bbr->rc_has_collapsed = 0;
 	if (!fnd) {
 		/* Nothing to do strange */
 		return;
 	}
 	/*
 	 * Now can we split?
 	 *
 	 * We don't want to split if splitting
 	 * would generate too many small segments
 	 * less we let an attacker fragment our
 	 * send_map and leave us out of memory.
 	 */
 	if ((max_seq != rsm->r_start) &&
 	    (max_seq != rsm->r_end)){
 		/* can we split? */
 		int res1, res2;
 
 		res1 = max_seq - rsm->r_start;
 		res2 = rsm->r_end - max_seq;
 		if ((res1 >= (maxseg/8)) &&
 		    (res2 >= (maxseg/8))) {
 			/* No small pieces here */
 			can_split = 1;
 		} else if (bbr->r_ctl.rc_num_small_maps_alloced < bbr_sack_block_limit) {
 			/* We are under the limit */
 			can_split = 1;
 		}
 	}
 	/* Ok do we need to split this rsm? */
 	if (max_seq == rsm->r_start) {
 		/* It's this guy no split required */
 		nrsm = rsm;
 	} else if (max_seq == rsm->r_end) {
 		/* It's the next one no split required. */
 		nrsm = TAILQ_NEXT(rsm, r_next);
 		if (nrsm == NULL) {
 			/* Huh? */
 			return;
 		}
 	} else if (can_split && SEQ_LT(max_seq, rsm->r_end)) {
 		/* yep we need to split it */
 		nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT);
 		if (nrsm == NULL) {
 			/* failed XXXrrs what can we do mark the whole? */
 			nrsm = rsm;
 			goto no_split;
 		}
 		/* Clone it */
 		bbr_log_type_rwnd_collapse(bbr, max_seq, 3, 0);
 		bbr_clone_rsm(bbr, nrsm, rsm, max_seq);
 		TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next);
 		if (rsm->r_in_tmap) {
 			TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 			nrsm->r_in_tmap = 1;
 		}
 	} else {
 		/*
 		 * Split not allowed just start here just
 		 * use this guy.
 		 */
 		nrsm = rsm;
 	}
 no_split:
 	BBR_STAT_INC(bbr_collapsed_win);
 	/* reuse fnd as a count */
 	fnd = 0;
 	TAILQ_FOREACH_FROM(nrsm, &bbr->r_ctl.rc_map, r_next) {
 		nrsm->r_flags |= BBR_RWND_COLLAPSED;
 		fnd++;
 		bbr->rc_has_collapsed = 1;
 	}
 	bbr_log_type_rwnd_collapse(bbr, max_seq, 4, fnd);
 }
 
 static void
 bbr_un_collapse_window(struct tcp_bbr *bbr)
 {
 	struct bbr_sendmap *rsm;
 	int cleared = 0;
 
 	TAILQ_FOREACH_REVERSE(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) {
 		if (rsm->r_flags & BBR_RWND_COLLAPSED) {
 			/* Clear the flag */
 			rsm->r_flags &= ~BBR_RWND_COLLAPSED;
 			cleared++;
 		} else
 			break;
 	}
 	bbr_log_type_rwnd_collapse(bbr,
 				   (bbr->rc_tp->snd_una + bbr->rc_tp->snd_wnd), 0, cleared);
 	bbr->rc_has_collapsed = 0;
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCB is still
  * locked.
  */
 static int
 bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	/*
 	 * Update window information. Don't look at window if no ACK: TAC's
 	 * send garbage on first SYN.
 	 */
 	uint16_t nsegs;
 	int32_t tfo_syn;
 	struct tcp_bbr *bbr;
 
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
 	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
 	    (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
 		/* keep track of pure window updates */
 		if (tlen == 0 &&
 		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 			KMOD_TCPSTAT_INC(tcps_rcvwinupd);
 		tp->snd_wnd = tiwin;
 		tp->snd_wl1 = th->th_seq;
 		tp->snd_wl2 = th->th_ack;
 		if (tp->snd_wnd > tp->max_sndwnd)
 			tp->max_sndwnd = tp->snd_wnd;
 		bbr->r_wanted_output = 1;
 	} else if (thflags & TH_ACK) {
 		if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
 			tp->snd_wnd = tiwin;
 			tp->snd_wl1 = th->th_seq;
 			tp->snd_wl2 = th->th_ack;
 		}
 	}
 	if (tp->snd_wnd < ctf_outstanding(tp))
 		/* The peer collapsed its window on us */
 		bbr_collapsed_window(bbr);
  	else if (bbr->rc_has_collapsed)
 		bbr_un_collapse_window(bbr);
 	/* Was persist timer active and now we have window space? */
 	if ((bbr->rc_in_persist != 0) &&
 	    (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2),
 				bbr_minseg(bbr)))) {
 		/*
 		 * Make the rate persist at end of persist mode if idle long
 		 * enough
 		 */
 		bbr_exit_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
 
 		/* Make sure we output to start the timer */
 		bbr->r_wanted_output = 1;
 	}
 	/* Do we need to enter persist? */
 	if ((bbr->rc_in_persist == 0) &&
 	    (tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) &&
 	    TCPS_HAVEESTABLISHED(tp->t_state) &&
 	    (tp->snd_max == tp->snd_una) &&
 	    sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
 	    (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
 		/* No send window.. we must enter persist */
 		bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
 	}
 	if (tp->t_flags2 & TF2_DROP_AF_DATA) {
 		m_freem(m);
 		return (0);
 	}
 	/*
 	 * We don't support urgent data but
 	 * drag along the up just to make sure
 	 * if there is a stack switch no one
 	 * is surprised.
 	 */
 	tp->rcv_up = tp->rcv_nxt;
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * Process the segment text, merging it into the TCP sequencing
 	 * queue, and arranging for acknowledgment of receipt if necessary.
 	 * This process logically involves adjusting tp->rcv_wnd as data is
 	 * presented to the user (this happens in tcp_usrreq.c, case
 	 * PRU_RCVD).  If a FIN has already been received on this connection
 	 * then we just ignore the text.
 	 */
 	tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
 		   IS_FASTOPEN(tp->t_flags));
 	if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		tcp_seq save_start = th->th_seq;
 		tcp_seq save_rnxt  = tp->rcv_nxt;
 		int     save_tlen  = tlen;
 
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 		/*
 		 * Insert segment which includes th into TCP reassembly
 		 * queue with control block tp.  Set thflags to whether
 		 * reassembly now includes a segment with FIN.  This handles
 		 * the common case inline (segment is the next to be
 		 * received on an established connection, and the queue is
 		 * empty), avoiding linkage into and removal from the queue
 		 * and repetition of various conversions. Set DELACK for
 		 * segments received in order, but ack immediately when
 		 * segments are out of order (so fast retransmit can work).
 		 */
 		if (th->th_seq == tp->rcv_nxt &&
 		    SEGQ_EMPTY(tp) &&
 		    (TCPS_HAVEESTABLISHED(tp->t_state) ||
 		    tfo_syn)) {
 #ifdef NETFLIX_SB_LIMITS
 			u_int mcnt, appended;
 
 			if (so->so_rcv.sb_shlim) {
 				mcnt = m_memcnt(m);
 				appended = 0;
 				if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
 				    CFO_NOSLEEP, NULL) == false) {
 					counter_u64_add(tcp_sb_shlim_fails, 1);
 					m_freem(m);
 					return (0);
 				}
 			}
 
 #endif
 			if (DELAY_ACK(tp, bbr, nsegs) || tfo_syn) {
 				bbr->bbr_segs_rcvd += max(1, nsegs);
 				tp->t_flags |= TF_DELACK;
 				bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
 			} else {
 				bbr->r_wanted_output = 1;
 				tp->t_flags |= TF_ACKNOW;
 			}
 			tp->rcv_nxt += tlen;
 			if (tlen &&
 			    ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
 			    (tp->t_fbyte_in == 0)) {
 				tp->t_fbyte_in = ticks;
 				if (tp->t_fbyte_in == 0)
 					tp->t_fbyte_in = 1;
 				if (tp->t_fbyte_out && tp->t_fbyte_in)
 					tp->t_flags2 |= TF2_FBYTES_COMPLETE;
 			}
 			thflags = tcp_get_flags(th) & TH_FIN;
 			KMOD_TCPSTAT_ADD(tcps_rcvpack, (int)nsegs);
 			KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 				m_freem(m);
 			else
 #ifdef NETFLIX_SB_LIMITS
 				appended =
 #endif
 					sbappendstream_locked(&so->so_rcv, m, 0);
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 #ifdef NETFLIX_SB_LIMITS
 			if (so->so_rcv.sb_shlim && appended != mcnt)
 				counter_fo_release(so->so_rcv.sb_shlim,
 				    mcnt - appended);
 #endif
 
 		} else {
 			/*
 			 * XXX: Due to the header drop above "th" is
 			 * theoretically invalid by now.  Fortunately
 			 * m_adj() doesn't actually frees any mbufs when
 			 * trimming from the head.
 			 */
 			tcp_seq temp = save_start;
 
 			thflags = tcp_reass(tp, th, &temp, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
 			if (tp->t_flags & TF_WAKESOR) {
 				tp->t_flags &= ~TF_WAKESOR;
 				/* NB: sorwakeup_locked() does an implicit unlock. */
 				sorwakeup_locked(so);
 			}
 		}
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    (save_tlen > 0) &&
 		    TCPS_HAVEESTABLISHED(tp->t_state)) {
 			if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
 				/*
 				 * DSACK actually handled in the fastpath
 				 * above.
 				 */
 				tcp_update_sack_list(tp, save_start,
 				    save_start + save_tlen);
 			} else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
 				if ((tp->rcv_numsacks >= 1) &&
 				    (tp->sackblks[0].end == save_start)) {
 					/*
 					 * Partial overlap, recorded at todrop
 					 * above.
 					 */
 					tcp_update_sack_list(tp,
 					    tp->sackblks[0].start,
 					    tp->sackblks[0].end);
 				} else {
 					tcp_update_dsack_list(tp, save_start,
 					    save_start + save_tlen);
 				}
 			} else if (tlen >= save_tlen) {
 				/* Update of sackblks. */
 				tcp_update_dsack_list(tp, save_start,
 				    save_start + save_tlen);
 			} else if (tlen > 0) {
 				tcp_update_dsack_list(tp, save_start,
 				    save_start + tlen);
 			}
 		}
 	} else {
 		m_freem(m);
 		thflags &= ~TH_FIN;
 	}
 
 	/*
 	 * If FIN is received ACK the FIN and let the user know that the
 	 * connection is closing.
 	 */
 	if (thflags & TH_FIN) {
 		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 			/* The socket upcall is handled by socantrcvmore. */
 			socantrcvmore(so);
 			/*
 			 * If connection is half-synchronized (ie NEEDSYN
 			 * flag on) then delay ACK, so it may be piggybacked
 			 * when SYN is sent. Otherwise, since we received a
 			 * FIN then no more input can be expected, send ACK
 			 * now.
 			 */
 			if (tp->t_flags & TF_NEEDSYN) {
 				tp->t_flags |= TF_DELACK;
 				bbr_timer_cancel(bbr,
 				    __LINE__, bbr->r_ctl.rc_rcvtime);
 			} else {
 				tp->t_flags |= TF_ACKNOW;
 			}
 			tp->rcv_nxt++;
 		}
 		switch (tp->t_state) {
 			/*
 			 * In SYN_RECEIVED and ESTABLISHED STATES enter the
 			 * CLOSE_WAIT state.
 			 */
 		case TCPS_SYN_RECEIVED:
 			tp->t_starttime = ticks;
 			/* FALLTHROUGH */
 		case TCPS_ESTABLISHED:
 			tcp_state_change(tp, TCPS_CLOSE_WAIT);
 			break;
 
 			/*
 			 * If still in FIN_WAIT_1 STATE FIN has not been
 			 * acked so enter the CLOSING state.
 			 */
 		case TCPS_FIN_WAIT_1:
 			tcp_state_change(tp, TCPS_CLOSING);
 			break;
 
 			/*
 			 * In FIN_WAIT_2 state enter the TIME_WAIT state,
 			 * starting the time-wait timer, turning off the
 			 * other standard timers.
 			 */
 		case TCPS_FIN_WAIT_2:
 			bbr->rc_timer_first = 1;
 			bbr_timer_cancel(bbr,
 			    __LINE__, bbr->r_ctl.rc_rcvtime);
 			INP_WLOCK_ASSERT(tp->t_inpcb);
 			tcp_twstart(tp);
 			return (1);
 		}
 	}
 	/*
 	 * Return any desired output.
 	 */
 	if ((tp->t_flags & TF_ACKNOW) ||
 	    (sbavail(&so->so_snd) > ctf_outstanding(tp))) {
 		bbr->r_wanted_output = 1;
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	return (0);
 }
 
 /*
  * Here nothing is really faster, its just that we
  * have broken out the fast-data path also just like
  * the fast-ack. Return 1 if we processed the packet
  * return 0 if you need to take the "slow-path".
  */
 static int
 bbr_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t nxt_pkt)
 {
 	uint16_t nsegs;
 	int32_t newsize = 0;	/* automatic sockbuf scaling */
 	struct tcp_bbr *bbr;
 #ifdef NETFLIX_SB_LIMITS
 	u_int mcnt, appended;
 #endif
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 
 #endif
 	/* On the hpts and we would have called output */
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * the timestamp. NOTE that the test is modified according to the
 	 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
 	 */
 	if (bbr->r_ctl.rc_resend != NULL) {
 		return (0);
 	}
 	if (tiwin && tiwin != tp->snd_wnd) {
 		return (0);
 	}
 	if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
 		return (0);
 	}
 	if (__predict_false((to->to_flags & TOF_TS) &&
 	    (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
 		return (0);
 	}
 	if (__predict_false((th->th_ack != tp->snd_una))) {
 		return (0);
 	}
 	if (__predict_false(tlen > sbspace(&so->so_rcv))) {
 		return (0);
 	}
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 		tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * This is a pure, in-sequence data packet with nothing on the
 	 * reassembly queue and we have enough buffer space to take it.
 	 */
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 
 #ifdef NETFLIX_SB_LIMITS
 	if (so->so_rcv.sb_shlim) {
 		mcnt = m_memcnt(m);
 		appended = 0;
 		if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
 		    CFO_NOSLEEP, NULL) == false) {
 			counter_u64_add(tcp_sb_shlim_fails, 1);
 			m_freem(m);
 			return (1);
 		}
 	}
 #endif
 	/* Clean receiver SACK report if present */
 	if (tp->rcv_numsacks)
 		tcp_clean_sackreport(tp);
 	KMOD_TCPSTAT_INC(tcps_preddat);
 	tp->rcv_nxt += tlen;
 	if (tlen &&
 	    ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
 	    (tp->t_fbyte_in == 0)) {
 		tp->t_fbyte_in = ticks;
 		if (tp->t_fbyte_in == 0)
 			tp->t_fbyte_in = 1;
 		if (tp->t_fbyte_out && tp->t_fbyte_in)
 			tp->t_flags2 |= TF2_FBYTES_COMPLETE;
 	}
 	/*
 	 * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
 	 */
 	tp->snd_wl1 = th->th_seq;
 	/*
 	 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
 	 */
 	tp->rcv_up = tp->rcv_nxt;
 	KMOD_TCPSTAT_ADD(tcps_rcvpack, (int)nsegs);
 	KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_INPUT, ostate, tp,
 		    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 	newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
 
 	/* Add data to socket buffer. */
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 		m_freem(m);
 	} else {
 		/*
 		 * Set new socket buffer size. Give up when limit is
 		 * reached.
 		 */
 		if (newsize)
 			if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
 				so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 
 #ifdef NETFLIX_SB_LIMITS
 		appended =
 #endif
 			sbappendstream_locked(&so->so_rcv, m, 0);
 		ctf_calc_rwin(so, tp);
 	}
 	/* NB: sorwakeup_locked() does an implicit unlock. */
 	sorwakeup_locked(so);
 #ifdef NETFLIX_SB_LIMITS
 	if (so->so_rcv.sb_shlim && mcnt != appended)
 		counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
 #endif
 	if (DELAY_ACK(tp, bbr, nsegs)) {
 		bbr->bbr_segs_rcvd += max(1, nsegs);
 		tp->t_flags |= TF_DELACK;
 		bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
 	} else {
 		bbr->r_wanted_output = 1;
 		tp->t_flags |= TF_ACKNOW;
 	}
 	return (1);
 }
 
 /*
  * This subfunction is used to try to highly optimize the
  * fast path. We again allow window updates that are
  * in sequence to remain in the fast-path. We also add
  * in the __predict's to attempt to help the compiler.
  * Note that if we return a 0, then we can *not* process
  * it and the caller should push the packet into the
  * slow-path. If we return 1, then all is well and
  * the packet is fully processed.
  */
 static int
 bbr_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t acked;
 	uint16_t nsegs;
 	uint32_t sack_changed;
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 
 #endif
 	uint32_t prev_acked = 0;
 	struct tcp_bbr *bbr;
 
 	if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
 		/* Old ack, behind (or duplicate to) the last one rcv'd */
 		return (0);
 	}
 	if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
 		/* Above what we have sent? */
 		return (0);
 	}
 	if (__predict_false(tiwin == 0)) {
 		/* zero window */
 		return (0);
 	}
 	if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
 		/* We need a SYN or a FIN, unlikely.. */
 		return (0);
 	}
 	if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
 		/* Timestamp is behind .. old ack with seq wrap? */
 		return (0);
 	}
 	if (__predict_false(IN_RECOVERY(tp->t_flags))) {
 		/* Still recovering */
 		return (0);
 	}
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	if (__predict_false(bbr->r_ctl.rc_resend != NULL)) {
 		/* We are retransmitting */
 		return (0);
 	}
 	if (__predict_false(bbr->rc_in_persist != 0)) {
 		/* In persist mode */
 		return (0);
 	}
 	if (bbr->r_ctl.rc_sacked) {
 		/* We have sack holes on our scoreboard */
 		return (0);
 	}
 	/* Ok if we reach here, we can process a fast-ack */
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 	sack_changed = bbr_log_ack(tp, to, th, &prev_acked);
 	/*
 	 * We never detect loss in fast ack [we can't
 	 * have a sack and can't be in recovery so
 	 * we always pass 0 (nothing detected)].
 	 */
 	bbr_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime, 0);
 	/* Did the window get updated? */
 	if (tiwin != tp->snd_wnd) {
 		tp->snd_wnd = tiwin;
 		tp->snd_wl1 = th->th_seq;
 		if (tp->snd_wnd > tp->max_sndwnd)
 			tp->max_sndwnd = tp->snd_wnd;
 	}
 	/* Do we need to exit persists? */
 	if ((bbr->rc_in_persist != 0) &&
 	    (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2),
 			       bbr_minseg(bbr)))) {
 		bbr_exit_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
 		bbr->r_wanted_output = 1;
 	}
 	/* Do we need to enter persists? */
 	if ((bbr->rc_in_persist == 0) &&
 	    (tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) &&
 	    TCPS_HAVEESTABLISHED(tp->t_state) &&
 	    (tp->snd_max == tp->snd_una) &&
 	    sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
 	    (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
 		/* No send window.. we must enter persist */
 		bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * the timestamp. NOTE that the test is modified according to the
 	 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 		tp->ts_recent_age = bbr->r_ctl.rc_rcvtime;
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * This is a pure ack for outstanding data.
 	 */
 	KMOD_TCPSTAT_INC(tcps_predack);
 
 	/*
 	 * "bad retransmit" recovery.
 	 */
 	if (tp->t_flags & TF_PREVVALID) {
 		tp->t_flags &= ~TF_PREVVALID;
 		if (tp->t_rxtshift == 1 &&
 		    (int)(ticks - tp->t_badrxtwin) < 0)
 			bbr_cong_signal(tp, th, CC_RTO_ERR, NULL);
 	}
 	/*
 	 * Recalculate the transmit timer / rtt.
 	 *
 	 * Some boxes send broken timestamp replies during the SYN+ACK
 	 * phase, ignore timestamps of 0 or we could calculate a huge RTT
 	 * and blow up the retransmit timer.
 	 */
 	acked = BYTES_THIS_ACK(tp, th);
 
 #ifdef TCP_HHOOK
 	/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
 	hhook_run_tcp_est_in(tp, th, to);
 #endif
 
 	KMOD_TCPSTAT_ADD(tcps_rcvackpack, (int)nsegs);
 	KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
 	sbdrop(&so->so_snd, acked);
 
 	if (SEQ_GT(th->th_ack, tp->snd_una))
 		bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp));
 	tp->snd_una = th->th_ack;
 	if (tp->snd_wnd < ctf_outstanding(tp))
 		/* The peer collapsed its window on us */
 		bbr_collapsed_window(bbr);
 	else if (bbr->rc_has_collapsed)
 		bbr_un_collapse_window(bbr);
 
 	if (SEQ_GT(tp->snd_una, tp->snd_recover)) {
 		tp->snd_recover = tp->snd_una;
 	}
 	bbr_ack_received(tp, bbr, th, acked, sack_changed, prev_acked, __LINE__, 0);
 	/*
 	 * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
 	 */
 	tp->snd_wl2 = th->th_ack;
 	m_freem(m);
 	/*
 	 * If all outstanding data are acked, stop retransmit timer,
 	 * otherwise restart timer using current (possibly backed-off)
 	 * value. If process is waiting for space, wakeup/selwakeup/signal.
 	 * If data are ready to send, let tcp_output decide between more
 	 * output or persist.
 	 */
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_INPUT, ostate, tp,
 		    (void *)tcp_saveipgen,
 		    &tcp_savetcp, 0);
 #endif
 	/* Wake up the socket if we have room to write more */
 	sowwakeup(so);
 	if (tp->snd_una == tp->snd_max) {
 		/* Nothing left outstanding */
 		bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__);
 		if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
 			bbr->rc_tp->t_acktime = 0;
 		bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
 		if (bbr->rc_in_persist == 0) {
 			bbr->r_ctl.rc_went_idle_time = bbr->r_ctl.rc_rcvtime;
 		}
 		sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una);
 		bbr_log_ack_clear(bbr, bbr->r_ctl.rc_rcvtime);
 		/*
 		 * We invalidate the last ack here since we
 		 * don't want to transfer forward the time
 		 * for our sum's calculations.
 		 */
 		bbr->r_wanted_output = 1;
 	}
 	if (sbavail(&so->so_snd)) {
 		bbr->r_wanted_output = 1;
 	}
 	return (1);
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCB is still
  * locked.
  */
 static int
 bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t todrop;
 	int32_t ourfinisacked = 0;
 	struct tcp_bbr *bbr;
 	int32_t ret_val = 0;
 
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	ctf_calc_rwin(so, tp);
 	/*
 	 * If the state is SYN_SENT: if seg contains an ACK, but not for our
 	 * SYN, drop the input. if seg contains a RST, then drop the
 	 * connection. if seg does not contain SYN, then drop it. Otherwise
 	 * this is an acceptable SYN segment initialize tp->rcv_nxt and
 	 * tp->irs if seg contains ack then advance tp->snd_una. BRR does
 	 * not support ECN so we will not say we are capable. if SYN has
 	 * been acked change to ESTABLISHED else SYN_RCVD state arrange for
 	 * segment to be acked (eventually) continue processing rest of
 	 * data/controls, beginning with URG
 	 */
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LEQ(th->th_ack, tp->iss) ||
 	    SEQ_GT(th->th_ack, tp->snd_max))) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 		return (1);
 	}
 	if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
 		TCP_PROBE5(connect__refused, NULL, tp,
 		    mtod(m, const char *), tp, th);
 		tp = tcp_drop(tp, ECONNREFUSED);
 		ctf_do_drop(m, tp);
 		return (1);
 	}
 	if (thflags & TH_RST) {
 		ctf_do_drop(m, tp);
 		return (1);
 	}
 	if (!(thflags & TH_SYN)) {
 		ctf_do_drop(m, tp);
 		return (1);
 	}
 	tp->irs = th->th_seq;
 	tcp_rcvseqinit(tp);
 	if (thflags & TH_ACK) {
 		int tfo_partial = 0;
 
 		KMOD_TCPSTAT_INC(tcps_connects);
 		soisconnected(so);
 #ifdef MAC
 		mac_socketpeer_set_from_mbuf(m, so);
 #endif
 		/* Do window scaling on this connection? */
 		if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 		    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 			tp->rcv_scale = tp->request_r_scale;
 		}
 		tp->rcv_adv += min(tp->rcv_wnd,
 		    TCP_MAXWIN << tp->rcv_scale);
 		/*
 		 * If not all the data that was sent in the TFO SYN
 		 * has been acked, resend the remainder right away.
 		 */
 		if (IS_FASTOPEN(tp->t_flags) &&
 		    (tp->snd_una != tp->snd_max)) {
 			tp->snd_nxt = th->th_ack;
 			tfo_partial = 1;
 		}
 		/*
 		 * If there's data, delay ACK; if there's also a FIN ACKNOW
 		 * will be turned on later.
 		 */
 		if (DELAY_ACK(tp, bbr, 1) && tlen != 0 && !tfo_partial) {
 			bbr->bbr_segs_rcvd += 1;
 			tp->t_flags |= TF_DELACK;
 			bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime);
 		} else {
 			bbr->r_wanted_output = 1;
 			tp->t_flags |= TF_ACKNOW;
 		}
 		if (SEQ_GT(th->th_ack, tp->iss)) {
 			/*
 			 * The SYN is acked
 			 * handle it specially.
 			 */
 			bbr_log_syn(tp, to);
 		}
 		if (SEQ_GT(th->th_ack, tp->snd_una)) {
 			/*
 			 * We advance snd_una for the
 			 * fast open case. If th_ack is
 			 * acknowledging data beyond
 			 * snd_una we can't just call
 			 * ack-processing since the
 			 * data stream in our send-map
 			 * will start at snd_una + 1 (one
 			 * beyond the SYN). If its just
 			 * equal we don't need to do that
 			 * and there is no send_map.
 			 */
 			tp->snd_una++;
 		}
 		/*
 		 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
 		 * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
 		 */
 		tp->t_starttime = ticks;
 		if (tp->t_flags & TF_NEEDFIN) {
 			tcp_state_change(tp, TCPS_FIN_WAIT_1);
 			tp->t_flags &= ~TF_NEEDFIN;
 			thflags &= ~TH_SYN;
 		} else {
 			tcp_state_change(tp, TCPS_ESTABLISHED);
 			TCP_PROBE5(connect__established, NULL, tp,
 			    mtod(m, const char *), tp, th);
 			cc_conn_init(tp);
 		}
 	} else {
 		/*
 		 * Received initial SYN in SYN-SENT[*] state => simultaneous
 		 * open.  If segment contains CC option and there is a
 		 * cached CC, apply TAO test. If it succeeds, connection is *
 		 * half-synchronized. Otherwise, do 3-way handshake:
 		 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
 		 * there was no CC option, clear cached CC value.
 		 */
 		tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN);
 		tcp_state_change(tp, TCPS_SYN_RECEIVED);
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	/*
 	 * Advance th->th_seq to correspond to first data byte. If data,
 	 * trim to stay within window, dropping FIN if necessary.
 	 */
 	th->th_seq++;
 	if (tlen > tp->rcv_wnd) {
 		todrop = tlen - tp->rcv_wnd;
 		m_adj(m, -todrop);
 		tlen = tp->rcv_wnd;
 		thflags &= ~TH_FIN;
 		KMOD_TCPSTAT_INC(tcps_rcvpackafterwin);
 		KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 	}
 	tp->snd_wl1 = th->th_seq - 1;
 	tp->rcv_up = th->th_seq;
 	/*
 	 * Client side of transaction: already sent SYN and data. If the
 	 * remote host used T/TCP to validate the SYN, our data will be
 	 * ACK'd; if so, enter normal data segment processing in the middle
 	 * of step 5, ack processing. Otherwise, goto step 6.
 	 */
 	if (thflags & TH_ACK) {
 		if ((to->to_flags & TOF_TS) != 0) {
 			uint32_t t, rtt;
 
 			t = tcp_tv_to_mssectick(&bbr->rc_tv);
 			if (TSTMP_GEQ(t, to->to_tsecr)) {
 				rtt = t - to->to_tsecr;
 				if (rtt == 0) {
 					rtt = 1;
 				}
 				rtt *= MS_IN_USEC;
 				tcp_bbr_xmit_timer(bbr, rtt, 0, 0, 0);
 				apply_filter_min_small(&bbr->r_ctl.rc_rttprop,
 						       rtt, bbr->r_ctl.rc_rcvtime);
 			}
 		}
 		if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
 			return (ret_val);
 		/* We may have changed to FIN_WAIT_1 above */
 		if (tp->t_state == TCPS_FIN_WAIT_1) {
 			/*
 			 * In FIN_WAIT_1 STATE in addition to the processing
 			 * for the ESTABLISHED state if our FIN is now
 			 * acknowledged then enter FIN_WAIT_2.
 			 */
 			if (ourfinisacked) {
 				/*
 				 * If we can't receive any more data, then
 				 * closing user can proceed. Starting the
 				 * timer is contrary to the specification,
 				 * but if we don't get a FIN we'll hang
 				 * forever.
 				 *
 				 * XXXjl: we should release the tp also, and
 				 * use a compressed state.
 				 */
 				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 					soisdisconnected(so);
 					tcp_timer_activate(tp, TT_2MSL,
 					    (tcp_fast_finwait2_recycle ?
 					    tcp_finwait2_timeout :
 					    TP_MAXIDLE(tp)));
 				}
 				tcp_state_change(tp, TCPS_FIN_WAIT_2);
 			}
 		}
 	}
 	return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCB is still
  * locked.
  */
 static int
 bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
 		uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t ourfinisacked = 0;
 	int32_t ret_val;
 	struct tcp_bbr *bbr;
 
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	ctf_calc_rwin(so, tp);
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 	     SEQ_GT(th->th_ack, tp->snd_max))) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 		return (1);
 	}
 	if (IS_FASTOPEN(tp->t_flags)) {
 		/*
 		 * When a TFO connection is in SYN_RECEIVED, the only valid
 		 * packets are the initial SYN, a retransmit/copy of the
 		 * initial SYN (possibly with a subset of the original
 		 * data), a valid ACK, a FIN, or a RST.
 		 */
 		if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
 			tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 			ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		} else if (thflags & TH_SYN) {
 			/* non-initial SYN is ignored */
 			if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
 			    (bbr->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
 			    (bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
 				ctf_do_drop(m, NULL);
 				return (0);
 			}
 		} else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
 			ctf_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	if ((thflags & TH_RST) ||
 	    (tp->t_fin_is_rst && (thflags & TH_FIN)))
 		return (ctf_process_rst(m, th, so, tp));
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	/*
 	 * In the SYN-RECEIVED state, validate that the packet belongs to
 	 * this connection before trimming the data to fit the receive
 	 * window.  Check the sequence number versus IRS since we know the
 	 * sequence numbers haven't wrapped.  This is a partial fix for the
 	 * "LAND" DoS attack.
 	 */
 	if (SEQ_LT(th->th_seq, tp->irs)) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 		return (1);
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 		    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
 		tp->ts_recent = to->to_tsval;
 	}
 	tp->snd_wnd = tiwin;
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (IS_FASTOPEN(tp->t_flags)) {
 			cc_conn_init(tp);
 		}
 		return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
 					 tiwin, thflags, nxt_pkt));
 	}
 	KMOD_TCPSTAT_INC(tcps_connects);
 	if (tp->t_flags & TF_SONOTCONN) {
 		tp->t_flags &= ~TF_SONOTCONN;
 		soisconnected(so);
 	}
 	/* Do window scaling? */
 	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 		tp->rcv_scale = tp->request_r_scale;
 	}
 	/*
 	 * ok for the first time in lets see if we can use the ts to figure
 	 * out what the initial RTT was.
 	 */
 	if ((to->to_flags & TOF_TS) != 0) {
 		uint32_t t, rtt;
 
 		t = tcp_tv_to_mssectick(&bbr->rc_tv);
 		if (TSTMP_GEQ(t, to->to_tsecr)) {
 			rtt = t - to->to_tsecr;
 			if (rtt == 0) {
 				rtt = 1;
 			}
 			rtt *= MS_IN_USEC;
 			tcp_bbr_xmit_timer(bbr, rtt, 0, 0, 0);
 			apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, bbr->r_ctl.rc_rcvtime);
 		}
 	}
 	/* Drop off any SYN in the send map (probably not there)  */
 	if (thflags & TH_ACK)
 		bbr_log_syn(tp, to);
 	if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
 		tcp_fastopen_decrement_counter(tp->t_tfo_pending);
 		tp->t_tfo_pending = NULL;
 	}
 	/*
 	 * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
 	 * FIN-WAIT-1
 	 */
 	tp->t_starttime = ticks;
 	if (tp->t_flags & TF_NEEDFIN) {
 		tcp_state_change(tp, TCPS_FIN_WAIT_1);
 		tp->t_flags &= ~TF_NEEDFIN;
 	} else {
 		tcp_state_change(tp, TCPS_ESTABLISHED);
 		TCP_PROBE5(accept__established, NULL, tp,
 			   mtod(m, const char *), tp, th);
 		/*
 		 * TFO connections call cc_conn_init() during SYN
 		 * processing.  Calling it again here for such connections
 		 * is not harmless as it would undo the snd_cwnd reduction
 		 * that occurs when a TFO SYN|ACK is retransmitted.
 		 */
 		if (!IS_FASTOPEN(tp->t_flags))
 			cc_conn_init(tp);
 	}
 	/*
 	 * Account for the ACK of our SYN prior to
 	 * regular ACK processing below, except for
 	 * simultaneous SYN, which is handled later.
 	 */
 	if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN))
 		tp->snd_una++;
 	/*
 	 * If segment contains data or ACK, will call tcp_reass() later; if
 	 * not, do so now to pass queued data to user.
 	 */
 	if (tlen == 0 && (thflags & TH_FIN) == 0) {
 		(void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
 			(struct mbuf *)0);
 		if (tp->t_flags & TF_WAKESOR) {
 			tp->t_flags &= ~TF_WAKESOR;
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 		}
 	}
 	tp->snd_wl1 = th->th_seq - 1;
 	if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (tp->t_state == TCPS_FIN_WAIT_1) {
 		/* We could have went to FIN_WAIT_1 (or EST) above */
 		/*
 		 * In FIN_WAIT_1 STATE in addition to the processing for the
 		 * ESTABLISHED state if our FIN is now acknowledged then
 		 * enter FIN_WAIT_2.
 		 */
 		if (ourfinisacked) {
 			/*
 			 * If we can't receive any more data, then closing
 			 * user can proceed. Starting the timer is contrary
 			 * to the specification, but if we don't get a FIN
 			 * we'll hang forever.
 			 *
 			 * XXXjl: we should release the tp also, and use a
 			 * compressed state.
 			 */
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				soisdisconnected(so);
 				tcp_timer_activate(tp, TT_2MSL,
 						   (tcp_fast_finwait2_recycle ?
 						    tcp_finwait2_timeout :
 						    TP_MAXIDLE(tp)));
 			}
 			tcp_state_change(tp, TCPS_FIN_WAIT_2);
 		}
 	}
 	return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
 				 tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCB is still
  * locked.
  */
 static int
 bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	struct tcp_bbr *bbr;
 	int32_t ret_val;
 
 	/*
 	 * Header prediction: check for the two common cases of a
 	 * uni-directional data xfer.  If the packet has no control flags,
 	 * is in-sequence, the window didn't change and we're not
 	 * retransmitting, it's a candidate.  If the length is zero and the
 	 * ack moved forward, we're the sender side of the xfer.  Just free
 	 * the data acked & wake any higher level process that was blocked
 	 * waiting for space.  If the length is non-zero and the ack didn't
 	 * move, we're the receiver side.  If we're getting packets in-order
 	 * (the reassembly queue is empty), add the data toc The socket
 	 * buffer and note that we need a delayed ack. Make sure that the
 	 * hidden state-flags are also off. Since we check for
 	 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
 	 */
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	if (bbr->r_ctl.rc_delivered < (4 * tp->t_maxseg)) {
 		/*
 		 * If we have delived under 4 segments increase the initial
 		 * window if raised by the peer. We use this to determine
 		 * dynamic and static rwnd's at the end of a connection.
 		 */
 		bbr->r_ctl.rc_init_rwnd = max(tiwin, tp->snd_wnd);
 	}
 	if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
 	    __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) &&
 	    __predict_true(SEGQ_EMPTY(tp)) &&
 	    __predict_true(th->th_seq == tp->rcv_nxt)) {
 		if (tlen == 0) {
 			if (bbr_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
 			    tiwin, nxt_pkt, iptos)) {
 				return (0);
 			}
 		} else {
 			if (bbr_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
 			    tiwin, nxt_pkt)) {
 				return (0);
 			}
 		}
 	}
 	ctf_calc_rwin(so, tp);
 
 	if ((thflags & TH_RST) ||
 	    (tp->t_fin_is_rst && (thflags & TH_FIN)))
 		return (ctf_process_rst(m, th, so, tp));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		ctf_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			bbr->r_wanted_output = 1;
 			return (ret_val);
 		} else {
 			ctf_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	/* State changes only happen in bbr_process_data() */
 	return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCB is still
  * locked.
  */
 static int
 bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	struct tcp_bbr *bbr;
 	int32_t ret_val;
 
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	ctf_calc_rwin(so, tp);
 	if ((thflags & TH_RST) ||
 	    (tp->t_fin_is_rst && (thflags & TH_FIN)))
 		return (ctf_process_rst(m, th, so, tp));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		ctf_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			bbr->r_wanted_output = 1;
 			return (ret_val);
 		} else {
 			ctf_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 static int
 bbr_check_data_after_close(struct mbuf *m, struct tcp_bbr *bbr,
     struct tcpcb *tp, int32_t * tlen, struct tcphdr *th, struct socket *so)
 {
 
 	if (bbr->rc_allow_data_af_clo == 0) {
 close_now:
 		tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
 		/* tcp_close will kill the inp pre-log the Reset */
 		tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
 		tp = tcp_close(tp);
 		KMOD_TCPSTAT_INC(tcps_rcvafterclose);
 		ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
 		return (1);
 	}
 	if (sbavail(&so->so_snd) == 0)
 		goto close_now;
 	/* Ok we allow data that is ignored and a followup reset */
 	tp->rcv_nxt = th->th_seq + *tlen;
 	tp->t_flags2 |= TF2_DROP_AF_DATA;
 	bbr->r_wanted_output = 1;
 	*tlen = 0;
 	return (0);
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCB is still
  * locked.
  */
 static int
 bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t ourfinisacked = 0;
 	int32_t ret_val;
 	struct tcp_bbr *bbr;
 
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	ctf_calc_rwin(so, tp);
 	if ((thflags & TH_RST) ||
 	    (tp->t_fin_is_rst && (thflags & TH_FIN)))
 		return (ctf_process_rst(m, th, so, tp));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		ctf_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If new data are received on a connection after the user processes
 	 * are gone, then RST the other end.
 	 * We call a new function now so we might continue and setup
 	 * to reset at all data being ack'd.
 	 */
 	if ((tp->t_flags & TF_CLOSED) && tlen &&
 	    bbr_check_data_after_close(m, bbr, tp, &tlen, th, so))
 		return (1);
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			bbr->r_wanted_output = 1;
 			return (ret_val);
 		} else {
 			ctf_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (ourfinisacked) {
 		/*
 		 * If we can't receive any more data, then closing user can
 		 * proceed. Starting the timer is contrary to the
 		 * specification, but if we don't get a FIN we'll hang
 		 * forever.
 		 *
 		 * XXXjl: we should release the tp also, and use a
 		 * compressed state.
 		 */
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 			soisdisconnected(so);
 			tcp_timer_activate(tp, TT_2MSL,
 			    (tcp_fast_finwait2_recycle ?
 			    tcp_finwait2_timeout :
 			    TP_MAXIDLE(tp)));
 		}
 		tcp_state_change(tp, TCPS_FIN_WAIT_2);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCB is still
  * locked.
  */
 static int
 bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t ourfinisacked = 0;
 	int32_t ret_val;
 	struct tcp_bbr *bbr;
 
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	ctf_calc_rwin(so, tp);
 	if ((thflags & TH_RST) ||
 	    (tp->t_fin_is_rst && (thflags & TH_FIN)))
 		return (ctf_process_rst(m, th, so, tp));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		ctf_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If new data are received on a connection after the user processes
 	 * are gone, then RST the other end.
 	 * We call a new function now so we might continue and setup
 	 * to reset at all data being ack'd.
 	 */
 	if ((tp->t_flags & TF_CLOSED) && tlen &&
 	    bbr_check_data_after_close(m, bbr, tp, &tlen, th, so))
 		return (1);
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			bbr->r_wanted_output = 1;
 			return (ret_val);
 		} else {
 			ctf_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (ourfinisacked) {
 		tcp_twstart(tp);
 		m_freem(m);
 		return (1);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCB is still
  * locked.
  */
 static int
 bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t ourfinisacked = 0;
 	int32_t ret_val;
 	struct tcp_bbr *bbr;
 
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	ctf_calc_rwin(so, tp);
 	if ((thflags & TH_RST) ||
 	    (tp->t_fin_is_rst && (thflags & TH_FIN)))
 		return (ctf_process_rst(m, th, so, tp));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		ctf_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If new data are received on a connection after the user processes
 	 * are gone, then RST the other end.
 	 * We call a new function now so we might continue and setup
 	 * to reset at all data being ack'd.
 	 */
 	if ((tp->t_flags & TF_CLOSED) && tlen &&
 	    bbr_check_data_after_close(m, bbr, tp, &tlen, th, so))
 		return (1);
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			bbr->r_wanted_output = 1;
 			return (ret_val);
 		} else {
 			ctf_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * case TCPS_LAST_ACK: Ack processing.
 	 */
 	if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (ourfinisacked) {
 		tp = tcp_close(tp);
 		ctf_do_drop(m, tp);
 		return (1);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCB is still
  * locked.
  */
 static int
 bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t ourfinisacked = 0;
 	int32_t ret_val;
 	struct tcp_bbr *bbr;
 
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	ctf_calc_rwin(so, tp);
 	/* Reset receive buffer auto scaling when not in bulk receive mode. */
 	if ((thflags & TH_RST) ||
 	    (tp->t_fin_is_rst && (thflags & TH_FIN)))
 		return (ctf_process_rst(m, th, so, tp));
 
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		ctf_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
 		return (ret_val);
 	}
 	/*
 	 * If new data are received on a connection after the user processes
 	 * are gone, then we may RST the other end depending on the outcome
 	 * of bbr_check_data_after_close.
 	 * We call a new function now so we might continue and setup
 	 * to reset at all data being ack'd.
 	 */
 	if ((tp->t_flags & TF_CLOSED) && tlen &&
 	    bbr_check_data_after_close(m, bbr, tp, &tlen, th, so))
 		return (1);
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			bbr->r_wanted_output = 1;
 			return (ret_val);
 		} else {
 			ctf_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 static void
 bbr_stop_all_timers(struct tcpcb *tp)
 {
 	struct tcp_bbr *bbr;
 
 	/*
 	 * Assure no timers are running.
 	 */
 	if (tcp_timer_active(tp, TT_PERSIST)) {
 		/* We enter in persists, set the flag appropriately */
 		bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 		bbr->rc_in_persist = 1;
 	}
 	tcp_timer_suspend(tp, TT_PERSIST);
 	tcp_timer_suspend(tp, TT_REXMT);
 	tcp_timer_suspend(tp, TT_KEEP);
 	tcp_timer_suspend(tp, TT_DELACK);
 }
 
 static void
 bbr_google_mode_on(struct tcp_bbr *bbr)
 {
 	bbr->rc_use_google = 1;
 	bbr->rc_no_pacing = 0;
 	bbr->r_ctl.bbr_google_discount = bbr_google_discount;
 	bbr->r_use_policer = bbr_policer_detection_enabled;
 	bbr->r_ctl.rc_probertt_int = (USECS_IN_SECOND * 10);
 	bbr->bbr_use_rack_cheat = 0;
 	bbr->r_ctl.rc_incr_tmrs = 0;
 	bbr->r_ctl.rc_inc_tcp_oh = 0;
 	bbr->r_ctl.rc_inc_ip_oh = 0;
 	bbr->r_ctl.rc_inc_enet_oh = 0;
 	reset_time(&bbr->r_ctl.rc_delrate,
 		   BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT);
 	reset_time_small(&bbr->r_ctl.rc_rttprop,
 			 (11 * USECS_IN_SECOND));
 	tcp_bbr_tso_size_check(bbr, tcp_get_usecs(&bbr->rc_tv));
 }
 
 static void
 bbr_google_mode_off(struct tcp_bbr *bbr)
 {
 	bbr->rc_use_google = 0;
 	bbr->r_ctl.bbr_google_discount = 0;
 	bbr->no_pacing_until = bbr_no_pacing_until;
 	bbr->r_use_policer = 0;
 	if (bbr->no_pacing_until)
 		bbr->rc_no_pacing = 1;
 	else
 		bbr->rc_no_pacing = 0;
 	if (bbr_use_rack_resend_cheat)
 		bbr->bbr_use_rack_cheat = 1;
 	else
 		bbr->bbr_use_rack_cheat = 0;
 	if (bbr_incr_timers)
 		bbr->r_ctl.rc_incr_tmrs = 1;
 	else
 		bbr->r_ctl.rc_incr_tmrs = 0;
 	if (bbr_include_tcp_oh)
 		bbr->r_ctl.rc_inc_tcp_oh = 1;
 	else
 		bbr->r_ctl.rc_inc_tcp_oh = 0;
 	if (bbr_include_ip_oh)
 		bbr->r_ctl.rc_inc_ip_oh = 1;
 	else
 		bbr->r_ctl.rc_inc_ip_oh = 0;
 	if (bbr_include_enet_oh)
 		bbr->r_ctl.rc_inc_enet_oh = 1;
 	else
 		bbr->r_ctl.rc_inc_enet_oh = 0;
 	bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit;
 	reset_time(&bbr->r_ctl.rc_delrate,
 		   bbr_num_pktepo_for_del_limit);
 	reset_time_small(&bbr->r_ctl.rc_rttprop,
 			 (bbr_filter_len_sec * USECS_IN_SECOND));
 	tcp_bbr_tso_size_check(bbr, tcp_get_usecs(&bbr->rc_tv));
 }
 /*
  * Return 0 on success, non-zero on failure
  * which indicates the error (usually no memory).
  */
 static int
 bbr_init(struct tcpcb *tp)
 {
 	struct tcp_bbr *bbr = NULL;
 	struct inpcb *inp;
 	uint32_t cts;
 
 	tp->t_fb_ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT | M_ZERO));
 	if (tp->t_fb_ptr == NULL) {
 		/*
 		 * We need to allocate memory but cant. The INP and INP_INFO
 		 * locks and they are recursive (happens during setup. So a
 		 * scheme to drop the locks fails :(
 		 *
 		 */
 		return (ENOMEM);
 	}
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	bbr->rtt_valid = 0;
 	inp = tp->t_inpcb;
 	inp->inp_flags2 |= INP_CANNOT_DO_ECN;
 	inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
 	TAILQ_INIT(&bbr->r_ctl.rc_map);
 	TAILQ_INIT(&bbr->r_ctl.rc_free);
 	TAILQ_INIT(&bbr->r_ctl.rc_tmap);
 	bbr->rc_tp = tp;
 	if (tp->t_inpcb) {
 		bbr->rc_inp = tp->t_inpcb;
 	}
 	cts = tcp_get_usecs(&bbr->rc_tv);
 	tp->t_acktime = 0;
 	bbr->rc_allow_data_af_clo = bbr_ignore_data_after_close;
 	bbr->r_ctl.rc_reorder_fade = bbr_reorder_fade;
 	bbr->rc_tlp_threshold = bbr_tlp_thresh;
 	bbr->r_ctl.rc_reorder_shift = bbr_reorder_thresh;
 	bbr->r_ctl.rc_pkt_delay = bbr_pkt_delay;
 	bbr->r_ctl.rc_min_to = bbr_min_to;
 	bbr->rc_bbr_state = BBR_STATE_STARTUP;
 	bbr->r_ctl.bbr_lost_at_state = 0;
 	bbr->r_ctl.rc_lost_at_startup = 0;
 	bbr->rc_all_timers_stopped = 0;
 	bbr->r_ctl.rc_bbr_lastbtlbw = 0;
 	bbr->r_ctl.rc_pkt_epoch_del = 0;
 	bbr->r_ctl.rc_pkt_epoch = 0;
 	bbr->r_ctl.rc_lowest_rtt = 0xffffffff;
 	bbr->r_ctl.rc_bbr_hptsi_gain = bbr_high_gain;
 	bbr->r_ctl.rc_bbr_cwnd_gain = bbr_high_gain;
 	bbr->r_ctl.rc_went_idle_time = cts;
 	bbr->rc_pacer_started = cts;
 	bbr->r_ctl.rc_pkt_epoch_time = cts;
 	bbr->r_ctl.rc_rcvtime = cts;
 	bbr->r_ctl.rc_bbr_state_time = cts;
 	bbr->r_ctl.rc_del_time = cts;
 	bbr->r_ctl.rc_tlp_rxt_last_time = cts;
 	bbr->r_ctl.last_in_probertt = cts;
 	bbr->skip_gain = 0;
 	bbr->gain_is_limited = 0;
 	bbr->no_pacing_until = bbr_no_pacing_until;
 	if (bbr->no_pacing_until)
 		bbr->rc_no_pacing = 1;
 	if (bbr_use_google_algo) {
 		bbr->rc_no_pacing = 0;
 		bbr->rc_use_google = 1;
 		bbr->r_ctl.bbr_google_discount = bbr_google_discount;
 		bbr->r_use_policer = bbr_policer_detection_enabled;
 	} else {
 		bbr->rc_use_google = 0;
 		bbr->r_ctl.bbr_google_discount = 0;
 		bbr->r_use_policer = 0;
 	}
 	if (bbr_ts_limiting)
 		bbr->rc_use_ts_limit = 1;
 	else
 		bbr->rc_use_ts_limit = 0;
 	if (bbr_ts_can_raise)
 		bbr->ts_can_raise = 1;
 	else
 		bbr->ts_can_raise = 0;
 	if (V_tcp_delack_enabled == 1)
 		tp->t_delayed_ack = 2;
 	else if (V_tcp_delack_enabled == 0)
 		tp->t_delayed_ack = 0;
 	else if (V_tcp_delack_enabled < 100)
 		tp->t_delayed_ack = V_tcp_delack_enabled;
 	else
 		tp->t_delayed_ack = 2;
 	if (bbr->rc_use_google == 0)
 		bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit;
 	else
 		bbr->r_ctl.rc_probertt_int = (USECS_IN_SECOND * 10);
 	bbr->r_ctl.rc_min_rto_ms = bbr_rto_min_ms;
 	bbr->rc_max_rto_sec = bbr_rto_max_sec;
 	bbr->rc_init_win = bbr_def_init_win;
 	if (tp->t_flags & TF_REQ_TSTMP)
 		bbr->rc_last_options = TCP_TS_OVERHEAD;
 	bbr->r_ctl.rc_pace_max_segs = tp->t_maxseg - bbr->rc_last_options;
 	bbr->r_ctl.rc_high_rwnd = tp->snd_wnd;
 	bbr->r_init_rtt = 1;
 
 	counter_u64_add(bbr_flows_nohdwr_pacing, 1);
 	if (bbr_allow_hdwr_pacing)
 		bbr->bbr_hdw_pace_ena = 1;
 	else
 		bbr->bbr_hdw_pace_ena = 0;
 	if (bbr_sends_full_iwnd)
 		bbr->bbr_init_win_cheat = 1;
 	else
 		bbr->bbr_init_win_cheat = 0;
 	bbr->r_ctl.bbr_utter_max = bbr_hptsi_utter_max;
 	bbr->r_ctl.rc_drain_pg = bbr_drain_gain;
 	bbr->r_ctl.rc_startup_pg = bbr_high_gain;
 	bbr->rc_loss_exit = bbr_exit_startup_at_loss;
 	bbr->r_ctl.bbr_rttprobe_gain_val = bbr_rttprobe_gain;
 	bbr->r_ctl.bbr_hptsi_per_second = bbr_hptsi_per_second;
 	bbr->r_ctl.bbr_hptsi_segments_delay_tar = bbr_hptsi_segments_delay_tar;
 	bbr->r_ctl.bbr_hptsi_segments_max = bbr_hptsi_segments_max;
 	bbr->r_ctl.bbr_hptsi_segments_floor = bbr_hptsi_segments_floor;
 	bbr->r_ctl.bbr_hptsi_bytes_min = bbr_hptsi_bytes_min;
 	bbr->r_ctl.bbr_cross_over = bbr_cross_over;
 	bbr->r_ctl.rc_rtt_shrinks = cts;
 	if (bbr->rc_use_google) {
 		setup_time_filter(&bbr->r_ctl.rc_delrate,
 				  FILTER_TYPE_MAX,
 				  BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT);
 		setup_time_filter_small(&bbr->r_ctl.rc_rttprop,
 					FILTER_TYPE_MIN, (11 * USECS_IN_SECOND));
 	} else {
 		setup_time_filter(&bbr->r_ctl.rc_delrate,
 				  FILTER_TYPE_MAX,
 				  bbr_num_pktepo_for_del_limit);
 		setup_time_filter_small(&bbr->r_ctl.rc_rttprop,
 					FILTER_TYPE_MIN, (bbr_filter_len_sec * USECS_IN_SECOND));
 	}
 	bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_INIT, 0);
 	if (bbr_uses_idle_restart)
 		bbr->rc_use_idle_restart = 1;
 	else
 		bbr->rc_use_idle_restart = 0;
 	bbr->r_ctl.rc_bbr_cur_del_rate = 0;
 	bbr->r_ctl.rc_initial_hptsi_bw = bbr_initial_bw_bps;
 	if (bbr_resends_use_tso)
 		bbr->rc_resends_use_tso = 1;
 #ifdef NETFLIX_PEAKRATE
 	tp->t_peakrate_thr = tp->t_maxpeakrate;
 #endif
 	if (tp->snd_una != tp->snd_max) {
 		/* Create a send map for the current outstanding data */
 		struct bbr_sendmap *rsm;
 
 		rsm = bbr_alloc(bbr);
 		if (rsm == NULL) {
 			uma_zfree(bbr_pcb_zone, tp->t_fb_ptr);
 			tp->t_fb_ptr = NULL;
 			return (ENOMEM);
 		}
 		rsm->r_rtt_not_allowed = 1;
 		rsm->r_tim_lastsent[0] = cts;
 		rsm->r_rtr_cnt = 1;
 		rsm->r_rtr_bytes = 0;
 		rsm->r_start = tp->snd_una;
 		rsm->r_end = tp->snd_max;
 		rsm->r_dupack = 0;
 		rsm->r_delivered = bbr->r_ctl.rc_delivered;
 		rsm->r_ts_valid = 0;
 		rsm->r_del_ack_ts = tp->ts_recent;
 		rsm->r_del_time = cts;
 		if (bbr->r_ctl.r_app_limited_until)
 			rsm->r_app_limited = 1;
 		else
 			rsm->r_app_limited = 0;
 		TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_map, rsm, r_next);
 		TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext);
 		rsm->r_in_tmap = 1;
 		if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW)
 			rsm->r_bbr_state = bbr_state_val(bbr);
 		else
 			rsm->r_bbr_state = 8;
 	}
 	if (bbr_use_rack_resend_cheat && (bbr->rc_use_google == 0))
 		bbr->bbr_use_rack_cheat = 1;
 	if (bbr_incr_timers && (bbr->rc_use_google == 0))
 		bbr->r_ctl.rc_incr_tmrs = 1;
 	if (bbr_include_tcp_oh && (bbr->rc_use_google == 0))
 		bbr->r_ctl.rc_inc_tcp_oh = 1;
 	if (bbr_include_ip_oh && (bbr->rc_use_google == 0))
 		bbr->r_ctl.rc_inc_ip_oh = 1;
 	if (bbr_include_enet_oh && (bbr->rc_use_google == 0))
 		bbr->r_ctl.rc_inc_enet_oh = 1;
 
 	bbr_log_type_statechange(bbr, cts, __LINE__);
 	if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 	    (tp->t_srtt)) {
 		uint32_t rtt;
 
 		rtt = (TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT);
 		apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
 	}
 	/* announce the settings and state */
 	bbr_log_settings_change(bbr, BBR_RECOVERY_LOWRTT);
 	tcp_bbr_tso_size_check(bbr, cts);
 	/*
 	 * Now call the generic function to start a timer. This will place
 	 * the TCB on the hptsi wheel if a timer is needed with appropriate
 	 * flags.
 	 */
 	bbr_stop_all_timers(tp);
 	bbr_start_hpts_timer(bbr, tp, cts, 5, 0, 0);
 	return (0);
 }
 
 /*
  * Return 0 if we can accept the connection. Return
  * non-zero if we can't handle the connection. A EAGAIN
  * means you need to wait until the connection is up.
  * a EADDRNOTAVAIL means we can never handle the connection
  * (no SACK).
  */
 static int
 bbr_handoff_ok(struct tcpcb *tp)
 {
 	if ((tp->t_state == TCPS_CLOSED) ||
 	    (tp->t_state == TCPS_LISTEN)) {
 		/* Sure no problem though it may not stick */
 		return (0);
 	}
 	if ((tp->t_state == TCPS_SYN_SENT) ||
 	    (tp->t_state == TCPS_SYN_RECEIVED)) {
 		/*
 		 * We really don't know you have to get to ESTAB or beyond
 		 * to tell.
 		 */
 		return (EAGAIN);
 	}
 	if (tp->t_flags & TF_SENTFIN)
 		return (EINVAL);
 	if ((tp->t_flags & TF_SACK_PERMIT) || bbr_sack_not_required) {
 		return (0);
 	}
 	/*
 	 * If we reach here we don't do SACK on this connection so we can
 	 * never do rack.
 	 */
 	return (EINVAL);
 }
 
 static void
 bbr_fini(struct tcpcb *tp, int32_t tcb_is_purged)
 {
 	if (tp->t_fb_ptr) {
 		uint32_t calc;
 		struct tcp_bbr *bbr;
 		struct bbr_sendmap *rsm;
 
 		bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 		if (bbr->r_ctl.crte)
 			tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp);
 		bbr_log_flowend(bbr);
 		bbr->rc_tp = NULL;
 		if (tp->t_inpcb) {
 			/* Backout any flags2 we applied */
 			tp->t_inpcb->inp_flags2 &= ~INP_CANNOT_DO_ECN;
 			tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
 			tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
 		}
 		if (bbr->bbr_hdrw_pacing)
 			counter_u64_add(bbr_flows_whdwr_pacing, -1);
 		else
 			counter_u64_add(bbr_flows_nohdwr_pacing, -1);
 		if (bbr->r_ctl.crte != NULL) {
 			tcp_rel_pacing_rate(bbr->r_ctl.crte, tp);
 			bbr->r_ctl.crte = NULL;
 		}
 		rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
 		while (rsm) {
 			TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next);
 			uma_zfree(bbr_zone, rsm);
 			rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
 		}
 		rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free);
 		while (rsm) {
 			TAILQ_REMOVE(&bbr->r_ctl.rc_free, rsm, r_next);
 			uma_zfree(bbr_zone, rsm);
 			rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free);
 		}
 		calc = bbr->r_ctl.rc_high_rwnd - bbr->r_ctl.rc_init_rwnd;
 		if (calc > (bbr->r_ctl.rc_init_rwnd / 10))
 			BBR_STAT_INC(bbr_dynamic_rwnd);
 		else
 			BBR_STAT_INC(bbr_static_rwnd);
 		bbr->r_ctl.rc_free_cnt = 0;
 		uma_zfree(bbr_pcb_zone, tp->t_fb_ptr);
 		tp->t_fb_ptr = NULL;
 	}
 	/* Make sure snd_nxt is correctly set */
 	tp->snd_nxt = tp->snd_max;
 }
 
 static void
 bbr_set_state(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t win)
 {
 	switch (tp->t_state) {
 	case TCPS_SYN_SENT:
 		bbr->r_state = TCPS_SYN_SENT;
 		bbr->r_substate = bbr_do_syn_sent;
 		break;
 	case TCPS_SYN_RECEIVED:
 		bbr->r_state = TCPS_SYN_RECEIVED;
 		bbr->r_substate = bbr_do_syn_recv;
 		break;
 	case TCPS_ESTABLISHED:
 		bbr->r_ctl.rc_init_rwnd = max(win, bbr->rc_tp->snd_wnd);
 		bbr->r_state = TCPS_ESTABLISHED;
 		bbr->r_substate = bbr_do_established;
 		break;
 	case TCPS_CLOSE_WAIT:
 		bbr->r_state = TCPS_CLOSE_WAIT;
 		bbr->r_substate = bbr_do_close_wait;
 		break;
 	case TCPS_FIN_WAIT_1:
 		bbr->r_state = TCPS_FIN_WAIT_1;
 		bbr->r_substate = bbr_do_fin_wait_1;
 		break;
 	case TCPS_CLOSING:
 		bbr->r_state = TCPS_CLOSING;
 		bbr->r_substate = bbr_do_closing;
 		break;
 	case TCPS_LAST_ACK:
 		bbr->r_state = TCPS_LAST_ACK;
 		bbr->r_substate = bbr_do_lastack;
 		break;
 	case TCPS_FIN_WAIT_2:
 		bbr->r_state = TCPS_FIN_WAIT_2;
 		bbr->r_substate = bbr_do_fin_wait_2;
 		break;
 	case TCPS_LISTEN:
 	case TCPS_CLOSED:
 	case TCPS_TIME_WAIT:
 	default:
 		break;
 	};
 }
 
 static void
 bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int32_t line, int dolog)
 {
 	/*
 	 * Now what state are we going into now? Is there adjustments
 	 * needed?
 	 */
 	int32_t old_state;
 
 	old_state = bbr_state_val(bbr);
 	if (bbr_state_val(bbr) == BBR_SUB_LEVEL1) {
 		/* Save the lowest srtt we saw in our end of the sub-state */
 		bbr->rc_hit_state_1 = 0;
 		if (bbr->r_ctl.bbr_smallest_srtt_this_state != 0xffffffff)
 			bbr->r_ctl.bbr_smallest_srtt_state2 = bbr->r_ctl.bbr_smallest_srtt_this_state;
 	}
 	bbr->rc_bbr_substate++;
 	if (bbr->rc_bbr_substate >= BBR_SUBSTATE_COUNT) {
 		/* Cycle back to first state-> gain */
 		bbr->rc_bbr_substate = 0;
 	}
 	if (bbr_state_val(bbr) == BBR_SUB_GAIN) {
 		/*
 		 * We enter the gain(5/4) cycle (possibly less if
 		 * shallow buffer detection is enabled)
 		 */
 		if (bbr->skip_gain) {
 			/*
 			 * Hardware pacing has set our rate to
 			 * the max and limited our b/w just
 			 * do level i.e. no gain.
 			 */
 			bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_LEVEL1];
 		} else if (bbr->gain_is_limited &&
 			   bbr->bbr_hdrw_pacing &&
 			   bbr->r_ctl.crte) {
 			/*
 			 * We can't gain above the hardware pacing
 			 * rate which is less than our rate + the gain
 			 * calculate the gain needed to reach the hardware
 			 * pacing rate..
 			 */
 			uint64_t bw, rate, gain_calc;
 
 			bw = bbr_get_bw(bbr);
 			rate = bbr->r_ctl.crte->rate;
 			if ((rate > bw) &&
 			    (((bw *  (uint64_t)bbr_hptsi_gain[BBR_SUB_GAIN]) / (uint64_t)BBR_UNIT) > rate)) {
 				gain_calc = (rate * BBR_UNIT) / bw;
 				if (gain_calc < BBR_UNIT)
 					gain_calc = BBR_UNIT;
 				bbr->r_ctl.rc_bbr_hptsi_gain = (uint16_t)gain_calc;
 			} else {
 				bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_GAIN];
 			}
 		} else
 			bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_GAIN];
 		if ((bbr->rc_use_google == 0) && (bbr_gain_to_target == 0)) {
 			bbr->r_ctl.rc_bbr_state_atflight = cts;
 		} else
 			bbr->r_ctl.rc_bbr_state_atflight = 0;
 	} else if (bbr_state_val(bbr) == BBR_SUB_DRAIN) {
 		bbr->rc_hit_state_1 = 1;
 		bbr->r_ctl.rc_exta_time_gd = 0;
 		bbr->r_ctl.flightsize_at_drain = ctf_flight_size(bbr->rc_tp,
 						     (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
 		if (bbr_state_drain_2_tar) {
 			bbr->r_ctl.rc_bbr_state_atflight = 0;
 		} else
 			bbr->r_ctl.rc_bbr_state_atflight = cts;
 		bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_DRAIN];
 	} else {
 		/* All other cycles hit here 2-7 */
 		if ((old_state == BBR_SUB_DRAIN) && bbr->rc_hit_state_1) {
 			if (bbr_sub_drain_slam_cwnd &&
 			    (bbr->rc_use_google == 0) &&
 			    (bbr->rc_tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) {
 				bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd;
 				bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
 			}
 			if ((cts - bbr->r_ctl.rc_bbr_state_time) > bbr_get_rtt(bbr, BBR_RTT_PROP))
 				bbr->r_ctl.rc_exta_time_gd += ((cts - bbr->r_ctl.rc_bbr_state_time) -
 							       bbr_get_rtt(bbr, BBR_RTT_PROP));
 			else
 				bbr->r_ctl.rc_exta_time_gd = 0;
 			if (bbr->r_ctl.rc_exta_time_gd) {
 				bbr->r_ctl.rc_level_state_extra = bbr->r_ctl.rc_exta_time_gd;
 				/* Now chop up the time for each state (div by 7) */
 				bbr->r_ctl.rc_level_state_extra /= 7;
 				if (bbr_rand_ot && bbr->r_ctl.rc_level_state_extra) {
 					/* Add a randomization */
 					bbr_randomize_extra_state_time(bbr);
 				}
 			}
 		}
 		bbr->r_ctl.rc_bbr_state_atflight = max(1, cts);
 		bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[bbr_state_val(bbr)];
 	}
 	if (bbr->rc_use_google) {
 		bbr->r_ctl.rc_bbr_state_atflight = max(1, cts);
 	}
 	bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
 	bbr->r_ctl.rc_bbr_cwnd_gain = bbr_cwnd_gain;
 	if (dolog)
 		bbr_log_type_statechange(bbr, cts, line);
 
 	if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
 		uint32_t time_in;
 
 		time_in = cts - bbr->r_ctl.rc_bbr_state_time;
 		if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) {
 			counter_u64_add(bbr_state_time[(old_state + 5)], time_in);
 		} else {
 			counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
 		}
 	}
 	bbr->r_ctl.bbr_smallest_srtt_this_state = 0xffffffff;
 	bbr_set_state_target(bbr, __LINE__);
 	if (bbr_sub_drain_slam_cwnd &&
 	    (bbr->rc_use_google == 0) &&
 	    (bbr_state_val(bbr) == BBR_SUB_DRAIN)) {
 		/* Slam down the cwnd */
 		bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd;
 		bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
 		if (bbr_sub_drain_app_limit) {
 			/* Go app limited if we are on a long drain */
 			bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.rc_delivered +
 							  ctf_flight_size(bbr->rc_tp,
 							      (bbr->r_ctl.rc_sacked +
 							       bbr->r_ctl.rc_lost_bytes)));
 		}
 		bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
 	}
 	if (bbr->rc_lt_use_bw) {
 		/* In policed mode we clamp pacing_gain to BBR_UNIT */
 		bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
 	}
 	/* Google changes TSO size every cycle */
 	if (bbr->rc_use_google)
 		tcp_bbr_tso_size_check(bbr, cts);
 	bbr->r_ctl.gain_epoch = cts;
 	bbr->r_ctl.rc_bbr_state_time = cts;
 	bbr->r_ctl.substate_pe = bbr->r_ctl.rc_pkt_epoch;
 }
 
 static void
 bbr_set_probebw_google_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses)
 {
 	if ((bbr_state_val(bbr) == BBR_SUB_DRAIN) &&
 	    (google_allow_early_out == 1) &&
 	    (bbr->r_ctl.rc_flight_at_input <= bbr->r_ctl.rc_target_at_state)) {
 		/* We have reached out target flight size possibly early */
 		goto change_state;
 	}
 	if (TSTMP_LT(cts, bbr->r_ctl.rc_bbr_state_time)) {
 		return;
 	}
 	if ((cts - bbr->r_ctl.rc_bbr_state_time) < bbr_get_rtt(bbr, BBR_RTT_PROP)) {
 		/*
 		 * Must be a rttProp movement forward before
 		 * we can change states.
 		 */
 		return;
 	}
 	if (bbr_state_val(bbr) == BBR_SUB_GAIN) {
 		/*
 		 * The needed time has passed but for
 		 * the gain cycle extra rules apply:
 		 * 1) If we have seen loss, we exit
 		 * 2) If we have not reached the target
 		 *    we stay in GAIN (gain-to-target).
 		 */
 		if (google_consider_lost && losses)
 			goto change_state;
 		if (bbr->r_ctl.rc_target_at_state > bbr->r_ctl.rc_flight_at_input) {
 			return;
 		}
 	}
 change_state:
 	/* For gain we must reach our target, all others last 1 rttProp */
 	bbr_substate_change(bbr, cts, __LINE__, 1);
 }
 
 static void
 bbr_set_probebw_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses)
 {
 	uint32_t flight, bbr_cur_cycle_time;
 
 	if (bbr->rc_use_google) {
 		bbr_set_probebw_google_gains(bbr, cts, losses);
 		return;
 	}
 	if (cts == 0) {
 		/*
 		 * Never alow cts to be 0 we
 		 * do this so we can judge if
 		 * we have set a timestamp.
 		 */
 		cts = 1;
 	}
 	if (bbr_state_is_pkt_epoch)
 		bbr_cur_cycle_time = bbr_get_rtt(bbr, BBR_RTT_PKTRTT);
 	else
 		bbr_cur_cycle_time = bbr_get_rtt(bbr, BBR_RTT_PROP);
 
 	if (bbr->r_ctl.rc_bbr_state_atflight == 0) {
 		if (bbr_state_val(bbr) == BBR_SUB_DRAIN) {
 			flight = ctf_flight_size(bbr->rc_tp,
 				     (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
 			if (bbr_sub_drain_slam_cwnd && bbr->rc_hit_state_1) {
 				/* Keep it slam down */
 				if (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state) {
 					bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
 					bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
 				}
 				if (bbr_sub_drain_app_limit) {
 					/* Go app limited if we are on a long drain */
 					bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.rc_delivered + flight);
 				}
 			}
 			if (TSTMP_GT(cts, bbr->r_ctl.gain_epoch) &&
 			    (((cts - bbr->r_ctl.gain_epoch) > bbr_get_rtt(bbr, BBR_RTT_PROP)) ||
 			     (flight >= bbr->r_ctl.flightsize_at_drain))) {
 				/*
 				 * Still here after the same time as
 				 * the gain. We need to drain harder
 				 * for the next srtt. Reduce by a set amount
 				 * the gain drop is capped at DRAIN states
 				 * value (88).
 				 */
 				bbr->r_ctl.flightsize_at_drain = flight;
 				if (bbr_drain_drop_mul &&
 				    bbr_drain_drop_div &&
 				    (bbr_drain_drop_mul < bbr_drain_drop_div)) {
 					/* Use your specific drop value (def 4/5 = 20%) */
 					bbr->r_ctl.rc_bbr_hptsi_gain *= bbr_drain_drop_mul;
 					bbr->r_ctl.rc_bbr_hptsi_gain /= bbr_drain_drop_div;
 				} else {
 					/* You get drop of 20% */
 					bbr->r_ctl.rc_bbr_hptsi_gain *= 4;
 					bbr->r_ctl.rc_bbr_hptsi_gain /= 5;
 				}
 				if (bbr->r_ctl.rc_bbr_hptsi_gain <= bbr_drain_floor) {
 					/* Reduce our gain again to the bottom  */
 					bbr->r_ctl.rc_bbr_hptsi_gain = max(bbr_drain_floor, 1);
 				}
 				bbr_log_exit_gain(bbr, cts, 4);
 				/*
 				 * Extend out so we wait another
 				 * epoch before dropping again.
 				 */
 				bbr->r_ctl.gain_epoch = cts;
 			}
 			if (flight <= bbr->r_ctl.rc_target_at_state) {
 				if (bbr_sub_drain_slam_cwnd &&
 				    (bbr->rc_use_google == 0) &&
 				    (bbr->rc_tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) {
 					bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd;
 					bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
 				}
 				bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1);
 				bbr_log_exit_gain(bbr, cts, 3);
 			}
 		} else {
 			/* Its a gain  */
 			if (bbr->r_ctl.rc_lost > bbr->r_ctl.bbr_lost_at_state) {
 				bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1);
 				goto change_state;
 			}
 			if ((ctf_outstanding(bbr->rc_tp) >= bbr->r_ctl.rc_target_at_state) ||
 			    ((ctf_outstanding(bbr->rc_tp) +  bbr->rc_tp->t_maxseg - 1) >=
 			     bbr->rc_tp->snd_wnd)) {
 				bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1);
 				bbr_log_exit_gain(bbr, cts, 2);
 			}
 		}
 		/**
 		 * We fall through and return always one of two things has
 		 * occurred.
 		 * 1) We are still not at target
 		 *    <or>
 		 * 2) We reached the target and set rc_bbr_state_atflight
 		 *    which means we no longer hit this block
 		 *    next time we are called.
 		 */
 		return;
 	}
 change_state:
 	if (TSTMP_LT(cts, bbr->r_ctl.rc_bbr_state_time))
 		return;
 	if ((cts - bbr->r_ctl.rc_bbr_state_time) < bbr_cur_cycle_time) {
 		/* Less than a full time-period has passed */
 		return;
 	}
 	if (bbr->r_ctl.rc_level_state_extra &&
 	    (bbr_state_val(bbr) > BBR_SUB_DRAIN) &&
 	    ((cts - bbr->r_ctl.rc_bbr_state_time) <
 	     (bbr_cur_cycle_time + bbr->r_ctl.rc_level_state_extra))) {
 		/* Less than a full time-period + extra has passed */
 		return;
 	}
 	if (bbr_gain_gets_extra_too &&
 	    bbr->r_ctl.rc_level_state_extra &&
 	    (bbr_state_val(bbr) == BBR_SUB_GAIN) &&
 	    ((cts - bbr->r_ctl.rc_bbr_state_time) <
 	     (bbr_cur_cycle_time + bbr->r_ctl.rc_level_state_extra))) {
 		/* Less than a full time-period + extra has passed */
 		return;
 	}
 	bbr_substate_change(bbr, cts, __LINE__, 1);
 }
 
 static uint32_t
 bbr_get_a_state_target(struct tcp_bbr *bbr, uint32_t gain)
 {
 	uint32_t mss, tar;
 
 	if (bbr->rc_use_google) {
 		/* Google just uses the cwnd target */
 		tar = bbr_get_target_cwnd(bbr, bbr_get_bw(bbr), gain);
 	} else {
 		mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options),
 			  bbr->r_ctl.rc_pace_max_segs);
 		/* Get the base cwnd with gain rounded to a mss */
 		tar = roundup(bbr_get_raw_target_cwnd(bbr, bbr_get_bw(bbr),
 						      gain), mss);
 		/* Make sure it is within our min */
 		if (tar < get_min_cwnd(bbr))
 			return (get_min_cwnd(bbr));
 	}
 	return (tar);
 }
 
 static void
 bbr_set_state_target(struct tcp_bbr *bbr, int line)
 {
 	uint32_t tar, meth;
 
 	if ((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) &&
 	    ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google)) {
 		/* Special case using old probe-rtt method */
 		tar = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options);
 		meth = 1;
 	} else {
 		/* Non-probe-rtt case and reduced probe-rtt  */
 		if ((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) &&
 		    (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT)) {
 			/* For gain cycle we use the hptsi gain */
 			tar = bbr_get_a_state_target(bbr, bbr->r_ctl.rc_bbr_hptsi_gain);
 			meth = 2;
 		} else if ((bbr_target_is_bbunit) || bbr->rc_use_google) {
 			/*
 			 * If configured, or for google all other states
 			 * get BBR_UNIT.
 			 */
 			tar = bbr_get_a_state_target(bbr, BBR_UNIT);
 			meth = 3;
 		} else {
 			/*
 			 * Or we set a target based on the pacing gain
 			 * for non-google mode and default (non-configured).
 			 * Note we don't set a target goal below drain (192).
 			 */
 			if (bbr->r_ctl.rc_bbr_hptsi_gain < bbr_hptsi_gain[BBR_SUB_DRAIN])  {
 				tar = bbr_get_a_state_target(bbr, bbr_hptsi_gain[BBR_SUB_DRAIN]);
 				meth = 4;
 			} else {
 				tar = bbr_get_a_state_target(bbr, bbr->r_ctl.rc_bbr_hptsi_gain);
 				meth = 5;
 			}
 		}
 	}
 	bbr_log_set_of_state_target(bbr, tar, line, meth);
 	bbr->r_ctl.rc_target_at_state = tar;
 }
 
 static void
 bbr_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts, int32_t line)
 {
 	/* Change to probe_rtt */
 	uint32_t time_in;
 
 	bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
 	bbr->r_ctl.flightsize_at_drain = ctf_flight_size(bbr->rc_tp,
 					     (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
 	bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.flightsize_at_drain
 					  + bbr->r_ctl.rc_delivered);
 	/* Setup so we force feed the filter */
 	if (bbr->rc_use_google || bbr_probertt_sets_rtt)
 		bbr->rc_prtt_set_ts = 1;
 	if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
 		time_in = cts - bbr->r_ctl.rc_bbr_state_time;
 		counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
 	}
 	bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_ENTERPROBE, 0);
 	bbr->r_ctl.rc_rtt_shrinks = cts;
 	bbr->r_ctl.last_in_probertt = cts;
 	bbr->r_ctl.rc_probertt_srttchktim = cts;
 	bbr->r_ctl.rc_bbr_state_time = cts;
 	bbr->rc_bbr_state = BBR_STATE_PROBE_RTT;
 	/* We need to force the filter to update */
 
 	if ((bbr_sub_drain_slam_cwnd) &&
 	    bbr->rc_hit_state_1 &&
 	    (bbr->rc_use_google == 0) &&
 	    (bbr_state_val(bbr) == BBR_SUB_DRAIN)) {
 		if (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_saved_cwnd)
 			bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd;
 	} else
 		bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd;
 	/* Update the lost */
 	bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
 	if ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google){
 		/* Set to the non-configurable default of 4 (PROBE_RTT_MIN)  */
 		bbr->rc_tp->snd_cwnd = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options);
 		bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
 		bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
 		bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT;
 		bbr_log_set_of_state_target(bbr, bbr->rc_tp->snd_cwnd, __LINE__, 6);
 		bbr->r_ctl.rc_target_at_state = bbr->rc_tp->snd_cwnd;
 	} else {
 		/*
 		 * We bring it down slowly by using a hptsi gain that is
 		 * probably 75%. This will slowly float down our outstanding
 		 * without tampering with the cwnd.
 		 */
 		bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.bbr_rttprobe_gain_val;
 		bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT;
 		bbr_set_state_target(bbr, __LINE__);
 		if (bbr_prtt_slam_cwnd &&
 		    (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) {
 			bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
 			bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
 		}
 	}
 	if (ctf_flight_size(bbr->rc_tp,
 		(bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <=
 	    bbr->r_ctl.rc_target_at_state) {
 		/* We are at target */
 		bbr->r_ctl.rc_bbr_enters_probertt = cts;
 	} else {
 		/* We need to come down to reach target before our time begins */
 		bbr->r_ctl.rc_bbr_enters_probertt = 0;
 	}
 	bbr->r_ctl.rc_pe_of_prtt = bbr->r_ctl.rc_pkt_epoch;
 	BBR_STAT_INC(bbr_enter_probertt);
 	bbr_log_exit_gain(bbr, cts, 0);
 	bbr_log_type_statechange(bbr, cts, line);
 }
 
 static void
 bbr_check_probe_rtt_limits(struct tcp_bbr *bbr, uint32_t cts)
 {
 	/*
 	 * Sanity check on probe-rtt intervals.
 	 * In crazy situations where we are competing
 	 * against new-reno flows with huge buffers
 	 * our rtt-prop interval could come to dominate
 	 * things if we can't get through a full set
 	 * of cycles, we need to adjust it.
 	 */
 	if (bbr_can_adjust_probertt &&
 	    (bbr->rc_use_google == 0)) {
 		uint16_t val = 0;
 		uint32_t cur_rttp, fval, newval, baseval;
 
 		/* Are we to small and go into probe-rtt to often? */
 		baseval = (bbr_get_rtt(bbr, BBR_RTT_PROP) * (BBR_SUBSTATE_COUNT + 1));
 		cur_rttp = roundup(baseval, USECS_IN_SECOND);
 		fval = bbr_filter_len_sec * USECS_IN_SECOND;
 		if (bbr_is_ratio == 0) {
 			if (fval > bbr_rtt_probe_limit)
 				newval = cur_rttp + (fval - bbr_rtt_probe_limit);
 			else
 				newval = cur_rttp;
 		} else {
 			int mul;
 
 			mul = fval / bbr_rtt_probe_limit;
 			newval = cur_rttp * mul;
 		}
 		if (cur_rttp > 	bbr->r_ctl.rc_probertt_int) {
 			bbr->r_ctl.rc_probertt_int = cur_rttp;
 			reset_time_small(&bbr->r_ctl.rc_rttprop, newval);
 			val = 1;
 		} else {
 			/*
 			 * No adjustments were made
 			 * do we need to shrink it?
 			 */
 			if (bbr->r_ctl.rc_probertt_int > bbr_rtt_probe_limit) {
 				if (cur_rttp <= bbr_rtt_probe_limit) {
 					/*
 					 * Things have calmed down lets
 					 * shrink all the way to default
 					 */
 					bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit;
 					reset_time_small(&bbr->r_ctl.rc_rttprop,
 							 (bbr_filter_len_sec * USECS_IN_SECOND));
 					cur_rttp = bbr_rtt_probe_limit;
 					newval = (bbr_filter_len_sec * USECS_IN_SECOND);
 					val = 2;
 				} else {
 					/*
 					 * Well does some adjustment make sense?
 					 */
 					if (cur_rttp < bbr->r_ctl.rc_probertt_int) {
 						/* We can reduce interval time some */
 						bbr->r_ctl.rc_probertt_int = cur_rttp;
 						reset_time_small(&bbr->r_ctl.rc_rttprop, newval);
 						val = 3;
 					}
 				}
 			}
 		}
 		if (val)
 			bbr_log_rtt_shrinks(bbr, cts, cur_rttp, newval, __LINE__, BBR_RTTS_RESETS_VALUES, val);
 	}
 }
 
 static void
 bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts)
 {
 	/* Exit probe-rtt */
 
 	if (tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd) {
 		tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd;
 		bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
 	}
 	bbr_log_exit_gain(bbr, cts, 1);
 	bbr->rc_hit_state_1 = 0;
 	bbr->r_ctl.rc_rtt_shrinks = cts;
 	bbr->r_ctl.last_in_probertt = cts;
 	bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_RTTPROBE, 0);
 	bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
 	bbr->r_ctl.r_app_limited_until = (ctf_flight_size(tp,
 					      (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) +
 					  bbr->r_ctl.rc_delivered);
 	if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
 		uint32_t time_in;
 
 		time_in = cts - bbr->r_ctl.rc_bbr_state_time;
 		counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
 	}
 	if (bbr->rc_filled_pipe) {
 		/* Switch to probe_bw */
 		bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
 		bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts);
 		bbr->r_ctl.rc_bbr_cwnd_gain = bbr_cwnd_gain;
 		bbr_substate_change(bbr, cts, __LINE__, 0);
 		bbr_log_type_statechange(bbr, cts, __LINE__);
 	} else {
 		/* Back to startup */
 		bbr->rc_bbr_state = BBR_STATE_STARTUP;
 		bbr->r_ctl.rc_bbr_state_time = cts;
 		/*
 		 * We don't want to give a complete free 3
 		 * measurements until we exit, so we use
 		 * the number of pe's we were in probe-rtt
 		 * to add to the startup_epoch. That way
 		 * we will still retain the old state.
 		 */
 		bbr->r_ctl.rc_bbr_last_startup_epoch += (bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_pe_of_prtt);
 		bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
 		/* Make sure to use the lower pg when shifting back in */
 		if (bbr->r_ctl.rc_lost &&
 		    bbr_use_lower_gain_in_startup &&
 		    (bbr->rc_use_google == 0))
 			bbr->r_ctl.rc_bbr_hptsi_gain = bbr_startup_lower;
 		else
 			bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg;
 		bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg;
 		/* Probably not needed but set it anyway */
 		bbr_set_state_target(bbr, __LINE__);
 		bbr_log_type_statechange(bbr, cts, __LINE__);
 		bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
 		    bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 0);
 	}
 	bbr_check_probe_rtt_limits(bbr, cts);
 }
 
 static int32_t inline
 bbr_should_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts)
 {
 	if ((bbr->rc_past_init_win == 1) &&
 	    (bbr->rc_in_persist == 0) &&
 	    (bbr_calc_time(cts, bbr->r_ctl.rc_rtt_shrinks) >= bbr->r_ctl.rc_probertt_int)) {
 		return (1);
 	}
 	if (bbr_can_force_probertt &&
 	    (bbr->rc_in_persist == 0) &&
 	    (TSTMP_GT(cts, bbr->r_ctl.last_in_probertt)) &&
 	    ((cts - bbr->r_ctl.last_in_probertt) > bbr->r_ctl.rc_probertt_int)) {
 		return (1);
 	}
 	return (0);
 }
 
 static int32_t
 bbr_google_startup(struct tcp_bbr *bbr, uint32_t cts, int32_t  pkt_epoch)
 {
 	uint64_t btlbw, gain;
 	if (pkt_epoch == 0) {
 		/*
 		 * Need to be on a pkt-epoch to continue.
 		 */
 		return (0);
 	}
 	btlbw = bbr_get_full_bw(bbr);
 	gain = ((bbr->r_ctl.rc_bbr_lastbtlbw *
 		 (uint64_t)bbr_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw;
 	if (btlbw >= gain) {
 		bbr->r_ctl.rc_bbr_last_startup_epoch = bbr->r_ctl.rc_pkt_epoch;
 		bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
 				      bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 3);
 		bbr->r_ctl.rc_bbr_lastbtlbw = btlbw;
 	}
 	if ((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS)
 		return (1);
 	bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
 			      bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 8);
 	return(0);
 }
 
 static int32_t inline
 bbr_state_startup(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch)
 {
 	/* Have we gained 25% in the last 3 packet based epoch's? */
 	uint64_t btlbw, gain;
 	int do_exit;
 	int delta, rtt_gain;
 
 	if ((bbr->rc_tp->snd_una == bbr->rc_tp->snd_max) &&
 	    (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) {
 		/*
 		 * This qualifies as a RTT_PROBE session since we drop the
 		 * data outstanding to nothing and waited more than
 		 * bbr_rtt_probe_time.
 		 */
 		bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0);
 		bbr_set_reduced_rtt(bbr, cts, __LINE__);
 	}
 	if (bbr_should_enter_probe_rtt(bbr, cts)) {
 		bbr_enter_probe_rtt(bbr, cts, __LINE__);
 		return (0);
 	}
 	if (bbr->rc_use_google)
 		return (bbr_google_startup(bbr, cts,  pkt_epoch));
 
 	if ((bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_startup) &&
 	    (bbr_use_lower_gain_in_startup)) {
 		/* Drop to a lower gain 1.5 x since we saw loss */
 		bbr->r_ctl.rc_bbr_hptsi_gain = bbr_startup_lower;
 	}
 	if (pkt_epoch == 0) {
 		/*
 		 * Need to be on a pkt-epoch to continue.
 		 */
 		return (0);
 	}
 	if (bbr_rtt_gain_thresh) {
 		/*
 		 * Do we allow a flow to stay
 		 * in startup with no loss and no
 		 * gain in rtt over a set threshold?
 		 */
 		if (bbr->r_ctl.rc_pkt_epoch_rtt &&
 		    bbr->r_ctl.startup_last_srtt &&
 		    (bbr->r_ctl.rc_pkt_epoch_rtt > bbr->r_ctl.startup_last_srtt)) {
 			delta = bbr->r_ctl.rc_pkt_epoch_rtt - bbr->r_ctl.startup_last_srtt;
 			rtt_gain = (delta * 100) / bbr->r_ctl.startup_last_srtt;
 		} else
 			rtt_gain = 0;
 		if ((bbr->r_ctl.startup_last_srtt == 0)  ||
 		    (bbr->r_ctl.rc_pkt_epoch_rtt < bbr->r_ctl.startup_last_srtt))
 			/* First time or new lower value */
 			bbr->r_ctl.startup_last_srtt = bbr->r_ctl.rc_pkt_epoch_rtt;
 
 		if ((bbr->r_ctl.rc_lost == 0) &&
 		    (rtt_gain < bbr_rtt_gain_thresh)) {
 			/*
 			 * No loss, and we are under
 			 * our gain threhold for
 			 * increasing RTT.
 			 */
 			if (bbr->r_ctl.rc_bbr_last_startup_epoch < bbr->r_ctl.rc_pkt_epoch)
 				bbr->r_ctl.rc_bbr_last_startup_epoch++;
 			bbr_log_startup_event(bbr, cts, rtt_gain,
 					      delta, bbr->r_ctl.startup_last_srtt, 10);
 			return (0);
 		}
 	}
 	if ((bbr->r_ctl.r_measurement_count == bbr->r_ctl.last_startup_measure) &&
 	    (bbr->r_ctl.rc_lost_at_startup == bbr->r_ctl.rc_lost) &&
 	    (!IN_RECOVERY(bbr->rc_tp->t_flags))) {
 		/*
 		 * We only assess if we have a new measurement when
 		 * we have no loss and are not in recovery.
 		 * Drag up by one our last_startup epoch so we will hold
 		 * the number of non-gain we have already accumulated.
 		 */
 		if (bbr->r_ctl.rc_bbr_last_startup_epoch < bbr->r_ctl.rc_pkt_epoch)
 			bbr->r_ctl.rc_bbr_last_startup_epoch++;
 		bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
 				      bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 9);
 		return (0);
 	}
 	/* Case where we reduced the lost (bad retransmit) */
 	if (bbr->r_ctl.rc_lost_at_startup > bbr->r_ctl.rc_lost)
 		bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
 	bbr->r_ctl.last_startup_measure = bbr->r_ctl.r_measurement_count;
 	btlbw = bbr_get_full_bw(bbr);
 	if (bbr->r_ctl.rc_bbr_hptsi_gain == bbr_startup_lower)
 		gain = ((bbr->r_ctl.rc_bbr_lastbtlbw *
 			 (uint64_t)bbr_low_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw;
 	else
 		gain = ((bbr->r_ctl.rc_bbr_lastbtlbw *
 			 (uint64_t)bbr_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw;
 	do_exit = 0;
 	if (btlbw > bbr->r_ctl.rc_bbr_lastbtlbw)
 		bbr->r_ctl.rc_bbr_lastbtlbw = btlbw;
 	if (btlbw >= gain) {
 		bbr->r_ctl.rc_bbr_last_startup_epoch = bbr->r_ctl.rc_pkt_epoch;
 		/* Update the lost so we won't exit in next set of tests */
 		bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
 		bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
 				      bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 3);
 	}
 	if ((bbr->rc_loss_exit &&
 	     (bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_startup) &&
 	     (bbr->r_ctl.rc_pkt_epoch_loss_rate > bbr_startup_loss_thresh)) &&
 	    ((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS)) {
 		/*
 		 * If we had no gain,  we had loss and that loss was above
 		 * our threshould, the rwnd is not constrained, and we have
 		 * had at least 3 packet epochs exit. Note that this is
 		 * switched off by sysctl. Google does not do this by the
 		 * way.
 		 */
 		if ((ctf_flight_size(bbr->rc_tp,
 			 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) +
 		     (2 * max(bbr->r_ctl.rc_pace_max_segs, bbr->rc_tp->t_maxseg))) <= bbr->rc_tp->snd_wnd) {
 			do_exit = 1;
 			bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
 					      bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 4);
 		} else {
 			/* Just record an updated loss value */
 			bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
 			bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
 					      bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 5);
 		}
 	} else
 		bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost;
 	if (((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS) ||
 	    do_exit) {
 		/* Return 1 to exit the startup state. */
 		return (1);
 	}
 	/* Stay in startup */
 	bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
 			      bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 8);
 	return (0);
 }
 
 static void
 bbr_state_change(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch, uint32_t losses)
 {
 	/*
 	 * A tick occurred in the rtt epoch do we need to do anything?
 	 */
 #ifdef BBR_INVARIANTS
 	if ((bbr->rc_bbr_state != BBR_STATE_STARTUP) &&
 	    (bbr->rc_bbr_state != BBR_STATE_DRAIN) &&
 	    (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) &&
 	    (bbr->rc_bbr_state != BBR_STATE_IDLE_EXIT) &&
 	    (bbr->rc_bbr_state != BBR_STATE_PROBE_BW)) {
 		/* Debug code? */
 		panic("Unknown BBR state %d?\n", bbr->rc_bbr_state);
 	}
 #endif
 	if (bbr->rc_bbr_state == BBR_STATE_STARTUP) {
 		/* Do we exit the startup state? */
 		if (bbr_state_startup(bbr, cts, epoch, pkt_epoch)) {
 			uint32_t time_in;
 
 			bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch,
 					      bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 6);
 			bbr->rc_filled_pipe = 1;
 			bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
 			if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
 				time_in = cts - bbr->r_ctl.rc_bbr_state_time;
 				counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
 			} else
 				time_in = 0;
 			if (bbr->rc_no_pacing)
 				bbr->rc_no_pacing = 0;
 			bbr->r_ctl.rc_bbr_state_time = cts;
 			bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_drain_pg;
 			bbr->rc_bbr_state = BBR_STATE_DRAIN;
 			bbr_set_state_target(bbr, __LINE__);
 			if ((bbr->rc_use_google == 0) &&
 			    bbr_slam_cwnd_in_main_drain) {
 				/* Here we don't have to worry about probe-rtt */
 				bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd;
 				bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
 				bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
 			}
 			bbr->r_ctl.rc_bbr_cwnd_gain = bbr_high_gain;
 			bbr_log_type_statechange(bbr, cts, __LINE__);
 			if (ctf_flight_size(bbr->rc_tp,
 			        (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <=
 			    bbr->r_ctl.rc_target_at_state) {
 				/*
 				 * Switch to probe_bw if we are already
 				 * there
 				 */
 				bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts);
 				bbr_substate_change(bbr, cts, __LINE__, 0);
 				bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
 				bbr_log_type_statechange(bbr, cts, __LINE__);
 			}
 		}
 	} else if (bbr->rc_bbr_state == BBR_STATE_IDLE_EXIT) {
 		uint32_t inflight;
 		struct tcpcb *tp;
 
 		tp = bbr->rc_tp;
 		inflight = ctf_flight_size(tp,
 			      (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
 		if (inflight >= bbr->r_ctl.rc_target_at_state) {
 			/* We have reached a flight of the cwnd target */
 			bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
 			bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
 			bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT;
 			bbr_set_state_target(bbr, __LINE__);
 			/*
 			 * Rig it so we don't do anything crazy and
 			 * start fresh with a new randomization.
 			 */
 			bbr->r_ctl.bbr_smallest_srtt_this_state = 0xffffffff;
 			bbr->rc_bbr_substate = BBR_SUB_LEVEL6;
 			bbr_substate_change(bbr, cts, __LINE__, 1);
 		}
 	} else if (bbr->rc_bbr_state == BBR_STATE_DRAIN) {
 		/* Has in-flight reached the bdp (or less)? */
 		uint32_t inflight;
 		struct tcpcb *tp;
 
 		tp = bbr->rc_tp;
 		inflight = ctf_flight_size(tp,
 			      (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
 		if ((bbr->rc_use_google == 0) &&
 		    bbr_slam_cwnd_in_main_drain &&
 		    (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) {
 			/*
 			 * Here we don't have to worry about probe-rtt
 			 * re-slam it, but keep it slammed down.
 			 */
 			bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
 			bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
 		}
 		if (inflight <= bbr->r_ctl.rc_target_at_state) {
 			/* We have drained */
 			bbr->rc_bbr_state = BBR_STATE_PROBE_BW;
 			bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost;
 			if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) {
 				uint32_t time_in;
 
 				time_in = cts - bbr->r_ctl.rc_bbr_state_time;
 				counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in);
 			}
 			if ((bbr->rc_use_google == 0) &&
 			    bbr_slam_cwnd_in_main_drain &&
 			    (tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) {
 				/* Restore the cwnd */
 				tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd;
 				bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
 			}
 			/* Setup probe-rtt has being done now RRS-HERE */
 			bbr->r_ctl.rc_rtt_shrinks = cts;
 			bbr->r_ctl.last_in_probertt = cts;
 			bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_LEAVE_DRAIN, 0);
 			/* Randomly pick a sub-state */
 			bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts);
 			bbr_substate_change(bbr, cts, __LINE__, 0);
 			bbr_log_type_statechange(bbr, cts, __LINE__);
 		}
 	} else if (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) {
 		uint32_t flight;
 
 		flight = ctf_flight_size(bbr->rc_tp,
 			     (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
 		bbr->r_ctl.r_app_limited_until = (flight + bbr->r_ctl.rc_delivered);
 		if (((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google) &&
 		    (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) {
 			/*
 			 * We must keep cwnd at the desired MSS.
 			 */
 			bbr->rc_tp->snd_cwnd = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options);
 			bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
 		} else if ((bbr_prtt_slam_cwnd) &&
 			   (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) {
 			/* Re-slam it */
 			bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state;
 			bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__);
 		}
 		if (bbr->r_ctl.rc_bbr_enters_probertt == 0) {
 			/* Has outstanding reached our target? */
 			if (flight <= bbr->r_ctl.rc_target_at_state) {
 				bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_REACHTAR, 0);
 				bbr->r_ctl.rc_bbr_enters_probertt = cts;
 				/* If time is exactly 0, be 1usec off */
 				if (bbr->r_ctl.rc_bbr_enters_probertt == 0)
 					bbr->r_ctl.rc_bbr_enters_probertt = 1;
 				if (bbr->rc_use_google == 0) {
 					/*
 					 * Restore any lowering that as occurred to
 					 * reach here
 					 */
 					if (bbr->r_ctl.bbr_rttprobe_gain_val)
 						bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.bbr_rttprobe_gain_val;
 					else
 						bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT;
 				}
 			}
 			if ((bbr->r_ctl.rc_bbr_enters_probertt == 0) &&
 			    (bbr->rc_use_google == 0) &&
 			    bbr->r_ctl.bbr_rttprobe_gain_val &&
 			    (((cts - bbr->r_ctl.rc_probertt_srttchktim) > bbr_get_rtt(bbr, bbr_drain_rtt)) ||
 			     (flight >= bbr->r_ctl.flightsize_at_drain))) {
 				/*
 				 * We have doddled with our current hptsi
 				 * gain an srtt and have still not made it
 				 * to target, or we have increased our flight.
 				 * Lets reduce the gain by xx%
 				 * flooring the reduce at DRAIN (based on
 				 * mul/div)
 				 */
 				int red;
 
 				bbr->r_ctl.flightsize_at_drain = flight;
 				bbr->r_ctl.rc_probertt_srttchktim = cts;
 				red = max((bbr->r_ctl.bbr_rttprobe_gain_val / 10), 1);
 				if ((bbr->r_ctl.rc_bbr_hptsi_gain - red) > max(bbr_drain_floor, 1)) {
 					/* Reduce our gain again */
 					bbr->r_ctl.rc_bbr_hptsi_gain -= red;
 					bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_SHRINK_PG, 0);
 				} else if (bbr->r_ctl.rc_bbr_hptsi_gain > max(bbr_drain_floor, 1)) {
 					/* one more chance before we give up */
 					bbr->r_ctl.rc_bbr_hptsi_gain = max(bbr_drain_floor, 1);
 					bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_SHRINK_PG_FINAL, 0);
 				} else {
 					/* At the very bottom */
 					bbr->r_ctl.rc_bbr_hptsi_gain = max((bbr_drain_floor-1), 1);
 				}
 			}
 		}
 		if (bbr->r_ctl.rc_bbr_enters_probertt &&
 		    (TSTMP_GT(cts, bbr->r_ctl.rc_bbr_enters_probertt)) &&
 		    ((cts - bbr->r_ctl.rc_bbr_enters_probertt) >= bbr_rtt_probe_time)) {
 			/* Time to exit probe RTT normally */
 			bbr_exit_probe_rtt(bbr->rc_tp, bbr, cts);
 		}
 	} else if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) {
 		if ((bbr->rc_tp->snd_una == bbr->rc_tp->snd_max) &&
 		    (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) {
 			/*
 			 * This qualifies as a RTT_PROBE session since we
 			 * drop the data outstanding to nothing and waited
 			 * more than bbr_rtt_probe_time.
 			 */
 			bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0);
 			bbr_set_reduced_rtt(bbr, cts, __LINE__);
 		}
 		if (bbr_should_enter_probe_rtt(bbr, cts)) {
 			bbr_enter_probe_rtt(bbr, cts, __LINE__);
 		} else {
 			bbr_set_probebw_gains(bbr, cts, losses);
 		}
 	}
 }
 
 static void
 bbr_check_bbr_for_state(struct tcp_bbr *bbr, uint32_t cts, int32_t line, uint32_t losses)
 {
 	int32_t epoch = 0;
 
 	if ((cts - bbr->r_ctl.rc_rcv_epoch_start) >= bbr_get_rtt(bbr, BBR_RTT_PROP)) {
 		bbr_set_epoch(bbr, cts, line);
 		/* At each epoch doe lt bw sampling */
 		epoch = 1;
 	}
 	bbr_state_change(bbr, cts, epoch, bbr->rc_is_pkt_epoch_now, losses);
 }
 
 static int
 bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
     int32_t nxt_pkt, struct timeval *tv)
 {
 	int32_t thflags, retval;
 	uint32_t cts, lcts;
 	uint32_t tiwin;
 	struct tcpopt to;
 	struct tcp_bbr *bbr;
 	struct bbr_sendmap *rsm;
 	struct timeval ltv;
 	int32_t did_out = 0;
 	uint16_t nsegs;
 	int32_t prev_state;
 	uint32_t lost;
 
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	/* add in our stats */
 	kern_prefetch(bbr, &prev_state);
 	prev_state = 0;
 	thflags = tcp_get_flags(th);
 	/*
 	 * If this is either a state-changing packet or current state isn't
 	 * established, we require a write lock on tcbinfo.  Otherwise, we
 	 * allow the tcbinfo to be in either alocked or unlocked, as the
 	 * caller may have unnecessarily acquired a write lock due to a
 	 * race.
 	 */
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
 	    __func__));
 
 	tp->t_rcvtime = ticks;
 	/*
 	 * Unscale the window into a 32-bit value. For the SYN_SENT state
 	 * the scale is zero.
 	 */
 	tiwin = th->th_win << tp->snd_scale;
 #ifdef STATS
 	stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
 #endif
 
 	if (m->m_flags & M_TSTMP) {
 		/* Prefer the hardware timestamp if present */
 		struct timespec ts;
 
 		mbuf_tstmp2timespec(m, &ts);
 		bbr->rc_tv.tv_sec = ts.tv_sec;
 		bbr->rc_tv.tv_usec = ts.tv_nsec / 1000;
 		bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv);
 	} else if (m->m_flags & M_TSTMP_LRO) {
 		/* Next the arrival timestamp */
 		struct timespec ts;
 
 		mbuf_tstmp2timespec(m, &ts);
 		bbr->rc_tv.tv_sec = ts.tv_sec;
 		bbr->rc_tv.tv_usec = ts.tv_nsec / 1000;
 		bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv);
 	} else {
 		/*
 		 * Ok just get the current time.
 		 */
 		bbr->r_ctl.rc_rcvtime = lcts = cts = tcp_get_usecs(&bbr->rc_tv);
 	}
 	/*
 	 * Parse options on any incoming segment.
 	 */
 	tcp_dooptions(&to, (u_char *)(th + 1),
 	    (th->th_off << 2) - sizeof(struct tcphdr),
 	    (thflags & TH_SYN) ? TO_SYN : 0);
 
 	/*
 	 * If timestamps were negotiated during SYN/ACK and a
 	 * segment without a timestamp is received, silently drop
 	 * the segment, unless it is a RST segment or missing timestamps are
 	 * tolerated.
 	 * See section 3.2 of RFC 7323.
 	 */
 	if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) &&
 	    ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) {
 		retval = 0;
 		m_freem(m);
 		goto done_with_input;
 	}
 	/*
 	 * If echoed timestamp is later than the current time, fall back to
 	 * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
 	 * were used when this connection was established.
 	 */
 	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
 		to.to_tsecr -= tp->ts_offset;
 		if (TSTMP_GT(to.to_tsecr, tcp_tv_to_mssectick(&bbr->rc_tv)))
 			to.to_tsecr = 0;
 	}
 	/*
 	 * If its the first time in we need to take care of options and
 	 * verify we can do SACK for rack!
 	 */
 	if (bbr->r_state == 0) {
 		/*
 		 * Process options only when we get SYN/ACK back. The SYN
 		 * case for incoming connections is handled in tcp_syncache.
 		 * According to RFC1323 the window field in a SYN (i.e., a
 		 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
 		 * this is traditional behavior, may need to be cleaned up.
 		 */
 		if (bbr->rc_inp == NULL) {
 			bbr->rc_inp = tp->t_inpcb;
 		}
 		/*
 		 * We need to init rc_inp here since its not init'd when
 		 * bbr_init is called
 		 */
 		if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
 			if ((to.to_flags & TOF_SCALE) &&
 			    (tp->t_flags & TF_REQ_SCALE)) {
 				tp->t_flags |= TF_RCVD_SCALE;
 				tp->snd_scale = to.to_wscale;
 			} else
 				tp->t_flags &= ~TF_REQ_SCALE;
 			/*
 			 * Initial send window.  It will be updated with the
 			 * next incoming segment to the scaled value.
 			 */
 			tp->snd_wnd = th->th_win;
 			if ((to.to_flags & TOF_TS) &&
 			    (tp->t_flags & TF_REQ_TSTMP)) {
 				tp->t_flags |= TF_RCVD_TSTMP;
 				tp->ts_recent = to.to_tsval;
 				tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv);
 			} else
 			    tp->t_flags &= ~TF_REQ_TSTMP;
 			if (to.to_flags & TOF_MSS)
 				tcp_mss(tp, to.to_mss);
 			if ((tp->t_flags & TF_SACK_PERMIT) &&
 			    (to.to_flags & TOF_SACKPERM) == 0)
 				tp->t_flags &= ~TF_SACK_PERMIT;
 			if (IS_FASTOPEN(tp->t_flags)) {
 				if (to.to_flags & TOF_FASTOPEN) {
 					uint16_t mss;
 
 					if (to.to_flags & TOF_MSS)
 						mss = to.to_mss;
 					else
 						if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
 							mss = TCP6_MSS;
 						else
 							mss = TCP_MSS;
 					tcp_fastopen_update_cache(tp, mss,
 					    to.to_tfo_len, to.to_tfo_cookie);
 				} else
 					tcp_fastopen_disable_path(tp);
 			}
 		}
 		/*
 		 * At this point we are at the initial call. Here we decide
 		 * if we are doing RACK or not. We do this by seeing if
 		 * TF_SACK_PERMIT is set, if not rack is *not* possible and
 		 * we switch to the default code.
 		 */
 		if ((tp->t_flags & TF_SACK_PERMIT) == 0) {
 			/* Bail */
 			tcp_switch_back_to_default(tp);
 			(*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
 			    tlen, iptos);
 			return (1);
 		}
 		/* Set the flag */
 		bbr->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 		tcp_set_hpts(tp->t_inpcb);
 		sack_filter_clear(&bbr->r_ctl.bbr_sf, th->th_ack);
 	}
 	if (thflags & TH_ACK) {
 		/* Track ack types */
 		if (to.to_flags & TOF_SACK)
 			BBR_STAT_INC(bbr_acks_with_sacks);
 		else
 			BBR_STAT_INC(bbr_plain_acks);
 	}
 	/*
 	 * This is the one exception case where we set the rack state
 	 * always. All other times (timers etc) we must have a rack-state
 	 * set (so we assure we have done the checks above for SACK).
 	 */
 	if (thflags & TH_FIN)
 		tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN);
 	if (bbr->r_state != tp->t_state)
 		bbr_set_state(tp, bbr, tiwin);
 
 	if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map)) != NULL)
 		kern_prefetch(rsm, &prev_state);
 	prev_state = bbr->r_state;
 	bbr->rc_ack_was_delayed = 0;
 	lost = bbr->r_ctl.rc_lost;
 	bbr->rc_is_pkt_epoch_now = 0;
 	if (m->m_flags & (M_TSTMP|M_TSTMP_LRO)) {
 		/* Get the real time into lcts and figure the real delay */
 		lcts = tcp_get_usecs(&ltv);
 		if (TSTMP_GT(lcts, cts)) {
 			bbr->r_ctl.rc_ack_hdwr_delay = lcts - cts;
 			bbr->rc_ack_was_delayed = 1;
 			if (TSTMP_GT(bbr->r_ctl.rc_ack_hdwr_delay,
 				     bbr->r_ctl.highest_hdwr_delay))
 				bbr->r_ctl.highest_hdwr_delay = bbr->r_ctl.rc_ack_hdwr_delay;
 		} else {
 			bbr->r_ctl.rc_ack_hdwr_delay = 0;
 			bbr->rc_ack_was_delayed = 0;
 		}
 	} else {
 		bbr->r_ctl.rc_ack_hdwr_delay = 0;
 		bbr->rc_ack_was_delayed = 0;
 	}
 	bbr_log_ack_event(bbr, th, &to, tlen, nsegs, cts, nxt_pkt, m);
 	if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
 		retval = 0;
 		m_freem(m);
 		goto done_with_input;
 	}
 	/*
 	 * If a segment with the ACK-bit set arrives in the SYN-SENT state
 	 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
 	 */
 	if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
 	    (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 		return (1);
 	}
 	if (tiwin > bbr->r_ctl.rc_high_rwnd)
 		bbr->r_ctl.rc_high_rwnd = tiwin;
 #ifdef BBR_INVARIANTS
 	if ((tp->t_inpcb->inp_flags & INP_DROPPED) ||
 	    (tp->t_inpcb->inp_flags2 & INP_FREED)) {
 		panic("tp:%p bbr:%p given a dropped inp:%p",
 		    tp, bbr, tp->t_inpcb);
 	}
 #endif
 	bbr->r_ctl.rc_flight_at_input = ctf_flight_size(tp,
 					    (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
 	bbr->rtt_valid = 0;
 	if (to.to_flags & TOF_TS) {
 		bbr->rc_ts_valid = 1;
 		bbr->r_ctl.last_inbound_ts = to.to_tsval;
 	} else {
 		bbr->rc_ts_valid = 0;
 		bbr->r_ctl.last_inbound_ts = 0;
 	}
 	retval = (*bbr->r_substate) (m, th, so,
 	    tp, &to, drop_hdrlen,
 	    tlen, tiwin, thflags, nxt_pkt, iptos);
 #ifdef BBR_INVARIANTS
 	if ((retval == 0) &&
 	    (tp->t_inpcb == NULL)) {
 		panic("retval:%d tp:%p t_inpcb:NULL state:%d",
 		    retval, tp, prev_state);
 	}
 #endif
 	if (nxt_pkt == 0)
 		BBR_STAT_INC(bbr_rlock_left_ret0);
 	else
 		BBR_STAT_INC(bbr_rlock_left_ret1);
 	if (retval == 0) {
 		/*
 		 * If retval is 1 the tcb is unlocked and most likely the tp
 		 * is gone.
 		 */
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 		tcp_bbr_xmit_timer_commit(bbr, tp, cts);
 		if (bbr->rc_is_pkt_epoch_now)
 			bbr_set_pktepoch(bbr, cts, __LINE__);
 		bbr_check_bbr_for_state(bbr, cts, __LINE__, (bbr->r_ctl.rc_lost - lost));
 		if (nxt_pkt == 0) {
 			if (bbr->r_wanted_output != 0) {
 				bbr->rc_output_starts_timer = 0;
 				did_out = 1;
 				if (tcp_output(tp) < 0)
 					return (1);
 			} else
 				bbr_start_hpts_timer(bbr, tp, cts, 6, 0, 0);
 		}
 		if ((nxt_pkt == 0) &&
 		    ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
 		    (SEQ_GT(tp->snd_max, tp->snd_una) ||
 		     (tp->t_flags & TF_DELACK) ||
 		     ((V_tcp_always_keepalive || bbr->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
 		      (tp->t_state <= TCPS_CLOSING)))) {
 			/*
 			 * We could not send (probably in the hpts but
 			 * stopped the timer)?
 			 */
 			if ((tp->snd_max == tp->snd_una) &&
 			    ((tp->t_flags & TF_DELACK) == 0) &&
 			    (tcp_in_hpts(bbr->rc_inp)) &&
 			    (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
 				/*
 				 * keep alive not needed if we are hptsi
 				 * output yet
 				 */
 				;
 			} else {
 				if (tcp_in_hpts(bbr->rc_inp)) {
 					tcp_hpts_remove(bbr->rc_inp);
 					if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
 					    (TSTMP_GT(lcts, bbr->rc_pacer_started))) {
 						uint32_t del;
 
 						del = lcts - bbr->rc_pacer_started;
 						if (bbr->r_ctl.rc_last_delay_val > del) {
 							BBR_STAT_INC(bbr_force_timer_start);
 							bbr->r_ctl.rc_last_delay_val -= del;
 							bbr->rc_pacer_started = lcts;
 						} else {
 							/* We are late */
 							bbr->r_ctl.rc_last_delay_val = 0;
 							BBR_STAT_INC(bbr_force_output);
 							if (tcp_output(tp) < 0)
 								return (1);
 						}
 					}
 				}
 				bbr_start_hpts_timer(bbr, tp, cts, 8, bbr->r_ctl.rc_last_delay_val,
 				    0);
 			}
 		} else if ((bbr->rc_output_starts_timer == 0) && (nxt_pkt == 0)) {
 			/* Do we have the correct timer running? */
 			bbr_timer_audit(tp, bbr, lcts, &so->so_snd);
 		}
 		/* Do we have a new state */
 		if (bbr->r_state != tp->t_state)
 			bbr_set_state(tp, bbr, tiwin);
 done_with_input:
 		bbr_log_doseg_done(bbr, cts, nxt_pkt, did_out);
 		if (did_out)
 			bbr->r_wanted_output = 0;
 #ifdef BBR_INVARIANTS
 		if (tp->t_inpcb == NULL) {
 			panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
 			    did_out,
 			    retval, tp, prev_state);
 		}
 #endif
 	}
 	return (retval);
 }
 
 static void
 bbr_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
 {
 	struct timeval tv;
 	int retval;
 
 	/* First lets see if we have old packets */
 	if (tp->t_in_pkt) {
 		if (ctf_do_queued_segments(so, tp, 1)) {
 			m_freem(m);
 			return;
 		}
 	}
 	if (m->m_flags & M_TSTMP_LRO) {
 		mbuf_tstmp2timeval(m, &tv);
 	} else {
 		/* Should not be should we kassert instead? */
 		tcp_get_usecs(&tv);
 	}
 	retval = bbr_do_segment_nounlock(m, th, so, tp,
 					 drop_hdrlen, tlen, iptos, 0, &tv);
 	if (retval == 0) {
 		INP_WUNLOCK(tp->t_inpcb);
 	}
 }
 
 /*
  * Return how much data can be sent without violating the
  * cwnd or rwnd.
  */
 
 static inline uint32_t
 bbr_what_can_we_send(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t sendwin,
     uint32_t avail, int32_t sb_offset, uint32_t cts)
 {
 	uint32_t len;
 
 	if (ctf_outstanding(tp) >= tp->snd_wnd) {
 		/* We never want to go over our peers rcv-window */
 		len = 0;
 	} else {
 		uint32_t flight;
 
 		flight = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes));
 		if (flight >= sendwin) {
 			/*
 			 * We have in flight what we are allowed by cwnd (if
 			 * it was rwnd blocking it would have hit above out
 			 * >= tp->snd_wnd).
 			 */
 			return (0);
 		}
 		len = sendwin - flight;
 		if ((len + ctf_outstanding(tp)) > tp->snd_wnd) {
 			/* We would send too much (beyond the rwnd) */
 			len = tp->snd_wnd - ctf_outstanding(tp);
 		}
 		if ((len + sb_offset) > avail) {
 			/*
 			 * We don't have that much in the SB, how much is
 			 * there?
 			 */
 			len = avail - sb_offset;
 		}
 	}
 	return (len);
 }
 
 static inline void
 bbr_do_error_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, int32_t len, int32_t error)
 {
 #ifdef NETFLIX_STATS
 	KMOD_TCPSTAT_INC(tcps_sndpack_error);
 	KMOD_TCPSTAT_ADD(tcps_sndbyte_error, len);
 #endif
 }
 
 static inline void
 bbr_do_send_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, int32_t len, int32_t error)
 {
 	if (error) {
 		bbr_do_error_accounting(tp, bbr, rsm, len, error);
 		return;
 	}
 	if (rsm) {
 		if (rsm->r_flags & BBR_TLP) {
 			/*
 			 * TLP should not count in retran count, but in its
 			 * own bin
 			 */
 #ifdef NETFLIX_STATS
 			KMOD_TCPSTAT_INC(tcps_tlpresends);
 			KMOD_TCPSTAT_ADD(tcps_tlpresend_bytes, len);
 #endif
 		} else {
 			/* Retransmit */
 			tp->t_sndrexmitpack++;
 			KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
 			KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
 #ifdef STATS
 			stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
 			    len);
 #endif
 		}
 		/*
 		 * Logs in 0 - 8, 8 is all non probe_bw states 0-7 is
 		 * sub-state
 		 */
 		counter_u64_add(bbr_state_lost[rsm->r_bbr_state], len);
 		if (bbr->rc_bbr_state != BBR_STATE_PROBE_BW) {
 			/* Non probe_bw log in 1, 2, or 4. */
 			counter_u64_add(bbr_state_resend[bbr->rc_bbr_state], len);
 		} else {
 			/*
 			 * Log our probe state 3, and log also 5-13 to show
 			 * us the recovery sub-state for the send. This
 			 * means that 3 == (5+6+7+8+9+10+11+12+13)
 			 */
 			counter_u64_add(bbr_state_resend[BBR_STATE_PROBE_BW], len);
 			counter_u64_add(bbr_state_resend[(bbr_state_val(bbr) + 5)], len);
 		}
 		/* Place in both 16's the totals of retransmitted */
 		counter_u64_add(bbr_state_lost[16], len);
 		counter_u64_add(bbr_state_resend[16], len);
 		/* Place in 17's the total sent */
 		counter_u64_add(bbr_state_resend[17], len);
 		counter_u64_add(bbr_state_lost[17], len);
 
 	} else {
 		/* New sends */
 		KMOD_TCPSTAT_INC(tcps_sndpack);
 		KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
 		/* Place in 17's the total sent */
 		counter_u64_add(bbr_state_resend[17], len);
 		counter_u64_add(bbr_state_lost[17], len);
 #ifdef STATS
 		stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
 		    len);
 #endif
 	}
 }
 
 static void
 bbr_cwnd_limiting(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t in_level)
 {
 	if (bbr->rc_filled_pipe && bbr_target_cwnd_mult_limit && (bbr->rc_use_google == 0)) {
 		/*
 		 * Limit the cwnd to not be above N x the target plus whats
 		 * is outstanding. The target is based on the current b/w
 		 * estimate.
 		 */
 		uint32_t target;
 
 		target = bbr_get_target_cwnd(bbr, bbr_get_bw(bbr), BBR_UNIT);
 		target += ctf_outstanding(tp);
 		target *= bbr_target_cwnd_mult_limit;
 		if (tp->snd_cwnd > target)
 			tp->snd_cwnd = target;
 		bbr_log_type_cwndupd(bbr, 0, 0, 0, 10, 0, 0, __LINE__);
 	}
 }
 
 static int
 bbr_window_update_needed(struct tcpcb *tp, struct socket *so, uint32_t recwin, int32_t maxseg)
 {
 	/*
 	 * "adv" is the amount we could increase the window, taking into
 	 * account that we are limited by TCP_MAXWIN << tp->rcv_scale.
 	 */
 	int32_t adv;
 	int32_t oldwin;
 
 	adv = recwin;
 	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
 		oldwin = (tp->rcv_adv - tp->rcv_nxt);
 		if (adv > oldwin)
 			adv -= oldwin;
 		else {
 			/* We can't increase the window */
 			adv = 0;
 		}
 	} else
 		oldwin = 0;
 
 	/*
 	 * If the new window size ends up being the same as or less
 	 * than the old size when it is scaled, then don't force
 	 * a window update.
 	 */
 	if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale)
 		return (0);
 
 	if (adv >= (2 * maxseg) &&
 	    (adv >= (so->so_rcv.sb_hiwat / 4) ||
 	    recwin <= (so->so_rcv.sb_hiwat / 8) ||
 	    so->so_rcv.sb_hiwat <= 8 * maxseg)) {
 		return (1);
 	}
 	if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
 		return (1);
 	return (0);
 }
 
 /*
  * Return 0 on success and a errno on failure to send.
  * Note that a 0 return may not mean we sent anything
  * if the TCB was on the hpts. A non-zero return
  * does indicate the error we got from ip[6]_output.
  */
 static int
 bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
 {
 	struct socket *so;
 	int32_t len;
 	uint32_t cts;
 	uint32_t recwin, sendwin;
 	int32_t sb_offset;
 	int32_t flags, abandon, error = 0;
 	struct tcp_log_buffer *lgb = NULL;
 	struct mbuf *m;
 	struct mbuf *mb;
 	uint32_t if_hw_tsomaxsegcount = 0;
 	uint32_t if_hw_tsomaxsegsize = 0;
 	uint32_t if_hw_tsomax = 0;
 	struct ip *ip = NULL;
 #ifdef TCPDEBUG
 	struct ipovly *ipov = NULL;
 #endif
 	struct tcp_bbr *bbr;
 	struct tcphdr *th;
 	struct udphdr *udp = NULL;
 	u_char opt[TCP_MAXOLEN];
 	unsigned ipoptlen, optlen, hdrlen;
 	unsigned ulen;
 	uint32_t bbr_seq;
 	uint32_t delay_calc=0;
 	uint8_t doing_tlp = 0;
 	uint8_t local_options;
 #ifdef BBR_INVARIANTS
 	uint8_t doing_retran_from = 0;
 	uint8_t picked_up_retran = 0;
 #endif
 	uint8_t wanted_cookie = 0;
 	uint8_t more_to_rxt=0;
 	int32_t prefetch_so_done = 0;
 	int32_t prefetch_rsm = 0;
 	uint32_t tot_len = 0;
 	uint32_t maxseg, pace_max_segs, p_maxseg;
 	int32_t csum_flags = 0;
  	int32_t hw_tls;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	unsigned ipsec_optlen = 0;
 
 #endif
 	volatile int32_t sack_rxmit;
 	struct bbr_sendmap *rsm = NULL;
 	int32_t tso, mtu;
 	struct tcpopt to;
 	int32_t slot = 0;
 	struct inpcb *inp;
 	struct sockbuf *sb;
 	uint32_t hpts_calling;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int32_t isipv6;
 #endif
 	uint8_t app_limited = BBR_JR_SENT_DATA;
 	uint8_t filled_all = 0;
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	/* We take a cache hit here */
 	memcpy(&bbr->rc_tv, tv, sizeof(struct timeval));
 	cts = tcp_tv_to_usectick(&bbr->rc_tv);
 	inp = bbr->rc_inp;
 	so = inp->inp_socket;
 	sb = &so->so_snd;
  	if (sb->sb_flags & SB_TLS_IFNET)
  		hw_tls = 1;
  	else
  		hw_tls = 0;
 	kern_prefetch(sb, &maxseg);
 	maxseg = tp->t_maxseg - bbr->rc_last_options;
 	if (bbr_minseg(bbr) < maxseg) {
 		tcp_bbr_tso_size_check(bbr, cts);
 	}
 	/* Remove any flags that indicate we are pacing on the inp  */
 	pace_max_segs = bbr->r_ctl.rc_pace_max_segs;
 	p_maxseg = min(maxseg, pace_max_segs);
 	INP_WLOCK_ASSERT(inp);
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE)
 		return (tcp_offload_output(tp));
 #endif
 
 #ifdef INET6
 	if (bbr->r_state) {
 		/* Use the cache line loaded if possible */
 		isipv6 = bbr->r_is_v6;
 	} else {
 		isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 	}
 #endif
 	if (((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
 	    tcp_in_hpts(inp)) {
 		/*
 		 * We are on the hpts for some timer but not hptsi output.
 		 * Possibly remove from the hpts so we can send/recv etc.
 		 */
 		if ((tp->t_flags & TF_ACKNOW) == 0) {
 			/*
 			 * No immediate demand right now to send an ack, but
 			 * the user may have read, making room for new data
 			 * (a window update). If so we may want to cancel
 			 * whatever timer is running (KEEP/DEL-ACK?) and
 			 * continue to send out a window update. Or we may
 			 * have gotten more data into the socket buffer to
 			 * send.
 			 */
 			recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
 				      (long)TCP_MAXWIN << tp->rcv_scale);
 			if ((bbr_window_update_needed(tp, so, recwin, maxseg) == 0) &&
 			    ((tcp_outflags[tp->t_state] & TH_RST) == 0) &&
 			    ((sbavail(sb) + ((tcp_outflags[tp->t_state] & TH_FIN) ? 1 : 0)) <=
 			    (tp->snd_max - tp->snd_una))) {
 				/*
 				 * Nothing new to send and no window update
 				 * is needed to send. Lets just return and
 				 * let the timer-run off.
 				 */
 				return (0);
 			}
 		}
 		tcp_hpts_remove(inp);
 		bbr_timer_cancel(bbr, __LINE__, cts);
 	}
 	if (bbr->r_ctl.rc_last_delay_val) {
 		/* Calculate a rough delay for early escape to sending  */
 		if (SEQ_GT(cts, bbr->rc_pacer_started))
 			delay_calc = cts - bbr->rc_pacer_started;
 		if (delay_calc >= bbr->r_ctl.rc_last_delay_val)
 			delay_calc -= bbr->r_ctl.rc_last_delay_val;
 		else
 			delay_calc = 0;
 	}
 	/* Mark that we have called bbr_output(). */
 	if ((bbr->r_timer_override) ||
 	    (tp->t_state < TCPS_ESTABLISHED)) {
 		/* Timeouts or early states are exempt */
 		if (tcp_in_hpts(inp))
 			tcp_hpts_remove(inp);
 	} else if (tcp_in_hpts(inp)) {
 		if ((bbr->r_ctl.rc_last_delay_val) &&
 		    (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
 		    delay_calc) {
 			/*
 			 * We were being paced for output and the delay has
 			 * already exceeded when we were supposed to be
 			 * called, lets go ahead and pull out of the hpts
 			 * and call output.
 			 */
 			counter_u64_add(bbr_out_size[TCP_MSS_ACCT_LATE], 1);
 			bbr->r_ctl.rc_last_delay_val = 0;
 			tcp_hpts_remove(inp);
 		} else if (tp->t_state == TCPS_CLOSED) {
 			bbr->r_ctl.rc_last_delay_val = 0;
 			tcp_hpts_remove(inp);
 		} else {
 			/*
 			 * On the hpts, you shall not pass! even if ACKNOW
 			 * is on, we will when the hpts fires, unless of
 			 * course we are overdue.
 			 */
 			counter_u64_add(bbr_out_size[TCP_MSS_ACCT_INPACE], 1);
 			return (0);
 		}
 	}
 	bbr->rc_cwnd_limited = 0;
 	if (bbr->r_ctl.rc_last_delay_val) {
 		/* recalculate the real delay and deal with over/under  */
 		if (SEQ_GT(cts, bbr->rc_pacer_started))
 			delay_calc = cts - bbr->rc_pacer_started;
 		else
 			delay_calc = 0;
 		if (delay_calc >= bbr->r_ctl.rc_last_delay_val)
 			/* Setup the delay which will be added in */
 			delay_calc -= bbr->r_ctl.rc_last_delay_val;
 		else {
 			/*
 			 * We are early setup to adjust
 			 * our slot time.
 			 */
 			uint64_t merged_val;
 
 			bbr->r_ctl.rc_agg_early += (bbr->r_ctl.rc_last_delay_val - delay_calc);
 			bbr->r_agg_early_set = 1;
 			if (bbr->r_ctl.rc_hptsi_agg_delay) {
 				if (bbr->r_ctl.rc_hptsi_agg_delay >= bbr->r_ctl.rc_agg_early) {
 					/* Nope our previous late cancels out the early */
 					bbr->r_ctl.rc_hptsi_agg_delay -= bbr->r_ctl.rc_agg_early;
 					bbr->r_agg_early_set = 0;
 					bbr->r_ctl.rc_agg_early = 0;
 				} else {
 					bbr->r_ctl.rc_agg_early -= bbr->r_ctl.rc_hptsi_agg_delay;
 					bbr->r_ctl.rc_hptsi_agg_delay = 0;
 				}
 			}
 			merged_val = bbr->rc_pacer_started;
 			merged_val <<= 32;
 			merged_val |= bbr->r_ctl.rc_last_delay_val;
 			bbr_log_pacing_delay_calc(bbr, inp->inp_hpts_calls,
 						 bbr->r_ctl.rc_agg_early, cts, delay_calc, merged_val,
 						 bbr->r_agg_early_set, 3);
 			bbr->r_ctl.rc_last_delay_val = 0;
 			BBR_STAT_INC(bbr_early);
 			delay_calc = 0;
 		}
 	} else {
 		/* We were not delayed due to hptsi */
 		if (bbr->r_agg_early_set)
 			bbr->r_ctl.rc_agg_early = 0;
 		bbr->r_agg_early_set = 0;
 		delay_calc = 0;
 	}
 	if (delay_calc) {
 		/*
 		 * We had a hptsi delay which means we are falling behind on
 		 * sending at the expected rate. Calculate an extra amount
 		 * of data we can send, if any, to put us back on track.
 		 */
 		if ((bbr->r_ctl.rc_hptsi_agg_delay + delay_calc) < bbr->r_ctl.rc_hptsi_agg_delay)
 			bbr->r_ctl.rc_hptsi_agg_delay = 0xffffffff;
 		else
 			bbr->r_ctl.rc_hptsi_agg_delay += delay_calc;
 	}
 	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
 	if ((tp->snd_una == tp->snd_max) &&
 	    (bbr->rc_bbr_state != BBR_STATE_IDLE_EXIT) &&
 	    (sbavail(sb))) {
 		/*
 		 * Ok we have been idle with nothing outstanding
 		 * we possibly need to start fresh with either a new
 		 * suite of states or a fast-ramp up.
 		 */
 		bbr_restart_after_idle(bbr,
 				       cts, bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time));
 	}
 	/*
 	 * Now was there a hptsi delay where we are behind? We only count
 	 * being behind if: a) We are not in recovery. b) There was a delay.
 	 * <and> c) We had room to send something.
 	 *
 	 */
 	hpts_calling = inp->inp_hpts_calls;
 	inp->inp_hpts_calls = 0;
 	if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
 		int retval;
 
 		retval = bbr_process_timers(tp, bbr, cts, hpts_calling);
 		if (retval != 0) {
 			counter_u64_add(bbr_out_size[TCP_MSS_ACCT_ATIMER], 1);
 			/*
 			 * If timers want tcp_drop(), then pass error out,
 			 * otherwise suppress it.
 			 */
 			return (retval < 0 ? retval : 0);
 		}
 	}
 	bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
 	if (hpts_calling &&
 	    (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
 		bbr->r_ctl.rc_last_delay_val = 0;
 	}
 	bbr->r_timer_override = 0;
 	bbr->r_wanted_output = 0;
 	/*
 	 * For TFO connections in SYN_RECEIVED, only allow the initial
 	 * SYN|ACK and those sent by the retransmit timer.
 	 */
 	if (IS_FASTOPEN(tp->t_flags) &&
 	    ((tp->t_state == TCPS_SYN_RECEIVED) ||
 	     (tp->t_state == TCPS_SYN_SENT)) &&
 	    SEQ_GT(tp->snd_max, tp->snd_una) &&	/* initial SYN or SYN|ACK sent */
 	    (tp->t_rxtshift == 0)) {	/* not a retransmit */
 		len = 0;
 		goto just_return_nolock;
 	}
 	/*
 	 * Before sending anything check for a state update. For hpts
 	 * calling without input this is important. If its input calling
 	 * then this was already done.
 	 */
 	if (bbr->rc_use_google == 0)
 		bbr_check_bbr_for_state(bbr, cts, __LINE__, 0);
 again:
 	/*
 	 * If we've recently taken a timeout, snd_max will be greater than
 	 * snd_max. BBR in general does not pay much attention to snd_nxt
 	 * for historic reasons the persist timer still uses it. This means
 	 * we have to look at it. All retransmissions that are not persits
 	 * use the rsm that needs to be sent so snd_nxt is ignored. At the
 	 * end of this routine we pull snd_nxt always up to snd_max.
 	 */
 	doing_tlp = 0;
 #ifdef BBR_INVARIANTS
 	doing_retran_from = picked_up_retran = 0;
 #endif
 	error = 0;
 	tso = 0;
 	slot = 0;
 	mtu = 0;
 	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
 	sb_offset = tp->snd_max - tp->snd_una;
 	flags = tcp_outflags[tp->t_state];
 	sack_rxmit = 0;
 	len = 0;
 	rsm = NULL;
 	if (flags & TH_RST) {
 		SOCKBUF_LOCK(sb);
 		goto send;
 	}
 recheck_resend:
 	while (bbr->r_ctl.rc_free_cnt < bbr_min_req_free) {
 		/* We need to always have one in reserve */
 		rsm = bbr_alloc(bbr);
 		if (rsm == NULL) {
 			error = ENOMEM;
 			/* Lie to get on the hpts */
 			tot_len = tp->t_maxseg;
 			if (hpts_calling)
 				/* Retry in a ms */
 				slot = 1001;
 			goto just_return_nolock;
 		}
 		TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next);
 		bbr->r_ctl.rc_free_cnt++;
 		rsm = NULL;
 	}
 	/* What do we send, a resend? */
 	if (bbr->r_ctl.rc_resend == NULL) {
 		/* Check for rack timeout */
 		bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts);
 		if (bbr->r_ctl.rc_resend) {
 #ifdef BBR_INVARIANTS
 			picked_up_retran = 1;
 #endif
 			bbr_cong_signal(tp, NULL, CC_NDUPACK, bbr->r_ctl.rc_resend);
 		}
 	}
 	if (bbr->r_ctl.rc_resend) {
 		rsm = bbr->r_ctl.rc_resend;
 #ifdef BBR_INVARIANTS
 		doing_retran_from = 1;
 #endif
 		/* Remove any TLP flags its a RACK or T-O */
 		rsm->r_flags &= ~BBR_TLP;
 		bbr->r_ctl.rc_resend = NULL;
 		if (SEQ_LT(rsm->r_start, tp->snd_una)) {
 #ifdef BBR_INVARIANTS
 			panic("Huh, tp:%p bbr:%p rsm:%p start:%u < snd_una:%u\n",
 			    tp, bbr, rsm, rsm->r_start, tp->snd_una);
 			goto recheck_resend;
 #else
 			/* TSNH */
 			rsm = NULL;
 			goto recheck_resend;
 #endif
 		}
 		if (rsm->r_flags & BBR_HAS_SYN) {
 			/* Only retransmit a SYN by itself */
 			len = 0;
 			if ((flags & TH_SYN) == 0) {
 				/* Huh something is wrong */
 				rsm->r_start++;
 				if (rsm->r_start == rsm->r_end) {
 					/* Clean it up, somehow we missed the ack? */
 					bbr_log_syn(tp, NULL);
 				} else {
 					/* TFO with data? */
 					rsm->r_flags &= ~BBR_HAS_SYN;
 					len = rsm->r_end - rsm->r_start;
 				}
 			} else {
 				/* Retransmitting SYN */
 				rsm = NULL;
 				SOCKBUF_LOCK(sb);
 				goto send;
 			}
 		} else
 			len = rsm->r_end - rsm->r_start;
 		if ((bbr->rc_resends_use_tso == 0) &&
 		    (len > maxseg)) {
 			len = maxseg;
 			more_to_rxt = 1;
 		}
 		sb_offset = rsm->r_start - tp->snd_una;
 		if (len > 0) {
 			sack_rxmit = 1;
 			KMOD_TCPSTAT_INC(tcps_sack_rexmits);
 			KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes,
 			    min(len, maxseg));
 		} else {
 			/* I dont think this can happen */
 			rsm = NULL;
 			goto recheck_resend;
 		}
 		BBR_STAT_INC(bbr_resends_set);
 	} else if (bbr->r_ctl.rc_tlp_send) {
 		/*
 		 * Tail loss probe
 		 */
 		doing_tlp = 1;
 		rsm = bbr->r_ctl.rc_tlp_send;
 		bbr->r_ctl.rc_tlp_send = NULL;
 		sack_rxmit = 1;
 		len = rsm->r_end - rsm->r_start;
 		if ((bbr->rc_resends_use_tso == 0) && (len > maxseg))
 			len = maxseg;
 
 		if (SEQ_GT(tp->snd_una, rsm->r_start)) {
 #ifdef BBR_INVARIANTS
 			panic("tp:%p bbc:%p snd_una:%u rsm:%p r_start:%u",
 			    tp, bbr, tp->snd_una, rsm, rsm->r_start);
 #else
 			/* TSNH */
 			rsm = NULL;
 			goto recheck_resend;
 #endif
 		}
 		sb_offset = rsm->r_start - tp->snd_una;
 		BBR_STAT_INC(bbr_tlp_set);
 	}
 	/*
 	 * Enforce a connection sendmap count limit if set
 	 * as long as we are not retransmiting.
 	 */
 	if ((rsm == NULL) &&
 	    (V_tcp_map_entries_limit > 0) &&
 	    (bbr->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
 		BBR_STAT_INC(bbr_alloc_limited);
 		if (!bbr->alloc_limit_reported) {
 			bbr->alloc_limit_reported = 1;
 			BBR_STAT_INC(bbr_alloc_limited_conns);
 		}
 		goto just_return_nolock;
 	}
 #ifdef BBR_INVARIANTS
 	if (rsm && SEQ_LT(rsm->r_start, tp->snd_una)) {
 		panic("tp:%p bbr:%p rsm:%p sb_offset:%u len:%u",
 		    tp, bbr, rsm, sb_offset, len);
 	}
 #endif
 	/*
 	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
 	 * state flags.
 	 */
 	if (tp->t_flags & TF_NEEDFIN && (rsm == NULL))
 		flags |= TH_FIN;
 	if (tp->t_flags & TF_NEEDSYN)
 		flags |= TH_SYN;
 
 	if (rsm && (rsm->r_flags & BBR_HAS_FIN)) {
 		/* we are retransmitting the fin */
 		len--;
 		if (len) {
 			/*
 			 * When retransmitting data do *not* include the
 			 * FIN. This could happen from a TLP probe if we
 			 * allowed data with a FIN.
 			 */
 			flags &= ~TH_FIN;
 		}
 	} else if (rsm) {
 		if (flags & TH_FIN)
 			flags &= ~TH_FIN;
 	}
 	if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
 		void *end_rsm;
 
 		end_rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_tmap, bbr_sendmap, r_tnext);
 		if (end_rsm)
 			kern_prefetch(end_rsm, &prefetch_rsm);
 		prefetch_rsm = 1;
 	}
 	SOCKBUF_LOCK(sb);
 	/*
 	 * If snd_nxt == snd_max and we have transmitted a FIN, the
 	 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
 	 * negative length.  This can also occur when TCP opens up its
 	 * congestion window while receiving additional duplicate acks after
 	 * fast-retransmit because TCP will reset snd_nxt to snd_max after
 	 * the fast-retransmit.
 	 *
 	 * In the normal retransmit-FIN-only case, however, snd_nxt will be
 	 * set to snd_una, the sb_offset will be 0, and the length may wind
 	 * up 0.
 	 *
 	 * If sack_rxmit is true we are retransmitting from the scoreboard
 	 * in which case len is already set.
 	 */
 	if (sack_rxmit == 0) {
 		uint32_t avail;
 
 		avail = sbavail(sb);
 		if (SEQ_GT(tp->snd_max, tp->snd_una))
 			sb_offset = tp->snd_max - tp->snd_una;
 		else
 			sb_offset = 0;
 		if (bbr->rc_tlp_new_data) {
 			/* TLP is forcing out new data */
 			uint32_t tlplen;
 
 			doing_tlp = 1;
 			tlplen = maxseg;
 
 			if (tlplen > (uint32_t)(avail - sb_offset)) {
 				tlplen = (uint32_t)(avail - sb_offset);
 			}
 			if (tlplen > tp->snd_wnd) {
 				len = tp->snd_wnd;
 			} else {
 				len = tlplen;
 			}
 			bbr->rc_tlp_new_data = 0;
 		} else {
 			len = bbr_what_can_we_send(tp, bbr, sendwin, avail, sb_offset, cts);
 			if ((len < p_maxseg) &&
 			    (bbr->rc_in_persist == 0) &&
 			    (ctf_outstanding(tp) >= (2 * p_maxseg)) &&
 			    ((avail - sb_offset) >= p_maxseg)) {
 				/*
 				 * We are not completing whats in the socket
 				 * buffer (i.e. there is at least a segment
 				 * waiting to send) and we have 2 or more
 				 * segments outstanding. There is no sense
 				 * of sending a little piece. Lets defer and
 				 * and wait until we can send a whole
 				 * segment.
 				 */
 				len = 0;
 			}
 			if (bbr->rc_in_persist) {
 				/*
 				 * We are in persists, figure out if
 				 * a retransmit is available (maybe the previous
 				 * persists we sent) or if we have to send new
 				 * data.
 				 */
 				rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
 				if (rsm) {
 					len = rsm->r_end - rsm->r_start;
 					if (rsm->r_flags & BBR_HAS_FIN)
 						len--;
 					if ((bbr->rc_resends_use_tso == 0) && (len > maxseg))
 						len = maxseg;
 					if (len > 1)
 						BBR_STAT_INC(bbr_persist_reneg);
 					/*
 					 * XXXrrs we could force the len to
 					 * 1 byte here to cause the chunk to
 					 * split apart.. but that would then
 					 * mean we always retransmit it as
 					 * one byte even after the window
 					 * opens.
 					 */
 					sack_rxmit = 1;
 					sb_offset = rsm->r_start - tp->snd_una;
 				} else {
 					/*
 					 * First time through in persists or peer
 					 * acked our one byte. Though we do have
 					 * to have something in the sb.
 					 */
 					len = 1;
 					sb_offset = 0;
 					if (avail == 0)
 					    len = 0;
 				}
 			}
 		}
 	}
 	if (prefetch_so_done == 0) {
 		kern_prefetch(so, &prefetch_so_done);
 		prefetch_so_done = 1;
 	}
 	/*
 	 * Lop off SYN bit if it has already been sent.  However, if this is
 	 * SYN-SENT state and if segment contains data and if we don't know
 	 * that foreign host supports TAO, suppress sending segment.
 	 */
 	if ((flags & TH_SYN) && (rsm == NULL) &&
 	    SEQ_GT(tp->snd_max, tp->snd_una)) {
 		if (tp->t_state != TCPS_SYN_RECEIVED)
 			flags &= ~TH_SYN;
 		/*
 		 * When sending additional segments following a TFO SYN|ACK,
 		 * do not include the SYN bit.
 		 */
 		if (IS_FASTOPEN(tp->t_flags) &&
 		    (tp->t_state == TCPS_SYN_RECEIVED))
 			flags &= ~TH_SYN;
 		sb_offset--, len++;
 		if (sbavail(sb) == 0)
 			len = 0;
 	} else if ((flags & TH_SYN) && rsm) {
 		/*
 		 * Subtract one from the len for the SYN being
 		 * retransmitted.
 		 */
 		len--;
 	}
 	/*
 	 * Be careful not to send data and/or FIN on SYN segments. This
 	 * measure is needed to prevent interoperability problems with not
 	 * fully conformant TCP implementations.
 	 */
 	if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
 		len = 0;
 		flags &= ~TH_FIN;
 	}
 	/*
 	 * On TFO sockets, ensure no data is sent in the following cases:
 	 *
 	 *  - When retransmitting SYN|ACK on a passively-created socket
 	 *  - When retransmitting SYN on an actively created socket
 	 *  - When sending a zero-length cookie (cookie request) on an
 	 *    actively created socket
 	 *  - When the socket is in the CLOSED state (RST is being sent)
 	 */
 	if (IS_FASTOPEN(tp->t_flags) &&
 	    (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
 	     ((tp->t_state == TCPS_SYN_SENT) &&
 	      (tp->t_tfo_client_cookie_len == 0)) ||
 	     (flags & TH_RST))) {
 		len = 0;
 		sack_rxmit = 0;
 		rsm = NULL;
 	}
 	/* Without fast-open there should never be data sent on a SYN */
 	if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags)))
 		len = 0;
 	if (len <= 0) {
 		/*
 		 * If FIN has been sent but not acked, but we haven't been
 		 * called to retransmit, len will be < 0.  Otherwise, window
 		 * shrank after we sent into it.  If window shrank to 0,
 		 * cancel pending retransmit, pull snd_nxt back to (closed)
 		 * window, and set the persist timer if it isn't already
 		 * going.  If the window didn't close completely, just wait
 		 * for an ACK.
 		 *
 		 * We also do a general check here to ensure that we will
 		 * set the persist timer when we have data to send, but a
 		 * 0-byte window. This makes sure the persist timer is set
 		 * even if the packet hits one of the "goto send" lines
 		 * below.
 		 */
 		len = 0;
 		if ((tp->snd_wnd == 0) &&
 		    (TCPS_HAVEESTABLISHED(tp->t_state)) &&
 		    (tp->snd_una == tp->snd_max) &&
 		    (sb_offset < (int)sbavail(sb))) {
 			/*
 			 * Not enough room in the rwnd to send
 			 * a paced segment out.
 			 */
 			bbr_enter_persist(tp, bbr, cts, __LINE__);
 		}
 	} else if ((rsm == NULL) &&
 		   (doing_tlp == 0) &&
 		   (len < bbr->r_ctl.rc_pace_max_segs)) {
 		/*
 		 * We are not sending a full segment for
 		 * some reason. Should we not send anything (think
 		 * sws or persists)?
 		 */
 		if ((tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) &&
 		    (TCPS_HAVEESTABLISHED(tp->t_state)) &&
 		    (len < (int)(sbavail(sb) - sb_offset))) {
 			/*
 			 * Here the rwnd is less than
 			 * the pacing size, this is not a retransmit,
 			 * we are established and
 			 * the send is not the last in the socket buffer
 			 * lets not send, and possibly enter persists.
 			 */
 			len = 0;
 			if (tp->snd_max == tp->snd_una)
 				bbr_enter_persist(tp, bbr, cts, __LINE__);
 		} else if ((tp->snd_cwnd >= bbr->r_ctl.rc_pace_max_segs) &&
 			   (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
 						 bbr->r_ctl.rc_lost_bytes)) > (2 * maxseg)) &&
 			   (len < (int)(sbavail(sb) - sb_offset)) &&
 			   (len < bbr_minseg(bbr))) {
 			/*
 			 * Here we are not retransmitting, and
 			 * the cwnd is not so small that we could
 			 * not send at least a min size (rxt timer
 			 * not having gone off), We have 2 segments or
 			 * more already in flight, its not the tail end
 			 * of the socket buffer  and the cwnd is blocking
 			 * us from sending out minimum pacing segment size.
 			 * Lets not send anything.
 			 */
 			bbr->rc_cwnd_limited = 1;
 			len = 0;
 		} else if (((tp->snd_wnd - ctf_outstanding(tp)) <
 			    min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) &&
 			   (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
 						 bbr->r_ctl.rc_lost_bytes)) > (2 * maxseg)) &&
 			   (len < (int)(sbavail(sb) - sb_offset)) &&
 			   (TCPS_HAVEESTABLISHED(tp->t_state))) {
 			/*
 			 * Here we have a send window but we have
 			 * filled it up and we can't send another pacing segment.
 			 * We also have in flight more than 2 segments
 			 * and we are not completing the sb i.e. we allow
 			 * the last bytes of the sb to go out even if
 			 * its not a full pacing segment.
 			 */
 			len = 0;
 		}
 	}
 	/* len will be >= 0 after this point. */
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 	tcp_sndbuf_autoscale(tp, so, sendwin);
 	/*
 	 *
 	 */
 	if (bbr->rc_in_persist &&
 	    len &&
 	    (rsm == NULL) &&
 	    (len < min((bbr->r_ctl.rc_high_rwnd/2), bbr->r_ctl.rc_pace_max_segs))) {
 		/*
 		 * We are in persist, not doing a retransmit and don't have enough space
 		 * yet to send a full TSO. So is it at the end of the sb
 		 * if so we need to send else nuke to 0 and don't send.
 		 */
 		int sbleft;
 		if (sbavail(sb) > sb_offset)
 			sbleft = sbavail(sb) - sb_offset;
 		else
 			sbleft = 0;
 		if (sbleft >= min((bbr->r_ctl.rc_high_rwnd/2), bbr->r_ctl.rc_pace_max_segs)) {
 			/* not at end of sb lets not send */
 			len = 0;
 		}
 	}
 	/*
 	 * Decide if we can use TCP Segmentation Offloading (if supported by
 	 * hardware).
 	 *
 	 * TSO may only be used if we are in a pure bulk sending state.  The
 	 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
 	 * options prevent using TSO.  With TSO the TCP header is the same
 	 * (except for the sequence number) for all generated packets.  This
 	 * makes it impossible to transmit any options which vary per
 	 * generated segment or packet.
 	 *
 	 * IPv4 handling has a clear separation of ip options and ip header
 	 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen()
 	 * does the right thing below to provide length of just ip options
 	 * and thus checking for ipoptlen is enough to decide if ip options
 	 * are present.
 	 */
 #ifdef INET6
 	if (isipv6)
 		ipoptlen = ip6_optlen(inp);
 	else
 #endif
 	if (inp->inp_options)
 		ipoptlen = inp->inp_options->m_len -
 		    offsetof(struct ipoption, ipopt_list);
 	else
 		ipoptlen = 0;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	/*
 	 * Pre-calculate here as we save another lookup into the darknesses
 	 * of IPsec that way and can actually decide if TSO is ok.
 	 */
 #ifdef INET6
 	if (isipv6 && IPSEC_ENABLED(ipv6))
 		ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp);
 #ifdef INET
 	else
 #endif
 #endif				/* INET6 */
 #ifdef INET
 	if (IPSEC_ENABLED(ipv4))
 		ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp);
 #endif				/* INET */
 #endif				/* IPSEC */
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	ipoptlen += ipsec_optlen;
 #endif
 	if ((tp->t_flags & TF_TSO) && V_tcp_do_tso &&
 	    (len > maxseg) &&
 	    (tp->t_port == 0) &&
 	    ((tp->t_flags & TF_SIGNATURE) == 0) &&
 	    tp->rcv_numsacks == 0 &&
 	    ipoptlen == 0)
 		tso = 1;
 
 	recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
 	    (long)TCP_MAXWIN << tp->rcv_scale);
 	/*
 	 * Sender silly window avoidance.   We transmit under the following
 	 * conditions when len is non-zero:
 	 *
 	 * - We have a full segment (or more with TSO) - This is the last
 	 * buffer in a write()/send() and we are either idle or running
 	 * NODELAY - we've timed out (e.g. persist timer) - we have more
 	 * then 1/2 the maximum send window's worth of data (receiver may be
 	 * limited the window size) - we need to retransmit
 	 */
 	if (rsm)
 		goto send;
 	if (len) {
 		if (sack_rxmit)
 			goto send;
 		if (len >= p_maxseg)
 			goto send;
 		/*
 		 * NOTE! on localhost connections an 'ack' from the remote
 		 * end may occur synchronously with the output and cause us
 		 * to flush a buffer queued with moretocome.  XXX
 		 *
 		 */
 		if (((tp->t_flags & TF_MORETOCOME) == 0) &&	/* normal case */
 		    ((tp->t_flags & TF_NODELAY) ||
 		    ((uint32_t)len + (uint32_t)sb_offset) >= sbavail(&so->so_snd)) &&
 		    (tp->t_flags & TF_NOPUSH) == 0) {
 			goto send;
 		}
 		if ((tp->snd_una == tp->snd_max) && len) {	/* Nothing outstanding */
 			goto send;
 		}
 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
 			goto send;
 		}
 	}
 	/*
 	 * Sending of standalone window updates.
 	 *
 	 * Window updates are important when we close our window due to a
 	 * full socket buffer and are opening it again after the application
 	 * reads data from it.  Once the window has opened again and the
 	 * remote end starts to send again the ACK clock takes over and
 	 * provides the most current window information.
 	 *
 	 * We must avoid the silly window syndrome whereas every read from
 	 * the receive buffer, no matter how small, causes a window update
 	 * to be sent.  We also should avoid sending a flurry of window
 	 * updates when the socket buffer had queued a lot of data and the
 	 * application is doing small reads.
 	 *
 	 * Prevent a flurry of pointless window updates by only sending an
 	 * update when we can increase the advertized window by more than
 	 * 1/4th of the socket buffer capacity.  When the buffer is getting
 	 * full or is very small be more aggressive and send an update
 	 * whenever we can increase by two mss sized segments. In all other
 	 * situations the ACK's to new incoming data will carry further
 	 * window increases.
 	 *
 	 * Don't send an independent window update if a delayed ACK is
 	 * pending (it will get piggy-backed on it) or the remote side
 	 * already has done a half-close and won't send more data.  Skip
 	 * this if the connection is in T/TCP half-open state.
 	 */
 	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
 	    !(tp->t_flags & TF_DELACK) &&
 	    !TCPS_HAVERCVDFIN(tp->t_state)) {
 		/* Check to see if we should do a window update */
 		if (bbr_window_update_needed(tp, so, recwin, maxseg))
 			goto send;
 	}
 	/*
 	 * Send if we owe the peer an ACK, RST, SYN.  ACKNOW
 	 * is also a catch-all for the retransmit timer timeout case.
 	 */
 	if (tp->t_flags & TF_ACKNOW) {
 		goto send;
 	}
 	if (flags & TH_RST) {
 		/* Always send a RST if one is due */
 		goto send;
 	}
 	if ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0) {
 		goto send;
 	}
 	/*
 	 * If our state indicates that FIN should be sent and we have not
 	 * yet done so, then we need to send.
 	 */
 	if (flags & TH_FIN &&
 	    ((tp->t_flags & TF_SENTFIN) == 0)) {
 		goto send;
 	}
 	/*
 	 * No reason to send a segment, just return.
 	 */
 just_return:
 	SOCKBUF_UNLOCK(sb);
 just_return_nolock:
 	if (tot_len)
 		slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);
 	if (bbr->rc_no_pacing)
 		slot = 0;
 	if (tot_len == 0) {
 		if ((ctf_outstanding(tp) + min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) >=
 		    tp->snd_wnd) {
 			BBR_STAT_INC(bbr_rwnd_limited);
 			app_limited = BBR_JR_RWND_LIMITED;
 			bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp));
 			if ((bbr->rc_in_persist == 0) &&
 			    TCPS_HAVEESTABLISHED(tp->t_state) &&
 			    (tp->snd_max == tp->snd_una) &&
 			    sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
 				/* No send window.. we must enter persist */
 				bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__);
 			}
 		} else if (ctf_outstanding(tp) >= sbavail(sb)) {
 			BBR_STAT_INC(bbr_app_limited);
 			app_limited = BBR_JR_APP_LIMITED;
 			bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp));
 		} else if ((ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
 						 bbr->r_ctl.rc_lost_bytes)) + p_maxseg) >= tp->snd_cwnd) {
 			BBR_STAT_INC(bbr_cwnd_limited);
  			app_limited = BBR_JR_CWND_LIMITED;
 			bbr_cwnd_limiting(tp, bbr, ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
 									bbr->r_ctl.rc_lost_bytes)));
 			bbr->rc_cwnd_limited = 1;
 		} else {
 			BBR_STAT_INC(bbr_app_limited);
 			app_limited = BBR_JR_APP_LIMITED;
 			bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp));
 		}
 		bbr->r_ctl.rc_hptsi_agg_delay = 0;
 		bbr->r_agg_early_set = 0;
 		bbr->r_ctl.rc_agg_early = 0;
 		bbr->r_ctl.rc_last_delay_val = 0;
 	} else if (bbr->rc_use_google == 0)
 		bbr_check_bbr_for_state(bbr, cts, __LINE__, 0);
 	/* Are we app limited? */
 	if ((app_limited == BBR_JR_APP_LIMITED) ||
 	    (app_limited == BBR_JR_RWND_LIMITED)) {
 		/**
 		 * We are application limited.
 		 */
 		bbr->r_ctl.r_app_limited_until = (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
 								       bbr->r_ctl.rc_lost_bytes)) + bbr->r_ctl.rc_delivered);
 	}
 	if (tot_len == 0)
 		counter_u64_add(bbr_out_size[TCP_MSS_ACCT_JUSTRET], 1);
 	/* Dont update the time if we did not send */
 	bbr->r_ctl.rc_last_delay_val = 0;
 	bbr->rc_output_starts_timer = 1;
 	bbr_start_hpts_timer(bbr, tp, cts, 9, slot, tot_len);
 	bbr_log_type_just_return(bbr, cts, tot_len, hpts_calling, app_limited, p_maxseg, len);
 	if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
 		/* Make sure snd_nxt is drug up */
 		tp->snd_nxt = tp->snd_max;
 	}
 	return (error);
 
 send:
 	if (doing_tlp == 0) {
 		/*
 		 * Data not a TLP, and its not the rxt firing. If it is the
 		 * rxt firing, we want to leave the tlp_in_progress flag on
 		 * so we don't send another TLP. It has to be a rack timer
 		 * or normal send (response to acked data) to clear the tlp
 		 * in progress flag.
 		 */
 		bbr->rc_tlp_in_progress = 0;
 		bbr->rc_tlp_rtx_out = 0;
 	} else {
 		/*
 		 * Its a TLP.
 		 */
 		bbr->rc_tlp_in_progress = 1;
 	}
 	bbr_timer_cancel(bbr, __LINE__, cts);
 	if (rsm == NULL) {
 		if (sbused(sb) > 0) {
 			/*
 			 * This is sub-optimal. We only send a stand alone
 			 * FIN on its own segment.
 			 */
 			if (flags & TH_FIN) {
 				flags &= ~TH_FIN;
 				if ((len == 0) && ((tp->t_flags & TF_ACKNOW) == 0)) {
 					/* Lets not send this */
 					slot = 0;
 					goto just_return;
 				}
 			}
 		}
 	} else {
 		/*
 		 * We do *not* send a FIN on a retransmit if it has data.
 		 * The if clause here where len > 1 should never come true.
 		 */
 		if ((len > 0) &&
 		    (((rsm->r_flags & BBR_HAS_FIN) == 0) &&
 		    (flags & TH_FIN))) {
 			flags &= ~TH_FIN;
 			len--;
 		}
 	}
 	SOCKBUF_LOCK_ASSERT(sb);
 	if (len > 0) {
 		if ((tp->snd_una == tp->snd_max) &&
 		    (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) {
 			/*
 			 * This qualifies as a RTT_PROBE session since we
 			 * drop the data outstanding to nothing and waited
 			 * more than bbr_rtt_probe_time.
 			 */
 			bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0);
 			bbr_set_reduced_rtt(bbr, cts, __LINE__);
 		}
 		if (len >= maxseg)
 			tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
 	}
 	/*
 	 * Before ESTABLISHED, force sending of initial options unless TCP
 	 * set not to do any options. NOTE: we assume that the IP/TCP header
 	 * plus TCP options always fit in a single mbuf, leaving room for a
 	 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
 	 * + optlen <= MCLBYTES
 	 */
 	optlen = 0;
 #ifdef INET6
 	if (isipv6)
 		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	else
 #endif
 		hdrlen = sizeof(struct tcpiphdr);
 
 	/*
 	 * Compute options for segment. We only have to care about SYN and
 	 * established connection segments.  Options for SYN-ACK segments
 	 * are handled in TCP syncache.
 	 */
 	to.to_flags = 0;
 	local_options = 0;
 	if ((tp->t_flags & TF_NOOPT) == 0) {
 		/* Maximum segment size. */
 		if (flags & TH_SYN) {
 			to.to_mss = tcp_mssopt(&inp->inp_inc);
 			if (tp->t_port)
 				to.to_mss -= V_tcp_udp_tunneling_overhead;
 			to.to_flags |= TOF_MSS;
 			/*
 			 * On SYN or SYN|ACK transmits on TFO connections,
 			 * only include the TFO option if it is not a
 			 * retransmit, as the presence of the TFO option may
 			 * have caused the original SYN or SYN|ACK to have
 			 * been dropped by a middlebox.
 			 */
 			if (IS_FASTOPEN(tp->t_flags) &&
 			    (tp->t_rxtshift == 0)) {
 				if (tp->t_state == TCPS_SYN_RECEIVED) {
 					to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
 					to.to_tfo_cookie =
 					    (u_int8_t *)&tp->t_tfo_cookie.server;
 					to.to_flags |= TOF_FASTOPEN;
 					wanted_cookie = 1;
 				} else if (tp->t_state == TCPS_SYN_SENT) {
 					to.to_tfo_len =
 					    tp->t_tfo_client_cookie_len;
 					to.to_tfo_cookie =
 					    tp->t_tfo_cookie.client;
 					to.to_flags |= TOF_FASTOPEN;
 					wanted_cookie = 1;
 				}
 			}
 		}
 		/* Window scaling. */
 		if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
 			to.to_wscale = tp->request_r_scale;
 			to.to_flags |= TOF_SCALE;
 		}
 		/* Timestamps. */
 		if ((tp->t_flags & TF_RCVD_TSTMP) ||
 		    ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
 			to.to_tsval = 	tcp_tv_to_mssectick(&bbr->rc_tv) + tp->ts_offset;
 			to.to_tsecr = tp->ts_recent;
 			to.to_flags |= TOF_TS;
 			local_options += TCPOLEN_TIMESTAMP + 2;
 		}
 		/* Set receive buffer autosizing timestamp. */
 		if (tp->rfbuf_ts == 0 &&
 		    (so->so_rcv.sb_flags & SB_AUTOSIZE))
 			tp->rfbuf_ts = 	tcp_tv_to_mssectick(&bbr->rc_tv);
 		/* Selective ACK's. */
 		if (flags & TH_SYN)
 			to.to_flags |= TOF_SACKPERM;
 		else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 		    tp->rcv_numsacks > 0) {
 			to.to_flags |= TOF_SACK;
 			to.to_nsacks = tp->rcv_numsacks;
 			to.to_sacks = (u_char *)tp->sackblks;
 		}
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		/* TCP-MD5 (RFC2385). */
 		if (tp->t_flags & TF_SIGNATURE)
 			to.to_flags |= TOF_SIGNATURE;
 #endif				/* TCP_SIGNATURE */
 
 		/* Processing the options. */
 		hdrlen += (optlen = tcp_addoptions(&to, opt));
 		/*
 		 * If we wanted a TFO option to be added, but it was unable
 		 * to fit, ensure no data is sent.
 		 */
 		if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
 		    !(to.to_flags & TOF_FASTOPEN))
 			len = 0;
 	}
 	if (tp->t_port) {
 		if (V_tcp_udp_tunneling_port == 0) {
 			/* The port was removed?? */
 			SOCKBUF_UNLOCK(&so->so_snd);
 			return (EHOSTUNREACH);
 		}
 		hdrlen += sizeof(struct udphdr);
 	}
 #ifdef INET6
 	if (isipv6)
 		ipoptlen = ip6_optlen(tp->t_inpcb);
 	else
 #endif
 	if (tp->t_inpcb->inp_options)
 		ipoptlen = tp->t_inpcb->inp_options->m_len -
 		    offsetof(struct ipoption, ipopt_list);
 	else
 		ipoptlen = 0;
 	ipoptlen = 0;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	ipoptlen += ipsec_optlen;
 #endif
 	if (bbr->rc_last_options != local_options) {
 		/*
 		 * Cache the options length this generally does not change
 		 * on a connection. We use this to calculate TSO.
 		 */
 		bbr->rc_last_options = local_options;
 	}
 	maxseg = tp->t_maxseg - (ipoptlen + optlen);
 	p_maxseg = min(maxseg, pace_max_segs);
 	/*
 	 * Adjust data length if insertion of options will bump the packet
 	 * length beyond the t_maxseg length. Clear the FIN bit because we
 	 * cut off the tail of the segment.
 	 */
 	if (len > maxseg) {
 		if (len != 0 && (flags & TH_FIN)) {
 			flags &= ~TH_FIN;
 		}
 		if (tso) {
 			uint32_t moff;
 			int32_t max_len;
 
 			/* extract TSO information */
 			if_hw_tsomax = tp->t_tsomax;
 			if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
 			if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
 			KASSERT(ipoptlen == 0,
 			    ("%s: TSO can't do IP options", __func__));
 
 			/*
 			 * Check if we should limit by maximum payload
 			 * length:
 			 */
 			if (if_hw_tsomax != 0) {
 				/* compute maximum TSO length */
 				max_len = (if_hw_tsomax - hdrlen -
 				    max_linkhdr);
 				if (max_len <= 0) {
 					len = 0;
 				} else if (len > max_len) {
 					len = max_len;
 				}
 			}
 			/*
 			 * Prevent the last segment from being fractional
 			 * unless the send sockbuf can be emptied:
 			 */
 			if ((sb_offset + len) < sbavail(sb)) {
 				moff = len % (uint32_t)maxseg;
 				if (moff != 0) {
 					len -= moff;
 				}
 			}
 			/*
 			 * In case there are too many small fragments don't
 			 * use TSO:
 			 */
 			if (len <= maxseg) {
 				len = maxseg;
 				tso = 0;
 			}
 		} else {
 			/* Not doing TSO */
 			if (optlen + ipoptlen >= tp->t_maxseg) {
 				/*
 				 * Since we don't have enough space to put
 				 * the IP header chain and the TCP header in
 				 * one packet as required by RFC 7112, don't
 				 * send it. Also ensure that at least one
 				 * byte of the payload can be put into the
 				 * TCP segment.
 				 */
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EMSGSIZE;
 				sack_rxmit = 0;
 				goto out;
 			}
 			len = maxseg;
 		}
 	} else {
 		/* Not doing TSO */
 		if_hw_tsomaxsegcount = 0;
 		tso = 0;
 	}
 	KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
 	    ("%s: len > IP_MAXPACKET", __func__));
 #ifdef DIAGNOSTIC
 #ifdef INET6
 	if (max_linkhdr + hdrlen > MCLBYTES)
 #else
 	if (max_linkhdr + hdrlen > MHLEN)
 #endif
 		panic("tcphdr too big");
 #endif
 	/*
 	 * This KASSERT is here to catch edge cases at a well defined place.
 	 * Before, those had triggered (random) panic conditions further
 	 * down.
 	 */
 #ifdef BBR_INVARIANTS
 	if (sack_rxmit) {
 		if (SEQ_LT(rsm->r_start, tp->snd_una)) {
 			panic("RSM:%p TP:%p bbr:%p start:%u is < snd_una:%u",
 			    rsm, tp, bbr, rsm->r_start, tp->snd_una);
 		}
 	}
 #endif
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 	if ((len == 0) &&
 	    (flags & TH_FIN) &&
 	    (sbused(sb))) {
 		/*
 		 * We have outstanding data, don't send a fin by itself!.
 		 */
 		slot = 0;
 		goto just_return;
 	}
 	/*
 	 * Grab a header mbuf, attaching a copy of data to be transmitted,
 	 * and initialize the header from the template for sends on this
 	 * connection.
 	 */
 	if (len) {
 		uint32_t moff;
 
 		/*
 		 * We place a limit on sending with hptsi.
 		 */
 		if ((rsm == NULL) && len > pace_max_segs)
 			len = pace_max_segs;
 		if (len <= maxseg)
 			tso = 0;
 #ifdef INET6
 		if (MHLEN < hdrlen + max_linkhdr)
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		else
 #endif
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 
 		if (m == NULL) {
 			BBR_STAT_INC(bbr_failed_mbuf_aloc);
 			bbr_log_enobuf_jmp(bbr, len, cts, __LINE__, len, 0, 0);
 			SOCKBUF_UNLOCK(sb);
 			error = ENOBUFS;
 			sack_rxmit = 0;
 			goto out;
 		}
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 		/*
 		 * Start the m_copy functions from the closest mbuf to the
 		 * sb_offset in the socket buffer chain.
 		 */
 		if ((sb_offset > sbavail(sb)) || ((len + sb_offset) > sbavail(sb))) {
 #ifdef BBR_INVARIANTS
 			if ((len + sb_offset) > (sbavail(sb) + ((flags & (TH_FIN | TH_SYN)) ? 1 : 0)))
 				panic("tp:%p bbr:%p len:%u sb_offset:%u sbavail:%u rsm:%p %u:%u:%u",
 				    tp, bbr, len, sb_offset, sbavail(sb), rsm,
 				    doing_retran_from,
 				    picked_up_retran,
 				    doing_tlp);
 
 #endif
 			/*
 			 * In this messed up situation we have two choices,
 			 * a) pretend the send worked, and just start timers
 			 * and what not (not good since that may lead us
 			 * back here a lot). <or> b) Send the lowest segment
 			 * in the map. <or> c) Drop the connection. Lets do
 			 * <b> which if it continues to happen will lead to
 			 * <c> via timeouts.
 			 */
 			BBR_STAT_INC(bbr_offset_recovery);
 			rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map);
 			sb_offset = 0;
 			if (rsm == NULL) {
 				sack_rxmit = 0;
 				len = sbavail(sb);
 			} else {
 				sack_rxmit = 1;
 				if (rsm->r_start != tp->snd_una) {
 					/*
 					 * Things are really messed up, <c>
 					 * is the only thing to do.
 					 */
 					BBR_STAT_INC(bbr_offset_drop);
 					SOCKBUF_UNLOCK(sb);
 					(void)m_free(m);
 					return (-EFAULT); /* tcp_drop() */
 				}
 				len = rsm->r_end - rsm->r_start;
 			}
 			if (len > sbavail(sb))
 				len = sbavail(sb);
 			if (len > maxseg)
 				len = maxseg;
 		}
 		mb = sbsndptr_noadv(sb, sb_offset, &moff);
 		if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
 			m_copydata(mb, moff, (int)len,
 			    mtod(m, caddr_t)+hdrlen);
 			if (rsm == NULL)
 				sbsndptr_adv(sb, mb, len);
 			m->m_len += len;
 		} else {
 			struct sockbuf *msb;
 
 			if (rsm)
 				msb = NULL;
 			else
 				msb = sb;
 #ifdef BBR_INVARIANTS
 			if ((len + moff) > (sbavail(sb) + ((flags & (TH_FIN | TH_SYN)) ? 1 : 0))) {
 				if (rsm) {
 					panic("tp:%p bbr:%p len:%u moff:%u sbavail:%u rsm:%p snd_una:%u rsm_start:%u flg:%x %u:%u:%u sr:%d ",
 					    tp, bbr, len, moff,
 					    sbavail(sb), rsm,
 					    tp->snd_una, rsm->r_flags, rsm->r_start,
 					    doing_retran_from,
 					    picked_up_retran,
 					    doing_tlp, sack_rxmit);
 				} else {
 					panic("tp:%p bbr:%p len:%u moff:%u sbavail:%u sb_offset:%u snd_una:%u",
 					    tp, bbr, len, moff, sbavail(sb), sb_offset, tp->snd_una);
 				}
 			}
 #endif
 			m->m_next = tcp_m_copym(
 				mb, moff, &len,
 				if_hw_tsomaxsegcount,
 				if_hw_tsomaxsegsize, msb,
 				((rsm == NULL) ? hw_tls : 0)
 #ifdef NETFLIX_COPY_ARGS
 				, &filled_all
 #endif
 				);
 			if (len <= maxseg) {
 				/*
 				 * Must have ran out of mbufs for the copy
 				 * shorten it to no longer need tso. Lets
 				 * not put on sendalot since we are low on
 				 * mbufs.
 				 */
 				tso = 0;
 			}
 			if (m->m_next == NULL) {
 				SOCKBUF_UNLOCK(sb);
 				(void)m_free(m);
 				error = ENOBUFS;
 				sack_rxmit = 0;
 				goto out;
 			}
 		}
 #ifdef BBR_INVARIANTS
 		if (tso && len < maxseg) {
 			panic("tp:%p tso on, but len:%d < maxseg:%d",
 			    tp, len, maxseg);
 		}
 		if (tso && if_hw_tsomaxsegcount) {
 			int32_t seg_cnt = 0;
 			struct mbuf *foo;
 
 			foo = m;
 			while (foo) {
 				seg_cnt++;
 				foo = foo->m_next;
 			}
 			if (seg_cnt > if_hw_tsomaxsegcount) {
 				panic("seg_cnt:%d > max:%d", seg_cnt, if_hw_tsomaxsegcount);
 			}
 		}
 #endif
 		/*
 		 * If we're sending everything we've got, set PUSH. (This
 		 * will keep happy those implementations which only give
 		 * data to the user when a buffer fills or a PUSH comes in.)
 		 */
 		if (sb_offset + len == sbused(sb) &&
 		    sbused(sb) &&
 		    !(flags & TH_SYN)) {
 			flags |= TH_PUSH;
 		}
 		SOCKBUF_UNLOCK(sb);
 	} else {
 		SOCKBUF_UNLOCK(sb);
 		if (tp->t_flags & TF_ACKNOW)
 			KMOD_TCPSTAT_INC(tcps_sndacks);
 		else if (flags & (TH_SYN | TH_FIN | TH_RST))
 			KMOD_TCPSTAT_INC(tcps_sndctrl);
 		else
 			KMOD_TCPSTAT_INC(tcps_sndwinup);
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			BBR_STAT_INC(bbr_failed_mbuf_aloc);
 			bbr_log_enobuf_jmp(bbr, len, cts, __LINE__, len, 0, 0);
 			error = ENOBUFS;
 			/* Fudge the send time since we could not send */
 			sack_rxmit = 0;
 			goto out;
 		}
 #ifdef INET6
 		if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
 		    MHLEN >= hdrlen) {
 			M_ALIGN(m, hdrlen);
 		} else
 #endif
 			m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 	}
 	SOCKBUF_UNLOCK_ASSERT(sb);
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 #ifdef INET6
 	if (isipv6) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (tp->t_port) {
 			udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr));
 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
 			udp->uh_dport = tp->t_port;
 			ulen = hdrlen + len - sizeof(struct ip6_hdr);
 			udp->uh_ulen = htons(ulen);
 			th = (struct tcphdr *)(udp + 1);
 		} else {
 			th = (struct tcphdr *)(ip6 + 1);
 		}
 		tcpip_fillheaders(inp, tp->t_port, ip6, th);
 	} else
 #endif				/* INET6 */
 	{
 		ip = mtod(m, struct ip *);
 #ifdef TCPDEBUG
 		ipov = (struct ipovly *)ip;
 #endif
 		if (tp->t_port) {
 			udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
 			udp->uh_dport = tp->t_port;
 			ulen = hdrlen + len - sizeof(struct ip);
 			udp->uh_ulen = htons(ulen);
 			th = (struct tcphdr *)(udp + 1);
 		} else {
 			th = (struct tcphdr *)(ip + 1);
 		}
 		tcpip_fillheaders(inp, tp->t_port, ip, th);
 	}
 	/*
 	 * If we are doing retransmissions, then snd_nxt will not reflect
 	 * the first unsent octet.  For ACK only packets, we do not want the
 	 * sequence number of the retransmitted packet, we want the sequence
 	 * number of the next unsent octet.  So, if there is no data (and no
 	 * SYN or FIN), use snd_max instead of snd_nxt when filling in
 	 * ti_seq.  But if we are in persist state, snd_max might reflect
 	 * one byte beyond the right edge of the window, so use snd_nxt in
 	 * that case, since we know we aren't doing a retransmission.
 	 * (retransmit and persist are mutually exclusive...)
 	 */
 	if (sack_rxmit == 0) {
 		if (len && ((flags & (TH_FIN | TH_SYN | TH_RST)) == 0)) {
 			/* New data (including new persists) */
 			th->th_seq = htonl(tp->snd_max);
 			bbr_seq = tp->snd_max;
 		} else if (flags & TH_SYN) {
 			/* Syn's always send from iss */
 			th->th_seq = htonl(tp->iss);
 			bbr_seq = tp->iss;
 		} else if (flags & TH_FIN) {
 			if (flags & TH_FIN && tp->t_flags & TF_SENTFIN) {
 				/*
 				 * If we sent the fin already its 1 minus
 				 * snd_max
 				 */
 				th->th_seq = (htonl(tp->snd_max - 1));
 				bbr_seq = (tp->snd_max - 1);
 			} else {
 				/* First time FIN use snd_max */
 				th->th_seq = htonl(tp->snd_max);
 				bbr_seq = tp->snd_max;
 			}
 		} else {
 			/*
 			 * len == 0 and not persist we use snd_max, sending
 			 * an ack unless we have sent the fin then its 1
 			 * minus.
 			 */
 			/*
 			 * XXXRRS Question if we are in persists and we have
 			 * nothing outstanding to send and we have not sent
 			 * a FIN, we will send an ACK. In such a case it
 			 * might be better to send (tp->snd_una - 1) which
 			 * would force the peer to ack.
 			 */
 			if (tp->t_flags & TF_SENTFIN) {
 				th->th_seq = htonl(tp->snd_max - 1);
 				bbr_seq = (tp->snd_max - 1);
 			} else {
 				th->th_seq = htonl(tp->snd_max);
 				bbr_seq = tp->snd_max;
 			}
 		}
 	} else {
 		/* All retransmits use the rsm to guide the send */
 		th->th_seq = htonl(rsm->r_start);
 		bbr_seq = rsm->r_start;
 	}
 	th->th_ack = htonl(tp->rcv_nxt);
 	if (optlen) {
 		bcopy(opt, th + 1, optlen);
 		th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 	}
 	tcp_set_flags(th, flags);
 	/*
 	 * Calculate receive window.  Don't shrink window, but avoid silly
 	 * window syndrome.
 	 */
 	if ((flags & TH_RST) || ((recwin < (so->so_rcv.sb_hiwat / 4) &&
 				  recwin < maxseg)))
 		recwin = 0;
 	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
 	    recwin < (tp->rcv_adv - tp->rcv_nxt))
 		recwin = (tp->rcv_adv - tp->rcv_nxt);
 	if (recwin > TCP_MAXWIN << tp->rcv_scale)
 		recwin = TCP_MAXWIN << tp->rcv_scale;
 
 	/*
 	 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
 	 * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
 	 * handled in syncache.
 	 */
 	if (flags & TH_SYN)
 		th->th_win = htons((u_short)
 		    (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
 	else {
 		/* Avoid shrinking window with window scaling. */
 		recwin = roundup2(recwin, 1 << tp->rcv_scale);
 		th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
 	}
 	/*
 	 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
 	 * window.  This may cause the remote transmitter to stall.  This
 	 * flag tells soreceive() to disable delayed acknowledgements when
 	 * draining the buffer.  This can occur if the receiver is
 	 * attempting to read more data than can be buffered prior to
 	 * transmitting on the connection.
 	 */
 	if (th->th_win == 0) {
 		tp->t_sndzerowin++;
 		tp->t_flags |= TF_RXWIN0SENT;
 	} else
 		tp->t_flags &= ~TF_RXWIN0SENT;
 	/*
 	 * We don't support urgent data, but drag along
 	 * the pointer in case of a stack switch.
 	 */
 	tp->snd_up = tp->snd_una;
 
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 	if (to.to_flags & TOF_SIGNATURE) {
 		/*
 		 * Calculate MD5 signature and put it into the place
 		 * determined before. NOTE: since TCP options buffer doesn't
 		 * point into mbuf's data, calculate offset and use it.
 		 */
 		if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
 		    (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
 			/*
 			 * Do not send segment if the calculation of MD5
 			 * digest has failed.
 			 */
 			goto out;
 		}
 	}
 #endif
 
 	/*
 	 * Put TCP length in extended header, and then checksum extended
 	 * header and data.
 	 */
 	m->m_pkthdr.len = hdrlen + len;	/* in6_cksum() need this */
 #ifdef INET6
 	if (isipv6) {
 		/*
 		 * ip6_plen is not need to be filled now, and will be filled
 		 * in ip6_output.
 		 */
 		if (tp->t_port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
 			th->th_sum = htons(0);
 			UDPSTAT_INC(udps_opackets);
 		} else {
 			csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
 			    optlen + len, IPPROTO_TCP, 0);
 		}
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		if (tp->t_port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
 			    ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
 			th->th_sum = htons(0);
 			UDPSTAT_INC(udps_opackets);
 		} else {
 			csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			th->th_sum = in_pseudo(ip->ip_src.s_addr,
 			    ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
 			    IPPROTO_TCP + len + optlen));
 		}
 		/* IP version must be set here for ipv4/ipv6 checking later */
 		KASSERT(ip->ip_v == IPVERSION,
 		    ("%s: IP version incorrect: %d", __func__, ip->ip_v));
 	}
 #endif
 
 	/*
 	 * Enable TSO and specify the size of the segments. The TCP pseudo
 	 * header checksum is always provided. XXX: Fixme: This is currently
 	 * not the case for IPv6.
 	 */
 	if (tso) {
 		KASSERT(len > maxseg,
 		    ("%s: len:%d <= tso_segsz:%d", __func__, len, maxseg));
 		m->m_pkthdr.csum_flags |= CSUM_TSO;
 		csum_flags |= CSUM_TSO;
 		m->m_pkthdr.tso_segsz = maxseg;
 	}
 	KASSERT(len + hdrlen == m_length(m, NULL),
 	    ("%s: mbuf chain different than expected: %d + %u != %u",
 	    __func__, len, hdrlen, m_length(m, NULL)));
 
 #ifdef TCP_HHOOK
 	/* Run HHOOK_TC_ESTABLISHED_OUT helper hooks. */
 	hhook_run_tcp_est_out(tp, th, &to, len, tso);
 #endif
 #ifdef TCPDEBUG
 	/*
 	 * Trace.
 	 */
 	if (so->so_options & SO_DEBUG) {
 		u_short save = 0;
 
 #ifdef INET6
 		if (!isipv6)
 #endif
 		{
 			save = ipov->ih_len;
 			ipov->ih_len = htons(m->m_pkthdr.len	/* - hdrlen +
 			      * (th->th_off << 2) */ );
 		}
 		tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
 #ifdef INET6
 		if (!isipv6)
 #endif
 			ipov->ih_len = save;
 	}
 #endif				/* TCPDEBUG */
 
 	/* Log to the black box */
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
 		/* Record info on type of transmission */
 		log.u_bbr.flex1 = bbr->r_ctl.rc_hptsi_agg_delay;
 		log.u_bbr.flex2 = (bbr->r_recovery_bw << 3);
 		log.u_bbr.flex3 = maxseg;
 		log.u_bbr.flex4 = delay_calc;
 		/* Encode filled_all into the upper flex5 bit */
 		log.u_bbr.flex5 = bbr->rc_past_init_win;
 		log.u_bbr.flex5 <<= 1;
 		log.u_bbr.flex5 |= bbr->rc_no_pacing;
 		log.u_bbr.flex5 <<= 29;
 		if (filled_all)
 			log.u_bbr.flex5 |= 0x80000000;
 		log.u_bbr.flex5 |= tp->t_maxseg;
 		log.u_bbr.flex6 = bbr->r_ctl.rc_pace_max_segs;
 		log.u_bbr.flex7 = (bbr->rc_bbr_state << 8) | bbr_state_val(bbr);
 		/* lets poke in the low and the high here for debugging */
 		log.u_bbr.pkts_out = bbr->rc_tp->t_maxseg;
 		if (rsm || sack_rxmit) {
 			if (doing_tlp)
 				log.u_bbr.flex8 = 2;
 			else
 				log.u_bbr.flex8 = 1;
 		} else {
 			log.u_bbr.flex8 = 0;
 		}
 		lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
 		    len, &log, false, NULL, NULL, 0, tv);
 	} else {
 		lgb = NULL;
 	}
 	/*
 	 * Fill in IP length and desired time to live and send to IP level.
 	 * There should be a better way to handle ttl and tos; we could keep
 	 * them in the template, but need a way to checksum without them.
 	 */
 	/*
 	 * m->m_pkthdr.len should have been set before cksum calcuration,
 	 * because in6_cksum() need it.
 	 */
 #ifdef INET6
 	if (isipv6) {
 		/*
 		 * we separately set hoplimit for every segment, since the
 		 * user might want to change the value via setsockopt. Also,
 		 * desired default hop limit might be changed via Neighbor
 		 * Discovery.
 		 */
 		ip6->ip6_hlim = in6_selecthlim(inp, NULL);
 
 		/*
 		 * Set the packet size here for the benefit of DTrace
 		 * probes. ip6_output() will set it properly; it's supposed
 		 * to include the option header lengths as well.
 		 */
 		ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
 
 		if (V_path_mtu_discovery && maxseg > V_tcp_minmss)
 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 
 		if (tp->t_state == TCPS_SYN_SENT)
 			TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
 
 		TCP_PROBE5(send, NULL, tp, ip6, tp, th);
 		/* TODO: IPv6 IP6TOS_ECT bit on */
 		error = ip6_output(m, inp->in6p_outputopts,
 		    &inp->inp_route6,
 		    ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0),
 		    NULL, NULL, inp);
 
 		if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL)
 			mtu = inp->inp_route6.ro_nh->nh_mtu;
 	}
 #endif				/* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		ip->ip_len = htons(m->m_pkthdr.len);
 #ifdef INET6
 		if (isipv6)
 			ip->ip_ttl = in6_selecthlim(inp, NULL);
 #endif				/* INET6 */
 		/*
 		 * If we do path MTU discovery, then we set DF on every
 		 * packet. This might not be the best thing to do according
 		 * to RFC3390 Section 2. However the tcp hostcache migitates
 		 * the problem so it affects only the first tcp connection
 		 * with a host.
 		 *
 		 * NB: Don't set DF on small MTU/MSS to have a safe
 		 * fallback.
 		 */
 		if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 			if (tp->t_port == 0 || len < V_tcp_minmss) {
 				ip->ip_off |= htons(IP_DF);
 			}
 		} else {
 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 		}
 
 		if (tp->t_state == TCPS_SYN_SENT)
 			TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
 
 		TCP_PROBE5(send, NULL, tp, ip, tp, th);
 
 		error = ip_output(m, inp->inp_options, &inp->inp_route,
 		    ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0,
 		    inp);
 		if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL)
 			mtu = inp->inp_route.ro_nh->nh_mtu;
 	}
 #endif				/* INET */
 out:
 
 	if (lgb) {
 		lgb->tlb_errno = error;
 		lgb = NULL;
 	}
 	/*
 	 * In transmit state, time the transmission and arrange for the
 	 * retransmit.  In persist state, just set snd_max.
 	 */
 	if (error == 0) {
 		tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls);
 		if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 		    (tp->t_flags & TF_SACK_PERMIT) &&
 		    tp->rcv_numsacks > 0)
 			tcp_clean_dsack_blocks(tp);
 		/* We sent an ack clear the bbr_segs_rcvd count */
 		bbr->output_error_seen = 0;
 		bbr->oerror_cnt = 0;
 		bbr->bbr_segs_rcvd = 0;
 		if (len == 0)
 			counter_u64_add(bbr_out_size[TCP_MSS_ACCT_SNDACK], 1);
 		/* Do accounting for new sends */
 		if ((len > 0) && (rsm == NULL)) {
 			int idx;
 			if (tp->snd_una == tp->snd_max) {
 				/*
 				 * Special case to match google, when
 				 * nothing is in flight the delivered
 				 * time does get updated to the current
 				 * time (see tcp_rate_bsd.c).
 				 */
 				bbr->r_ctl.rc_del_time = cts;
 			}
 			if (len >= maxseg) {
 				idx = (len / maxseg) + 3;
 				if (idx >= TCP_MSS_ACCT_ATIMER)
 					counter_u64_add(bbr_out_size[(TCP_MSS_ACCT_ATIMER - 1)], 1);
 				else
 					counter_u64_add(bbr_out_size[idx], 1);
 			} else {
 				/* smaller than a MSS */
 				idx = len / (bbr_hptsi_bytes_min - bbr->rc_last_options);
 				if (idx >= TCP_MSS_SMALL_MAX_SIZE_DIV)
 					idx = (TCP_MSS_SMALL_MAX_SIZE_DIV - 1);
 				counter_u64_add(bbr_out_size[(idx + TCP_MSS_SMALL_SIZE_OFF)], 1);
 			}
 		}
 	}
 	abandon = 0;
 	/*
 	 * We must do the send accounting before we log the output,
 	 * otherwise the state of the rsm could change and we account to the
 	 * wrong bucket.
 	 */
 	if (len > 0) {
 		bbr_do_send_accounting(tp, bbr, rsm, len, error);
 		if (error == 0) {
 			if (tp->snd_una == tp->snd_max)
 				bbr->r_ctl.rc_tlp_rxt_last_time = cts;
 		}
 	}
 	bbr_log_output(bbr, tp, &to, len, bbr_seq, (uint8_t) flags, error,
 	    cts, mb, &abandon, rsm, 0, sb);
 	if (abandon) {
 		/*
 		 * If bbr_log_output destroys the TCB or sees a TH_RST being
 		 * sent we should hit this condition.
 		 */
 		return (0);
 	}
 	if (bbr->rc_in_persist == 0) {
 		/*
 		 * Advance snd_nxt over sequence space of this segment.
 		 */
 		if (error)
 			/* We don't log or do anything with errors */
 			goto skip_upd;
 
 		if (tp->snd_una == tp->snd_max &&
 		    (len || (flags & (TH_SYN | TH_FIN)))) {
 			/*
 			 * Update the time we just added data since none was
 			 * outstanding.
 			 */
 			bbr_log_progress_event(bbr, tp, ticks, PROGRESS_START, __LINE__);
 			bbr->rc_tp->t_acktime  = ticks;
 		}
 		if (flags & (TH_SYN | TH_FIN) && (rsm == NULL)) {
 			if (flags & TH_SYN) {
 				/*
 				 * Smack the snd_max to iss + 1
 				 * if its a FO we will add len below.
 				 */
 				tp->snd_max = tp->iss + 1;
 			}
 			if ((flags & TH_FIN) && ((tp->t_flags & TF_SENTFIN) == 0)) {
 				tp->snd_max++;
 				tp->t_flags |= TF_SENTFIN;
 			}
 		}
 		if (sack_rxmit == 0)
 			tp->snd_max += len;
 skip_upd:
 		if ((error == 0) && len)
 			tot_len += len;
 	} else {
 		/* Persists case */
 		int32_t xlen = len;
 
 		if (error)
 			goto nomore;
 
 		if (flags & TH_SYN)
 			++xlen;
 		if ((flags & TH_FIN) && ((tp->t_flags & TF_SENTFIN) == 0)) {
 			++xlen;
 			tp->t_flags |= TF_SENTFIN;
 		}
 		if (xlen && (tp->snd_una == tp->snd_max)) {
 			/*
 			 * Update the time we just added data since none was
 			 * outstanding.
 			 */
 			bbr_log_progress_event(bbr, tp, ticks, PROGRESS_START, __LINE__);
 			bbr->rc_tp->t_acktime = ticks;
 		}
 		if (sack_rxmit == 0)
 			tp->snd_max += xlen;
 		tot_len += (len + optlen + ipoptlen);
 	}
 nomore:
 	if (error) {
 		/*
 		 * Failures do not advance the seq counter above. For the
 		 * case of ENOBUFS we will fall out and become ack-clocked.
 		 * capping the cwnd at the current flight.
 		 * Everything else will just have to retransmit with the timer
 		 * (no pacer).
 		 */
 		SOCKBUF_UNLOCK_ASSERT(sb);
 		BBR_STAT_INC(bbr_saw_oerr);
 		/* Clear all delay/early tracks */
 		bbr->r_ctl.rc_hptsi_agg_delay = 0;
 		bbr->r_ctl.rc_agg_early = 0;
 		bbr->r_agg_early_set = 0;
 		bbr->output_error_seen = 1;
 		if (bbr->oerror_cnt < 0xf)
 			bbr->oerror_cnt++;
 		if (bbr_max_net_error_cnt && (bbr->oerror_cnt >= bbr_max_net_error_cnt)) {
 			/* drop the session */
 			return (-ENETDOWN);
 		}
 		switch (error) {
 		case ENOBUFS:
 			/*
 			 * Make this guy have to get ack's to send
 			 * more but lets make sure we don't
 			 * slam him below a T-O (1MSS).
 			 */
 			if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) {
 				tp->snd_cwnd = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
 								    bbr->r_ctl.rc_lost_bytes)) - maxseg;
 				if (tp->snd_cwnd < maxseg)
 					tp->snd_cwnd = maxseg;
 			}
 			slot = (bbr_error_base_paceout + 1) << bbr->oerror_cnt;
 			BBR_STAT_INC(bbr_saw_enobuf);
 			if (bbr->bbr_hdrw_pacing)
 				counter_u64_add(bbr_hdwr_pacing_enobuf, 1);
 			else
 				counter_u64_add(bbr_nohdwr_pacing_enobuf, 1);
 			/*
 			 * Here even in the enobuf's case we want to do our
 			 * state update. The reason being we may have been
 			 * called by the input function. If so we have had
 			 * things change.
 			 */
 			error = 0;
 			goto enobufs;
 		case EMSGSIZE:
 			/*
 			 * For some reason the interface we used initially
 			 * to send segments changed to another or lowered
 			 * its MTU. If TSO was active we either got an
 			 * interface without TSO capabilits or TSO was
 			 * turned off. If we obtained mtu from ip_output()
 			 * then update it and try again.
 			 */
 			/* Turn on tracing (or try to) */
 			{
 				int old_maxseg;
 
 				old_maxseg = tp->t_maxseg;
 				BBR_STAT_INC(bbr_saw_emsgsiz);
 				bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, csum_flags, tso, cts);
 				if (mtu != 0)
 					tcp_mss_update(tp, -1, mtu, NULL, NULL);
 				if (old_maxseg <= tp->t_maxseg) {
 					/* Huh it did not shrink? */
 					tp->t_maxseg = old_maxseg - 40;
 					bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, 0, tso, cts);
 				}
 				/*
 				 * Nuke all other things that can interfere
 				 * with slot
 				 */
 				if ((tot_len + len) && (len >= tp->t_maxseg)) {
 					slot = bbr_get_pacing_delay(bbr,
 					    bbr->r_ctl.rc_bbr_hptsi_gain,
 					    (tot_len + len), cts, 0);
 					if (slot < bbr_error_base_paceout)
 						slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;
 				} else
 					slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt;
 				bbr->rc_output_starts_timer = 1;
 				bbr_start_hpts_timer(bbr, tp, cts, 10, slot,
 				    tot_len);
 				return (error);
 			}
 		case EPERM:
 			tp->t_softerror = error;
 			/* Fall through */
 		case EHOSTDOWN:
 		case EHOSTUNREACH:
 		case ENETDOWN:
 		case ENETUNREACH:
 			if (TCPS_HAVERCVDSYN(tp->t_state)) {
 				tp->t_softerror = error;
 			}
 			/* FALLTHROUGH */
 		default:
 			slot = (bbr_error_base_paceout + 3) << bbr->oerror_cnt;
 			bbr->rc_output_starts_timer = 1;
 			bbr_start_hpts_timer(bbr, tp, cts, 11, slot, 0);
 			return (error);
 		}
 #ifdef STATS
 	} else if (((tp->t_flags & TF_GPUTINPROG) == 0) &&
 		    len &&
 		    (rsm == NULL) &&
 	    (bbr->rc_in_persist == 0)) {
 		tp->gput_seq = bbr_seq;
 		tp->gput_ack = bbr_seq +
 		    min(sbavail(&so->so_snd) - sb_offset, sendwin);
 		tp->gput_ts = cts;
 		tp->t_flags |= TF_GPUTINPROG;
 #endif
 	}
 	KMOD_TCPSTAT_INC(tcps_sndtotal);
 	if ((bbr->bbr_hdw_pace_ena) &&
 	    (bbr->bbr_attempt_hdwr_pace == 0) &&
 	    (bbr->rc_past_init_win) &&
 	    (bbr->rc_bbr_state != BBR_STATE_STARTUP) &&
 	    (get_filter_value(&bbr->r_ctl.rc_delrate)) &&
 	    (inp->inp_route.ro_nh &&
 	     inp->inp_route.ro_nh->nh_ifp)) {
 		/*
 		 * We are past the initial window and
 		 * have at least one measurement so we
 		 * could use hardware pacing if its available.
 		 * We have an interface and we have not attempted
 		 * to setup hardware pacing, lets try to now.
 		 */
 		uint64_t rate_wanted;
 		int err = 0;
 
 		rate_wanted = bbr_get_hardware_rate(bbr);
 		bbr->bbr_attempt_hdwr_pace = 1;
 		bbr->r_ctl.crte = tcp_set_pacing_rate(bbr->rc_tp,
 						      inp->inp_route.ro_nh->nh_ifp,
 						      rate_wanted,
 						      (RS_PACING_GEQ|RS_PACING_SUB_OK),
 						      &err, NULL);
 		if (bbr->r_ctl.crte) {
 			bbr_type_log_hdwr_pacing(bbr,
 						 bbr->r_ctl.crte->ptbl->rs_ifp,
 						 rate_wanted,
 						 bbr->r_ctl.crte->rate,
 						 __LINE__, cts, err);
 			BBR_STAT_INC(bbr_hdwr_rl_add_ok);
 			counter_u64_add(bbr_flows_nohdwr_pacing, -1);
 			counter_u64_add(bbr_flows_whdwr_pacing, 1);
 			bbr->bbr_hdrw_pacing = 1;
 			/* Now what is our gain status? */
 			if (bbr->r_ctl.crte->rate < rate_wanted) {
 				/* We have a problem */
 				bbr_setup_less_of_rate(bbr, cts,
 						       bbr->r_ctl.crte->rate, rate_wanted);
 			} else {
 				/* We are good */
 				bbr->gain_is_limited = 0;
 				bbr->skip_gain = 0;
 			}
 			tcp_bbr_tso_size_check(bbr, cts);
 		} else {
 			bbr_type_log_hdwr_pacing(bbr,
 						 inp->inp_route.ro_nh->nh_ifp,
 						 rate_wanted,
 						 0,
 						 __LINE__, cts, err);
 			BBR_STAT_INC(bbr_hdwr_rl_add_fail);
 		}
 	}
 	if (bbr->bbr_hdrw_pacing) {
 		/*
 		 * Worry about cases where the route
 		 * changes or something happened that we
 		 * lost our hardware pacing possibly during
 		 * the last ip_output call.
 		 */
 		if (inp->inp_snd_tag == NULL) {
 			/* A change during ip output disabled hw pacing? */
 			bbr->bbr_hdrw_pacing = 0;
 		} else if ((inp->inp_route.ro_nh == NULL) ||
 		    (inp->inp_route.ro_nh->nh_ifp != inp->inp_snd_tag->ifp)) {
 			/*
 			 * We had an interface or route change,
 			 * detach from the current hdwr pacing
 			 * and setup to re-attempt next go
 			 * round.
 			 */
 			bbr->bbr_hdrw_pacing = 0;
 			bbr->bbr_attempt_hdwr_pace = 0;
 			tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp);
 			tcp_bbr_tso_size_check(bbr, cts);
 		}
 	}
 	/*
 	 * Data sent (as far as we can tell). If this advertises a larger
 	 * window than any other segment, then remember the size of the
 	 * advertised window. Any pending ACK has now been sent.
 	 */
 	if (SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
 		tp->rcv_adv = tp->rcv_nxt + recwin;
 
 	tp->last_ack_sent = tp->rcv_nxt;
 	if ((error == 0) &&
 	    (bbr->r_ctl.rc_pace_max_segs > tp->t_maxseg) &&
 	    (doing_tlp == 0) &&
 	    (tso == 0) &&
 	    (len > 0) &&
 	    ((flags & TH_RST) == 0) &&
 	    ((flags & TH_SYN) == 0) &&
 	    (IN_RECOVERY(tp->t_flags) == 0) &&
 	    (bbr->rc_in_persist == 0) &&
 	    (tot_len < bbr->r_ctl.rc_pace_max_segs)) {
 		/*
 		 * For non-tso we need to goto again until we have sent out
 		 * enough data to match what we are hptsi out every hptsi
 		 * interval.
 		 */
 		if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
 			/* Make sure snd_nxt is drug up */
 			tp->snd_nxt = tp->snd_max;
 		}
 		if (rsm != NULL) {
 			rsm = NULL;
 			goto skip_again;
 		}
 		rsm = NULL;
 		sack_rxmit = 0;
 		tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 		goto again;
 	}
 skip_again:
 	if ((error == 0) && (flags & TH_FIN))
 		tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN);
 	if ((error == 0) && (flags & TH_RST))
 		tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
 	if (((flags & (TH_RST | TH_SYN | TH_FIN)) == 0) && tot_len) {
 		/*
 		 * Calculate/Re-Calculate the hptsi slot in usecs based on
 		 * what we have sent so far
 		 */
 		slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0);
 		if (bbr->rc_no_pacing)
 			slot = 0;
 	}
 	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 enobufs:
 	if (bbr->rc_use_google == 0)
 		bbr_check_bbr_for_state(bbr, cts, __LINE__, 0);
 	bbr_cwnd_limiting(tp, bbr, ctf_flight_size(tp, (bbr->r_ctl.rc_sacked +
 							bbr->r_ctl.rc_lost_bytes)));
 	bbr->rc_output_starts_timer = 1;
 	if (bbr->bbr_use_rack_cheat &&
 	    (more_to_rxt ||
 	     ((bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts)) != NULL))) {
 		/* Rack cheats and shotguns out all rxt's 1ms apart */
 		if (slot > 1000)
 			slot = 1000;
 	}
 	if (bbr->bbr_hdrw_pacing && (bbr->hw_pacing_set == 0)) {
 		/*
 		 * We don't change the tso size until some number of sends
 		 * to give the hardware commands time to get down
 		 * to the interface.
 		 */
 		bbr->r_ctl.bbr_hdwr_cnt_noset_snt++;
 		if (bbr->r_ctl.bbr_hdwr_cnt_noset_snt >= bbr_hdwr_pacing_delay_cnt) {
 			bbr->hw_pacing_set = 1;
 			tcp_bbr_tso_size_check(bbr, cts);
 		}
 	}
 	bbr_start_hpts_timer(bbr, tp, cts, 12, slot, tot_len);
 	if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
 		/* Make sure snd_nxt is drug up */
 		tp->snd_nxt = tp->snd_max;
 	}
 	return (error);
 
 }
 
 /*
  * See bbr_output_wtime() for return values.
  */
 static int
 bbr_output(struct tcpcb *tp)
 {
 	int32_t ret;
 	struct timeval tv;
 
 	NET_EPOCH_ASSERT();
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	(void)tcp_get_usecs(&tv);
 	ret = bbr_output_wtime(tp, &tv);
 	return (ret);
 }
 
 static void
 bbr_mtu_chg(struct tcpcb *tp)
 {
 	struct tcp_bbr *bbr;
 	struct bbr_sendmap *rsm, *frsm = NULL;
 	uint32_t maxseg;
 
 	/*
 	 * The MTU has changed. a) Clear the sack filter. b) Mark everything
 	 * over the current size as SACK_PASS so a retransmit will occur.
 	 */
 
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	maxseg = tp->t_maxseg - bbr->rc_last_options;
 	sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una);
 	TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) {
 		/* Don't mess with ones acked (by sack?) */
 		if (rsm->r_flags & BBR_ACKED)
 			continue;
 		if ((rsm->r_end - rsm->r_start) > maxseg) {
 			/*
 			 * We mark sack-passed on all the previous large
 			 * sends we did. This will force them to retransmit.
 			 */
 			rsm->r_flags |= BBR_SACK_PASSED;
 			if (((rsm->r_flags & BBR_MARKED_LOST) == 0) &&
 			    bbr_is_lost(bbr, rsm, bbr->r_ctl.rc_rcvtime)) {
 				bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start;
 				bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start;
 				rsm->r_flags |= BBR_MARKED_LOST;
 			}
 			if (frsm == NULL)
 				frsm = rsm;
 		}
 	}
 	if (frsm) {
 		bbr->r_ctl.rc_resend = frsm;
 	}
 }
 
 static int
 bbr_pru_options(struct tcpcb *tp, int flags)
 {
 	if (flags & PRUS_OOB)
 		return (EOPNOTSUPP);
 	return (0);
 }
 
 struct tcp_function_block __tcp_bbr = {
 	.tfb_tcp_block_name = __XSTRING(STACKNAME),
 	.tfb_tcp_output = bbr_output,
 	.tfb_do_queued_segments = ctf_do_queued_segments,
 	.tfb_do_segment_nounlock = bbr_do_segment_nounlock,
 	.tfb_tcp_do_segment = bbr_do_segment,
 	.tfb_tcp_ctloutput = bbr_ctloutput,
 	.tfb_tcp_fb_init = bbr_init,
 	.tfb_tcp_fb_fini = bbr_fini,
 	.tfb_tcp_timer_stop_all = bbr_stopall,
 	.tfb_tcp_timer_activate = bbr_timer_activate,
 	.tfb_tcp_timer_active = bbr_timer_active,
 	.tfb_tcp_timer_stop = bbr_timer_stop,
 	.tfb_tcp_rexmit_tmr = bbr_remxt_tmr,
 	.tfb_tcp_handoff_ok = bbr_handoff_ok,
 	.tfb_tcp_mtu_chg = bbr_mtu_chg,
 	.tfb_pru_options = bbr_pru_options,
 	.tfb_flags = TCP_FUNC_OUTPUT_CANDROP,
 };
 
 /*
  * bbr_ctloutput() must drop the inpcb lock before performing copyin on
  * socket option arguments.  When it re-acquires the lock after the copy, it
  * has to revalidate that the connection is still valid for the socket
  * option.
  */
 static int
 bbr_set_sockopt(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct epoch_tracker et;
 	struct tcpcb *tp;
 	struct tcp_bbr *bbr;
 	int32_t error = 0, optval;
 
 	switch (sopt->sopt_level) {
 	case IPPROTO_IPV6:
 	case IPPROTO_IP:
 		return (tcp_default_ctloutput(inp, sopt));
 	}
 
 	switch (sopt->sopt_name) {
 	case TCP_RACK_PACE_MAX_SEG:
 	case TCP_RACK_MIN_TO:
 	case TCP_RACK_REORD_THRESH:
 	case TCP_RACK_REORD_FADE:
 	case TCP_RACK_TLP_THRESH:
 	case TCP_RACK_PKT_DELAY:
 	case TCP_BBR_ALGORITHM:
 	case TCP_BBR_TSLIMITS:
 	case TCP_BBR_IWINTSO:
 	case TCP_BBR_RECFORCE:
 	case TCP_BBR_STARTUP_PG:
 	case TCP_BBR_DRAIN_PG:
 	case TCP_BBR_RWND_IS_APP:
 	case TCP_BBR_PROBE_RTT_INT:
 	case TCP_BBR_PROBE_RTT_GAIN:
 	case TCP_BBR_PROBE_RTT_LEN:
 	case TCP_BBR_STARTUP_LOSS_EXIT:
 	case TCP_BBR_USEDEL_RATE:
 	case TCP_BBR_MIN_RTO:
 	case TCP_BBR_MAX_RTO:
 	case TCP_BBR_PACE_PER_SEC:
 	case TCP_DELACK:
 	case TCP_BBR_PACE_DEL_TAR:
 	case TCP_BBR_SEND_IWND_IN_TSO:
 	case TCP_BBR_EXTRA_STATE:
 	case TCP_BBR_UTTER_MAX_TSO:
 	case TCP_BBR_MIN_TOPACEOUT:
 	case TCP_BBR_FLOOR_MIN_TSO:
 	case TCP_BBR_TSTMP_RAISES:
 	case TCP_BBR_POLICER_DETECT:
 	case TCP_BBR_USE_RACK_CHEAT:
 	case TCP_DATA_AFTER_CLOSE:
 	case TCP_BBR_HDWR_PACE:
 	case TCP_BBR_PACE_SEG_MAX:
 	case TCP_BBR_PACE_SEG_MIN:
 	case TCP_BBR_PACE_CROSS:
 	case TCP_BBR_PACE_OH:
 #ifdef NETFLIX_PEAKRATE
 	case TCP_MAXPEAKRATE:
 #endif
 	case TCP_BBR_TMR_PACE_OH:
 	case TCP_BBR_RACK_RTT_USE:
 	case TCP_BBR_RETRAN_WTSO:
 		break;
 	default:
 		return (tcp_default_ctloutput(inp, sopt));
 		break;
 	}
 	INP_WUNLOCK(inp);
 	error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
 	if (error)
 		return (error);
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	if (tp->t_fb != &__tcp_bbr) {
 		INP_WUNLOCK(inp);
 		return (ENOPROTOOPT);
 	}
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	switch (sopt->sopt_name) {
 	case TCP_BBR_PACE_PER_SEC:
 		BBR_OPTS_INC(tcp_bbr_pace_per_sec);
 		bbr->r_ctl.bbr_hptsi_per_second = optval;
 		break;
 	case TCP_BBR_PACE_DEL_TAR:
 		BBR_OPTS_INC(tcp_bbr_pace_del_tar);
 		bbr->r_ctl.bbr_hptsi_segments_delay_tar = optval;
 		break;
 	case TCP_BBR_PACE_SEG_MAX:
 		BBR_OPTS_INC(tcp_bbr_pace_seg_max);
 		bbr->r_ctl.bbr_hptsi_segments_max = optval;
 		break;
 	case TCP_BBR_PACE_SEG_MIN:
 		BBR_OPTS_INC(tcp_bbr_pace_seg_min);
 		bbr->r_ctl.bbr_hptsi_bytes_min = optval;
 		break;
 	case TCP_BBR_PACE_CROSS:
 		BBR_OPTS_INC(tcp_bbr_pace_cross);
 		bbr->r_ctl.bbr_cross_over = optval;
 		break;
 	case TCP_BBR_ALGORITHM:
 		BBR_OPTS_INC(tcp_bbr_algorithm);
 		if (optval && (bbr->rc_use_google == 0)) {
 			/* Turn on the google mode */
 			bbr_google_mode_on(bbr);
 			if ((optval > 3) && (optval < 500)) {
 				/*
 				 * Must be at least greater than .3%
 				 * and must be less than 50.0%.
 				 */
 				bbr->r_ctl.bbr_google_discount = optval;
 			}
 		} else if ((optval == 0) && (bbr->rc_use_google == 1)) {
 			/* Turn off the google mode */
 			bbr_google_mode_off(bbr);
 		}
 		break;
 	case TCP_BBR_TSLIMITS:
 		BBR_OPTS_INC(tcp_bbr_tslimits);
 		if (optval == 1)
 			bbr->rc_use_ts_limit = 1;
 		else if (optval == 0)
 			bbr->rc_use_ts_limit = 0;
 		else
 			error = EINVAL;
 		break;
 
 	case TCP_BBR_IWINTSO:
 		BBR_OPTS_INC(tcp_bbr_iwintso);
 		if ((optval >= 0) && (optval < 128)) {
 			uint32_t twin;
 
 			bbr->rc_init_win = optval;
 			twin = bbr_initial_cwnd(bbr, tp);
 			if ((bbr->rc_past_init_win == 0) && (twin > tp->snd_cwnd))
 				tp->snd_cwnd = twin;
 			else
 				error = EBUSY;
 		} else
 			error = EINVAL;
 		break;
 	case TCP_BBR_STARTUP_PG:
 		BBR_OPTS_INC(tcp_bbr_startup_pg);
 		if ((optval > 0) && (optval < BBR_MAX_GAIN_VALUE)) {
 			bbr->r_ctl.rc_startup_pg = optval;
 			if (bbr->rc_bbr_state == BBR_STATE_STARTUP) {
 				bbr->r_ctl.rc_bbr_hptsi_gain = optval;
 			}
 		} else
 			error = EINVAL;
 		break;
 	case TCP_BBR_DRAIN_PG:
 		BBR_OPTS_INC(tcp_bbr_drain_pg);
 		if ((optval > 0) && (optval < BBR_MAX_GAIN_VALUE))
 			bbr->r_ctl.rc_drain_pg = optval;
 		else
 			error = EINVAL;
 		break;
 	case TCP_BBR_PROBE_RTT_LEN:
 		BBR_OPTS_INC(tcp_bbr_probertt_len);
 		if (optval <= 1)
 			reset_time_small(&bbr->r_ctl.rc_rttprop, (optval * USECS_IN_SECOND));
 		else
 			error = EINVAL;
 		break;
 	case TCP_BBR_PROBE_RTT_GAIN:
 		BBR_OPTS_INC(tcp_bbr_probertt_gain);
 		if (optval <= BBR_UNIT)
 			bbr->r_ctl.bbr_rttprobe_gain_val = optval;
 		else
 			error = EINVAL;
 		break;
 	case TCP_BBR_PROBE_RTT_INT:
 		BBR_OPTS_INC(tcp_bbr_probe_rtt_int);
 		if (optval > 1000)
 			bbr->r_ctl.rc_probertt_int = optval;
 		else
 			error = EINVAL;
 		break;
 	case TCP_BBR_MIN_TOPACEOUT:
 		BBR_OPTS_INC(tcp_bbr_topaceout);
 		if (optval == 0) {
 			bbr->no_pacing_until = 0;
 			bbr->rc_no_pacing = 0;
 		} else if (optval <= 0x00ff) {
 			bbr->no_pacing_until = optval;
 			if ((bbr->r_ctl.rc_pkt_epoch < bbr->no_pacing_until) &&
 			    (bbr->rc_bbr_state == BBR_STATE_STARTUP)){
 				/* Turn on no pacing */
 				bbr->rc_no_pacing = 1;
 			}
 		} else
 			error = EINVAL;
 		break;
 	case TCP_BBR_STARTUP_LOSS_EXIT:
 		BBR_OPTS_INC(tcp_bbr_startup_loss_exit);
 		bbr->rc_loss_exit = optval;
 		break;
 	case TCP_BBR_USEDEL_RATE:
 		error = EINVAL;
 		break;
 	case TCP_BBR_MIN_RTO:
 		BBR_OPTS_INC(tcp_bbr_min_rto);
 		bbr->r_ctl.rc_min_rto_ms = optval;
 		break;
 	case TCP_BBR_MAX_RTO:
 		BBR_OPTS_INC(tcp_bbr_max_rto);
 		bbr->rc_max_rto_sec = optval;
 		break;
 	case TCP_RACK_MIN_TO:
 		/* Minimum time between rack t-o's in ms */
 		BBR_OPTS_INC(tcp_rack_min_to);
 		bbr->r_ctl.rc_min_to = optval;
 		break;
 	case TCP_RACK_REORD_THRESH:
 		/* RACK reorder threshold (shift amount) */
 		BBR_OPTS_INC(tcp_rack_reord_thresh);
 		if ((optval > 0) && (optval < 31))
 			bbr->r_ctl.rc_reorder_shift = optval;
 		else
 			error = EINVAL;
 		break;
 	case TCP_RACK_REORD_FADE:
 		/* Does reordering fade after ms time */
 		BBR_OPTS_INC(tcp_rack_reord_fade);
 		bbr->r_ctl.rc_reorder_fade = optval;
 		break;
 	case TCP_RACK_TLP_THRESH:
 		/* RACK TLP theshold i.e. srtt+(srtt/N) */
 		BBR_OPTS_INC(tcp_rack_tlp_thresh);
 		if (optval)
 			bbr->rc_tlp_threshold = optval;
 		else
 			error = EINVAL;
 		break;
 	case TCP_BBR_USE_RACK_CHEAT:
 		BBR_OPTS_INC(tcp_use_rackcheat);
 		if (bbr->rc_use_google) {
 			error = EINVAL;
 			break;
 		}
 		BBR_OPTS_INC(tcp_rack_cheat);
 		if (optval)
 			bbr->bbr_use_rack_cheat = 1;
 		else
 			bbr->bbr_use_rack_cheat = 0;
 		break;
 	case TCP_BBR_FLOOR_MIN_TSO:
 		BBR_OPTS_INC(tcp_utter_max_tso);
 		if ((optval >= 0) && (optval < 40))
 			bbr->r_ctl.bbr_hptsi_segments_floor = optval;
 		else
 			error = EINVAL;
 		break;
 	case TCP_BBR_UTTER_MAX_TSO:
 		BBR_OPTS_INC(tcp_utter_max_tso);
 		if ((optval >= 0) && (optval < 0xffff))
 			bbr->r_ctl.bbr_utter_max = optval;
 		else
 			error = EINVAL;
 		break;
 
 	case TCP_BBR_EXTRA_STATE:
 		BBR_OPTS_INC(tcp_extra_state);
 		if (optval)
 			bbr->rc_use_idle_restart = 1;
 		else
 			bbr->rc_use_idle_restart = 0;
 		break;
 	case TCP_BBR_SEND_IWND_IN_TSO:
 		BBR_OPTS_INC(tcp_iwnd_tso);
 		if (optval) {
 			bbr->bbr_init_win_cheat = 1;
 			if (bbr->rc_past_init_win == 0) {
 				uint32_t cts;
 				cts = tcp_get_usecs(&bbr->rc_tv);
 				tcp_bbr_tso_size_check(bbr, cts);
 			}
 		} else
 			bbr->bbr_init_win_cheat = 0;
 		break;
 	case TCP_BBR_HDWR_PACE:
 		BBR_OPTS_INC(tcp_hdwr_pacing);
 		if (optval){
 			bbr->bbr_hdw_pace_ena = 1;
 			bbr->bbr_attempt_hdwr_pace = 0;
 		} else {
 			bbr->bbr_hdw_pace_ena = 0;
 #ifdef RATELIMIT
 			if (bbr->r_ctl.crte != NULL) {
 				tcp_rel_pacing_rate(bbr->r_ctl.crte, tp);
 				bbr->r_ctl.crte = NULL;
 			}
 #endif
 		}
 		break;
 
 	case TCP_DELACK:
 		BBR_OPTS_INC(tcp_delack);
 		if (optval < 100) {
 			if (optval == 0) /* off */
 				tp->t_delayed_ack = 0;
 			else if (optval == 1) /* on which is 2 */
 				tp->t_delayed_ack = 2;
 			else /* higher than 2 and less than 100 */
 				tp->t_delayed_ack = optval;
 			if (tp->t_flags & TF_DELACK) {
 				tp->t_flags &= ~TF_DELACK;
 				tp->t_flags |= TF_ACKNOW;
 				NET_EPOCH_ENTER(et);
 				bbr_output(tp);
 				NET_EPOCH_EXIT(et);
 			}
 		} else
 			error = EINVAL;
 		break;
 	case TCP_RACK_PKT_DELAY:
 		/* RACK added ms i.e. rack-rtt + reord + N */
 		BBR_OPTS_INC(tcp_rack_pkt_delay);
 		bbr->r_ctl.rc_pkt_delay = optval;
 		break;
 #ifdef NETFLIX_PEAKRATE
 	case TCP_MAXPEAKRATE:
 		BBR_OPTS_INC(tcp_maxpeak);
 		error = tcp_set_maxpeakrate(tp, optval);
 		if (!error)
 			tp->t_peakrate_thr = tp->t_maxpeakrate;
 		break;
 #endif
 	case TCP_BBR_RETRAN_WTSO:
 		BBR_OPTS_INC(tcp_retran_wtso);
 		if (optval)
 			bbr->rc_resends_use_tso = 1;
 		else
 			bbr->rc_resends_use_tso = 0;
 		break;
 	case TCP_DATA_AFTER_CLOSE:
 		BBR_OPTS_INC(tcp_data_ac);
 		if (optval)
 			bbr->rc_allow_data_af_clo = 1;
 		else
 			bbr->rc_allow_data_af_clo = 0;
 		break;
 	case TCP_BBR_POLICER_DETECT:
 		BBR_OPTS_INC(tcp_policer_det);
 		if (bbr->rc_use_google == 0)
 			error = EINVAL;
 		else if (optval)
 			bbr->r_use_policer = 1;
 		else
 			bbr->r_use_policer = 0;
 		break;
 
 	case TCP_BBR_TSTMP_RAISES:
 		BBR_OPTS_INC(tcp_ts_raises);
 		if (optval)
 			bbr->ts_can_raise = 1;
 		else
 			bbr->ts_can_raise = 0;
 		break;
 	case TCP_BBR_TMR_PACE_OH:
 		BBR_OPTS_INC(tcp_pacing_oh_tmr);
 		if (bbr->rc_use_google) {
 			error = EINVAL;
 		} else {
 			if (optval)
 				bbr->r_ctl.rc_incr_tmrs = 1;
 			else
 				bbr->r_ctl.rc_incr_tmrs = 0;
 		}
 		break;
 	case TCP_BBR_PACE_OH:
 		BBR_OPTS_INC(tcp_pacing_oh);
 		if (bbr->rc_use_google) {
 			error = EINVAL;
 		} else {
 			if (optval > (BBR_INCL_TCP_OH|
 				      BBR_INCL_IP_OH|
 				      BBR_INCL_ENET_OH)) {
 				error = EINVAL;
 				break;
 			}
 			if (optval & BBR_INCL_TCP_OH)
 				bbr->r_ctl.rc_inc_tcp_oh = 1;
 			else
 				bbr->r_ctl.rc_inc_tcp_oh = 0;
 			if (optval & BBR_INCL_IP_OH)
 				bbr->r_ctl.rc_inc_ip_oh = 1;
 			else
 				bbr->r_ctl.rc_inc_ip_oh = 0;
 			if (optval & BBR_INCL_ENET_OH)
 				bbr->r_ctl.rc_inc_enet_oh = 1;
 			else
 				bbr->r_ctl.rc_inc_enet_oh = 0;
 		}
 		break;
 	default:
 		return (tcp_default_ctloutput(inp, sopt));
 		break;
 	}
 #ifdef NETFLIX_STATS
 	tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
 #endif
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * return 0 on success, error-num on failure
  */
 static int
 bbr_get_sockopt(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct tcpcb *tp;
 	struct tcp_bbr *bbr;
 	int32_t error, optval;
 
 	tp = intotcpcb(inp);
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	if (bbr == NULL) {
 		INP_WUNLOCK(inp);
 		return (EINVAL);
 	}
 	/*
 	 * Because all our options are either boolean or an int, we can just
 	 * pull everything into optval and then unlock and copy. If we ever
 	 * add a option that is not a int, then this will have quite an
 	 * impact to this routine.
 	 */
 	switch (sopt->sopt_name) {
 	case TCP_BBR_PACE_PER_SEC:
 		optval = bbr->r_ctl.bbr_hptsi_per_second;
 		break;
 	case TCP_BBR_PACE_DEL_TAR:
 		optval = bbr->r_ctl.bbr_hptsi_segments_delay_tar;
 		break;
 	case TCP_BBR_PACE_SEG_MAX:
 		optval = bbr->r_ctl.bbr_hptsi_segments_max;
 		break;
 	case TCP_BBR_MIN_TOPACEOUT:
 		optval = bbr->no_pacing_until;
 		break;
 	case TCP_BBR_PACE_SEG_MIN:
 		optval = bbr->r_ctl.bbr_hptsi_bytes_min;
 		break;
 	case TCP_BBR_PACE_CROSS:
 		optval = bbr->r_ctl.bbr_cross_over;
 		break;
 	case TCP_BBR_ALGORITHM:
 		optval = bbr->rc_use_google;
 		break;
 	case TCP_BBR_TSLIMITS:
 		optval = bbr->rc_use_ts_limit;
 		break;
 	case TCP_BBR_IWINTSO:
 		optval = bbr->rc_init_win;
 		break;
 	case TCP_BBR_STARTUP_PG:
 		optval = bbr->r_ctl.rc_startup_pg;
 		break;
 	case TCP_BBR_DRAIN_PG:
 		optval = bbr->r_ctl.rc_drain_pg;
 		break;
 	case TCP_BBR_PROBE_RTT_INT:
 		optval = bbr->r_ctl.rc_probertt_int;
 		break;
 	case TCP_BBR_PROBE_RTT_LEN:
 		optval = (bbr->r_ctl.rc_rttprop.cur_time_limit / USECS_IN_SECOND);
 		break;
 	case TCP_BBR_PROBE_RTT_GAIN:
 		optval = bbr->r_ctl.bbr_rttprobe_gain_val;
 		break;
 	case TCP_BBR_STARTUP_LOSS_EXIT:
 		optval = bbr->rc_loss_exit;
 		break;
 	case TCP_BBR_USEDEL_RATE:
 		error = EINVAL;
 		break;
 	case TCP_BBR_MIN_RTO:
 		optval = bbr->r_ctl.rc_min_rto_ms;
 		break;
 	case TCP_BBR_MAX_RTO:
 		optval = bbr->rc_max_rto_sec;
 		break;
 	case TCP_RACK_PACE_MAX_SEG:
 		/* Max segments in a pace */
 		optval = bbr->r_ctl.rc_pace_max_segs;
 		break;
 	case TCP_RACK_MIN_TO:
 		/* Minimum time between rack t-o's in ms */
 		optval = bbr->r_ctl.rc_min_to;
 		break;
 	case TCP_RACK_REORD_THRESH:
 		/* RACK reorder threshold (shift amount) */
 		optval = bbr->r_ctl.rc_reorder_shift;
 		break;
 	case TCP_RACK_REORD_FADE:
 		/* Does reordering fade after ms time */
 		optval = bbr->r_ctl.rc_reorder_fade;
 		break;
 	case TCP_BBR_USE_RACK_CHEAT:
 		/* Do we use the rack cheat for rxt */
 		optval = bbr->bbr_use_rack_cheat;
 		break;
 	case TCP_BBR_FLOOR_MIN_TSO:
 		optval = bbr->r_ctl.bbr_hptsi_segments_floor;
 		break;
 	case TCP_BBR_UTTER_MAX_TSO:
 		optval = bbr->r_ctl.bbr_utter_max;
 		break;
 	case TCP_BBR_SEND_IWND_IN_TSO:
 		/* Do we send TSO size segments initially */
 		optval = bbr->bbr_init_win_cheat;
 		break;
 	case TCP_BBR_EXTRA_STATE:
 		optval = bbr->rc_use_idle_restart;
 		break;
 	case TCP_RACK_TLP_THRESH:
 		/* RACK TLP theshold i.e. srtt+(srtt/N) */
 		optval = bbr->rc_tlp_threshold;
 		break;
 	case TCP_RACK_PKT_DELAY:
 		/* RACK added ms i.e. rack-rtt + reord + N */
 		optval = bbr->r_ctl.rc_pkt_delay;
 		break;
 	case TCP_BBR_RETRAN_WTSO:
 		optval = bbr->rc_resends_use_tso;
 		break;
 	case TCP_DATA_AFTER_CLOSE:
 		optval = bbr->rc_allow_data_af_clo;
 		break;
 	case TCP_DELACK:
 		optval = tp->t_delayed_ack;
 		break;
 	case TCP_BBR_HDWR_PACE:
 		optval = bbr->bbr_hdw_pace_ena;
 		break;
 	case TCP_BBR_POLICER_DETECT:
 		optval = bbr->r_use_policer;
 		break;
 	case TCP_BBR_TSTMP_RAISES:
 		optval = bbr->ts_can_raise;
 		break;
 	case TCP_BBR_TMR_PACE_OH:
 		optval = bbr->r_ctl.rc_incr_tmrs;
 		break;
 	case TCP_BBR_PACE_OH:
 		optval = 0;
 		if (bbr->r_ctl.rc_inc_tcp_oh)
 			optval |= BBR_INCL_TCP_OH;
 		if (bbr->r_ctl.rc_inc_ip_oh)
 			optval |= BBR_INCL_IP_OH;
 		if (bbr->r_ctl.rc_inc_enet_oh)
 			optval |= BBR_INCL_ENET_OH;
 		break;
 	default:
 		return (tcp_default_ctloutput(inp, sopt));
 		break;
 	}
 	INP_WUNLOCK(inp);
 	error = sooptcopyout(sopt, &optval, sizeof optval);
 	return (error);
 }
 
 /*
  * return 0 on success, error-num on failure
  */
 static int
 bbr_ctloutput(struct inpcb *inp, struct sockopt *sopt)
 {
 	if (sopt->sopt_dir == SOPT_SET) {
 		return (bbr_set_sockopt(inp, sopt));
 	} else if (sopt->sopt_dir == SOPT_GET) {
 		return (bbr_get_sockopt(inp, sopt));
 	} else {
 		panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir);
 	}
 }
 
 static const char *bbr_stack_names[] = {
 	__XSTRING(STACKNAME),
 #ifdef STACKALIAS
 	__XSTRING(STACKALIAS),
 #endif
 };
 
 static bool bbr_mod_inited = false;
 
 static int
 tcp_addbbr(module_t mod, int32_t type, void *data)
 {
 	int32_t err = 0;
 	int num_stacks;
 
 	switch (type) {
 	case MOD_LOAD:
 		printf("Attempting to load " __XSTRING(MODNAME) "\n");
 		bbr_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
 		    sizeof(struct bbr_sendmap),
 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 		bbr_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
 		    sizeof(struct tcp_bbr),
 		    NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
 		sysctl_ctx_init(&bbr_sysctl_ctx);
 		bbr_sysctl_root = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
 		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
 		    OID_AUTO,
 #ifdef STACKALIAS
 		    __XSTRING(STACKALIAS),
 #else
 		    __XSTRING(STACKNAME),
 #endif
 		    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 		    "");
 		if (bbr_sysctl_root == NULL) {
 			printf("Failed to add sysctl node\n");
 			err = EFAULT;
 			goto free_uma;
 		}
 		bbr_init_sysctls();
 		num_stacks = nitems(bbr_stack_names);
 		err = register_tcp_functions_as_names(&__tcp_bbr, M_WAITOK,
 		    bbr_stack_names, &num_stacks);
 		if (err) {
 			printf("Failed to register %s stack name for "
 			    "%s module\n", bbr_stack_names[num_stacks],
 			    __XSTRING(MODNAME));
 			sysctl_ctx_free(&bbr_sysctl_ctx);
 	free_uma:
 			uma_zdestroy(bbr_zone);
 			uma_zdestroy(bbr_pcb_zone);
 			bbr_counter_destroy();
 			printf("Failed to register " __XSTRING(MODNAME)
 			    " module err:%d\n", err);
 			return (err);
 		}
 		tcp_lro_reg_mbufq();
 		bbr_mod_inited = true;
 		printf(__XSTRING(MODNAME) " is now available\n");
 		break;
 	case MOD_QUIESCE:
 		err = deregister_tcp_functions(&__tcp_bbr, true, false);
 		break;
 	case MOD_UNLOAD:
 		err = deregister_tcp_functions(&__tcp_bbr, false, true);
 		if (err == EBUSY)
 			break;
 		if (bbr_mod_inited) {
 			uma_zdestroy(bbr_zone);
 			uma_zdestroy(bbr_pcb_zone);
 			sysctl_ctx_free(&bbr_sysctl_ctx);
 			bbr_counter_destroy();
 			printf(__XSTRING(MODNAME)
 			    " is now no longer available\n");
 			bbr_mod_inited = false;
 		}
 		tcp_lro_dereg_mbufq();
 		err = 0;
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (err);
 }
 
 static moduledata_t tcp_bbr = {
 	.name = __XSTRING(MODNAME),
 	    .evhand = tcp_addbbr,
 	    .priv = 0
 };
 
 MODULE_VERSION(MODNAME, 1);
 DECLARE_MODULE(MODNAME, tcp_bbr, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 740ec73a17df..74503bc8a1b2 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -1,20955 +1,20955 @@
 /*-
  * Copyright (c) 2016-2020 Netflix, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_tcpdebug.h"
 #include "opt_ratelimit.h"
 #include "opt_kern_tls.h"
 #include <sys/param.h>
 #include <sys/arb.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #ifdef TCP_HHOOK
 #include <sys/hhook.h>
 #endif
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #ifdef STATS
 #include <sys/qmath.h>
 #include <sys/tree.h>
 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
 #else
 #include <sys/tree.h>
 #endif
 #include <sys/refcount.h>
 #include <sys/queue.h>
 #include <sys/tim_filter.h>
 #include <sys/smp.h>
 #include <sys/kthread.h>
 #include <sys/kern_prefetch.h>
 #include <sys/protosw.h>
 #ifdef TCP_ACCOUNTING
 #include <sys/sched.h>
 #include <machine/cpu.h>
 #endif
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #define TCPSTATES		/* for logging */
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* required for icmp_var.h */
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/tcp.h>
 #define	TCPOUTFLAGS
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/tcp_hpts.h>
 #include <netinet/tcp_ratelimit.h>
 #include <netinet/tcp_accounting.h>
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
 #include <netinet/cc/cc_newreno.h>
 #include <netinet/tcp_fastopen.h>
 #include <netinet/tcp_lro.h>
 #ifdef NETFLIX_SHARED_CWND
 #include <netinet/tcp_shared_cwnd.h>
 #endif
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif				/* TCPDEBUG */
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcp_ecn.h>
 
 #include <netipsec/ipsec_support.h>
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif				/* IPSEC */
 
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <machine/in_cksum.h>
 
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
 #include "sack_filter.h"
 #include "tcp_rack.h"
 #include "rack_bbr_common.h"
 
 uma_zone_t rack_zone;
 uma_zone_t rack_pcb_zone;
 
 #ifndef TICKS2SBT
 #define	TICKS2SBT(__t)	(tick_sbt * ((sbintime_t)(__t)))
 #endif
 
 VNET_DECLARE(uint32_t, newreno_beta);
 VNET_DECLARE(uint32_t, newreno_beta_ecn);
 #define V_newreno_beta VNET(newreno_beta)
 #define V_newreno_beta_ecn VNET(newreno_beta_ecn)
 
 
 MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block");
 MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options");
 
 struct sysctl_ctx_list rack_sysctl_ctx;
 struct sysctl_oid *rack_sysctl_root;
 
 #define CUM_ACKED 1
 #define SACKED 2
 
 /*
  * The RACK module incorporates a number of
  * TCP ideas that have been put out into the IETF
  * over the last few years:
  * - Matt Mathis's Rate Halving which slowly drops
  *    the congestion window so that the ack clock can
  *    be maintained during a recovery.
  * - Yuchung Cheng's RACK TCP (for which its named) that
  *    will stop us using the number of dup acks and instead
  *    use time as the gage of when we retransmit.
  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
  *    of Dukkipati et.al.
  * RACK depends on SACK, so if an endpoint arrives that
  * cannot do SACK the state machine below will shuttle the
  * connection back to using the "default" TCP stack that is
  * in FreeBSD.
  *
  * To implement RACK the original TCP stack was first decomposed
  * into a functional state machine with individual states
  * for each of the possible TCP connection states. The do_segment
  * functions role in life is to mandate the connection supports SACK
  * initially and then assure that the RACK state matches the conenction
  * state before calling the states do_segment function. Each
  * state is simplified due to the fact that the original do_segment
  * has been decomposed and we *know* what state we are in (no
  * switches on the state) and all tests for SACK are gone. This
  * greatly simplifies what each state does.
  *
  * TCP output is also over-written with a new version since it
  * must maintain the new rack scoreboard.
  *
  */
 static int32_t rack_tlp_thresh = 1;
 static int32_t rack_tlp_limit = 2;	/* No more than 2 TLPs w-out new data */
 static int32_t rack_tlp_use_greater = 1;
 static int32_t rack_reorder_thresh = 2;
 static int32_t rack_reorder_fade = 60000000;	/* 0 - never fade, def 60,000,000
 						 * - 60 seconds */
 static uint8_t rack_req_measurements = 1;
 /* Attack threshold detections */
 static uint32_t rack_highest_sack_thresh_seen = 0;
 static uint32_t rack_highest_move_thresh_seen = 0;
 static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */
 static int32_t rack_hw_pace_extra_slots = 2;	/* 2 extra MSS time betweens */
 static int32_t rack_hw_rate_caps = 1; /* 1; */
 static int32_t rack_hw_rate_min = 0; /* 1500000;*/
 static int32_t rack_hw_rate_to_low = 0; /* 1200000; */
 static int32_t rack_hw_up_only = 1;
 static int32_t rack_stats_gets_ms_rtt = 1;
 static int32_t rack_prr_addbackmax = 2;
 static int32_t rack_do_hystart = 0;
 static int32_t rack_apply_rtt_with_reduced_conf = 0;
 
 static int32_t rack_pkt_delay = 1000;
 static int32_t rack_send_a_lot_in_prr = 1;
 static int32_t rack_min_to = 1000;	/* Number of microsecond  min timeout */
 static int32_t rack_verbose_logging = 0;
 static int32_t rack_ignore_data_after_close = 1;
 static int32_t rack_enable_shared_cwnd = 1;
 static int32_t rack_use_cmp_acks = 1;
 static int32_t rack_use_fsb = 1;
 static int32_t rack_use_rfo = 1;
 static int32_t rack_use_rsm_rfo = 1;
 static int32_t rack_max_abc_post_recovery = 2;
 static int32_t rack_client_low_buf = 0;
 static int32_t rack_dsack_std_based = 0x3;	/* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */
 #ifdef TCP_ACCOUNTING
 static int32_t rack_tcp_accounting = 0;
 #endif
 static int32_t rack_limits_scwnd = 1;
 static int32_t rack_enable_mqueue_for_nonpaced = 0;
 static int32_t rack_disable_prr = 0;
 static int32_t use_rack_rr = 1;
 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */
 static int32_t rack_persist_min = 250000;	/* 250usec */
 static int32_t rack_persist_max = 2000000;	/* 2 Second in usec's */
 static int32_t rack_sack_not_required = 1;	/* set to one to allow non-sack to use rack */
 static int32_t rack_default_init_window = 0;	/* Use system default */
 static int32_t rack_limit_time_with_srtt = 0;
 static int32_t rack_autosndbuf_inc = 20;	/* In percentage form */
 static int32_t rack_enobuf_hw_boost_mult = 2;	/* How many times the hw rate we boost slot using time_between */
 static int32_t rack_enobuf_hw_max = 12000;	/* 12 ms in usecs */
 static int32_t rack_enobuf_hw_min = 10000;	/* 10 ms in usecs */
 static int32_t rack_hw_rwnd_factor = 2;		/* How many max_segs the rwnd must be before we hold off sending */
 
 /*
  * Currently regular tcp has a rto_min of 30ms
  * the backoff goes 12 times so that ends up
  * being a total of 122.850 seconds before a
  * connection is killed.
  */
 static uint32_t rack_def_data_window = 20;
 static uint32_t rack_goal_bdp = 2;
 static uint32_t rack_min_srtts = 1;
 static uint32_t rack_min_measure_usec = 0;
 static int32_t rack_tlp_min = 10000;	/* 10ms */
 static int32_t rack_rto_min = 30000;	/* 30,000 usec same as main freebsd */
 static int32_t rack_rto_max = 4000000;	/* 4 seconds in usec's */
 static const int32_t rack_free_cache = 2;
 static int32_t rack_hptsi_segments = 40;
 static int32_t rack_rate_sample_method = USE_RTT_LOW;
 static int32_t rack_pace_every_seg = 0;
 static int32_t rack_delayed_ack_time = 40000;	/* 40ms in usecs */
 static int32_t rack_slot_reduction = 4;
 static int32_t rack_wma_divisor = 8;		/* For WMA calculation */
 static int32_t rack_cwnd_block_ends_measure = 0;
 static int32_t rack_rwnd_block_ends_measure = 0;
 static int32_t rack_def_profile = 0;
 
 static int32_t rack_lower_cwnd_at_tlp = 0;
 static int32_t rack_limited_retran = 0;
 static int32_t rack_always_send_oldest = 0;
 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
 
 static uint16_t rack_per_of_gp_ss = 250;	/* 250 % slow-start */
 static uint16_t rack_per_of_gp_ca = 200;	/* 200 % congestion-avoidance */
 static uint16_t rack_per_of_gp_rec = 200;	/* 200 % of bw */
 
 /* Probertt */
 static uint16_t rack_per_of_gp_probertt = 60;	/* 60% of bw */
 static uint16_t rack_per_of_gp_lowthresh = 40;	/* 40% is bottom */
 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */
 static uint16_t rack_atexit_prtt_hbp = 130;	/* Clamp to 130% on exit prtt if highly buffered path */
 static uint16_t rack_atexit_prtt = 130;	/* Clamp to 100% on exit prtt if non highly buffered path */
 
 static uint32_t rack_max_drain_wait = 2;	/* How man gp srtt's before we give up draining */
 static uint32_t rack_must_drain = 1;		/* How many GP srtt's we *must* wait */
 static uint32_t rack_probertt_use_min_rtt_entry = 1;	/* Use the min to calculate the goal else gp_srtt */
 static uint32_t rack_probertt_use_min_rtt_exit = 0;
 static uint32_t rack_probe_rtt_sets_cwnd = 0;
 static uint32_t rack_probe_rtt_safety_val = 2000000;	/* No more than 2 sec in probe-rtt */
 static uint32_t rack_time_between_probertt = 9600000;	/* 9.6 sec in usecs */
 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0;	/* How many srtt periods does probe-rtt last top fraction */
 static uint32_t rack_probertt_gpsrtt_cnt_div = 0;	/* How many srtt periods does probe-rtt last bottom fraction */
 static uint32_t rack_min_probertt_hold = 40000;		/* Equal to delayed ack time */
 static uint32_t rack_probertt_filter_life = 10000000;
 static uint32_t rack_probertt_lower_within = 10;
 static uint32_t rack_min_rtt_movement = 250000;	/* Must move at least 250ms (in microseconds)  to count as a lowering */
 static int32_t rack_pace_one_seg = 0;		/* Shall we pace for less than 1.4Meg 1MSS at a time */
 static int32_t rack_probertt_clear_is = 1;
 static int32_t rack_max_drain_hbp = 1;		/* Extra drain times gpsrtt for highly buffered paths */
 static int32_t rack_hbp_thresh = 3;		/* what is the divisor max_rtt/min_rtt to decided a hbp */
 
 /* Part of pacing */
 static int32_t rack_max_per_above = 30;		/* When we go to increment stop if above 100+this% */
 
 /* Timely information */
 /* Combine these two gives the range of 'no change' to bw */
 /* ie the up/down provide the upper and lower bound */
 static int32_t rack_gp_per_bw_mul_up = 2;	/* 2% */
 static int32_t rack_gp_per_bw_mul_down = 4;	/* 4% */
 static int32_t rack_gp_rtt_maxmul = 3;		/* 3 x maxmin */
 static int32_t rack_gp_rtt_minmul = 1;		/* minrtt + (minrtt/mindiv) is lower rtt */
 static int32_t rack_gp_rtt_mindiv = 4;		/* minrtt + (minrtt * minmul/mindiv) is lower rtt */
 static int32_t rack_gp_decrease_per = 20;	/* 20% decrease in multiplier */
 static int32_t rack_gp_increase_per = 2;	/* 2% increase in multiplier */
 static int32_t rack_per_lower_bound = 50;	/* Don't allow to drop below this multiplier */
 static int32_t rack_per_upper_bound_ss = 0;	/* Don't allow SS to grow above this */
 static int32_t rack_per_upper_bound_ca = 0;	/* Don't allow CA to grow above this */
 static int32_t rack_do_dyn_mul = 0;		/* Are the rack gp multipliers dynamic */
 static int32_t rack_gp_no_rec_chg = 1;		/* Prohibit recovery from reducing it's multiplier */
 static int32_t rack_timely_dec_clear = 6;	/* Do we clear decrement count at a value (6)? */
 static int32_t rack_timely_max_push_rise = 3;	/* One round of pushing */
 static int32_t rack_timely_max_push_drop = 3;	/* Three round of pushing */
 static int32_t rack_timely_min_segs = 4;	/* 4 segment minimum */
 static int32_t rack_use_max_for_nobackoff = 0;
 static int32_t rack_timely_int_timely_only = 0;	/* do interim timely's only use the timely algo (no b/w changes)? */
 static int32_t rack_timely_no_stopping = 0;
 static int32_t rack_down_raise_thresh = 100;
 static int32_t rack_req_segs = 1;
 static uint64_t rack_bw_rate_cap = 0;
 static uint32_t rack_trace_point_config = 0;
 static uint32_t rack_trace_point_bb_mode = 4;
 static int32_t rack_trace_point_count = 0;
 
 
 /* Weird delayed ack mode */
 static int32_t rack_use_imac_dack = 0;
 /* Rack specific counters */
 counter_u64_t rack_saw_enobuf;
 counter_u64_t rack_saw_enobuf_hw;
 counter_u64_t rack_saw_enetunreach;
 counter_u64_t rack_persists_sends;
 counter_u64_t rack_persists_acks;
 counter_u64_t rack_persists_loss;
 counter_u64_t rack_persists_lost_ends;
 #ifdef INVARIANTS
 counter_u64_t rack_adjust_map_bw;
 #endif
 /* Tail loss probe counters */
 counter_u64_t rack_tlp_tot;
 counter_u64_t rack_tlp_newdata;
 counter_u64_t rack_tlp_retran;
 counter_u64_t rack_tlp_retran_bytes;
 counter_u64_t rack_to_tot;
 counter_u64_t rack_hot_alloc;
 counter_u64_t rack_to_alloc;
 counter_u64_t rack_to_alloc_hard;
 counter_u64_t rack_to_alloc_emerg;
 counter_u64_t rack_to_alloc_limited;
 counter_u64_t rack_alloc_limited_conns;
 counter_u64_t rack_split_limited;
 
 counter_u64_t rack_multi_single_eq;
 counter_u64_t rack_proc_non_comp_ack;
 
 counter_u64_t rack_fto_send;
 counter_u64_t rack_fto_rsm_send;
 counter_u64_t rack_nfto_resend;
 counter_u64_t rack_non_fto_send;
 counter_u64_t rack_extended_rfo;
 
 counter_u64_t rack_sack_proc_all;
 counter_u64_t rack_sack_proc_short;
 counter_u64_t rack_sack_proc_restart;
 counter_u64_t rack_sack_attacks_detected;
 counter_u64_t rack_sack_attacks_reversed;
 counter_u64_t rack_sack_used_next_merge;
 counter_u64_t rack_sack_splits;
 counter_u64_t rack_sack_used_prev_merge;
 counter_u64_t rack_sack_skipped_acked;
 counter_u64_t rack_ack_total;
 counter_u64_t rack_express_sack;
 counter_u64_t rack_sack_total;
 counter_u64_t rack_move_none;
 counter_u64_t rack_move_some;
 
 counter_u64_t rack_input_idle_reduces;
 counter_u64_t rack_collapsed_win;
 counter_u64_t rack_collapsed_win_seen;
 counter_u64_t rack_collapsed_win_rxt;
 counter_u64_t rack_collapsed_win_rxt_bytes;
 counter_u64_t rack_try_scwnd;
 counter_u64_t rack_hw_pace_init_fail;
 counter_u64_t rack_hw_pace_lost;
 
 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
 
 
 #define	RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2)))
 
 #define	RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do {	\
 	(tv) = (value) + slop;	 \
 	if ((u_long)(tv) < (u_long)(tvmin)) \
 		(tv) = (tvmin); \
 	if ((u_long)(tv) > (u_long)(tvmax)) \
 		(tv) = (tvmax); \
 } while (0)
 
 static void
 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);
 
 static int
 rack_process_ack(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to,
     uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
 static int
 rack_process_data(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
 static void
 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
    uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery);
 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
     uint8_t limit_type);
 static struct rack_sendmap *
 rack_check_recovery_mode(struct tcpcb *tp,
     uint32_t tsused);
 static void
 rack_cong_signal(struct tcpcb *tp,
 		 uint32_t type, uint32_t ack, int );
 static void rack_counter_destroy(void);
 static int
 rack_ctloutput(struct inpcb *inp, struct sockopt *sopt);
 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
 static void
 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override);
 static void
 rack_do_segment(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
     uint8_t iptos);
 static void rack_dtor(void *mem, int32_t size, void *arg);
 static void
 rack_log_alt_to_to_cancel(struct tcp_rack *rack,
     uint32_t flex1, uint32_t flex2,
     uint32_t flex3, uint32_t flex4,
     uint32_t flex5, uint32_t flex6,
     uint16_t flex7, uint8_t mod);
 
 static void
 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
    uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line,
    struct rack_sendmap *rsm, uint8_t quality);
 static struct rack_sendmap *
 rack_find_high_nonack(struct tcp_rack *rack,
     struct rack_sendmap *rsm);
 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
 static int rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt);
 static void
 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
 			    tcp_seq th_ack, int line, uint8_t quality);
 static uint32_t
 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss);
 static int32_t rack_handoff_ok(struct tcpcb *tp);
 static int32_t rack_init(struct tcpcb *tp);
 static void rack_init_sysctls(void);
 static void
 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
     struct tcphdr *th, int entered_rec, int dup_ack_struck);
 static void
 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
     uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t ts,
     struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls);
 
 static void
 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm);
 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm);
 static int32_t rack_output(struct tcpcb *tp);
 
 static uint32_t
 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
     struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
     uint32_t cts, int *moved_two);
 static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq);
 static void rack_remxt_tmr(struct tcpcb *tp);
 static int rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt);
 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
 static int32_t rack_stopall(struct tcpcb *tp);
 static void
 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
     uint32_t delta);
 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
 static uint32_t
 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag);
 static void
 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag);
 static int
 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack);
 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
 static int
 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 static int
 rack_do_closing(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 static int
 rack_do_established(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 static int
 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos);
 static int
 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 static int
 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 static int
 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 static int
 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 static int
 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
 struct rack_sendmap *
 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
     uint32_t tsused);
 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt,
     uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt);
 static void
      tcp_rack_partialack(struct tcpcb *tp);
 static int
 rack_set_profile(struct tcp_rack *rack, int prof);
 static void
 rack_apply_deferred_options(struct tcp_rack *rack);
 
 int32_t rack_clear_counter=0;
 
 static inline void
 rack_trace_point(struct tcp_rack *rack, int num)
 {
 	if (((rack_trace_point_config == num)  ||
 	     (rack_trace_point_config = 0xffffffff)) &&
 	    (rack_trace_point_bb_mode != 0) &&
 	    (rack_trace_point_count > 0) &&
 	    (rack->rc_tp->t_logstate == 0)) {
 		int res;
 		res = atomic_fetchadd_int(&rack_trace_point_count, -1);
 		if (res > 0) {
 			rack->rc_tp->t_logstate = rack_trace_point_bb_mode;
 		} else {
 			/* Loss a race assure its zero now */
 			rack_trace_point_count = 0;
 		}
 	}
 }
 
 static void
 rack_set_cc_pacing(struct tcp_rack *rack)
 {
 	struct sockopt sopt;
 	struct cc_newreno_opts opt;
 	struct newreno old, *ptr;
 	struct tcpcb *tp;
 	int error;
 
 	if (rack->rc_pacing_cc_set)
 		return;
 
 	tp = rack->rc_tp;
 	if (tp->cc_algo == NULL) {
 		/* Tcb is leaving */
 		return;
 	}
 	rack->rc_pacing_cc_set = 1;
 	if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
 		/* Not new-reno we can't play games with beta! */
 		goto out;
 	}
 	ptr = ((struct newreno *)tp->ccv->cc_data);
 	if (CC_ALGO(tp)->ctl_output == NULL)  {
 		/* Huh, why does new_reno no longer have a set function? */
 		goto out;
 	}
 	if (ptr == NULL) {
 		/* Just the default values */
 		old.beta = V_newreno_beta_ecn;
 		old.beta_ecn = V_newreno_beta_ecn;
 		old.newreno_flags = 0;
 	} else {
 		old.beta = ptr->beta;
 		old.beta_ecn = ptr->beta_ecn;
 		old.newreno_flags = ptr->newreno_flags;
 	}
 	sopt.sopt_valsize = sizeof(struct cc_newreno_opts);
 	sopt.sopt_dir = SOPT_SET;
 	opt.name = CC_NEWRENO_BETA;
 	opt.val = rack->r_ctl.rc_saved_beta.beta;
 	error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
 	if (error)  {
 		goto out;
 	}
 	/*
 	 * Hack alert we need to set in our newreno_flags
 	 * so that Abe behavior is also applied.
 	 */
 	((struct newreno *)tp->ccv->cc_data)->newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED;
 	opt.name = CC_NEWRENO_BETA_ECN;
 	opt.val = rack->r_ctl.rc_saved_beta.beta_ecn;
 	error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
 	if (error) {
 		goto out;
 	}
 	/* Save off the original values for restoral */
 	memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno));
 out:
 	if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		ptr = ((struct newreno *)tp->ccv->cc_data);
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		if (ptr) {
 			log.u_bbr.flex1 = ptr->beta;
 			log.u_bbr.flex2 = ptr->beta_ecn;
 			log.u_bbr.flex3 = ptr->newreno_flags;
 		}
 		log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta;
 		log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn;
 		log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags;
 		log.u_bbr.flex7 = rack->gp_ready;
 		log.u_bbr.flex7 <<= 1;
 		log.u_bbr.flex7 |= rack->use_fixed_rate;
 		log.u_bbr.flex7 <<= 1;
 		log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
 		log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
 		log.u_bbr.flex8 = 3;
 		tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, error,
 			       0, &log, false, NULL, NULL, 0, &tv);
 	}
 }
 
 static void
 rack_undo_cc_pacing(struct tcp_rack *rack)
 {
 	struct newreno old, *ptr;
 	struct tcpcb *tp;
 
 	if (rack->rc_pacing_cc_set == 0)
 		return;
 	tp = rack->rc_tp;
 	rack->rc_pacing_cc_set = 0;
 	if (tp->cc_algo == NULL)
 		/* Tcb is leaving */
 		return;
 	if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
 		/* Not new-reno nothing to do! */
 		return;
 	}
 	ptr = ((struct newreno *)tp->ccv->cc_data);
 	if (ptr == NULL) {
 		/*
 		 * This happens at rack_fini() if the
 		 * cc module gets freed on us. In that
 		 * case we loose our "new" settings but
 		 * thats ok, since the tcb is going away anyway.
 		 */
 		return;
 	}
 	/* Grab out our set values */
 	memcpy(&old, ptr, sizeof(struct newreno));
 	/* Copy back in the original values */
 	memcpy(ptr, &rack->r_ctl.rc_saved_beta, sizeof(struct newreno));
 	/* Now save back the values we had set in (for when pacing is restored) */
 	memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno));
 	if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		ptr = ((struct newreno *)tp->ccv->cc_data);
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.flex1 = ptr->beta;
 		log.u_bbr.flex2 = ptr->beta_ecn;
 		log.u_bbr.flex3 = ptr->newreno_flags;
 		log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta;
 		log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn;
 		log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags;
 		log.u_bbr.flex7 = rack->gp_ready;
 		log.u_bbr.flex7 <<= 1;
 		log.u_bbr.flex7 |= rack->use_fixed_rate;
 		log.u_bbr.flex7 <<= 1;
 		log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
 		log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
 		log.u_bbr.flex8 = 4;
 		tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
 			       0, &log, false, NULL, NULL, 0, &tv);
 	}
 }
 
 #ifdef NETFLIX_PEAKRATE
 static inline void
 rack_update_peakrate_thr(struct tcpcb *tp)
 {
 	/* Keep in mind that t_maxpeakrate is in B/s. */
 	uint64_t peak;
 	peak = uqmax((tp->t_maxseg * 2),
 		     (((uint64_t)tp->t_maxpeakrate * (uint64_t)(tp->t_srtt)) / (uint64_t)HPTS_USEC_IN_SEC));
 	tp->t_peakrate_thr = (uint32_t)uqmin(peak, UINT32_MAX);
 }
 #endif
 
 static int
 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t stat;
 	int32_t error;
 
 	error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
 	if (error || req->newptr == NULL)
 		return error;
 
 	error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
 	if (error)
 		return (error);
 	if (stat == 1) {
 #ifdef INVARIANTS
 		printf("Clearing RACK counters\n");
 #endif
 		counter_u64_zero(rack_tlp_tot);
 		counter_u64_zero(rack_tlp_newdata);
 		counter_u64_zero(rack_tlp_retran);
 		counter_u64_zero(rack_tlp_retran_bytes);
 		counter_u64_zero(rack_to_tot);
 		counter_u64_zero(rack_saw_enobuf);
 		counter_u64_zero(rack_saw_enobuf_hw);
 		counter_u64_zero(rack_saw_enetunreach);
 		counter_u64_zero(rack_persists_sends);
 		counter_u64_zero(rack_persists_acks);
 		counter_u64_zero(rack_persists_loss);
 		counter_u64_zero(rack_persists_lost_ends);
 #ifdef INVARIANTS
 		counter_u64_zero(rack_adjust_map_bw);
 #endif
 		counter_u64_zero(rack_to_alloc_hard);
 		counter_u64_zero(rack_to_alloc_emerg);
 		counter_u64_zero(rack_sack_proc_all);
 		counter_u64_zero(rack_fto_send);
 		counter_u64_zero(rack_fto_rsm_send);
 		counter_u64_zero(rack_extended_rfo);
 		counter_u64_zero(rack_hw_pace_init_fail);
 		counter_u64_zero(rack_hw_pace_lost);
 		counter_u64_zero(rack_non_fto_send);
 		counter_u64_zero(rack_nfto_resend);
 		counter_u64_zero(rack_sack_proc_short);
 		counter_u64_zero(rack_sack_proc_restart);
 		counter_u64_zero(rack_to_alloc);
 		counter_u64_zero(rack_to_alloc_limited);
 		counter_u64_zero(rack_alloc_limited_conns);
 		counter_u64_zero(rack_split_limited);
 		counter_u64_zero(rack_multi_single_eq);
 		counter_u64_zero(rack_proc_non_comp_ack);
 		counter_u64_zero(rack_sack_attacks_detected);
 		counter_u64_zero(rack_sack_attacks_reversed);
 		counter_u64_zero(rack_sack_used_next_merge);
 		counter_u64_zero(rack_sack_used_prev_merge);
 		counter_u64_zero(rack_sack_splits);
 		counter_u64_zero(rack_sack_skipped_acked);
 		counter_u64_zero(rack_ack_total);
 		counter_u64_zero(rack_express_sack);
 		counter_u64_zero(rack_sack_total);
 		counter_u64_zero(rack_move_none);
 		counter_u64_zero(rack_move_some);
 		counter_u64_zero(rack_try_scwnd);
 		counter_u64_zero(rack_collapsed_win);
 		counter_u64_zero(rack_collapsed_win_rxt);
 		counter_u64_zero(rack_collapsed_win_seen);
 		counter_u64_zero(rack_collapsed_win_rxt_bytes);
 	}
 	rack_clear_counter = 0;
 	return (0);
 }
 
 static void
 rack_init_sysctls(void)
 {
 	struct sysctl_oid *rack_counters;
 	struct sysctl_oid *rack_attack;
 	struct sysctl_oid *rack_pacing;
 	struct sysctl_oid *rack_timely;
 	struct sysctl_oid *rack_timers;
 	struct sysctl_oid *rack_tlp;
 	struct sysctl_oid *rack_misc;
 	struct sysctl_oid *rack_features;
 	struct sysctl_oid *rack_measure;
 	struct sysctl_oid *rack_probertt;
 	struct sysctl_oid *rack_hw_pacing;
 	struct sysctl_oid *rack_tracepoint;
 
 	rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO,
 	    "sack_attack",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "Rack Sack Attack Counters and Controls");
 	rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO,
 	    "stats",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "Rack Counters");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "rate_sample_method", CTLFLAG_RW,
 	    &rack_rate_sample_method , USE_RTT_LOW,
 	    "What method should we use for rate sampling 0=high, 1=low ");
 	/* Probe rtt related controls */
 	rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO,
 	    "probertt",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "ProbeRTT related Controls");
 	SYSCTL_ADD_U16(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "exit_per_hpb", CTLFLAG_RW,
 	    &rack_atexit_prtt_hbp, 130,
 	    "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%");
 	SYSCTL_ADD_U16(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW,
 	    &rack_atexit_prtt, 130,
 	    "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%");
 	SYSCTL_ADD_U16(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "gp_per_mul", CTLFLAG_RW,
 	    &rack_per_of_gp_probertt, 60,
 	    "What percentage of goodput do we pace at in probertt");
 	SYSCTL_ADD_U16(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "gp_per_reduce", CTLFLAG_RW,
 	    &rack_per_of_gp_probertt_reduce, 10,
 	    "What percentage of goodput do we reduce every gp_srtt");
 	SYSCTL_ADD_U16(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "gp_per_low", CTLFLAG_RW,
 	    &rack_per_of_gp_lowthresh, 40,
 	    "What percentage of goodput do we allow the multiplier to fall to");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "time_between", CTLFLAG_RW,
 	    & rack_time_between_probertt, 96000000,
 	    "How many useconds between the lowest rtt falling must past before we enter probertt");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "safety", CTLFLAG_RW,
 	    &rack_probe_rtt_safety_val, 2000000,
 	    "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "sets_cwnd", CTLFLAG_RW,
 	    &rack_probe_rtt_sets_cwnd, 0,
 	    "Do we set the cwnd too (if always_lower is on)");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "maxdrainsrtts", CTLFLAG_RW,
 	    &rack_max_drain_wait, 2,
 	    "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "mustdrainsrtts", CTLFLAG_RW,
 	    &rack_must_drain, 1,
 	    "We must drain this many gp_srtt's waiting for flight to reach goal");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "goal_use_min_entry", CTLFLAG_RW,
 	    &rack_probertt_use_min_rtt_entry, 1,
 	    "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "goal_use_min_exit", CTLFLAG_RW,
 	    &rack_probertt_use_min_rtt_exit, 0,
 	    "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "length_div", CTLFLAG_RW,
 	    &rack_probertt_gpsrtt_cnt_div, 0,
 	    "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "length_mul", CTLFLAG_RW,
 	    &rack_probertt_gpsrtt_cnt_mul, 0,
 	    "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "holdtim_at_target", CTLFLAG_RW,
 	    &rack_min_probertt_hold, 200000,
 	    "What is the minimum time we hold probertt at target");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "filter_life", CTLFLAG_RW,
 	    &rack_probertt_filter_life, 10000000,
 	    "What is the time for the filters life in useconds");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "lower_within", CTLFLAG_RW,
 	    &rack_probertt_lower_within, 10,
 	    "If the rtt goes lower within this percentage of the time, go into probe-rtt");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "must_move", CTLFLAG_RW,
 	    &rack_min_rtt_movement, 250,
 	    "How much is the minimum movement in rtt to count as a drop for probertt purposes");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "clear_is_cnts", CTLFLAG_RW,
 	    &rack_probertt_clear_is, 1,
 	    "Do we clear I/S counts on exiting probe-rtt");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "hbp_extra_drain", CTLFLAG_RW,
 	    &rack_max_drain_hbp, 1,
 	    "How many extra drain gpsrtt's do we get in highly buffered paths");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_probertt),
 	    OID_AUTO, "hbp_threshold", CTLFLAG_RW,
 	    &rack_hbp_thresh, 3,
 	    "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold");
 
 	rack_tracepoint = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO,
 	    "tp",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "Rack tracepoint facility");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_tracepoint),
 	    OID_AUTO, "number", CTLFLAG_RW,
 	    &rack_trace_point_config, 0,
 	    "What is the trace point number to activate (0=none, 0xffffffff = all)?");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_tracepoint),
 	    OID_AUTO, "bbmode", CTLFLAG_RW,
 	    &rack_trace_point_bb_mode, 4,
 	    "What is BB logging mode that is activated?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_tracepoint),
 	    OID_AUTO, "count", CTLFLAG_RW,
 	    &rack_trace_point_count, 0,
 	    "How many connections will have BB logging turned on that hit the tracepoint?");
 	/* Pacing related sysctls */
 	rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO,
 	    "pacing",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "Pacing related Controls");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_pacing),
 	    OID_AUTO, "max_pace_over", CTLFLAG_RW,
 	    &rack_max_per_above, 30,
 	    "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_pacing),
 	    OID_AUTO, "pace_to_one", CTLFLAG_RW,
 	    &rack_pace_one_seg, 0,
 	    "Do we allow low b/w pacing of 1MSS instead of two");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_pacing),
 	    OID_AUTO, "limit_wsrtt", CTLFLAG_RW,
 	    &rack_limit_time_with_srtt, 0,
 	    "Do we limit pacing time based on srtt");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_pacing),
 	    OID_AUTO, "init_win", CTLFLAG_RW,
 	    &rack_default_init_window, 0,
 	    "Do we have a rack initial window 0 = system default");
 	SYSCTL_ADD_U16(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_pacing),
 	    OID_AUTO, "gp_per_ss", CTLFLAG_RW,
 	    &rack_per_of_gp_ss, 250,
 	    "If non zero, what percentage of goodput to pace at in slow start");
 	SYSCTL_ADD_U16(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_pacing),
 	    OID_AUTO, "gp_per_ca", CTLFLAG_RW,
 	    &rack_per_of_gp_ca, 150,
 	    "If non zero, what percentage of goodput to pace at in congestion avoidance");
 	SYSCTL_ADD_U16(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_pacing),
 	    OID_AUTO, "gp_per_rec", CTLFLAG_RW,
 	    &rack_per_of_gp_rec, 200,
 	    "If non zero, what percentage of goodput to pace at in recovery");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_pacing),
 	    OID_AUTO, "pace_max_seg", CTLFLAG_RW,
 	    &rack_hptsi_segments, 40,
 	    "What size is the max for TSO segments in pacing and burst mitigation");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_pacing),
 	    OID_AUTO, "burst_reduces", CTLFLAG_RW,
 	    &rack_slot_reduction, 4,
 	    "When doing only burst mitigation what is the reduce divisor");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "use_pacing", CTLFLAG_RW,
 	    &rack_pace_every_seg, 0,
 	    "If set we use pacing, if clear we use only the original burst mitigation");
 	SYSCTL_ADD_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_pacing),
 	    OID_AUTO, "rate_cap", CTLFLAG_RW,
 	    &rack_bw_rate_cap, 0,
 	    "If set we apply this value to the absolute rate cap used by pacing");
 	SYSCTL_ADD_U8(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "req_measure_cnt", CTLFLAG_RW,
 	    &rack_req_measurements, 1,
 	    "If doing dynamic pacing, how many measurements must be in before we start pacing?");
 	/* Hardware pacing */
 	rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO,
 	    "hdwr_pacing",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "Pacing related Controls");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_hw_pacing),
 	    OID_AUTO, "rwnd_factor", CTLFLAG_RW,
 	    &rack_hw_rwnd_factor, 2,
 	    "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_hw_pacing),
 	    OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW,
 	    &rack_enobuf_hw_boost_mult, 2,
 	    "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_hw_pacing),
 	    OID_AUTO, "pace_enobuf_max", CTLFLAG_RW,
 	    &rack_enobuf_hw_max, 2,
 	    "What is the max boost the pacing time if we see a ENOBUFS?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_hw_pacing),
 	    OID_AUTO, "pace_enobuf_min", CTLFLAG_RW,
 	    &rack_enobuf_hw_min, 2,
 	    "What is the min boost the pacing time if we see a ENOBUFS?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_hw_pacing),
 	    OID_AUTO, "enable", CTLFLAG_RW,
 	    &rack_enable_hw_pacing, 0,
 	    "Should RACK attempt to use hw pacing?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_hw_pacing),
 	    OID_AUTO, "rate_cap", CTLFLAG_RW,
 	    &rack_hw_rate_caps, 1,
 	    "Does the highest hardware pacing rate cap the rate we will send at??");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_hw_pacing),
 	    OID_AUTO, "rate_min", CTLFLAG_RW,
 	    &rack_hw_rate_min, 0,
 	    "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_hw_pacing),
 	    OID_AUTO, "rate_to_low", CTLFLAG_RW,
 	    &rack_hw_rate_to_low, 0,
 	    "If we fall below this rate, dis-engage hw pacing?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_hw_pacing),
 	    OID_AUTO, "up_only", CTLFLAG_RW,
 	    &rack_hw_up_only, 1,
 	    "Do we allow hw pacing to lower the rate selected?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_hw_pacing),
 	    OID_AUTO, "extra_mss_precise", CTLFLAG_RW,
 	    &rack_hw_pace_extra_slots, 2,
 	    "If the rates between software and hardware match precisely how many extra time_betweens do we get?");
 	rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO,
 	    "timely",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "Rack Timely RTT Controls");
 	/* Timely based GP dynmics */
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "upper", CTLFLAG_RW,
 	    &rack_gp_per_bw_mul_up, 2,
 	    "Rack timely upper range for equal b/w (in percentage)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "lower", CTLFLAG_RW,
 	    &rack_gp_per_bw_mul_down, 4,
 	    "Rack timely lower range for equal b/w (in percentage)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "rtt_max_mul", CTLFLAG_RW,
 	    &rack_gp_rtt_maxmul, 3,
 	    "Rack timely multiplier of lowest rtt for rtt_max");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "rtt_min_div", CTLFLAG_RW,
 	    &rack_gp_rtt_mindiv, 4,
 	    "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "rtt_min_mul", CTLFLAG_RW,
 	    &rack_gp_rtt_minmul, 1,
 	    "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "decrease", CTLFLAG_RW,
 	    &rack_gp_decrease_per, 20,
 	    "Rack timely decrease percentage of our GP multiplication factor");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "increase", CTLFLAG_RW,
 	    &rack_gp_increase_per, 2,
 	    "Rack timely increase perentage of our GP multiplication factor");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "lowerbound", CTLFLAG_RW,
 	    &rack_per_lower_bound, 50,
 	    "Rack timely lowest percentage we allow GP multiplier to fall to");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "upperboundss", CTLFLAG_RW,
 	    &rack_per_upper_bound_ss, 0,
 	    "Rack timely highest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "upperboundca", CTLFLAG_RW,
 	    &rack_per_upper_bound_ca, 0,
 	    "Rack timely highest percentage we allow GP multiplier to CA raise to (0 is no upperbound)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "dynamicgp", CTLFLAG_RW,
 	    &rack_do_dyn_mul, 0,
 	    "Rack timely do we enable dynmaic timely goodput by default");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "no_rec_red", CTLFLAG_RW,
 	    &rack_gp_no_rec_chg, 1,
 	    "Rack timely do we prohibit the recovery multiplier from being lowered");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "red_clear_cnt", CTLFLAG_RW,
 	    &rack_timely_dec_clear, 6,
 	    "Rack timely what threshold do we count to before another boost during b/w decent");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "max_push_rise", CTLFLAG_RW,
 	    &rack_timely_max_push_rise, 3,
 	    "Rack timely how many times do we push up with b/w increase");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "max_push_drop", CTLFLAG_RW,
 	    &rack_timely_max_push_drop, 3,
 	    "Rack timely how many times do we push back on b/w decent");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "min_segs", CTLFLAG_RW,
 	    &rack_timely_min_segs, 4,
 	    "Rack timely when setting the cwnd what is the min num segments");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "noback_max", CTLFLAG_RW,
 	    &rack_use_max_for_nobackoff, 0,
 	    "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "interim_timely_only", CTLFLAG_RW,
 	    &rack_timely_int_timely_only, 0,
 	    "Rack timely when doing interim timely's do we only do timely (no b/w consideration)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "nonstop", CTLFLAG_RW,
 	    &rack_timely_no_stopping, 0,
 	    "Rack timely don't stop increase");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "dec_raise_thresh", CTLFLAG_RW,
 	    &rack_down_raise_thresh, 100,
 	    "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timely),
 	    OID_AUTO, "bottom_drag_segs", CTLFLAG_RW,
 	    &rack_req_segs, 1,
 	    "Bottom dragging if not these many segments outstanding and room");
 
 	/* TLP and Rack related parameters */
 	rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO,
 	    "tlp",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "TLP and Rack related Controls");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_tlp),
 	    OID_AUTO, "use_rrr", CTLFLAG_RW,
 	    &use_rack_rr, 1,
 	    "Do we use Rack Rapid Recovery");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_tlp),
 	    OID_AUTO, "post_rec_labc", CTLFLAG_RW,
 	    &rack_max_abc_post_recovery, 2,
 	    "Since we do early recovery, do we override the l_abc to a value, if so what?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_tlp),
 	    OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW,
 	    &rack_non_rxt_use_cr, 0,
 	    "Do we use ss/ca rate if in recovery we are transmitting a new data chunk");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_tlp),
 	    OID_AUTO, "tlpmethod", CTLFLAG_RW,
 	    &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
 	    "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_tlp),
 	    OID_AUTO, "limit", CTLFLAG_RW,
 	    &rack_tlp_limit, 2,
 	    "How many TLP's can be sent without sending new data");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_tlp),
 	    OID_AUTO, "use_greater", CTLFLAG_RW,
 	    &rack_tlp_use_greater, 1,
 	    "Should we use the rack_rtt time if its greater than srtt");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_tlp),
 	    OID_AUTO, "tlpminto", CTLFLAG_RW,
 	    &rack_tlp_min, 10000,
 	    "TLP minimum timeout per the specification (in microseconds)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_tlp),
 	    OID_AUTO, "send_oldest", CTLFLAG_RW,
 	    &rack_always_send_oldest, 0,
 	    "Should we always send the oldest TLP and RACK-TLP");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_tlp),
 	    OID_AUTO, "rack_tlimit", CTLFLAG_RW,
 	    &rack_limited_retran, 0,
 	    "How many times can a rack timeout drive out sends");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_tlp),
 	    OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
 	    &rack_lower_cwnd_at_tlp, 0,
 	    "When a TLP completes a retran should we enter recovery");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_tlp),
 	    OID_AUTO, "reorder_thresh", CTLFLAG_RW,
 	    &rack_reorder_thresh, 2,
 	    "What factor for rack will be added when seeing reordering (shift right)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_tlp),
 	    OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
 	    &rack_tlp_thresh, 1,
 	    "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_tlp),
 	    OID_AUTO, "reorder_fade", CTLFLAG_RW,
 	    &rack_reorder_fade, 60000000,
 	    "Does reorder detection fade, if so how many microseconds (0 means never)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_tlp),
 	    OID_AUTO, "pktdelay", CTLFLAG_RW,
 	    &rack_pkt_delay, 1000,
 	    "Extra RACK time (in microseconds) besides reordering thresh");
 
 	/* Timer related controls */
 	rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO,
 	    "timers",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "Timer related controls");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timers),
 	    OID_AUTO, "persmin", CTLFLAG_RW,
 	    &rack_persist_min, 250000,
 	    "What is the minimum time in microseconds between persists");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timers),
 	    OID_AUTO, "persmax", CTLFLAG_RW,
 	    &rack_persist_max, 2000000,
 	    "What is the largest delay in microseconds between persists");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timers),
 	    OID_AUTO, "delayed_ack", CTLFLAG_RW,
 	    &rack_delayed_ack_time, 40000,
 	    "Delayed ack time (40ms in microseconds)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timers),
 	    OID_AUTO, "minrto", CTLFLAG_RW,
 	    &rack_rto_min, 30000,
 	    "Minimum RTO in microseconds -- set with caution below 1000 due to TLP");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timers),
 	    OID_AUTO, "maxrto", CTLFLAG_RW,
 	    &rack_rto_max, 4000000,
 	    "Maximum RTO in microseconds -- should be at least as large as min_rto");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_timers),
 	    OID_AUTO, "minto", CTLFLAG_RW,
 	    &rack_min_to, 1000,
 	    "Minimum rack timeout in microseconds");
 	/* Measure controls */
 	rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO,
 	    "measure",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "Measure related controls");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_measure),
 	    OID_AUTO, "wma_divisor", CTLFLAG_RW,
 	    &rack_wma_divisor, 8,
 	    "When doing b/w calculation what is the  divisor for the WMA");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_measure),
 	    OID_AUTO, "end_cwnd", CTLFLAG_RW,
 	    &rack_cwnd_block_ends_measure, 0,
 	    "Does a cwnd just-return end the measurement window (app limited)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_measure),
 	    OID_AUTO, "end_rwnd", CTLFLAG_RW,
 	    &rack_rwnd_block_ends_measure, 0,
 	    "Does an rwnd just-return end the measurement window (app limited -- not persists)");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_measure),
 	    OID_AUTO, "min_target", CTLFLAG_RW,
 	    &rack_def_data_window, 20,
 	    "What is the minimum target window (in mss) for a GP measurements");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_measure),
 	    OID_AUTO, "goal_bdp", CTLFLAG_RW,
 	    &rack_goal_bdp, 2,
 	    "What is the goal BDP to measure");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_measure),
 	    OID_AUTO, "min_srtts", CTLFLAG_RW,
 	    &rack_min_srtts, 1,
 	    "What is the goal BDP to measure");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_measure),
 	    OID_AUTO, "min_measure_tim", CTLFLAG_RW,
 	    &rack_min_measure_usec, 0,
 	    "What is the Minimum time time for a measurement if 0, this is off");
 	/* Features */
 	rack_features = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO,
 	    "features",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "Feature controls");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_features),
 	    OID_AUTO, "cmpack", CTLFLAG_RW,
 	    &rack_use_cmp_acks, 1,
 	    "Should RACK have LRO send compressed acks");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_features),
 	    OID_AUTO, "fsb", CTLFLAG_RW,
 	    &rack_use_fsb, 1,
 	    "Should RACK use the fast send block?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_features),
 	    OID_AUTO, "rfo", CTLFLAG_RW,
 	    &rack_use_rfo, 1,
 	    "Should RACK use rack_fast_output()?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_features),
 	    OID_AUTO, "rsmrfo", CTLFLAG_RW,
 	    &rack_use_rsm_rfo, 1,
 	    "Should RACK use rack_fast_rsm_output()?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_features),
 	    OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW,
 	    &rack_enable_mqueue_for_nonpaced, 0,
 	    "Should RACK use mbuf queuing for non-paced connections");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_features),
 	    OID_AUTO, "hystartplusplus", CTLFLAG_RW,
 	    &rack_do_hystart, 0,
 	    "Should RACK enable HyStart++ on connections?");
 	/* Misc rack controls */
 	rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO,
 	    "misc",
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "Misc related controls");
 #ifdef TCP_ACCOUNTING
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_misc),
 	    OID_AUTO, "tcp_acct", CTLFLAG_RW,
 	    &rack_tcp_accounting, 0,
 	    "Should we turn on TCP accounting for all rack sessions?");
 #endif
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_misc),
 	    OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW,
 	    &rack_apply_rtt_with_reduced_conf, 0,
 	    "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_misc),
 	    OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW,
 	    &rack_dsack_std_based, 3,
 	    "How do we process dsack with respect to rack timers, bit field, 3 is standards based?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_misc),
 	    OID_AUTO, "prr_addback_max", CTLFLAG_RW,
 	    &rack_prr_addbackmax, 2,
 	    "What is the maximum number of MSS we allow to be added back if prr can't send all its data?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_misc),
 	    OID_AUTO, "stats_gets_ms", CTLFLAG_RW,
 	    &rack_stats_gets_ms_rtt, 1,
 	    "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_misc),
 	    OID_AUTO, "clientlowbuf", CTLFLAG_RW,
 	    &rack_client_low_buf, 0,
 	    "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_misc),
 	    OID_AUTO, "defprofile", CTLFLAG_RW,
 	    &rack_def_profile, 0,
 	    "Should RACK use a default profile (0=no, num == profile num)?");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_misc),
 	    OID_AUTO, "shared_cwnd", CTLFLAG_RW,
 	    &rack_enable_shared_cwnd, 1,
 	    "Should RACK try to use the shared cwnd on connections where allowed");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_misc),
 	    OID_AUTO, "limits_on_scwnd", CTLFLAG_RW,
 	    &rack_limits_scwnd, 1,
 	    "Should RACK place low end time limits on the shared cwnd feature");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_misc),
 	    OID_AUTO, "iMac_dack", CTLFLAG_RW,
 	    &rack_use_imac_dack, 0,
 	    "Should RACK try to emulate iMac delayed ack");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_misc),
 	    OID_AUTO, "no_prr", CTLFLAG_RW,
 	    &rack_disable_prr, 0,
 	    "Should RACK not use prr and only pace (must have pacing on)");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_misc),
 	    OID_AUTO, "bb_verbose", CTLFLAG_RW,
 	    &rack_verbose_logging, 0,
 	    "Should RACK black box logging be verbose");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_misc),
 	    OID_AUTO, "data_after_close", CTLFLAG_RW,
 	    &rack_ignore_data_after_close, 1,
 	    "Do we hold off sending a RST until all pending data is ack'd");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_misc),
 	    OID_AUTO, "no_sack_needed", CTLFLAG_RW,
 	    &rack_sack_not_required, 1,
 	    "Do we allow rack to run on connections not supporting SACK");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_misc),
 	    OID_AUTO, "prr_sendalot", CTLFLAG_RW,
 	    &rack_send_a_lot_in_prr, 1,
 	    "Send a lot in prr");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_misc),
 	    OID_AUTO, "autoscale", CTLFLAG_RW,
 	    &rack_autosndbuf_inc, 20,
 	    "What percentage should rack scale up its snd buffer by?");
 	/* Sack Attacker detection stuff */
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_attack),
 	    OID_AUTO, "detect_highsackratio", CTLFLAG_RW,
 	    &rack_highest_sack_thresh_seen, 0,
 	    "Highest sack to ack ratio seen");
 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_attack),
 	    OID_AUTO, "detect_highmoveratio", CTLFLAG_RW,
 	    &rack_highest_move_thresh_seen, 0,
 	    "Highest move to non-move ratio seen");
 	rack_ack_total = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_attack),
 	    OID_AUTO, "acktotal", CTLFLAG_RD,
 	    &rack_ack_total,
 	    "Total number of Ack's");
 	rack_express_sack = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_attack),
 	    OID_AUTO, "exp_sacktotal", CTLFLAG_RD,
 	    &rack_express_sack,
 	    "Total expresss number of Sack's");
 	rack_sack_total = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_attack),
 	    OID_AUTO, "sacktotal", CTLFLAG_RD,
 	    &rack_sack_total,
 	    "Total number of SACKs");
 	rack_move_none = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_attack),
 	    OID_AUTO, "move_none", CTLFLAG_RD,
 	    &rack_move_none,
 	    "Total number of SACK index reuse of positions under threshold");
 	rack_move_some = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_attack),
 	    OID_AUTO, "move_some", CTLFLAG_RD,
 	    &rack_move_some,
 	    "Total number of SACK index reuse of positions over threshold");
 	rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_attack),
 	    OID_AUTO, "attacks", CTLFLAG_RD,
 	    &rack_sack_attacks_detected,
 	    "Total number of SACK attackers that had sack disabled");
 	rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_attack),
 	    OID_AUTO, "reversed", CTLFLAG_RD,
 	    &rack_sack_attacks_reversed,
 	    "Total number of SACK attackers that were later determined false positive");
 	rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_attack),
 	    OID_AUTO, "nextmerge", CTLFLAG_RD,
 	    &rack_sack_used_next_merge,
 	    "Total number of times we used the next merge");
 	rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_attack),
 	    OID_AUTO, "prevmerge", CTLFLAG_RD,
 	    &rack_sack_used_prev_merge,
 	    "Total number of times we used the prev merge");
 	/* Counters */
 	rack_fto_send = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "fto_send", CTLFLAG_RD,
 	    &rack_fto_send, "Total number of rack_fast_output sends");
 	rack_fto_rsm_send = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "fto_rsm_send", CTLFLAG_RD,
 	    &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends");
 	rack_nfto_resend = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "nfto_resend", CTLFLAG_RD,
 	    &rack_nfto_resend, "Total number of rack_output retransmissions");
 	rack_non_fto_send = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "nfto_send", CTLFLAG_RD,
 	    &rack_non_fto_send, "Total number of rack_output first sends");
 	rack_extended_rfo = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "rfo_extended", CTLFLAG_RD,
 	    &rack_extended_rfo, "Total number of times we extended rfo");
 
 	rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "hwpace_init_fail", CTLFLAG_RD,
 	    &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing");
 	rack_hw_pace_lost = counter_u64_alloc(M_WAITOK);
 
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "hwpace_lost", CTLFLAG_RD,
 	    &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing");
 	rack_tlp_tot = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "tlp_to_total", CTLFLAG_RD,
 	    &rack_tlp_tot,
 	    "Total number of tail loss probe expirations");
 	rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "tlp_new", CTLFLAG_RD,
 	    &rack_tlp_newdata,
 	    "Total number of tail loss probe sending new data");
 	rack_tlp_retran = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "tlp_retran", CTLFLAG_RD,
 	    &rack_tlp_retran,
 	    "Total number of tail loss probe sending retransmitted data");
 	rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
 	    &rack_tlp_retran_bytes,
 	    "Total bytes of tail loss probe sending retransmitted data");
 	rack_to_tot = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "rack_to_tot", CTLFLAG_RD,
 	    &rack_to_tot,
 	    "Total number of times the rack to expired");
 	rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "saw_enobufs", CTLFLAG_RD,
 	    &rack_saw_enobuf,
 	    "Total number of times a sends returned enobuf for non-hdwr paced connections");
 	rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD,
 	    &rack_saw_enobuf_hw,
 	    "Total number of times a send returned enobuf for hdwr paced connections");
 	rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
 	    &rack_saw_enetunreach,
 	    "Total number of times a send received a enetunreachable");
 	rack_hot_alloc = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "alloc_hot", CTLFLAG_RD,
 	    &rack_hot_alloc,
 	    "Total allocations from the top of our list");
 	rack_to_alloc = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "allocs", CTLFLAG_RD,
 	    &rack_to_alloc,
 	    "Total allocations of tracking structures");
 	rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "allochard", CTLFLAG_RD,
 	    &rack_to_alloc_hard,
 	    "Total allocations done with sleeping the hard way");
 	rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "allocemerg", CTLFLAG_RD,
 	    &rack_to_alloc_emerg,
 	    "Total allocations done from emergency cache");
 	rack_to_alloc_limited = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "alloc_limited", CTLFLAG_RD,
 	    &rack_to_alloc_limited,
 	    "Total allocations dropped due to limit");
 	rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "alloc_limited_conns", CTLFLAG_RD,
 	    &rack_alloc_limited_conns,
 	    "Connections with allocations dropped due to limit");
 	rack_split_limited = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "split_limited", CTLFLAG_RD,
 	    &rack_split_limited,
 	    "Split allocations dropped due to limit");
 	rack_persists_sends = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "persist_sends", CTLFLAG_RD,
 	    &rack_persists_sends,
 	    "Number of times we sent a persist probe");
 	rack_persists_acks = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "persist_acks", CTLFLAG_RD,
 	    &rack_persists_acks,
 	    "Number of times a persist probe was acked");
 	rack_persists_loss = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "persist_loss", CTLFLAG_RD,
 	    &rack_persists_loss,
 	    "Number of times we detected a lost persist probe (no ack)");
 	rack_persists_lost_ends = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "persist_loss_ends", CTLFLAG_RD,
 	    &rack_persists_lost_ends,
 	    "Number of lost persist probe (no ack) that the run ended with a PERSIST abort");
 #ifdef INVARIANTS
 	rack_adjust_map_bw = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "map_adjust_req", CTLFLAG_RD,
 	    &rack_adjust_map_bw,
 	    "Number of times we hit the case where the sb went up and down on a sendmap entry");
 #endif
 	rack_multi_single_eq = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD,
 	    &rack_multi_single_eq,
 	    "Number of compressed acks total represented");
 	rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "cmp_ack_not", CTLFLAG_RD,
 	    &rack_proc_non_comp_ack,
 	    "Number of non compresseds acks that we processed");
 
 
 	rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "sack_long", CTLFLAG_RD,
 	    &rack_sack_proc_all,
 	    "Total times we had to walk whole list for sack processing");
 	rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "sack_restart", CTLFLAG_RD,
 	    &rack_sack_proc_restart,
 	    "Total times we had to walk whole list due to a restart");
 	rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "sack_short", CTLFLAG_RD,
 	    &rack_sack_proc_short,
 	    "Total times we took shortcut for sack processing");
 	rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_attack),
 	    OID_AUTO, "skipacked", CTLFLAG_RD,
 	    &rack_sack_skipped_acked,
 	    "Total number of times we skipped previously sacked");
 	rack_sack_splits = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_attack),
 	    OID_AUTO, "ofsplit", CTLFLAG_RD,
 	    &rack_sack_splits,
 	    "Total number of times we did the old fashion tree split");
 	rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
 	    &rack_input_idle_reduces,
 	    "Total number of idle reductions on input");
 	rack_collapsed_win_seen = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "collapsed_win_seen", CTLFLAG_RD,
 	    &rack_collapsed_win_seen,
 	    "Total number of collapsed window events seen (where our window shrinks)");
 
 	rack_collapsed_win = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "collapsed_win", CTLFLAG_RD,
 	    &rack_collapsed_win,
 	    "Total number of collapsed window events where we mark packets");
 	rack_collapsed_win_rxt = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "collapsed_win_rxt", CTLFLAG_RD,
 	    &rack_collapsed_win_rxt,
 	    "Total number of packets that were retransmitted");
 	rack_collapsed_win_rxt_bytes = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "collapsed_win_bytes", CTLFLAG_RD,
 	    &rack_collapsed_win_rxt_bytes,
 	    "Total number of bytes that were retransmitted");
 	rack_try_scwnd = counter_u64_alloc(M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_counters),
 	    OID_AUTO, "tried_scwnd", CTLFLAG_RD,
 	    &rack_try_scwnd,
 	    "Total number of scwnd attempts");
 	COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "outsize", CTLFLAG_RD,
 	    rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
 	COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
 	SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "opts", CTLFLAG_RD,
 	    rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
 	SYSCTL_ADD_PROC(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 	    &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
 }
 
 static __inline int
 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a)
 {
 	if (SEQ_GEQ(b->r_start, a->r_start) &&
 	    SEQ_LT(b->r_start, a->r_end)) {
 		/*
 		 * The entry b is within the
 		 * block a. i.e.:
 		 * a --   |-------------|
 		 * b --   |----|
 		 * <or>
 		 * b --       |------|
 		 * <or>
 		 * b --       |-----------|
 		 */
 		return (0);
 	} else if (SEQ_GEQ(b->r_start, a->r_end)) {
 		/*
 		 * b falls as either the next
 		 * sequence block after a so a
 		 * is said to be smaller than b.
 		 * i.e:
 		 * a --   |------|
 		 * b --          |--------|
 		 * or
 		 * b --              |-----|
 		 */
 		return (1);
 	}
 	/*
 	 * Whats left is where a is
 	 * larger than b. i.e:
 	 * a --         |-------|
 	 * b --  |---|
 	 * or even possibly
 	 * b --   |--------------|
 	 */
 	return (-1);
 }
 
 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp);
 
 static uint32_t
 rc_init_window(struct tcp_rack *rack)
 {
 	uint32_t win;
 
 	if (rack->rc_init_win == 0) {
 		/*
 		 * Nothing set by the user, use the system stack
 		 * default.
 		 */
 		return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)));
 	}
 	win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win;
 	return (win);
 }
 
 static uint64_t
 rack_get_fixed_pacing_bw(struct tcp_rack *rack)
 {
 	if (IN_FASTRECOVERY(rack->rc_tp->t_flags))
 		return (rack->r_ctl.rc_fixed_pacing_rate_rec);
 	else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
 		return (rack->r_ctl.rc_fixed_pacing_rate_ss);
 	else
 		return (rack->r_ctl.rc_fixed_pacing_rate_ca);
 }
 
 static uint64_t
 rack_get_bw(struct tcp_rack *rack)
 {
 	if (rack->use_fixed_rate) {
 		/* Return the fixed pacing rate */
 		return (rack_get_fixed_pacing_bw(rack));
 	}
 	if (rack->r_ctl.gp_bw == 0) {
 		/*
 		 * We have yet no b/w measurement,
 		 * if we have a user set initial bw
 		 * return it. If we don't have that and
 		 * we have an srtt, use the tcp IW (10) to
 		 * calculate a fictional b/w over the SRTT
 		 * which is more or less a guess. Note
 		 * we don't use our IW from rack on purpose
 		 * so if we have like IW=30, we are not
 		 * calculating a "huge" b/w.
 		 */
 		uint64_t bw, srtt;
 		if (rack->r_ctl.init_rate)
 			return (rack->r_ctl.init_rate);
 
 		/* Has the user set a max peak rate? */
 #ifdef NETFLIX_PEAKRATE
 		if (rack->rc_tp->t_maxpeakrate)
 			return (rack->rc_tp->t_maxpeakrate);
 #endif
 		/* Ok lets come up with the IW guess, if we have a srtt */
 		if (rack->rc_tp->t_srtt == 0) {
 			/*
 			 * Go with old pacing method
 			 * i.e. burst mitigation only.
 			 */
 			return (0);
 		}
 		/* Ok lets get the initial TCP win (not racks) */
 		bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp));
 		srtt = (uint64_t)rack->rc_tp->t_srtt;
 		bw *= (uint64_t)USECS_IN_SECOND;
 		bw /= srtt;
 		if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap))
 			bw = rack->r_ctl.bw_rate_cap;
 		return (bw);
 	} else {
 		uint64_t bw;
 
 		if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
 			/* Averaging is done, we can return the value */
 			bw = rack->r_ctl.gp_bw;
 		} else {
 			/* Still doing initial average must calculate */
 			bw = rack->r_ctl.gp_bw / rack->r_ctl.num_measurements;
 		}
 #ifdef NETFLIX_PEAKRATE
 		if ((rack->rc_tp->t_maxpeakrate) &&
 		    (bw > rack->rc_tp->t_maxpeakrate)) {
 			/* The user has set a peak rate to pace at
 			 * don't allow us to pace faster than that.
 			 */
 			return (rack->rc_tp->t_maxpeakrate);
 		}
 #endif
 		if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap))
 			bw = rack->r_ctl.bw_rate_cap;
 		return (bw);
 	}
 }
 
 static uint16_t
 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm)
 {
 	if (rack->use_fixed_rate) {
 		return (100);
 	} else if (rack->in_probe_rtt && (rsm == NULL))
 		return (rack->r_ctl.rack_per_of_gp_probertt);
 	else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) &&
 		  rack->r_ctl.rack_per_of_gp_rec)) {
 		if (rsm) {
 			/* a retransmission always use the recovery rate */
 			return (rack->r_ctl.rack_per_of_gp_rec);
 		} else if (rack->rack_rec_nonrxt_use_cr) {
 			/* Directed to use the configured rate */
 			goto configured_rate;
 		} else if (rack->rack_no_prr &&
 			   (rack->r_ctl.rack_per_of_gp_rec > 100)) {
 			/* No PRR, lets just use the b/w estimate only */
 			return (100);
 		} else {
 			/*
 			 * Here we may have a non-retransmit but we
 			 * have no overrides, so just use the recovery
 			 * rate (prr is in effect).
 			 */
 			return (rack->r_ctl.rack_per_of_gp_rec);
 		}
 	}
 configured_rate:
 	/* For the configured rate we look at our cwnd vs the ssthresh */
 	if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
 		return (rack->r_ctl.rack_per_of_gp_ss);
 	else
 		return (rack->r_ctl.rack_per_of_gp_ca);
 }
 
 static void
 rack_log_dsack_event(struct tcp_rack *rack, uint8_t mod, uint32_t flex4, uint32_t flex5, uint32_t flex6)
 {
 	/*
 	 * Types of logs (mod value)
 	 * 1 = dsack_persists reduced by 1 via T-O or fast recovery exit.
 	 * 2 = a dsack round begins, persist is reset to 16.
 	 * 3 = a dsack round ends
 	 * 4 = Dsack option increases rack rtt flex5 is the srtt input, flex6 is thresh
 	 * 5 = Socket option set changing the control flags rc_rack_tmr_std_based, rc_rack_use_dsack
 	 * 6 = Final rack rtt, flex4 is srtt and flex6 is final limited thresh.
 	 */
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log, 0, sizeof(log));
 		log.u_bbr.flex1 = rack->rc_rack_tmr_std_based;
 		log.u_bbr.flex1 <<= 1;
 		log.u_bbr.flex1 |= rack->rc_rack_use_dsack;
 		log.u_bbr.flex1 <<= 1;
 		log.u_bbr.flex1 |= rack->rc_dsack_round_seen;
 		log.u_bbr.flex2 = rack->r_ctl.dsack_round_end;
 		log.u_bbr.flex3 = rack->r_ctl.num_dsack;
 		log.u_bbr.flex4 = flex4;
 		log.u_bbr.flex5 = flex5;
 		log.u_bbr.flex6 = flex6;
 		log.u_bbr.flex7 = rack->r_ctl.dsack_persist;
 		log.u_bbr.flex8 = mod;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    RACK_DSACK_HANDLING, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 static void
 rack_log_hdwr_pacing(struct tcp_rack *rack,
 		     uint64_t rate, uint64_t hw_rate, int line,
 		     int error, uint16_t mod)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 		const struct ifnet *ifp;
 
 		memset(&log, 0, sizeof(log));
 		log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff);
 		log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff);
 		if (rack->r_ctl.crte) {
 			ifp = rack->r_ctl.crte->ptbl->rs_ifp;
 		} else if (rack->rc_inp->inp_route.ro_nh &&
 			   rack->rc_inp->inp_route.ro_nh->nh_ifp) {
 			ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp;
 		} else
 			ifp = NULL;
 		if (ifp) {
 			log.u_bbr.flex3 = (((uint64_t)ifp  >> 32) & 0x00000000ffffffff);
 			log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff);
 		}
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.bw_inuse = rate;
 		log.u_bbr.flex5 = line;
 		log.u_bbr.flex6 = error;
 		log.u_bbr.flex7 = mod;
 		log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs;
 		log.u_bbr.flex8 = rack->use_fixed_rate;
 		log.u_bbr.flex8 <<= 1;
 		log.u_bbr.flex8 |= rack->rack_hdrw_pacing;
 		log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
 		log.u_bbr.delRate = rack->r_ctl.crte_prev_rate;
 		if (rack->r_ctl.crte)
 			log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate;
 		else
 			log.u_bbr.cur_del_rate = 0;
 		log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req;
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_HDWR_PACE, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 static uint64_t
 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped)
 {
 	/*
 	 * We allow rack_per_of_gp_xx to dictate our bw rate we want.
 	 */
 	uint64_t bw_est, high_rate;
 	uint64_t gain;
 
 	gain = (uint64_t)rack_get_output_gain(rack, rsm);
 	bw_est = bw * gain;
 	bw_est /= (uint64_t)100;
 	/* Never fall below the minimum (def 64kbps) */
 	if (bw_est < RACK_MIN_BW)
 		bw_est = RACK_MIN_BW;
 	if (rack->r_rack_hw_rate_caps) {
 		/* Rate caps are in place */
 		if (rack->r_ctl.crte != NULL) {
 			/* We have a hdwr rate already */
 			high_rate = tcp_hw_highest_rate(rack->r_ctl.crte);
 			if (bw_est >= high_rate) {
 				/* We are capping bw at the highest rate table entry */
 				rack_log_hdwr_pacing(rack,
 						     bw_est, high_rate, __LINE__,
 						     0, 3);
 				bw_est = high_rate;
 				if (capped)
 					*capped = 1;
 			}
 		} else if ((rack->rack_hdrw_pacing == 0) &&
 			   (rack->rack_hdw_pace_ena) &&
 			   (rack->rack_attempt_hdwr_pace == 0) &&
 			   (rack->rc_inp->inp_route.ro_nh != NULL) &&
 			   (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
 			/*
 			 * Special case, we have not yet attempted hardware
 			 * pacing, and yet we may, when we do, find out if we are
 			 * above the highest rate. We need to know the maxbw for the interface
 			 * in question (if it supports ratelimiting). We get back
 			 * a 0, if the interface is not found in the RL lists.
 			 */
 			high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp);
 			if (high_rate) {
 				/* Yep, we have a rate is it above this rate? */
 				if (bw_est > high_rate) {
 					bw_est = high_rate;
 					if (capped)
 						*capped = 1;
 				}
 			}
 		}
 	}
 	return (bw_est);
 }
 
 static void
 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		if ((mod != 1) && (rack_verbose_logging == 0)) {
 			/*
 			 * We get 3 values currently for mod
 			 * 1 - We are retransmitting and this tells the reason.
 			 * 2 - We are clearing a dup-ack count.
 			 * 3 - We are incrementing a dup-ack count.
 			 *
 			 * The clear/increment are only logged
 			 * if you have BBverbose on.
 			 */
 			return;
 		}
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.flex1 = tsused;
 		log.u_bbr.flex2 = thresh;
 		log.u_bbr.flex3 = rsm->r_flags;
 		log.u_bbr.flex4 = rsm->r_dupack;
 		log.u_bbr.flex5 = rsm->r_start;
 		log.u_bbr.flex6 = rsm->r_end;
 		log.u_bbr.flex8 = mod;
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
 		log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
 		log.u_bbr.pacing_gain = rack->r_must_retran;
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_SETTINGS_CHG, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 static void
 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.flex1 = rack->rc_tp->t_srtt;
 		log.u_bbr.flex2 = to;
 		log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex4 = slot;
 		log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
 		log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
 		log.u_bbr.flex7 = rack->rc_in_persist;
 		log.u_bbr.flex8 = which;
 		if (rack->rack_no_prr)
 			log.u_bbr.pkts_out = 0;
 		else
 			log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
 		log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
 		log.u_bbr.pacing_gain = rack->r_must_retran;
 		log.u_bbr.cwnd_gain = rack->rc_has_collapsed;
 		log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift;
 		log.u_bbr.lost = rack_rto_min;
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_TIMERSTAR, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 static void
 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
 		log.u_bbr.flex8 = to_num;
 		log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
 		log.u_bbr.flex2 = rack->rc_rack_rtt;
 		if (rsm == NULL)
 			log.u_bbr.flex3 = 0;
 		else
 			log.u_bbr.flex3 = rsm->r_end - rsm->r_start;
 		if (rack->rack_no_prr)
 			log.u_bbr.flex5 = 0;
 		else
 			log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
 		log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
 		log.u_bbr.pacing_gain = rack->r_must_retran;
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_RTO, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 static void
 rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack,
 		 struct rack_sendmap *prev,
 		 struct rack_sendmap *rsm,
 		 struct rack_sendmap *next,
 		 int flag, uint32_t th_ack, int line)
 {
 	if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.flex8 = flag;
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
 		log.u_bbr.cur_del_rate = (uint64_t)prev;
 		log.u_bbr.delRate = (uint64_t)rsm;
 		log.u_bbr.rttProp = (uint64_t)next;
 		log.u_bbr.flex7 = 0;
 		if (prev) {
 			log.u_bbr.flex1 = prev->r_start;
 			log.u_bbr.flex2 = prev->r_end;
 			log.u_bbr.flex7 |= 0x4;
 		}
 		if (rsm) {
 			log.u_bbr.flex3 = rsm->r_start;
 			log.u_bbr.flex4 = rsm->r_end;
 			log.u_bbr.flex7 |= 0x2;
 		}
 		if (next) {
 			log.u_bbr.flex5 = next->r_start;
 			log.u_bbr.flex6 = next->r_end;
 			log.u_bbr.flex7 |= 0x1;
 		}
 		log.u_bbr.applimited = line;
 		log.u_bbr.pkts_out = th_ack;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		if (rack->rack_no_prr)
 			log.u_bbr.lost = 0;
 		else
 			log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt;
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    TCP_LOG_MAPCHG, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 static void
 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len,
 		 struct rack_sendmap *rsm, int conf)
 {
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
 		log.u_bbr.flex1 = t;
 		log.u_bbr.flex2 = len;
 		log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt;
 		log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
 		log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
 		log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt;
 		log.u_bbr.flex7 = conf;
 		log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot;
 		log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt;
 		log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags;
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		if (rsm) {
 			log.u_bbr.pkt_epoch = rsm->r_start;
 			log.u_bbr.lost = rsm->r_end;
 			log.u_bbr.cwnd_gain = rsm->r_rtr_cnt;
 			/* We loose any upper of the 24 bits */
 			log.u_bbr.pacing_gain = (uint16_t)rsm->r_flags;
 		} else {
 			/* Its a SYN */
 			log.u_bbr.pkt_epoch = rack->rc_tp->iss;
 			log.u_bbr.lost = 0;
 			log.u_bbr.cwnd_gain = 0;
 			log.u_bbr.pacing_gain = 0;
 		}
 		/* Write out general bits of interest rrs here */
 		log.u_bbr.use_lt_bw = rack->rc_highly_buffered;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->forced_ack;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->app_limited_needs_set;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->rc_gp_filled;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom;
 		log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight;
 		log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts;
 		log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered;
 		log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts;
 		log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt;
 		log.u_bbr.bw_inuse = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
 		log.u_bbr.bw_inuse <<= 32;
 		if (rsm)
 			log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]);
 		TCP_LOG_EVENTP(tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_BBRRTT, 0,
 		    0, &log, false, &tv);
 
 
 	}
 }
 
 static void
 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
 {
 	/*
 	 * Log the rtt sample we are
 	 * applying to the srtt algorithm in
 	 * useconds.
 	 */
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		/* Convert our ms to a microsecond */
 		memset(&log, 0, sizeof(log));
 		log.u_bbr.flex1 = rtt;
 		log.u_bbr.flex2 = rack->r_ctl.ack_count;
 		log.u_bbr.flex3 = rack->r_ctl.sack_count;
 		log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
 		log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra;
 		log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
 		log.u_bbr.flex7 = 1;
 		log.u_bbr.flex8 = rack->sack_attack_disable;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
 		log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
 		log.u_bbr.pacing_gain = rack->r_must_retran;
 		/*
 		 * We capture in delRate the upper 32 bits as
 		 * the confidence level we had declared, and the
 		 * lower 32 bits as the actual RTT using the arrival
 		 * timestamp.
 		 */
 		log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence;
 		log.u_bbr.delRate <<= 32;
 		log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt;
 		/* Lets capture all the things that make up t_rtxcur */
 		log.u_bbr.applimited = rack_rto_min;
 		log.u_bbr.epoch = rack_rto_max;
 		log.u_bbr.lt_epoch = rack->r_ctl.timer_slop;
 		log.u_bbr.lost = rack_rto_min;
 		log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop);
 		log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp);
 		log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec;
 		log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC;
 		log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec;
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    TCP_LOG_RTT, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 static void
 rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where)
 {
 	if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		/* Convert our ms to a microsecond */
 		memset(&log, 0, sizeof(log));
 		log.u_bbr.flex1 = rtt;
 		log.u_bbr.flex2 = send_time;
 		log.u_bbr.flex3 = ack_time;
 		log.u_bbr.flex4 = where;
 		log.u_bbr.flex7 = 2;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    TCP_LOG_RTT, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 
 
 static inline void
 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line)
 {
 	if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
 		log.u_bbr.flex1 = line;
 		log.u_bbr.flex2 = tick;
 		log.u_bbr.flex3 = tp->t_maxunacktime;
 		log.u_bbr.flex4 = tp->t_acktime;
 		log.u_bbr.flex8 = event;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
 		log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
 		log.u_bbr.pacing_gain = rack->r_must_retran;
 		TCP_LOG_EVENTP(tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_PROGRESS, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 static void
 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
 		log.u_bbr.flex1 = slot;
 		if (rack->rack_no_prr)
 			log.u_bbr.flex2 = 0;
 		else
 			log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt;
 		log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
 		log.u_bbr.flex8 = rack->rc_in_persist;
 		log.u_bbr.timeStamp = cts;
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
 		log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
 		log.u_bbr.pacing_gain = rack->r_must_retran;
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_BBRSND, 0,
 		    0, &log, false, tv);
 	}
 }
 
 static void
 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log, 0, sizeof(log));
 		log.u_bbr.flex1 = did_out;
 		log.u_bbr.flex2 = nxt_pkt;
 		log.u_bbr.flex3 = way_out;
 		log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
 		if (rack->rack_no_prr)
 			log.u_bbr.flex5 = 0;
 		else
 			log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
 		log.u_bbr.flex6 = nsegs;
 		log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs;
 		log.u_bbr.flex7 = rack->rc_ack_can_sendout_data;	/* Do we have ack-can-send set */
 		log.u_bbr.flex7 <<= 1;
 		log.u_bbr.flex7 |= rack->r_fast_output;	/* is fast output primed */
 		log.u_bbr.flex7 <<= 1;
 		log.u_bbr.flex7 |= rack->r_wanted_output;	/* Do we want output */
 		log.u_bbr.flex8 = rack->rc_in_persist;
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->r_might_revert;
 		log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
 		log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
 		log.u_bbr.pacing_gain = rack->r_must_retran;
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_DOSEG_DONE, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 static void
 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm)
 {
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log, 0, sizeof(log));
 		log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs;
 		log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
 		log.u_bbr.flex4 = arg1;
 		log.u_bbr.flex5 = arg2;
 		log.u_bbr.flex6 = arg3;
 		log.u_bbr.flex8 = frm;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
 		log.u_bbr.applimited = rack->r_ctl.rc_sacked;
 		log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
 		log.u_bbr.pacing_gain = rack->r_must_retran;
 		TCP_LOG_EVENTP(tp, NULL,
 		    &tp->t_inpcb->inp_socket->so_rcv,
 		    &tp->t_inpcb->inp_socket->so_snd,
 		    TCP_HDWR_PACE_SIZE, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 static void
 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot,
 			  uint8_t hpts_calling, int reason, uint32_t cwnd_to_use)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
 		log.u_bbr.flex1 = slot;
 		log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex4 = reason;
 		if (rack->rack_no_prr)
 			log.u_bbr.flex5 = 0;
 		else
 			log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
 		log.u_bbr.flex7 = hpts_calling;
 		log.u_bbr.flex8 = rack->rc_in_persist;
 		log.u_bbr.lt_epoch = cwnd_to_use;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
 		log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
 		log.u_bbr.pacing_gain = rack->r_must_retran;
 		log.u_bbr.cwnd_gain = rack->rc_has_collapsed;
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_JUSTRET, 0,
 		    tlen, &log, false, &tv);
 	}
 }
 
 static void
 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts,
 		   struct timeval *tv, uint32_t flags_on_entry)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
 		log.u_bbr.flex1 = line;
 		log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to;
 		log.u_bbr.flex3 = flags_on_entry;
 		log.u_bbr.flex4 = us_cts;
 		if (rack->rack_no_prr)
 			log.u_bbr.flex5 = 0;
 		else
 			log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
 		log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
 		log.u_bbr.flex7 = hpts_removed;
 		log.u_bbr.flex8 = 1;
 		log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags;
 		log.u_bbr.timeStamp = us_cts;
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
 		log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
 		log.u_bbr.pacing_gain = rack->r_must_retran;
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_TIMERCANC, 0,
 		    0, &log, false, tv);
 	}
 }
 
 static void
 rack_log_alt_to_to_cancel(struct tcp_rack *rack,
 			  uint32_t flex1, uint32_t flex2,
 			  uint32_t flex3, uint32_t flex4,
 			  uint32_t flex5, uint32_t flex6,
 			  uint16_t flex7, uint8_t mod)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		if (mod == 1) {
 			/* No you can't use 1, its for the real to cancel */
 			return;
 		}
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.flex1 = flex1;
 		log.u_bbr.flex2 = flex2;
 		log.u_bbr.flex3 = flex3;
 		log.u_bbr.flex4 = flex4;
 		log.u_bbr.flex5 = flex5;
 		log.u_bbr.flex6 = flex6;
 		log.u_bbr.flex7 = flex7;
 		log.u_bbr.flex8 = mod;
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_TIMERCANC, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 static void
 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.flex1 = timers;
 		log.u_bbr.flex2 = ret;
 		log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
 		log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex5 = cts;
 		if (rack->rack_no_prr)
 			log.u_bbr.flex6 = 0;
 		else
 			log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt;
 		log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
 		log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
 		log.u_bbr.pacing_gain = rack->r_must_retran;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_TO_PROCESS, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 static void
 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd, int line)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.flex1 = rack->r_ctl.rc_prr_out;
 		log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs;
 		if (rack->rack_no_prr)
 			log.u_bbr.flex3 = 0;
 		else
 			log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt;
 		log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered;
 		log.u_bbr.flex5 = rack->r_ctl.rc_sacked;
 		log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt;
 		log.u_bbr.flex7 = line;
 		log.u_bbr.flex8 = frm;
 		log.u_bbr.pkts_out = orig_cwnd;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->r_might_revert;
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_BBRUPD, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 #ifdef NETFLIX_EXP_DETECTION
 static void
 rack_log_sad(struct tcp_rack *rack, int event)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.flex1 = rack->r_ctl.sack_count;
 		log.u_bbr.flex2 = rack->r_ctl.ack_count;
 		log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra;
 		log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move;
 		log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced;
 		log.u_bbr.flex6 = tcp_sack_to_ack_thresh;
 		log.u_bbr.pkts_out = tcp_sack_to_move_thresh;
 		log.u_bbr.lt_epoch = (tcp_force_detection << 8);
 		log.u_bbr.lt_epoch |= rack->do_detection;
 		log.u_bbr.applimited = tcp_map_minimum;
 		log.u_bbr.flex7 = rack->sack_attack_disable;
 		log.u_bbr.flex8 = event;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.delivered = tcp_sad_decay_val;
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    TCP_SAD_DETECTION, 0,
 		    0, &log, false, &tv);
 	}
 }
 #endif
 
 static void
 rack_counter_destroy(void)
 {
 	counter_u64_free(rack_fto_send);
 	counter_u64_free(rack_fto_rsm_send);
 	counter_u64_free(rack_nfto_resend);
 	counter_u64_free(rack_hw_pace_init_fail);
 	counter_u64_free(rack_hw_pace_lost);
 	counter_u64_free(rack_non_fto_send);
 	counter_u64_free(rack_extended_rfo);
 	counter_u64_free(rack_ack_total);
 	counter_u64_free(rack_express_sack);
 	counter_u64_free(rack_sack_total);
 	counter_u64_free(rack_move_none);
 	counter_u64_free(rack_move_some);
 	counter_u64_free(rack_sack_attacks_detected);
 	counter_u64_free(rack_sack_attacks_reversed);
 	counter_u64_free(rack_sack_used_next_merge);
 	counter_u64_free(rack_sack_used_prev_merge);
 	counter_u64_free(rack_tlp_tot);
 	counter_u64_free(rack_tlp_newdata);
 	counter_u64_free(rack_tlp_retran);
 	counter_u64_free(rack_tlp_retran_bytes);
 	counter_u64_free(rack_to_tot);
 	counter_u64_free(rack_saw_enobuf);
 	counter_u64_free(rack_saw_enobuf_hw);
 	counter_u64_free(rack_saw_enetunreach);
 	counter_u64_free(rack_hot_alloc);
 	counter_u64_free(rack_to_alloc);
 	counter_u64_free(rack_to_alloc_hard);
 	counter_u64_free(rack_to_alloc_emerg);
 	counter_u64_free(rack_to_alloc_limited);
 	counter_u64_free(rack_alloc_limited_conns);
 	counter_u64_free(rack_split_limited);
 	counter_u64_free(rack_multi_single_eq);
 	counter_u64_free(rack_proc_non_comp_ack);
 	counter_u64_free(rack_sack_proc_all);
 	counter_u64_free(rack_sack_proc_restart);
 	counter_u64_free(rack_sack_proc_short);
 	counter_u64_free(rack_sack_skipped_acked);
 	counter_u64_free(rack_sack_splits);
 	counter_u64_free(rack_input_idle_reduces);
 	counter_u64_free(rack_collapsed_win);
 	counter_u64_free(rack_collapsed_win_rxt);
 	counter_u64_free(rack_collapsed_win_rxt_bytes);
 	counter_u64_free(rack_collapsed_win_seen);
 	counter_u64_free(rack_try_scwnd);
 	counter_u64_free(rack_persists_sends);
 	counter_u64_free(rack_persists_acks);
 	counter_u64_free(rack_persists_loss);
 	counter_u64_free(rack_persists_lost_ends);
 #ifdef INVARIANTS
 	counter_u64_free(rack_adjust_map_bw);
 #endif
 	COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
 	COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
 }
 
 static struct rack_sendmap *
 rack_alloc(struct tcp_rack *rack)
 {
 	struct rack_sendmap *rsm;
 
 	/*
 	 * First get the top of the list it in
 	 * theory is the "hottest" rsm we have,
 	 * possibly just freed by ack processing.
 	 */
 	if (rack->rc_free_cnt > rack_free_cache) {
 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
 		TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
 		counter_u64_add(rack_hot_alloc, 1);
 		rack->rc_free_cnt--;
 		return (rsm);
 	}
 	/*
 	 * Once we get under our free cache we probably
 	 * no longer have a "hot" one available. Lets
 	 * get one from UMA.
 	 */
 	rsm = uma_zalloc(rack_zone, M_NOWAIT);
 	if (rsm) {
 		rack->r_ctl.rc_num_maps_alloced++;
 		counter_u64_add(rack_to_alloc, 1);
 		return (rsm);
 	}
 	/*
 	 * Dig in to our aux rsm's (the last two) since
 	 * UMA failed to get us one.
 	 */
 	if (rack->rc_free_cnt) {
 		counter_u64_add(rack_to_alloc_emerg, 1);
 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
 		TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
 		rack->rc_free_cnt--;
 		return (rsm);
 	}
 	return (NULL);
 }
 
 static struct rack_sendmap *
 rack_alloc_full_limit(struct tcp_rack *rack)
 {
 	if ((V_tcp_map_entries_limit > 0) &&
 	    (rack->do_detection == 0) &&
 	    (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
 		counter_u64_add(rack_to_alloc_limited, 1);
 		if (!rack->alloc_limit_reported) {
 			rack->alloc_limit_reported = 1;
 			counter_u64_add(rack_alloc_limited_conns, 1);
 		}
 		return (NULL);
 	}
 	return (rack_alloc(rack));
 }
 
 /* wrapper to allocate a sendmap entry, subject to a specific limit */
 static struct rack_sendmap *
 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
 {
 	struct rack_sendmap *rsm;
 
 	if (limit_type) {
 		/* currently there is only one limit type */
 		if (V_tcp_map_split_limit > 0 &&
 		    (rack->do_detection == 0) &&
 		    rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) {
 			counter_u64_add(rack_split_limited, 1);
 			if (!rack->alloc_limit_reported) {
 				rack->alloc_limit_reported = 1;
 				counter_u64_add(rack_alloc_limited_conns, 1);
 			}
 			return (NULL);
 		}
 	}
 
 	/* allocate and mark in the limit type, if set */
 	rsm = rack_alloc(rack);
 	if (rsm != NULL && limit_type) {
 		rsm->r_limit_type = limit_type;
 		rack->r_ctl.rc_num_split_allocs++;
 	}
 	return (rsm);
 }
 
 static void
 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
 {
 	if (rsm->r_flags & RACK_APP_LIMITED) {
 		if (rack->r_ctl.rc_app_limited_cnt > 0) {
 			rack->r_ctl.rc_app_limited_cnt--;
 		}
 	}
 	if (rsm->r_limit_type) {
 		/* currently there is only one limit type */
 		rack->r_ctl.rc_num_split_allocs--;
 	}
 	if (rsm == rack->r_ctl.rc_first_appl) {
 		if (rack->r_ctl.rc_app_limited_cnt == 0)
 			rack->r_ctl.rc_first_appl = NULL;
 		else {
 			/* Follow the next one out */
 			struct rack_sendmap fe;
 
 			fe.r_start = rsm->r_nseq_appl;
 			rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
 		}
 	}
 	if (rsm == rack->r_ctl.rc_resend)
 		rack->r_ctl.rc_resend = NULL;
 	if (rsm == rack->r_ctl.rc_end_appl)
 		rack->r_ctl.rc_end_appl = NULL;
 	if (rack->r_ctl.rc_tlpsend == rsm)
 		rack->r_ctl.rc_tlpsend = NULL;
 	if (rack->r_ctl.rc_sacklast == rsm)
 		rack->r_ctl.rc_sacklast = NULL;
 	memset(rsm, 0, sizeof(struct rack_sendmap));
 	TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext);
 	rack->rc_free_cnt++;
 }
 
 static void
 rack_free_trim(struct tcp_rack *rack)
 {
 	struct rack_sendmap *rsm;
 
 	/*
 	 * Free up all the tail entries until
 	 * we get our list down to the limit.
 	 */
 	while (rack->rc_free_cnt > rack_free_cache) {
 		rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head);
 		TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
 		rack->rc_free_cnt--;
 		uma_zfree(rack_zone, rsm);
 	}
 }
 
 
 static uint32_t
 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack)
 {
 	uint64_t srtt, bw, len, tim;
 	uint32_t segsiz, def_len, minl;
 
 	segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
 	def_len = rack_def_data_window * segsiz;
 	if (rack->rc_gp_filled == 0) {
 		/*
 		 * We have no measurement (IW is in flight?) so
 		 * we can only guess using our data_window sysctl
 		 * value (usually 20MSS).
 		 */
 		return (def_len);
 	}
 	/*
 	 * Now we have a number of factors to consider.
 	 *
 	 * 1) We have a desired BDP which is usually
 	 *    at least 2.
 	 * 2) We have a minimum number of rtt's usually 1 SRTT
 	 *    but we allow it too to be more.
 	 * 3) We want to make sure a measurement last N useconds (if
 	 *    we have set rack_min_measure_usec.
 	 *
 	 * We handle the first concern here by trying to create a data
 	 * window of max(rack_def_data_window, DesiredBDP). The
 	 * second concern we handle in not letting the measurement
 	 * window end normally until at least the required SRTT's
 	 * have gone by which is done further below in
 	 * rack_enough_for_measurement(). Finally the third concern
 	 * we also handle here by calculating how long that time
 	 * would take at the current BW and then return the
 	 * max of our first calculation and that length. Note
 	 * that if rack_min_measure_usec is 0, we don't deal
 	 * with concern 3. Also for both Concern 1 and 3 an
 	 * application limited period could end the measurement
 	 * earlier.
 	 *
 	 * So lets calculate the BDP with the "known" b/w using
 	 * the SRTT has our rtt and then multiply it by the
 	 * goal.
 	 */
 	bw = rack_get_bw(rack);
 	srtt = (uint64_t)tp->t_srtt;
 	len = bw * srtt;
 	len /= (uint64_t)HPTS_USEC_IN_SEC;
 	len *= max(1, rack_goal_bdp);
 	/* Now we need to round up to the nearest MSS */
 	len = roundup(len, segsiz);
 	if (rack_min_measure_usec) {
 		/* Now calculate our min length for this b/w */
 		tim = rack_min_measure_usec;
 		minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC;
 		if (minl == 0)
 			minl = 1;
 		minl = roundup(minl, segsiz);
 		if (len < minl)
 			len = minl;
 	}
 	/*
 	 * Now if we have a very small window we want
 	 * to attempt to get the window that is
 	 * as small as possible. This happens on
 	 * low b/w connections and we don't want to
 	 * span huge numbers of rtt's between measurements.
 	 *
 	 * We basically include 2 over our "MIN window" so
 	 * that the measurement can be shortened (possibly) by
 	 * an ack'ed packet.
 	 */
 	if (len < def_len)
 		return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz)));
 	else
 		return (max((uint32_t)len, def_len));
 
 }
 
 static int
 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, uint8_t *quality)
 {
 	uint32_t tim, srtts, segsiz;
 
 	/*
 	 * Has enough time passed for the GP measurement to be valid?
 	 */
 	if ((tp->snd_max == tp->snd_una) ||
 	    (th_ack == tp->snd_max)){
 		/* All is acked */
 		*quality = RACK_QUALITY_ALLACKED;
 		return (1);
 	}
 	if (SEQ_LT(th_ack, tp->gput_seq)) {
 		/* Not enough bytes yet */
 		return (0);
 	}
 	segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
 	if (SEQ_LT(th_ack, tp->gput_ack) &&
 	    ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) {
 		/* Not enough bytes yet */
 		return (0);
 	}
 	if (rack->r_ctl.rc_first_appl &&
 	    (SEQ_GEQ(th_ack, rack->r_ctl.rc_first_appl->r_end))) {
 		/*
 		 * We are up to the app limited send point
 		 * we have to measure irrespective of the time..
 		 */
 		*quality = RACK_QUALITY_APPLIMITED;
 		return (1);
 	}
 	/* Now what about time? */
 	srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts);
 	tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts;
 	if (tim >= srtts) {
 		*quality = RACK_QUALITY_HIGH;
 		return (1);
 	}
 	/* Nope not even a full SRTT has passed */
 	return (0);
 }
 
 static void
 rack_log_timely(struct tcp_rack *rack,
 		uint32_t logged, uint64_t cur_bw, uint64_t low_bnd,
 		uint64_t up_bnd, int line, uint8_t method)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log, 0, sizeof(log));
 		log.u_bbr.flex1 = logged;
 		log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt;
 		log.u_bbr.flex2 <<= 4;
 		log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt;
 		log.u_bbr.flex2 <<= 4;
 		log.u_bbr.flex2 |= rack->rc_gp_incr;
 		log.u_bbr.flex2 <<= 4;
 		log.u_bbr.flex2 |= rack->rc_gp_bwred;
 		log.u_bbr.flex3 = rack->rc_gp_incr;
 		log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss;
 		log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca;
 		log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec;
 		log.u_bbr.flex7 = rack->rc_gp_bwred;
 		log.u_bbr.flex8 = method;
 		log.u_bbr.cur_del_rate = cur_bw;
 		log.u_bbr.delRate = low_bnd;
 		log.u_bbr.bw_inuse = up_bnd;
 		log.u_bbr.rttProp = rack_get_bw(rack);
 		log.u_bbr.pkt_epoch = line;
 		log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt;
 		log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt;
 		log.u_bbr.cwnd_gain = rack->rc_dragged_bottom;
 		log.u_bbr.cwnd_gain <<= 1;
 		log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec;
 		log.u_bbr.cwnd_gain <<= 1;
 		log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
 		log.u_bbr.cwnd_gain <<= 1;
 		log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
 		log.u_bbr.lost = rack->r_ctl.rc_loss_count;
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    TCP_TIMELY_WORK, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 static int
 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult)
 {
 	/*
 	 * Before we increase we need to know if
 	 * the estimate just made was less than
 	 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est)
 	 *
 	 * If we already are pacing at a fast enough
 	 * rate to push us faster there is no sense of
 	 * increasing.
 	 *
 	 * We first caculate our actual pacing rate (ss or ca multiplier
 	 * times our cur_bw).
 	 *
 	 * Then we take the last measured rate and multipy by our
 	 * maximum pacing overage to give us a max allowable rate.
 	 *
 	 * If our act_rate is smaller than our max_allowable rate
 	 * then we should increase. Else we should hold steady.
 	 *
 	 */
 	uint64_t act_rate, max_allow_rate;
 
 	if (rack_timely_no_stopping)
 		return (1);
 
 	if ((cur_bw == 0) || (last_bw_est == 0)) {
 		/*
 		 * Initial startup case or
 		 * everything is acked case.
 		 */
 		rack_log_timely(rack,  mult, cur_bw, 0, 0,
 				__LINE__, 9);
 		return (1);
 	}
 	if (mult <= 100) {
 		/*
 		 * We can always pace at or slightly above our rate.
 		 */
 		rack_log_timely(rack,  mult, cur_bw, 0, 0,
 				__LINE__, 9);
 		return (1);
 	}
 	act_rate = cur_bw * (uint64_t)mult;
 	act_rate /= 100;
 	max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100);
 	max_allow_rate /= 100;
 	if (act_rate < max_allow_rate) {
 		/*
 		 * Here the rate we are actually pacing at
 		 * is smaller than 10% above our last measurement.
 		 * This means we are pacing below what we would
 		 * like to try to achieve (plus some wiggle room).
 		 */
 		rack_log_timely(rack,  mult, cur_bw, act_rate, max_allow_rate,
 				__LINE__, 9);
 		return (1);
 	} else {
 		/*
 		 * Here we are already pacing at least rack_max_per_above(10%)
 		 * what we are getting back. This indicates most likely
 		 * that we are being limited (cwnd/rwnd/app) and can't
 		 * get any more b/w. There is no sense of trying to
 		 * raise up the pacing rate its not speeding us up
 		 * and we already are pacing faster than we are getting.
 		 */
 		rack_log_timely(rack,  mult, cur_bw, act_rate, max_allow_rate,
 				__LINE__, 8);
 		return (0);
 	}
 }
 
 static void
 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack)
 {
 	/*
 	 * When we drag bottom, we want to assure
 	 * that no multiplier is below 1.0, if so
 	 * we want to restore it to at least that.
 	 */
 	if (rack->r_ctl.rack_per_of_gp_rec  < 100) {
 		/* This is unlikely we usually do not touch recovery */
 		rack->r_ctl.rack_per_of_gp_rec = 100;
 	}
 	if (rack->r_ctl.rack_per_of_gp_ca < 100) {
 		rack->r_ctl.rack_per_of_gp_ca = 100;
 	}
 	if (rack->r_ctl.rack_per_of_gp_ss < 100) {
 		rack->r_ctl.rack_per_of_gp_ss = 100;
 	}
 }
 
 static void
 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack)
 {
 	if (rack->r_ctl.rack_per_of_gp_ca > 100) {
 		rack->r_ctl.rack_per_of_gp_ca = 100;
 	}
 	if (rack->r_ctl.rack_per_of_gp_ss > 100) {
 		rack->r_ctl.rack_per_of_gp_ss = 100;
 	}
 }
 
 static void
 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override)
 {
 	int32_t  calc, logged, plus;
 
 	logged = 0;
 
 	if (override) {
 		/*
 		 * override is passed when we are
 		 * loosing b/w and making one last
 		 * gasp at trying to not loose out
 		 * to a new-reno flow.
 		 */
 		goto extra_boost;
 	}
 	/* In classic timely we boost by 5x if we have 5 increases in a row, lets not */
 	if (rack->rc_gp_incr &&
 	    ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) {
 		/*
 		 * Reset and get 5 strokes more before the boost. Note
 		 * that the count is 0 based so we have to add one.
 		 */
 extra_boost:
 		plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST;
 		rack->rc_gp_timely_inc_cnt = 0;
 	} else
 		plus = (uint32_t)rack_gp_increase_per;
 	/* Must be at least 1% increase for true timely increases */
 	if ((plus < 1) &&
 	    ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0)))
 		plus = 1;
 	if (rack->rc_gp_saw_rec &&
 	    (rack->rc_gp_no_rec_chg == 0) &&
 	    rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
 				  rack->r_ctl.rack_per_of_gp_rec)) {
 		/* We have been in recovery ding it too */
 		calc = rack->r_ctl.rack_per_of_gp_rec + plus;
 		if (calc > 0xffff)
 			calc = 0xffff;
 		logged |= 1;
 		rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc;
 		if (rack_per_upper_bound_ss &&
 		    (rack->rc_dragged_bottom == 0) &&
 		    (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss))
 			rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss;
 	}
 	if (rack->rc_gp_saw_ca &&
 	    (rack->rc_gp_saw_ss == 0) &&
 	    rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
 				  rack->r_ctl.rack_per_of_gp_ca)) {
 		/* In CA */
 		calc = rack->r_ctl.rack_per_of_gp_ca + plus;
 		if (calc > 0xffff)
 			calc = 0xffff;
 		logged |= 2;
 		rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc;
 		if (rack_per_upper_bound_ca &&
 		    (rack->rc_dragged_bottom == 0) &&
 		    (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca))
 			rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca;
 	}
 	if (rack->rc_gp_saw_ss &&
 	    rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
 				  rack->r_ctl.rack_per_of_gp_ss)) {
 		/* In SS */
 		calc = rack->r_ctl.rack_per_of_gp_ss + plus;
 		if (calc > 0xffff)
 			calc = 0xffff;
 		rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc;
 		if (rack_per_upper_bound_ss &&
 		    (rack->rc_dragged_bottom == 0) &&
 		    (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss))
 			rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss;
 		logged |= 4;
 	}
 	if (logged &&
 	    (rack->rc_gp_incr == 0)){
 		/* Go into increment mode */
 		rack->rc_gp_incr = 1;
 		rack->rc_gp_timely_inc_cnt = 0;
 	}
 	if (rack->rc_gp_incr &&
 	    logged &&
 	    (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) {
 		rack->rc_gp_timely_inc_cnt++;
 	}
 	rack_log_timely(rack,  logged, plus, 0, 0,
 			__LINE__, 1);
 }
 
 static uint32_t
 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff)
 {
 	/*
 	 * norm_grad = rtt_diff / minrtt;
 	 * new_per = curper * (1 - B * norm_grad)
 	 *
 	 * B = rack_gp_decrease_per (default 10%)
 	 * rtt_dif = input var current rtt-diff
 	 * curper = input var current percentage
 	 * minrtt = from rack filter
 	 *
 	 */
 	uint64_t perf;
 
 	perf = (((uint64_t)curper * ((uint64_t)1000000 -
 		    ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 *
 		     (((uint64_t)rtt_diff * (uint64_t)1000000)/
 		      (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/
 		     (uint64_t)1000000)) /
 		(uint64_t)1000000);
 	if (perf > curper) {
 		/* TSNH */
 		perf = curper - 1;
 	}
 	return ((uint32_t)perf);
 }
 
 static uint32_t
 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt)
 {
 	/*
 	 *                                   highrttthresh
 	 * result = curper * (1 - (B * ( 1 -  ------          ))
 	 *                                     gp_srtt
 	 *
 	 * B = rack_gp_decrease_per (default 10%)
 	 * highrttthresh = filter_min * rack_gp_rtt_maxmul
 	 */
 	uint64_t perf;
 	uint32_t highrttthresh;
 
 	highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul;
 
 	perf = (((uint64_t)curper * ((uint64_t)1000000 -
 				     ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 -
 					((uint64_t)highrttthresh * (uint64_t)1000000) /
 						    (uint64_t)rtt)) / 100)) /(uint64_t)1000000);
 	return (perf);
 }
 
 static void
 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff)
 {
 	uint64_t logvar, logvar2, logvar3;
 	uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val;
 
 	if (rack->rc_gp_incr) {
 		/* Turn off increment counting */
 		rack->rc_gp_incr = 0;
 		rack->rc_gp_timely_inc_cnt = 0;
 	}
 	ss_red = ca_red = rec_red = 0;
 	logged = 0;
 	/* Calculate the reduction value */
 	if (rtt_diff < 0) {
 		rtt_diff *= -1;
 	}
 	/* Must be at least 1% reduction */
 	if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) {
 		/* We have been in recovery ding it too */
 		if (timely_says == 2) {
 			new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt);
 			alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
 			if (alt < new_per)
 				val = alt;
 			else
 				val = new_per;
 		} else
 			 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
 		if (rack->r_ctl.rack_per_of_gp_rec > val) {
 			rec_red = (rack->r_ctl.rack_per_of_gp_rec - val);
 			rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val;
 		} else {
 			rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound;
 			rec_red = 0;
 		}
 		if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec)
 			rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound;
 		logged |= 1;
 	}
 	if (rack->rc_gp_saw_ss) {
 		/* Sent in SS */
 		if (timely_says == 2) {
 			new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt);
 			alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
 			if (alt < new_per)
 				val = alt;
 			else
 				val = new_per;
 		} else
 			val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff);
 		if (rack->r_ctl.rack_per_of_gp_ss > new_per) {
 			ss_red = rack->r_ctl.rack_per_of_gp_ss - val;
 			rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val;
 		} else {
 			ss_red = new_per;
 			rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound;
 			logvar = new_per;
 			logvar <<= 32;
 			logvar |= alt;
 			logvar2 = (uint32_t)rtt;
 			logvar2 <<= 32;
 			logvar2 |= (uint32_t)rtt_diff;
 			logvar3 = rack_gp_rtt_maxmul;
 			logvar3 <<= 32;
 			logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
 			rack_log_timely(rack, timely_says,
 					logvar2, logvar3,
 					logvar, __LINE__, 10);
 		}
 		if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss)
 			rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound;
 		logged |= 4;
 	} else if (rack->rc_gp_saw_ca) {
 		/* Sent in CA */
 		if (timely_says == 2) {
 			new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt);
 			alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
 			if (alt < new_per)
 				val = alt;
 			else
 				val = new_per;
 		} else
 			val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff);
 		if (rack->r_ctl.rack_per_of_gp_ca > val) {
 			ca_red = rack->r_ctl.rack_per_of_gp_ca - val;
 			rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val;
 		} else {
 			rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound;
 			ca_red = 0;
 			logvar = new_per;
 			logvar <<= 32;
 			logvar |= alt;
 			logvar2 = (uint32_t)rtt;
 			logvar2 <<= 32;
 			logvar2 |= (uint32_t)rtt_diff;
 			logvar3 = rack_gp_rtt_maxmul;
 			logvar3 <<= 32;
 			logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
 			rack_log_timely(rack, timely_says,
 					logvar2, logvar3,
 					logvar, __LINE__, 10);
 		}
 		if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca)
 			rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound;
 		logged |= 2;
 	}
 	if (rack->rc_gp_timely_dec_cnt < 0x7) {
 		rack->rc_gp_timely_dec_cnt++;
 		if (rack_timely_dec_clear &&
 		    (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear))
 			rack->rc_gp_timely_dec_cnt = 0;
 	}
 	logvar = ss_red;
 	logvar <<= 32;
 	logvar |= ca_red;
 	rack_log_timely(rack,  logged, rec_red, rack_per_lower_bound, logvar,
 			__LINE__, 2);
 }
 
 static void
 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts,
 		     uint32_t rtt, uint32_t line, uint8_t reas)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.flex1 = line;
 		log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts;
 		log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts;
 		log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss;
 		log.u_bbr.flex5 = rtt;
 		log.u_bbr.flex6 = rack->rc_highly_buffered;
 		log.u_bbr.flex6 <<= 1;
 		log.u_bbr.flex6 |= rack->forced_ack;
 		log.u_bbr.flex6 <<= 1;
 		log.u_bbr.flex6 |= rack->rc_gp_dyn_mul;
 		log.u_bbr.flex6 <<= 1;
 		log.u_bbr.flex6 |= rack->in_probe_rtt;
 		log.u_bbr.flex6 <<= 1;
 		log.u_bbr.flex6 |= rack->measure_saw_probe_rtt;
 		log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt;
 		log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca;
 		log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec;
 		log.u_bbr.flex8 = reas;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.delRate = rack_get_bw(rack);
 		log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt;
 		log.u_bbr.cur_del_rate <<= 32;
 		log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt;
 		log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered;
 		log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff;
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt;
 		log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt;
 		log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts;
 		log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight;
 		log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
 		log.u_bbr.rttProp = us_cts;
 		log.u_bbr.rttProp <<= 32;
 		log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt;
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_RTT_SHRINKS, 0,
 		    0, &log, false, &rack->r_ctl.act_rcv_time);
 	}
 }
 
 static void
 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt)
 {
 	uint64_t bwdp;
 
 	bwdp = rack_get_bw(rack);
 	bwdp *= (uint64_t)rtt;
 	bwdp /= (uint64_t)HPTS_USEC_IN_SEC;
 	rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz);
 	if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) {
 		/*
 		 * A window protocol must be able to have 4 packets
 		 * outstanding as the floor in order to function
 		 * (especially considering delayed ack :D).
 		 */
 		rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs);
 	}
 }
 
 static void
 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts)
 {
 	/**
 	 * ProbeRTT is a bit different in rack_pacing than in
 	 * BBR. It is like BBR in that it uses the lowering of
 	 * the RTT as a signal that we saw something new and
 	 * counts from there for how long between. But it is
 	 * different in that its quite simple. It does not
 	 * play with the cwnd and wait until we get down
 	 * to N segments outstanding and hold that for
 	 * 200ms. Instead it just sets the pacing reduction
 	 * rate to a set percentage (70 by default) and hold
 	 * that for a number of recent GP Srtt's.
 	 */
 	uint32_t segsiz;
 
 	if (rack->rc_gp_dyn_mul == 0)
 		return;
 
 	if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) {
 		/* We are idle */
 		return;
 	}
 	if ((rack->rc_tp->t_flags & TF_GPUTINPROG) &&
 	    SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) {
 		/*
 		 * Stop the goodput now, the idea here is
 		 * that future measurements with in_probe_rtt
 		 * won't register if they are not greater so
 		 * we want to get what info (if any) is available
 		 * now.
 		 */
 		rack_do_goodput_measurement(rack->rc_tp, rack,
 					    rack->rc_tp->snd_una, __LINE__,
 					    RACK_QUALITY_PROBERTT);
 	}
 	rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
 	rack->r_ctl.rc_time_probertt_entered = us_cts;
 	segsiz = min(ctf_fixed_maxseg(rack->rc_tp),
 		     rack->r_ctl.rc_pace_min_segs);
 	rack->in_probe_rtt = 1;
 	rack->measure_saw_probe_rtt = 1;
 	rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
 	rack->r_ctl.rc_time_probertt_starts = 0;
 	rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt;
 	if (rack_probertt_use_min_rtt_entry)
 		rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt));
 	else
 		rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt);
 	rack_log_rtt_shrinks(rack,  us_cts,  get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
 			     __LINE__, RACK_RTTS_ENTERPROBE);
 }
 
 static void
 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts)
 {
 	struct rack_sendmap *rsm;
 	uint32_t segsiz;
 
 	segsiz = min(ctf_fixed_maxseg(rack->rc_tp),
 		     rack->r_ctl.rc_pace_min_segs);
 	rack->in_probe_rtt = 0;
 	if ((rack->rc_tp->t_flags & TF_GPUTINPROG) &&
 	    SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) {
 		/*
 		 * Stop the goodput now, the idea here is
 		 * that future measurements with in_probe_rtt
 		 * won't register if they are not greater so
 		 * we want to get what info (if any) is available
 		 * now.
 		 */
 		rack_do_goodput_measurement(rack->rc_tp, rack,
 					    rack->rc_tp->snd_una, __LINE__,
 					    RACK_QUALITY_PROBERTT);
 	} else if (rack->rc_tp->t_flags & TF_GPUTINPROG) {
 		/*
 		 * We don't have enough data to make a measurement.
 		 * So lets just stop and start here after exiting
 		 * probe-rtt. We probably are not interested in
 		 * the results anyway.
 		 */
 		rack->rc_tp->t_flags &= ~TF_GPUTINPROG;
 	}
 	/*
 	 * Measurements through the current snd_max are going
 	 * to be limited by the slower pacing rate.
 	 *
 	 * We need to mark these as app-limited so we
 	 * don't collapse the b/w.
 	 */
 	rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
 	if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
 		if (rack->r_ctl.rc_app_limited_cnt == 0)
 			rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
 		else {
 			/*
 			 * Go out to the end app limited and mark
 			 * this new one as next and move the end_appl up
 			 * to this guy.
 			 */
 			if (rack->r_ctl.rc_end_appl)
 				rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start;
 			rack->r_ctl.rc_end_appl = rsm;
 		}
 		rsm->r_flags |= RACK_APP_LIMITED;
 		rack->r_ctl.rc_app_limited_cnt++;
 	}
 	/*
 	 * Now, we need to examine our pacing rate multipliers.
 	 * If its under 100%, we need to kick it back up to
 	 * 100%. We also don't let it be over our "max" above
 	 * the actual rate i.e. 100% + rack_clamp_atexit_prtt.
 	 * Note setting clamp_atexit_prtt to 0 has the effect
 	 * of setting CA/SS to 100% always at exit (which is
 	 * the default behavior).
 	 */
 	if (rack_probertt_clear_is) {
 		rack->rc_gp_incr = 0;
 		rack->rc_gp_bwred = 0;
 		rack->rc_gp_timely_inc_cnt = 0;
 		rack->rc_gp_timely_dec_cnt = 0;
 	}
 	/* Do we do any clamping at exit? */
 	if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) {
 		rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp;
 		rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp;
 	}
 	if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) {
 		rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt;
 		rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt;
 	}
 	/*
 	 * Lets set rtt_diff to 0, so that we will get a "boost"
 	 * after exiting.
 	 */
 	rack->r_ctl.rc_rtt_diff = 0;
 
 	/* Clear all flags so we start fresh */
 	rack->rc_tp->t_bytes_acked = 0;
 	rack->rc_tp->ccv->flags &= ~CCF_ABC_SENTAWND;
 	/*
 	 * If configured to, set the cwnd and ssthresh to
 	 * our targets.
 	 */
 	if (rack_probe_rtt_sets_cwnd) {
 		uint64_t ebdp;
 		uint32_t setto;
 
 		/* Set ssthresh so we get into CA once we hit our target */
 		if (rack_probertt_use_min_rtt_exit == 1) {
 			/* Set to min rtt */
 			rack_set_prtt_target(rack, segsiz,
 					     get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt));
 		} else if (rack_probertt_use_min_rtt_exit == 2) {
 			/* Set to current gp rtt */
 			rack_set_prtt_target(rack, segsiz,
 					     rack->r_ctl.rc_gp_srtt);
 		} else if (rack_probertt_use_min_rtt_exit == 3) {
 			/* Set to entry gp rtt */
 			rack_set_prtt_target(rack, segsiz,
 					     rack->r_ctl.rc_entry_gp_rtt);
 		} else {
 			uint64_t sum;
 			uint32_t setval;
 
 			sum = rack->r_ctl.rc_entry_gp_rtt;
 			sum *= 10;
 			sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt));
 			if (sum >= 20) {
 				/*
 				 * A highly buffered path needs
 				 * cwnd space for timely to work.
 				 * Lets set things up as if
 				 * we are heading back here again.
 				 */
 				setval = rack->r_ctl.rc_entry_gp_rtt;
 			} else if (sum >= 15) {
 				/*
 				 * Lets take the smaller of the
 				 * two since we are just somewhat
 				 * buffered.
 				 */
 				setval = rack->r_ctl.rc_gp_srtt;
 				if (setval > rack->r_ctl.rc_entry_gp_rtt)
 					setval = rack->r_ctl.rc_entry_gp_rtt;
 			} else {
 				/*
 				 * Here we are not highly buffered
 				 * and should pick the min we can to
 				 * keep from causing loss.
 				 */
 				setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
 			}
 			rack_set_prtt_target(rack, segsiz,
 					     setval);
 		}
 		if (rack_probe_rtt_sets_cwnd > 1) {
 			/* There is a percentage here to boost */
 			ebdp = rack->r_ctl.rc_target_probertt_flight;
 			ebdp *= rack_probe_rtt_sets_cwnd;
 			ebdp /= 100;
 			setto = rack->r_ctl.rc_target_probertt_flight + ebdp;
 		} else
 			setto = rack->r_ctl.rc_target_probertt_flight;
 		rack->rc_tp->snd_cwnd = roundup(setto, segsiz);
 		if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) {
 			/* Enforce a min */
 			rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs;
 		}
 		/* If we set in the cwnd also set the ssthresh point so we are in CA */
 		rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1);
 	}
 	rack_log_rtt_shrinks(rack,  us_cts,
 			     get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
 			     __LINE__, RACK_RTTS_EXITPROBE);
 	/* Clear times last so log has all the info */
 	rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max;
 	rack->r_ctl.rc_time_probertt_entered = us_cts;
 	rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
 	rack->r_ctl.rc_time_of_last_probertt = us_cts;
 }
 
 static void
 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts)
 {
 	/* Check in on probe-rtt */
 	if (rack->rc_gp_filled == 0) {
 		/* We do not do p-rtt unless we have gp measurements */
 		return;
 	}
 	if (rack->in_probe_rtt) {
 		uint64_t no_overflow;
 		uint32_t endtime, must_stay;
 
 		if (rack->r_ctl.rc_went_idle_time &&
 		    ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) {
 			/*
 			 * We went idle during prtt, just exit now.
 			 */
 			rack_exit_probertt(rack, us_cts);
 		} else if (rack_probe_rtt_safety_val &&
 		    TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) &&
 		    ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) {
 			/*
 			 * Probe RTT safety value triggered!
 			 */
 			rack_log_rtt_shrinks(rack,  us_cts,
 					     get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
 					     __LINE__, RACK_RTTS_SAFETY);
 			rack_exit_probertt(rack, us_cts);
 		}
 		/* Calculate the max we will wait */
 		endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait);
 		if (rack->rc_highly_buffered)
 			endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp);
 		/* Calculate the min we must wait */
 		must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain);
 		if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) &&
 		    TSTMP_LT(us_cts, endtime)) {
 			uint32_t calc;
 			/* Do we lower more? */
 no_exit:
 			if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered))
 				calc = us_cts - rack->r_ctl.rc_time_probertt_entered;
 			else
 				calc = 0;
 			calc /= max(rack->r_ctl.rc_gp_srtt, 1);
 			if (calc) {
 				/* Maybe */
 				calc *= rack_per_of_gp_probertt_reduce;
 				rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc;
 				/* Limit it too */
 				if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh)
 					rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh;
 			}
 			/* We must reach target or the time set */
 			return;
 		}
 		if (rack->r_ctl.rc_time_probertt_starts == 0) {
 			if ((TSTMP_LT(us_cts, must_stay) &&
 			     rack->rc_highly_buffered) ||
 			     (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) >
 			      rack->r_ctl.rc_target_probertt_flight)) {
 				/* We are not past the must_stay time */
 				goto no_exit;
 			}
 			rack_log_rtt_shrinks(rack,  us_cts,
 					     get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
 					     __LINE__, RACK_RTTS_REACHTARGET);
 			rack->r_ctl.rc_time_probertt_starts = us_cts;
 			if (rack->r_ctl.rc_time_probertt_starts == 0)
 				rack->r_ctl.rc_time_probertt_starts = 1;
 			/* Restore back to our rate we want to pace at in prtt */
 			rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
 		}
 		/*
 		 * Setup our end time, some number of gp_srtts plus 200ms.
 		 */
 		no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt *
 			       (uint64_t)rack_probertt_gpsrtt_cnt_mul);
 		if (rack_probertt_gpsrtt_cnt_div)
 			endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div);
 		else
 			endtime = 0;
 		endtime += rack_min_probertt_hold;
 		endtime += rack->r_ctl.rc_time_probertt_starts;
 		if (TSTMP_GEQ(us_cts,  endtime)) {
 			/* yes, exit probertt */
 			rack_exit_probertt(rack, us_cts);
 		}
 
 	} else if ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) {
 		/* Go into probertt, its been too long since we went lower */
 		rack_enter_probertt(rack, us_cts);
 	}
 }
 
 static void
 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est,
 		       uint32_t rtt, int32_t rtt_diff)
 {
 	uint64_t cur_bw, up_bnd, low_bnd, subfr;
 	uint32_t losses;
 
 	if ((rack->rc_gp_dyn_mul == 0) ||
 	    (rack->use_fixed_rate) ||
 	    (rack->in_probe_rtt) ||
 	    (rack->rc_always_pace == 0)) {
 		/* No dynamic GP multiplier in play */
 		return;
 	}
 	losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start;
 	cur_bw = rack_get_bw(rack);
 	/* Calculate our up and down range */
 	up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up;
 	up_bnd /= 100;
 	up_bnd += rack->r_ctl.last_gp_comp_bw;
 
 	subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down;
 	subfr /= 100;
 	low_bnd = rack->r_ctl.last_gp_comp_bw - subfr;
 	if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) {
 		/*
 		 * This is the case where our RTT is above
 		 * the max target and we have been configured
 		 * to just do timely no bonus up stuff in that case.
 		 *
 		 * There are two configurations, set to 1, and we
 		 * just do timely if we are over our max. If its
 		 * set above 1 then we slam the multipliers down
 		 * to 100 and then decrement per timely.
 		 */
 		rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
 				__LINE__, 3);
 		if (rack->r_ctl.rc_no_push_at_mrtt > 1)
 			rack_validate_multipliers_at_or_below_100(rack);
 		rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
 	} else if ((last_bw_est < low_bnd) && !losses) {
 		/*
 		 * We are decreasing this is a bit complicated this
 		 * means we are loosing ground. This could be
 		 * because another flow entered and we are competing
 		 * for b/w with it. This will push the RTT up which
 		 * makes timely unusable unless we want to get shoved
 		 * into a corner and just be backed off (the age
 		 * old problem with delay based CC).
 		 *
 		 * On the other hand if it was a route change we
 		 * would like to stay somewhat contained and not
 		 * blow out the buffers.
 		 */
 		rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
 				__LINE__, 3);
 		rack->r_ctl.last_gp_comp_bw = cur_bw;
 		if (rack->rc_gp_bwred == 0) {
 			/* Go into reduction counting */
 			rack->rc_gp_bwred = 1;
 			rack->rc_gp_timely_dec_cnt = 0;
 		}
 		if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) ||
 		    (timely_says == 0)) {
 			/*
 			 * Push another time with a faster pacing
 			 * to try to gain back (we include override to
 			 * get a full raise factor).
 			 */
 			if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) ||
 			    (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) ||
 			    (timely_says == 0) ||
 			    (rack_down_raise_thresh == 0)) {
 				/*
 				 * Do an override up in b/w if we were
 				 * below the threshold or if the threshold
 				 * is zero we always do the raise.
 				 */
 				rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1);
 			} else {
 				/* Log it stays the same */
 				rack_log_timely(rack,  0, last_bw_est, low_bnd, 0,
 						__LINE__, 11);
 			}
 			rack->rc_gp_timely_dec_cnt++;
 			/* We are not incrementing really no-count */
 			rack->rc_gp_incr = 0;
 			rack->rc_gp_timely_inc_cnt = 0;
 		} else {
 			/*
 			 * Lets just use the RTT
 			 * information and give up
 			 * pushing.
 			 */
 			goto use_timely;
 		}
 	} else if ((timely_says != 2) &&
 		    !losses &&
 		    (last_bw_est > up_bnd)) {
 		/*
 		 * We are increasing b/w lets keep going, updating
 		 * our b/w and ignoring any timely input, unless
 		 * of course we are at our max raise (if there is one).
 		 */
 
 		rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
 				__LINE__, 3);
 		rack->r_ctl.last_gp_comp_bw = cur_bw;
 		if (rack->rc_gp_saw_ss &&
 		    rack_per_upper_bound_ss &&
 		     (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) {
 			    /*
 			     * In cases where we can't go higher
 			     * we should just use timely.
 			     */
 			    goto use_timely;
 		}
 		if (rack->rc_gp_saw_ca &&
 		    rack_per_upper_bound_ca &&
 		    (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) {
 			    /*
 			     * In cases where we can't go higher
 			     * we should just use timely.
 			     */
 			    goto use_timely;
 		}
 		rack->rc_gp_bwred = 0;
 		rack->rc_gp_timely_dec_cnt = 0;
 		/* You get a set number of pushes if timely is trying to reduce */
 		if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) {
 			rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
 		} else {
 			/* Log it stays the same */
 			rack_log_timely(rack,  0, last_bw_est, up_bnd, 0,
 			    __LINE__, 12);
 		}
 		return;
 	} else {
 		/*
 		 * We are staying between the lower and upper range bounds
 		 * so use timely to decide.
 		 */
 		rack_log_timely(rack,  timely_says, cur_bw, low_bnd, up_bnd,
 				__LINE__, 3);
 use_timely:
 		if (timely_says) {
 			rack->rc_gp_incr = 0;
 			rack->rc_gp_timely_inc_cnt = 0;
 			if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) &&
 			    !losses &&
 			    (last_bw_est < low_bnd)) {
 				/* We are loosing ground */
 				rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
 				rack->rc_gp_timely_dec_cnt++;
 				/* We are not incrementing really no-count */
 				rack->rc_gp_incr = 0;
 				rack->rc_gp_timely_inc_cnt = 0;
 			} else
 				rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
 		} else {
 			rack->rc_gp_bwred = 0;
 			rack->rc_gp_timely_dec_cnt = 0;
 			rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
 		}
 	}
 }
 
 static int32_t
 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt)
 {
 	int32_t timely_says;
 	uint64_t log_mult, log_rtt_a_diff;
 
 	log_rtt_a_diff = rtt;
 	log_rtt_a_diff <<= 32;
 	log_rtt_a_diff |= (uint32_t)rtt_diff;
 	if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) *
 		    rack_gp_rtt_maxmul)) {
 		/* Reduce the b/w multiplier */
 		timely_says = 2;
 		log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul;
 		log_mult <<= 32;
 		log_mult |= prev_rtt;
 		rack_log_timely(rack,  timely_says, log_mult,
 				get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
 				log_rtt_a_diff, __LINE__, 4);
 	} else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) +
 			   ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) /
 			    max(rack_gp_rtt_mindiv , 1)))) {
 		/* Increase the b/w multiplier */
 		log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) +
 			((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) /
 			 max(rack_gp_rtt_mindiv , 1));
 		log_mult <<= 32;
 		log_mult |= prev_rtt;
 		timely_says = 0;
 		rack_log_timely(rack,  timely_says, log_mult ,
 				get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
 				log_rtt_a_diff, __LINE__, 5);
 	} else {
 		/*
 		 * Use a gradient to find it the timely gradient
 		 * is:
 		 * grad = rc_rtt_diff / min_rtt;
 		 *
 		 * anything below or equal to 0 will be
 		 * a increase indication. Anything above
 		 * zero is a decrease. Note we take care
 		 * of the actual gradient calculation
 		 * in the reduction (its not needed for
 		 * increase).
 		 */
 		log_mult = prev_rtt;
 		if (rtt_diff <= 0) {
 			/*
 			 * Rttdiff is less than zero, increase the
 			 * b/w multiplier (its 0 or negative)
 			 */
 			timely_says = 0;
 			rack_log_timely(rack,  timely_says, log_mult,
 					get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6);
 		} else {
 			/* Reduce the b/w multiplier */
 			timely_says = 1;
 			rack_log_timely(rack,  timely_says, log_mult,
 					get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7);
 		}
 	}
 	return (timely_says);
 }
 
 static void
 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
 			    tcp_seq th_ack, int line, uint8_t quality)
 {
 	uint64_t tim, bytes_ps, ltim, stim, utim;
 	uint32_t segsiz, bytes, reqbytes, us_cts;
 	int32_t gput, new_rtt_diff, timely_says;
 	uint64_t  resid_bw, subpart = 0, addpart = 0, srtt;
 	int did_add = 0;
 
 	us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
 	segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
 	if (TSTMP_GEQ(us_cts, tp->gput_ts))
 		tim = us_cts - tp->gput_ts;
 	else
 		tim = 0;
 	if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts)
 		stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts;
 	else
 		stim = 0;
 	/*
 	 * Use the larger of the send time or ack time. This prevents us
 	 * from being influenced by ack artifacts to come up with too
 	 * high of measurement. Note that since we are spanning over many more
 	 * bytes in most of our measurements hopefully that is less likely to
 	 * occur.
 	 */
 	if (tim > stim)
 		utim = max(tim, 1);
 	else
 		utim = max(stim, 1);
 	/* Lets get a msec time ltim too for the old stuff */
 	ltim = max(1, (utim / HPTS_USEC_IN_MSEC));
 	gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim;
 	reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz));
 	if ((tim == 0) && (stim == 0)) {
 		/*
 		 * Invalid measurement time, maybe
 		 * all on one ack/one send?
 		 */
 		bytes = 0;
 		bytes_ps = 0;
 		rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
 					   0, 0, 0, 10, __LINE__, NULL, quality);
 		goto skip_measurement;
 	}
 	if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) {
 		/* We never made a us_rtt measurement? */
 		bytes = 0;
 		bytes_ps = 0;
 		rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
 					   0, 0, 0, 10, __LINE__, NULL, quality);
 		goto skip_measurement;
 	}
 	/*
 	 * Calculate the maximum possible b/w this connection
 	 * could have. We base our calculation on the lowest
 	 * rtt we have seen during the measurement and the
 	 * largest rwnd the client has given us in that time. This
 	 * forms a BDP that is the maximum that we could ever
 	 * get to the client. Anything larger is not valid.
 	 *
 	 * I originally had code here that rejected measurements
 	 * where the time was less than 1/2 the latest us_rtt.
 	 * But after thinking on that I realized its wrong since
 	 * say you had a 150Mbps or even 1Gbps link, and you
 	 * were a long way away.. example I am in Europe (100ms rtt)
 	 * talking to my 1Gbps link in S.C. Now measuring say 150,000
 	 * bytes my time would be 1.2ms, and yet my rtt would say
 	 * the measurement was invalid the time was < 50ms. The
 	 * same thing is true for 150Mb (8ms of time).
 	 *
 	 * A better way I realized is to look at what the maximum
 	 * the connection could possibly do. This is gated on
 	 * the lowest RTT we have seen and the highest rwnd.
 	 * We should in theory never exceed that, if we are
 	 * then something on the path is storing up packets
 	 * and then feeding them all at once to our endpoint
 	 * messing up our measurement.
 	 */
 	rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd;
 	rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC;
 	rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt;
 	if (SEQ_LT(th_ack, tp->gput_seq)) {
 		/* No measurement can be made */
 		bytes = 0;
 		bytes_ps = 0;
 		rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
 					   0, 0, 0, 10, __LINE__, NULL, quality);
 		goto skip_measurement;
 	} else
 		bytes = (th_ack - tp->gput_seq);
 	bytes_ps = (uint64_t)bytes;
 	/*
 	 * Don't measure a b/w for pacing unless we have gotten at least
 	 * an initial windows worth of data in this measurement interval.
 	 *
 	 * Small numbers of bytes get badly influenced by delayed ack and
 	 * other artifacts. Note we take the initial window or our
 	 * defined minimum GP (defaulting to 10 which hopefully is the
 	 * IW).
 	 */
 	if (rack->rc_gp_filled == 0) {
 		/*
 		 * The initial estimate is special. We
 		 * have blasted out an IW worth of packets
 		 * without a real valid ack ts results. We
 		 * then setup the app_limited_needs_set flag,
 		 * this should get the first ack in (probably 2
 		 * MSS worth) to be recorded as the timestamp.
 		 * We thus allow a smaller number of bytes i.e.
 		 * IW - 2MSS.
 		 */
 		reqbytes -= (2 * segsiz);
 		/* Also lets fill previous for our first measurement to be neutral */
 		rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt;
 	}
 	if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) {
 		rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
 					   rack->r_ctl.rc_app_limited_cnt,
 					   0, 0, 10, __LINE__, NULL, quality);
 		goto skip_measurement;
 	}
 	/*
 	 * We now need to calculate the Timely like status so
 	 * we can update (possibly) the b/w multipliers.
 	 */
 	new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt;
 	if (rack->rc_gp_filled == 0) {
 		/* No previous reading */
 		rack->r_ctl.rc_rtt_diff = new_rtt_diff;
 	} else {
 		if (rack->measure_saw_probe_rtt == 0) {
 			/*
 			 * We don't want a probertt to be counted
 			 * since it will be negative incorrectly. We
 			 * expect to be reducing the RTT when we
 			 * pace at a slower rate.
 			 */
 			rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8);
 			rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8);
 		}
 	}
 	timely_says = rack_make_timely_judgement(rack,
 		rack->r_ctl.rc_gp_srtt,
 		rack->r_ctl.rc_rtt_diff,
 	        rack->r_ctl.rc_prev_gp_srtt
 		);
 	bytes_ps *= HPTS_USEC_IN_SEC;
 	bytes_ps /= utim;
 	if (bytes_ps > rack->r_ctl.last_max_bw) {
 		/*
 		 * Something is on path playing
 		 * since this b/w is not possible based
 		 * on our BDP (highest rwnd and lowest rtt
 		 * we saw in the measurement window).
 		 *
 		 * Another option here would be to
 		 * instead skip the measurement.
 		 */
 		rack_log_pacing_delay_calc(rack, bytes, reqbytes,
 					   bytes_ps, rack->r_ctl.last_max_bw, 0,
 					   11, __LINE__, NULL, quality);
 		bytes_ps = rack->r_ctl.last_max_bw;
 	}
 	/* We store gp for b/w in bytes per second */
 	if (rack->rc_gp_filled == 0) {
 		/* Initial measurement */
 		if (bytes_ps) {
 			rack->r_ctl.gp_bw = bytes_ps;
 			rack->rc_gp_filled = 1;
 			rack->r_ctl.num_measurements = 1;
 			rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
 		} else {
 			rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
 						   rack->r_ctl.rc_app_limited_cnt,
 						   0, 0, 10, __LINE__, NULL, quality);
 		}
 		if (tcp_in_hpts(rack->rc_inp) &&
 		    (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
 			/*
 			 * Ok we can't trust the pacer in this case
 			 * where we transition from un-paced to paced.
 			 * Or for that matter when the burst mitigation
 			 * was making a wild guess and got it wrong.
 			 * Stop the pacer and clear up all the aggregate
 			 * delays etc.
 			 */
 			tcp_hpts_remove(rack->rc_inp);
 			rack->r_ctl.rc_hpts_flags = 0;
 			rack->r_ctl.rc_last_output_to = 0;
 		}
 		did_add = 2;
 	} else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) {
 		/* Still a small number run an average */
 		rack->r_ctl.gp_bw += bytes_ps;
 		addpart = rack->r_ctl.num_measurements;
 		rack->r_ctl.num_measurements++;
 		if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
 			/* We have collected enough to move forward */
 			rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements;
 		}
 		did_add = 3;
 	} else {
 		/*
 		 * We want to take 1/wma of the goodput and add in to 7/8th
 		 * of the old value weighted by the srtt. So if your measurement
 		 * period is say 2 SRTT's long you would get 1/4 as the
 		 * value, if it was like 1/2 SRTT then you would get 1/16th.
 		 *
 		 * But we must be careful not to take too much i.e. if the
 		 * srtt is say 20ms and the measurement is taken over
 		 * 400ms our weight would be 400/20 i.e. 20. On the
 		 * other hand if we get a measurement over 1ms with a
 		 * 10ms rtt we only want to take a much smaller portion.
 		 */
 		if (rack->r_ctl.num_measurements < 0xff) {
 			rack->r_ctl.num_measurements++;
 		}
 		srtt = (uint64_t)tp->t_srtt;
 		if (srtt == 0) {
 			/*
 			 * Strange why did t_srtt go back to zero?
 			 */
 			if (rack->r_ctl.rc_rack_min_rtt)
 				srtt = rack->r_ctl.rc_rack_min_rtt;
 			else
 				srtt = HPTS_USEC_IN_MSEC;
 		}
 		/*
 		 * XXXrrs: Note for reviewers, in playing with
 		 * dynamic pacing I discovered this GP calculation
 		 * as done originally leads to some undesired results.
 		 * Basically you can get longer measurements contributing
 		 * too much to the WMA. Thus I changed it if you are doing
 		 * dynamic adjustments to only do the aportioned adjustment
 		 * if we have a very small (time wise) measurement. Longer
 		 * measurements just get there weight (defaulting to 1/8)
 		 * add to the WMA. We may want to think about changing
 		 * this to always do that for both sides i.e. dynamic
 		 * and non-dynamic... but considering lots of folks
 		 * were playing with this I did not want to change the
 		 * calculation per.se. without your thoughts.. Lawerence?
 		 * Peter??
 		 */
 		if (rack->rc_gp_dyn_mul == 0) {
 			subpart = rack->r_ctl.gp_bw * utim;
 			subpart /= (srtt * 8);
 			if (subpart < (rack->r_ctl.gp_bw / 2)) {
 				/*
 				 * The b/w update takes no more
 				 * away then 1/2 our running total
 				 * so factor it in.
 				 */
 				addpart = bytes_ps * utim;
 				addpart /= (srtt * 8);
 			} else {
 				/*
 				 * Don't allow a single measurement
 				 * to account for more than 1/2 of the
 				 * WMA. This could happen on a retransmission
 				 * where utim becomes huge compared to
 				 * srtt (multiple retransmissions when using
 				 * the sending rate which factors in all the
 				 * transmissions from the first one).
 				 */
 				subpart = rack->r_ctl.gp_bw / 2;
 				addpart = bytes_ps / 2;
 			}
 			resid_bw = rack->r_ctl.gp_bw - subpart;
 			rack->r_ctl.gp_bw = resid_bw + addpart;
 			did_add = 1;
 		} else {
 			if ((utim / srtt) <= 1) {
 				/*
 				 * The b/w update was over a small period
 				 * of time. The idea here is to prevent a small
 				 * measurement time period from counting
 				 * too much. So we scale it based on the
 				 * time so it attributes less than 1/rack_wma_divisor
 				 * of its measurement.
 				 */
 				subpart = rack->r_ctl.gp_bw * utim;
 				subpart /= (srtt * rack_wma_divisor);
 				addpart = bytes_ps * utim;
 				addpart /= (srtt * rack_wma_divisor);
 			} else {
 				/*
 				 * The scaled measurement was long
 				 * enough so lets just add in the
 				 * portion of the measurement i.e. 1/rack_wma_divisor
 				 */
 				subpart = rack->r_ctl.gp_bw / rack_wma_divisor;
 				addpart = bytes_ps / rack_wma_divisor;
 			}
 			if ((rack->measure_saw_probe_rtt == 0) ||
 		            (bytes_ps > rack->r_ctl.gp_bw)) {
 				/*
 				 * For probe-rtt we only add it in
 				 * if its larger, all others we just
 				 * add in.
 				 */
 				did_add = 1;
 				resid_bw = rack->r_ctl.gp_bw - subpart;
 				rack->r_ctl.gp_bw = resid_bw + addpart;
 			}
 		}
 	}
 	if ((rack->gp_ready == 0) &&
 	    (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
 		/* We have enough measurements now */
 		rack->gp_ready = 1;
 		rack_set_cc_pacing(rack);
 		if (rack->defer_options)
 			rack_apply_deferred_options(rack);
 	}
 	rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim,
 				   rack_get_bw(rack), 22, did_add, NULL, quality);
 	/* We do not update any multipliers if we are in or have seen a probe-rtt */
 	if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set)
 		rack_update_multiplier(rack, timely_says, bytes_ps,
 				       rack->r_ctl.rc_gp_srtt,
 				       rack->r_ctl.rc_rtt_diff);
 	rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim,
 				   rack_get_bw(rack), 3, line, NULL, quality);
 	/* reset the gp srtt and setup the new prev */
 	rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt;
 	/* Record the lost count for the next measurement */
 	rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count;
 	/*
 	 * We restart our diffs based on the gpsrtt in the
 	 * measurement window.
 	 */
 	rack->rc_gp_rtt_set = 0;
 	rack->rc_gp_saw_rec = 0;
 	rack->rc_gp_saw_ca = 0;
 	rack->rc_gp_saw_ss = 0;
 	rack->rc_dragged_bottom = 0;
 skip_measurement:
 
 #ifdef STATS
 	stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
 				 gput);
 	/*
 	 * XXXLAS: This is a temporary hack, and should be
 	 * chained off VOI_TCP_GPUT when stats(9) grows an
 	 * API to deal with chained VOIs.
 	 */
 	if (tp->t_stats_gput_prev > 0)
 		stats_voi_update_abs_s32(tp->t_stats,
 					 VOI_TCP_GPUT_ND,
 					 ((gput - tp->t_stats_gput_prev) * 100) /
 					 tp->t_stats_gput_prev);
 #endif
 	tp->t_flags &= ~TF_GPUTINPROG;
 	tp->t_stats_gput_prev = gput;
 	/*
 	 * Now are we app limited now and there is space from where we
 	 * were to where we want to go?
 	 *
 	 * We don't do the other case i.e. non-applimited here since
 	 * the next send will trigger us picking up the missing data.
 	 */
 	if (rack->r_ctl.rc_first_appl &&
 	    TCPS_HAVEESTABLISHED(tp->t_state) &&
 	    rack->r_ctl.rc_app_limited_cnt &&
 	    (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) &&
 	    ((rack->r_ctl.rc_first_appl->r_end - th_ack) >
 	     max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) {
 		/*
 		 * Yep there is enough outstanding to make a measurement here.
 		 */
 		struct rack_sendmap *rsm, fe;
 
 		rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
 		rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
 		tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
 		rack->app_limited_needs_set = 0;
 		tp->gput_seq = th_ack;
 		if (rack->in_probe_rtt)
 			rack->measure_saw_probe_rtt = 1;
 		else if ((rack->measure_saw_probe_rtt) &&
 			 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
 			rack->measure_saw_probe_rtt = 0;
 		if ((rack->r_ctl.rc_first_appl->r_end - th_ack) >= rack_get_measure_window(tp, rack)) {
 			/* There is a full window to gain info from */
 			tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
 		} else {
 			/* We can only measure up to the applimited point */
 			tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_end - th_ack);
 			if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) {
 				/*
 				 * We don't have enough to make a measurement.
 				 */
 				tp->t_flags &= ~TF_GPUTINPROG;
 				rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
 							   0, 0, 0, 6, __LINE__, NULL, quality);
 				return;
 			}
 		}
 		if (tp->t_state >= TCPS_FIN_WAIT_1) {
 			/*
 			 * We will get no more data into the SB
 			 * this means we need to have the data available
 			 * before we start a measurement.
 			 */
 			if (sbavail(&tp->t_inpcb->inp_socket->so_snd) < (tp->gput_ack - tp->gput_seq)) {
 				/* Nope not enough data. */
 				return;
 			}
 		}
 		tp->t_flags |= TF_GPUTINPROG;
 		/*
 		 * Now we need to find the timestamp of the send at tp->gput_seq
 		 * for the send based measurement.
 		 */
 		fe.r_start = tp->gput_seq;
 		rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
 		if (rsm) {
 			/* Ok send-based limit is set */
 			if (SEQ_LT(rsm->r_start, tp->gput_seq)) {
 				/*
 				 * Move back to include the earlier part
 				 * so our ack time lines up right (this may
 				 * make an overlapping measurement but thats
 				 * ok).
 				 */
 				tp->gput_seq = rsm->r_start;
 			}
 			if (rsm->r_flags & RACK_ACKED)
 				tp->gput_ts = (uint32_t)rsm->r_ack_arrival;
 			else
 				rack->app_limited_needs_set = 1;
 			rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
 		} else {
 			/*
 			 * If we don't find the rsm due to some
 			 * send-limit set the current time, which
 			 * basically disables the send-limit.
 			 */
 			struct timeval tv;
 
 			microuptime(&tv);
 			rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
 		}
 		rack_log_pacing_delay_calc(rack,
 					   tp->gput_seq,
 					   tp->gput_ack,
 					   (uint64_t)rsm,
 					   tp->gput_ts,
 					   rack->r_ctl.rc_app_limited_cnt,
 					   9,
 					   __LINE__, NULL, quality);
 	}
 }
 
 /*
  * CC wrapper hook functions
  */
 static void
 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs,
     uint16_t type, int32_t recovery)
 {
 	uint32_t prior_cwnd, acked;
 	struct tcp_log_buffer *lgb = NULL;
 	uint8_t labc_to_use, quality;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	tp->ccv->nsegs = nsegs;
 	acked = tp->ccv->bytes_this_ack = (th_ack - tp->snd_una);
 	if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
 		uint32_t max;
 
 		max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp);
 		if (tp->ccv->bytes_this_ack > max) {
 			tp->ccv->bytes_this_ack = max;
 		}
 	}
 #ifdef STATS
 	stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
 	    ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd);
 #endif
 	quality = RACK_QUALITY_NONE;
 	if ((tp->t_flags & TF_GPUTINPROG) &&
 	    rack_enough_for_measurement(tp, rack, th_ack, &quality)) {
 		/* Measure the Goodput */
 		rack_do_goodput_measurement(tp, rack, th_ack, __LINE__, quality);
 #ifdef NETFLIX_PEAKRATE
 		if ((type == CC_ACK) &&
 		    (tp->t_maxpeakrate)) {
 			/*
 			 * We update t_peakrate_thr. This gives us roughly
 			 * one update per round trip time. Note
 			 * it will only be used if pace_always is off i.e
 			 * we don't do this for paced flows.
 			 */
 			rack_update_peakrate_thr(tp);
 		}
 #endif
 	}
 	/* Which way our we limited, if not cwnd limited no advance in CA */
 	if (tp->snd_cwnd <= tp->snd_wnd)
 		tp->ccv->flags |= CCF_CWND_LIMITED;
 	else
 		tp->ccv->flags &= ~CCF_CWND_LIMITED;
 	if (tp->snd_cwnd > tp->snd_ssthresh) {
 		tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
 			 nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp));
 		/* For the setting of a window past use the actual scwnd we are using */
 		if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) {
 			tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use;
 			tp->ccv->flags |= CCF_ABC_SENTAWND;
 		}
 	} else {
 		tp->ccv->flags &= ~CCF_ABC_SENTAWND;
 		tp->t_bytes_acked = 0;
 	}
 	prior_cwnd = tp->snd_cwnd;
 	if ((recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec ||
 	    (rack_client_low_buf && (rack->client_bufferlvl < rack_client_low_buf)))
 		labc_to_use = rack->rc_labc;
 	else
 		labc_to_use = rack_max_abc_post_recovery;
 	if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.flex1 = th_ack;
 		log.u_bbr.flex2 = tp->ccv->flags;
 		log.u_bbr.flex3 = tp->ccv->bytes_this_ack;
 		log.u_bbr.flex4 = tp->ccv->nsegs;
 		log.u_bbr.flex5 = labc_to_use;
 		log.u_bbr.flex6 = prior_cwnd;
 		log.u_bbr.flex7 = V_tcp_do_newsack;
 		log.u_bbr.flex8 = 1;
 		lgb = tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
 				     0, &log, false, NULL, NULL, 0, &tv);
 	}
 	if (CC_ALGO(tp)->ack_received != NULL) {
 		/* XXXLAS: Find a way to live without this */
 		tp->ccv->curack = th_ack;
 		tp->ccv->labc = labc_to_use;
 		tp->ccv->flags |= CCF_USE_LOCAL_ABC;
 		CC_ALGO(tp)->ack_received(tp->ccv, type);
 	}
 	if (lgb) {
 		lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd;
 	}
 	if (rack->r_must_retran) {
 		if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) {
 			/*
 			 * We now are beyond the rxt point so lets disable
 			 * the flag.
 			 */
 			rack->r_ctl.rc_out_at_rto = 0;
 			rack->r_must_retran = 0;
 		} else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) {
 			/*
 			 * Only decrement the rc_out_at_rto if the cwnd advances
 			 * at least a whole segment. Otherwise next time the peer
 			 * acks, we won't be able to send this generaly happens
 			 * when we are in Congestion Avoidance.
 			 */
 			if (acked <= rack->r_ctl.rc_out_at_rto){
 				rack->r_ctl.rc_out_at_rto -= acked;
 			} else {
 				rack->r_ctl.rc_out_at_rto = 0;
 			}
 		}
 	}
 #ifdef STATS
 	stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use);
 #endif
 	if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) {
 		rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use;
 	}
 #ifdef NETFLIX_PEAKRATE
 	/* we enforce max peak rate if it is set and we are not pacing */
 	if ((rack->rc_always_pace == 0) &&
 	    tp->t_peakrate_thr &&
 	    (tp->snd_cwnd > tp->t_peakrate_thr)) {
 		tp->snd_cwnd = tp->t_peakrate_thr;
 	}
 #endif
 }
 
 static void
 tcp_rack_partialack(struct tcpcb *tp)
 {
 	struct tcp_rack *rack;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	/*
 	 * If we are doing PRR and have enough
 	 * room to send <or> we are pacing and prr
 	 * is disabled we will want to see if we
 	 * can send data (by setting r_wanted_output to
 	 * true).
 	 */
 	if ((rack->r_ctl.rc_prr_sndcnt > 0) ||
 	    rack->rack_no_prr)
 		rack->r_wanted_output = 1;
 }
 
 static void
 rack_post_recovery(struct tcpcb *tp, uint32_t th_ack)
 {
 	struct tcp_rack *rack;
 	uint32_t orig_cwnd;
 
 	orig_cwnd = tp->snd_cwnd;
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	/* only alert CC if we alerted when we entered */
 	if (CC_ALGO(tp)->post_recovery != NULL) {
 		tp->ccv->curack = th_ack;
 		CC_ALGO(tp)->post_recovery(tp->ccv);
 		if (tp->snd_cwnd < tp->snd_ssthresh) {
 			/*
 			 * Rack has burst control and pacing
 			 * so lets not set this any lower than
 			 * snd_ssthresh per RFC-6582 (option 2).
 			 */
 			tp->snd_cwnd = tp->snd_ssthresh;
 		}
 	}
 	if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.flex1 = th_ack;
 		log.u_bbr.flex2 = tp->ccv->flags;
 		log.u_bbr.flex3 = tp->ccv->bytes_this_ack;
 		log.u_bbr.flex4 = tp->ccv->nsegs;
 		log.u_bbr.flex5 = V_tcp_abc_l_var;
 		log.u_bbr.flex6 = orig_cwnd;
 		log.u_bbr.flex7 = V_tcp_do_newsack;
 		log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
 		log.u_bbr.flex8 = 2;
 		tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
 			       0, &log, false, NULL, NULL, 0, &tv);
 	}
 	if ((rack->rack_no_prr == 0) &&
 	    (rack->no_prr_addback == 0) &&
 	    (rack->r_ctl.rc_prr_sndcnt > 0)) {
 		/*
 		 * Suck the next prr cnt back into cwnd, but
 		 * only do that if we are not application limited.
 		 */
 		if (ctf_outstanding(tp) <= sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
 			/*
 			 * We are allowed to add back to the cwnd the amount we did
 			 * not get out if:
 			 * a) no_prr_addback is off.
 			 * b) we are not app limited
 			 * c) we are doing prr
 			 * <and>
 			 * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none).
 			 */
 			tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax),
 					    rack->r_ctl.rc_prr_sndcnt);
 		}
 		rack->r_ctl.rc_prr_sndcnt = 0;
 		rack_log_to_prr(rack, 1, 0, __LINE__);
 	}
 	rack_log_to_prr(rack, 14, orig_cwnd, __LINE__);
 	tp->snd_recover = tp->snd_una;
 	if (rack->r_ctl.dsack_persist) {
 		rack->r_ctl.dsack_persist--;
 		if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
 			rack->r_ctl.num_dsack = 0;
 		}
 		rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
 	}
 	EXIT_RECOVERY(tp->t_flags);
 }
 
 static void
 rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line)
 {
 	struct tcp_rack *rack;
 	uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 #ifdef STATS
 	stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
 #endif
 	if (IN_RECOVERY(tp->t_flags) == 0) {
 		in_rec_at_entry = 0;
 		ssthresh_enter = tp->snd_ssthresh;
 		cwnd_enter = tp->snd_cwnd;
 	} else
 		in_rec_at_entry = 1;
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	switch (type) {
 	case CC_NDUPACK:
 		tp->t_flags &= ~TF_WASFRECOVERY;
 		tp->t_flags &= ~TF_WASCRECOVERY;
 		if (!IN_FASTRECOVERY(tp->t_flags)) {
 			rack->r_ctl.rc_prr_delivered = 0;
 			rack->r_ctl.rc_prr_out = 0;
 			if (rack->rack_no_prr == 0) {
 				rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
 				rack_log_to_prr(rack, 2, in_rec_at_entry, line);
 			}
 			rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
 			tp->snd_recover = tp->snd_max;
 			if (tp->t_flags2 & TF2_ECN_PERMIT)
 				tp->t_flags2 |= TF2_ECN_SND_CWR;
 		}
 		break;
 	case CC_ECN:
 		if (!IN_CONGRECOVERY(tp->t_flags) ||
 		    /*
 		     * Allow ECN reaction on ACK to CWR, if
 		     * that data segment was also CE marked.
 		     */
 		    SEQ_GEQ(ack, tp->snd_recover)) {
 			EXIT_CONGRECOVERY(tp->t_flags);
 			KMOD_TCPSTAT_INC(tcps_ecn_rcwnd);
 			tp->snd_recover = tp->snd_max + 1;
 			if (tp->t_flags2 & TF2_ECN_PERMIT)
 				tp->t_flags2 |= TF2_ECN_SND_CWR;
 		}
 		break;
 	case CC_RTO:
 		tp->t_dupacks = 0;
 		tp->t_bytes_acked = 0;
 		EXIT_RECOVERY(tp->t_flags);
 		tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 /
 		    ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp);
 		orig_cwnd = tp->snd_cwnd;
 		tp->snd_cwnd = ctf_fixed_maxseg(tp);
 		rack_log_to_prr(rack, 16, orig_cwnd, line);
 		if (tp->t_flags2 & TF2_ECN_PERMIT)
 			tp->t_flags2 |= TF2_ECN_SND_CWR;
 		break;
 	case CC_RTO_ERR:
 		KMOD_TCPSTAT_INC(tcps_sndrexmitbad);
 		/* RTO was unnecessary, so reset everything. */
 		tp->snd_cwnd = tp->snd_cwnd_prev;
 		tp->snd_ssthresh = tp->snd_ssthresh_prev;
 		tp->snd_recover = tp->snd_recover_prev;
 		if (tp->t_flags & TF_WASFRECOVERY) {
 			ENTER_FASTRECOVERY(tp->t_flags);
 			tp->t_flags &= ~TF_WASFRECOVERY;
 		}
 		if (tp->t_flags & TF_WASCRECOVERY) {
 			ENTER_CONGRECOVERY(tp->t_flags);
 			tp->t_flags &= ~TF_WASCRECOVERY;
 		}
 		tp->snd_nxt = tp->snd_max;
 		tp->t_badrxtwin = 0;
 		break;
 	}
 	if ((CC_ALGO(tp)->cong_signal != NULL)  &&
 	    (type != CC_RTO)){
 		tp->ccv->curack = ack;
 		CC_ALGO(tp)->cong_signal(tp->ccv, type);
 	}
 	if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) {
 		rack_log_to_prr(rack, 15, cwnd_enter, line);
 		rack->r_ctl.dsack_byte_cnt = 0;
 		rack->r_ctl.retran_during_recovery = 0;
 		rack->r_ctl.rc_cwnd_at_erec = cwnd_enter;
 		rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter;
 		rack->r_ent_rec_ns = 1;
 	}
 }
 
 static inline void
 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp)
 {
 	uint32_t i_cwnd;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 #ifdef NETFLIX_STATS
 	KMOD_TCPSTAT_INC(tcps_idle_restarts);
 	if (tp->t_state == TCPS_ESTABLISHED)
 		KMOD_TCPSTAT_INC(tcps_idle_estrestarts);
 #endif
 	if (CC_ALGO(tp)->after_idle != NULL)
 		CC_ALGO(tp)->after_idle(tp->ccv);
 
 	if (tp->snd_cwnd == 1)
 		i_cwnd = tp->t_maxseg;		/* SYN(-ACK) lost */
 	else
 		i_cwnd = rc_init_window(rack);
 
 	/*
 	 * Being idle is no different than the initial window. If the cc
 	 * clamps it down below the initial window raise it to the initial
 	 * window.
 	 */
 	if (tp->snd_cwnd < i_cwnd) {
 		tp->snd_cwnd = i_cwnd;
 	}
 }
 
 /*
  * Indicate whether this ack should be delayed.  We can delay the ack if
  * following conditions are met:
  *	- There is no delayed ack timer in progress.
  *	- Our last ack wasn't a 0-sized window. We never want to delay
  *	  the ack that opens up a 0-sized window.
  *	- LRO wasn't used for this segment. We make sure by checking that the
  *	  segment size is not larger than the MSS.
  *	- Delayed acks are enabled or this is a half-synchronized T/TCP
  *	  connection.
  */
 #define DELAY_ACK(tp, tlen)			 \
 	(((tp->t_flags & TF_RXWIN0SENT) == 0) && \
 	((tp->t_flags & TF_DELACK) == 0) &&	 \
 	(tlen <= tp->t_maxseg) &&		 \
 	(tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
 
 static struct rack_sendmap *
 rack_find_lowest_rsm(struct tcp_rack *rack)
 {
 	struct rack_sendmap *rsm;
 
 	/*
 	 * Walk the time-order transmitted list looking for an rsm that is
 	 * not acked. This will be the one that was sent the longest time
 	 * ago that is still outstanding.
 	 */
 	TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
 		if (rsm->r_flags & RACK_ACKED) {
 			continue;
 		}
 		goto finish;
 	}
 finish:
 	return (rsm);
 }
 
 static struct rack_sendmap *
 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
 {
 	struct rack_sendmap *prsm;
 
 	/*
 	 * Walk the sequence order list backward until we hit and arrive at
 	 * the highest seq not acked. In theory when this is called it
 	 * should be the last segment (which it was not).
 	 */
 	prsm = rsm;
 	RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) {
 		if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
 			continue;
 		}
 		return (prsm);
 	}
 	return (NULL);
 }
 
 static uint32_t
 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
 {
 	int32_t lro;
 	uint32_t thresh;
 
 	/*
 	 * lro is the flag we use to determine if we have seen reordering.
 	 * If it gets set we have seen reordering. The reorder logic either
 	 * works in one of two ways:
 	 *
 	 * If reorder-fade is configured, then we track the last time we saw
 	 * re-ordering occur. If we reach the point where enough time as
 	 * passed we no longer consider reordering has occuring.
 	 *
 	 * Or if reorder-face is 0, then once we see reordering we consider
 	 * the connection to alway be subject to reordering and just set lro
 	 * to 1.
 	 *
 	 * In the end if lro is non-zero we add the extra time for
 	 * reordering in.
 	 */
 	if (srtt == 0)
 		srtt = 1;
 	if (rack->r_ctl.rc_reorder_ts) {
 		if (rack->r_ctl.rc_reorder_fade) {
 			if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
 				lro = cts - rack->r_ctl.rc_reorder_ts;
 				if (lro == 0) {
 					/*
 					 * No time as passed since the last
 					 * reorder, mark it as reordering.
 					 */
 					lro = 1;
 				}
 			} else {
 				/* Negative time? */
 				lro = 0;
 			}
 			if (lro > rack->r_ctl.rc_reorder_fade) {
 				/* Turn off reordering seen too */
 				rack->r_ctl.rc_reorder_ts = 0;
 				lro = 0;
 			}
 		} else {
 			/* Reodering does not fade */
 			lro = 1;
 		}
 	} else {
 		lro = 0;
 	}
 	if (rack->rc_rack_tmr_std_based == 0) {
 		thresh = srtt + rack->r_ctl.rc_pkt_delay;
 	} else {
 		/* Standards based pkt-delay is 1/4 srtt */
 		thresh = srtt +  (srtt >> 2);
 	}
 	if (lro && (rack->rc_rack_tmr_std_based == 0)) {
 		/* It must be set, if not you get 1/4 rtt */
 		if (rack->r_ctl.rc_reorder_shift)
 			thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
 		else
 			thresh += (srtt >> 2);
 	}
 	if (rack->rc_rack_use_dsack &&
 	    lro &&
 	    (rack->r_ctl.num_dsack > 0)) {
 		/*
 		 * We only increase the reordering window if we
 		 * have seen reordering <and> we have a DSACK count.
 		 */
 		thresh += rack->r_ctl.num_dsack * (srtt >> 2);
 		rack_log_dsack_event(rack, 4, __LINE__, srtt, thresh);
 	}
 	/* SRTT * 2 is the ceiling */
 	if (thresh > (srtt * 2)) {
 		thresh = srtt * 2;
 	}
 	/* And we don't want it above the RTO max either */
 	if (thresh > rack_rto_max) {
 		thresh = rack_rto_max;
 	}
 	rack_log_dsack_event(rack, 6, __LINE__, srtt, thresh);
 	return (thresh);
 }
 
 static uint32_t
 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
 		     struct rack_sendmap *rsm, uint32_t srtt)
 {
 	struct rack_sendmap *prsm;
 	uint32_t thresh, len;
 	int segsiz;
 
 	if (srtt == 0)
 		srtt = 1;
 	if (rack->r_ctl.rc_tlp_threshold)
 		thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
 	else
 		thresh = (srtt * 2);
 
 	/* Get the previous sent packet, if any */
 	segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
 	len = rsm->r_end - rsm->r_start;
 	if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
 		/* Exactly like the ID */
 		if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) {
 			uint32_t alt_thresh;
 			/*
 			 * Compensate for delayed-ack with the d-ack time.
 			 */
 			alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
 			if (alt_thresh > thresh)
 				thresh = alt_thresh;
 		}
 	} else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
 		/* 2.1 behavior */
 		prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
 		if (prsm && (len <= segsiz)) {
 			/*
 			 * Two packets outstanding, thresh should be (2*srtt) +
 			 * possible inter-packet delay (if any).
 			 */
 			uint32_t inter_gap = 0;
 			int idx, nidx;
 
 			idx = rsm->r_rtr_cnt - 1;
 			nidx = prsm->r_rtr_cnt - 1;
 			if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) {
 				/* Yes it was sent later (or at the same time) */
 				inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
 			}
 			thresh += inter_gap;
 		} else if (len <= segsiz) {
 			/*
 			 * Possibly compensate for delayed-ack.
 			 */
 			uint32_t alt_thresh;
 
 			alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
 			if (alt_thresh > thresh)
 				thresh = alt_thresh;
 		}
 	} else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
 		/* 2.2 behavior */
 		if (len <= segsiz) {
 			uint32_t alt_thresh;
 			/*
 			 * Compensate for delayed-ack with the d-ack time.
 			 */
 			alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
 			if (alt_thresh > thresh)
 				thresh = alt_thresh;
 		}
 	}
 	/* Not above an RTO */
 	if (thresh > tp->t_rxtcur) {
 		thresh = tp->t_rxtcur;
 	}
 	/* Not above a RTO max */
 	if (thresh > rack_rto_max) {
 		thresh = rack_rto_max;
 	}
 	/* Apply user supplied min TLP */
 	if (thresh < rack_tlp_min) {
 		thresh = rack_tlp_min;
 	}
 	return (thresh);
 }
 
 static uint32_t
 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack)
 {
 	/*
 	 * We want the rack_rtt which is the
 	 * last rtt we measured. However if that
 	 * does not exist we fallback to the srtt (which
 	 * we probably will never do) and then as a last
 	 * resort we use RACK_INITIAL_RTO if no srtt is
 	 * yet set.
 	 */
 	if (rack->rc_rack_rtt)
 		return (rack->rc_rack_rtt);
 	else if (tp->t_srtt == 0)
 		return (RACK_INITIAL_RTO);
 	return (tp->t_srtt);
 }
 
 static struct rack_sendmap *
 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
 {
 	/*
 	 * Check to see that we don't need to fall into recovery. We will
 	 * need to do so if our oldest transmit is past the time we should
 	 * have had an ack.
 	 */
 	struct tcp_rack *rack;
 	struct rack_sendmap *rsm;
 	int32_t idx;
 	uint32_t srtt, thresh;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
 		return (NULL);
 	}
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	if (rsm == NULL)
 		return (NULL);
 
 
 	if (rsm->r_flags & RACK_ACKED) {
 		rsm = rack_find_lowest_rsm(rack);
 		if (rsm == NULL)
 			return (NULL);
 	}
 	idx = rsm->r_rtr_cnt - 1;
 	srtt = rack_grab_rtt(tp, rack);
 	thresh = rack_calc_thresh_rack(rack, srtt, tsused);
 	if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) {
 		return (NULL);
 	}
 	if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) {
 		return (NULL);
 	}
 	/* Ok if we reach here we are over-due and this guy can be sent */
 	rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
 	return (rsm);
 }
 
 static uint32_t
 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
 {
 	int32_t t;
 	int32_t tt;
 	uint32_t ret_val;
 
 	t = (tp->t_srtt + (tp->t_rttvar << 2));
 	RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
  	    rack_persist_min, rack_persist_max, rack->r_ctl.timer_slop);
 	rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
 	ret_val = (uint32_t)tt;
 	return (ret_val);
 }
 
 static uint32_t
 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack)
 {
 	/*
 	 * Start the FR timer, we do this based on getting the first one in
 	 * the rc_tmap. Note that if its NULL we must stop the timer. in all
 	 * events we need to stop the running timer (if its running) before
 	 * starting the new one.
 	 */
 	uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse;
 	uint32_t srtt_cur;
 	int32_t idx;
 	int32_t is_tlp_timer = 0;
 	struct rack_sendmap *rsm;
 
 	if (rack->t_timers_stopped) {
 		/* All timers have been stopped none are to run */
 		return (0);
 	}
 	if (rack->rc_in_persist) {
 		/* We can't start any timer in persists */
 		return (rack_get_persists_timer_val(tp, rack));
 	}
 	rack->rc_on_min_to = 0;
 	if ((tp->t_state < TCPS_ESTABLISHED) ||
 	    ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
 		goto activate_rxt;
 	}
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	if ((rsm == NULL) || sup_rack) {
 		/* Nothing on the send map or no rack */
 activate_rxt:
 		time_since_sent = 0;
 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 		if (rsm) {
 			/*
 			 * Should we discount the RTX timer any?
 			 *
 			 * We want to discount it the smallest amount.
 			 * If a timer (Rack/TLP or RXT) has gone off more
 			 * recently thats the discount we want to use (now - timer time).
 			 * If the retransmit of the oldest packet was more recent then
 			 * we want to use that (now - oldest-packet-last_transmit_time).
 			 *
 			 */
 			idx = rsm->r_rtr_cnt - 1;
 			if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx])))
 				tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time;
 			else
 				tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx];
 			if (TSTMP_GT(cts, tstmp_touse))
 			    time_since_sent = cts - tstmp_touse;
 		}
 		if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
 			rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
 			to = tp->t_rxtcur;
 			if (to > time_since_sent)
 				to -= time_since_sent;
 			else
 				to = rack->r_ctl.rc_min_to;
 			if (to == 0)
 				to = 1;
 			/* Special case for KEEPINIT */
 			if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) &&
 			    (TP_KEEPINIT(tp) != 0) &&
 			    rsm) {
 				/*
 				 * We have to put a ceiling on the rxt timer
 				 * of the keep-init timeout.
 				 */
 				uint32_t max_time, red;
 
 				max_time = TICKS_2_USEC(TP_KEEPINIT(tp));
 				if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) {
 					red = (cts - (uint32_t)rsm->r_tim_lastsent[0]);
 					if (red < max_time)
 						max_time -= red;
 					else
 						max_time = 1;
 				}
 				/* Reduce timeout to the keep value if needed */
 				if (max_time < to)
 					to = max_time;
 			}
 			return (to);
 		}
 		return (0);
 	}
 	if (rsm->r_flags & RACK_ACKED) {
 		rsm = rack_find_lowest_rsm(rack);
 		if (rsm == NULL) {
 			/* No lowest? */
 			goto activate_rxt;
 		}
 	}
 	if (rack->sack_attack_disable) {
 		/*
 		 * We don't want to do
 		 * any TLP's if you are an attacker.
 		 * Though if you are doing what
 		 * is expected you may still have
 		 * SACK-PASSED marks.
 		 */
 		goto activate_rxt;
 	}
 	/* Convert from ms to usecs */
 	if ((rsm->r_flags & RACK_SACK_PASSED) ||
 	    (rsm->r_flags & RACK_RWND_COLLAPSED) ||
 	    (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
 		if ((tp->t_flags & TF_SENTFIN) &&
 		    ((tp->snd_max - tp->snd_una) == 1) &&
 		    (rsm->r_flags & RACK_HAS_FIN)) {
 			/*
 			 * We don't start a rack timer if all we have is a
 			 * FIN outstanding.
 			 */
 			goto activate_rxt;
 		}
 		if ((rack->use_rack_rr == 0) &&
 		    (IN_FASTRECOVERY(tp->t_flags)) &&
 		    (rack->rack_no_prr == 0) &&
 		     (rack->r_ctl.rc_prr_sndcnt  < ctf_fixed_maxseg(tp))) {
 			/*
 			 * We are not cheating, in recovery  and
 			 * not enough ack's to yet get our next
 			 * retransmission out.
 			 *
 			 * Note that classified attackers do not
 			 * get to use the rack-cheat.
 			 */
 			goto activate_tlp;
 		}
 		srtt = rack_grab_rtt(tp, rack);
 		thresh = rack_calc_thresh_rack(rack, srtt, cts);
 		idx = rsm->r_rtr_cnt - 1;
 		exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh;
 		if (SEQ_GEQ(exp, cts)) {
 			to = exp - cts;
 			if (to < rack->r_ctl.rc_min_to) {
 				to = rack->r_ctl.rc_min_to;
 				if (rack->r_rr_config == 3)
 					rack->rc_on_min_to = 1;
 			}
 		} else {
 			to = rack->r_ctl.rc_min_to;
 			if (rack->r_rr_config == 3)
 				rack->rc_on_min_to = 1;
 		}
 	} else {
 		/* Ok we need to do a TLP not RACK */
 activate_tlp:
 		if ((rack->rc_tlp_in_progress != 0) &&
 		    (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) {
 			/*
 			 * The previous send was a TLP and we have sent
 			 * N TLP's without sending new data.
 			 */
 			goto activate_rxt;
 		}
 		rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
 		if (rsm == NULL) {
 			/* We found no rsm to TLP with. */
 			goto activate_rxt;
 		}
 		if (rsm->r_flags & RACK_HAS_FIN) {
 			/* If its a FIN we dont do TLP */
 			rsm = NULL;
 			goto activate_rxt;
 		}
 		idx = rsm->r_rtr_cnt - 1;
 		time_since_sent = 0;
 		if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time))
 			tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx];
 		else
 			tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time;
 		if (TSTMP_GT(cts, tstmp_touse))
 		    time_since_sent = cts - tstmp_touse;
 		is_tlp_timer = 1;
 		if (tp->t_srtt) {
 			if ((rack->rc_srtt_measure_made == 0) &&
 			    (tp->t_srtt == 1)) {
 				/*
 				 * If another stack as run and set srtt to 1,
 				 * then the srtt was 0, so lets use the initial.
 				 */
 				srtt = RACK_INITIAL_RTO;
 			} else {
 				srtt_cur = tp->t_srtt;
 				srtt = srtt_cur;
 			}
 		} else
 			srtt = RACK_INITIAL_RTO;
 		/*
 		 * If the SRTT is not keeping up and the
 		 * rack RTT has spiked we want to use
 		 * the last RTT not the smoothed one.
 		 */
 		if (rack_tlp_use_greater &&
 		    tp->t_srtt &&
 		    (srtt < rack_grab_rtt(tp, rack))) {
 			srtt = rack_grab_rtt(tp, rack);
 		}
 		thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
 		if (thresh > time_since_sent) {
 			to = thresh - time_since_sent;
 		} else {
 			to = rack->r_ctl.rc_min_to;
 			rack_log_alt_to_to_cancel(rack,
 						  thresh,		/* flex1 */
 						  time_since_sent,	/* flex2 */
 						  tstmp_touse,		/* flex3 */
 						  rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */
 						  (uint32_t)rsm->r_tim_lastsent[idx],
 						  srtt,
 						  idx, 99);
 		}
 		if (to < rack_tlp_min) {
 			to = rack_tlp_min;
 		}
 		if (to > TICKS_2_USEC(TCPTV_REXMTMAX)) {
 			/*
 			 * If the TLP time works out to larger than the max
 			 * RTO lets not do TLP.. just RTO.
 			 */
 			goto activate_rxt;
 		}
 	}
 	if (is_tlp_timer == 0) {
 		rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
 	} else {
 		rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
 	}
 	if (to == 0)
 		to = 1;
 	return (to);
 }
 
 static void
 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	if (rack->rc_in_persist == 0) {
 		if (tp->t_flags & TF_GPUTINPROG) {
 			/*
 			 * Stop the goodput now, the calling of the
 			 * measurement function clears the flag.
 			 */
 			rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__,
 						    RACK_QUALITY_PERSIST);
 		}
 #ifdef NETFLIX_SHARED_CWND
 		if (rack->r_ctl.rc_scw) {
 			tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
 			rack->rack_scwnd_is_idle = 1;
 		}
 #endif
 		rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
 		if (rack->r_ctl.rc_went_idle_time == 0)
 			rack->r_ctl.rc_went_idle_time = 1;
 		rack_timer_cancel(tp, rack, cts, __LINE__);
 		rack->r_ctl.persist_lost_ends = 0;
 		rack->probe_not_answered = 0;
 		rack->forced_ack = 0;
 		tp->t_rxtshift = 0;
 		RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
 			      rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
 		rack->rc_in_persist = 1;
 	}
 }
 
 static void
 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	if (tcp_in_hpts(rack->rc_inp)) {
 		tcp_hpts_remove(rack->rc_inp);
 		rack->r_ctl.rc_hpts_flags = 0;
 	}
 #ifdef NETFLIX_SHARED_CWND
 	if (rack->r_ctl.rc_scw) {
 		tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
 		rack->rack_scwnd_is_idle = 0;
 	}
 #endif
 	if (rack->rc_gp_dyn_mul &&
 	    (rack->use_fixed_rate == 0) &&
 	    (rack->rc_always_pace)) {
 		/*
 		 * Do we count this as if a probe-rtt just
 		 * finished?
 		 */
 		uint32_t time_idle, idle_min;
 
 		time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time;
 		idle_min = rack_min_probertt_hold;
 		if (rack_probertt_gpsrtt_cnt_div) {
 			uint64_t extra;
 			extra = (uint64_t)rack->r_ctl.rc_gp_srtt *
 				(uint64_t)rack_probertt_gpsrtt_cnt_mul;
 			extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div;
 			idle_min += (uint32_t)extra;
 		}
 		if (time_idle >= idle_min) {
 			/* Yes, we count it as a probe-rtt. */
 			uint32_t us_cts;
 
 			us_cts = tcp_get_usecs(NULL);
 			if (rack->in_probe_rtt == 0) {
 				rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
 				rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts;
 				rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts;
 				rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts;
 			} else {
 				rack_exit_probertt(rack, us_cts);
 			}
 		}
 	}
 	rack->rc_in_persist = 0;
 	rack->r_ctl.rc_went_idle_time = 0;
 	tp->t_rxtshift = 0;
 	RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
 	   rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
 	rack->r_ctl.rc_agg_delayed = 0;
 	rack->r_early = 0;
 	rack->r_late = 0;
 	rack->r_ctl.rc_agg_early = 0;
 }
 
 static void
 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts,
 		   struct hpts_diag *diag, struct timeval *tv)
 {
 	if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.flex1 = diag->p_nxt_slot;
 		log.u_bbr.flex2 = diag->p_cur_slot;
 		log.u_bbr.flex3 = diag->slot_req;
 		log.u_bbr.flex4 = diag->inp_hptsslot;
 		log.u_bbr.flex5 = diag->slot_remaining;
 		log.u_bbr.flex6 = diag->need_new_to;
 		log.u_bbr.flex7 = diag->p_hpts_active;
 		log.u_bbr.flex8 = diag->p_on_min_sleep;
 		/* Hijack other fields as needed */
 		log.u_bbr.epoch = diag->have_slept;
 		log.u_bbr.lt_epoch = diag->yet_to_sleep;
 		log.u_bbr.pkts_out = diag->co_ret;
 		log.u_bbr.applimited = diag->hpts_sleep_time;
 		log.u_bbr.delivered = diag->p_prev_slot;
 		log.u_bbr.inflight = diag->p_runningslot;
 		log.u_bbr.bw_inuse = diag->wheel_slot;
 		log.u_bbr.rttProp = diag->wheel_cts;
 		log.u_bbr.timeStamp = cts;
 		log.u_bbr.delRate = diag->maxslots;
 		log.u_bbr.cur_del_rate = diag->p_curtick;
 		log.u_bbr.cur_del_rate <<= 32;
 		log.u_bbr.cur_del_rate |= diag->p_lasttick;
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_HPTSDIAG, 0,
 		    0, &log, false, tv);
 	}
 
 }
 
 static void
 rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type)
 {
 	if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.flex1 = sb->sb_flags;
 		log.u_bbr.flex2 = len;
 		log.u_bbr.flex3 = sb->sb_state;
 		log.u_bbr.flex8 = type;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    TCP_LOG_SB_WAKE, 0,
 		    len, &log, false, &tv);
 	}
 }
 
 static void
 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
       int32_t slot, uint32_t tot_len_this_send, int sup_rack)
 {
 	struct hpts_diag diag;
 	struct inpcb *inp;
 	struct timeval tv;
 	uint32_t delayed_ack = 0;
 	uint32_t hpts_timeout;
 	uint32_t entry_slot = slot;
 	uint8_t stopped;
 	uint32_t left = 0;
 	uint32_t us_cts;
 
 	inp = tp->t_inpcb;
 	if ((tp->t_state == TCPS_CLOSED) ||
 	    (tp->t_state == TCPS_LISTEN)) {
 		return;
 	}
 	if (tcp_in_hpts(inp)) {
 		/* Already on the pacer */
 		return;
 	}
 	stopped = rack->rc_tmr_stopped;
 	if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
 		left = rack->r_ctl.rc_timer_exp - cts;
 	}
 	rack->r_ctl.rc_timer_exp = 0;
 	rack->r_ctl.rc_hpts_flags = 0;
 	us_cts = tcp_get_usecs(&tv);
 	/* Now early/late accounting */
 	rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0);
 	if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) {
 		/*
 		 * We have a early carry over set,
 		 * we can always add more time so we
 		 * can always make this compensation.
 		 *
 		 * Note if ack's are allowed to wake us do not
 		 * penalize the next timer for being awoke
 		 * by an ack aka the rc_agg_early (non-paced mode).
 		 */
 		slot += rack->r_ctl.rc_agg_early;
 		rack->r_early = 0;
 		rack->r_ctl.rc_agg_early = 0;
 	}
 	if (rack->r_late) {
 		/*
 		 * This is harder, we can
 		 * compensate some but it
 		 * really depends on what
 		 * the current pacing time is.
 		 */
 		if (rack->r_ctl.rc_agg_delayed >= slot) {
 			/*
 			 * We can't compensate for it all.
 			 * And we have to have some time
 			 * on the clock. We always have a min
 			 * 10 slots (10 x 10 i.e. 100 usecs).
 			 */
 			if (slot <= HPTS_TICKS_PER_SLOT) {
 				/* We gain delay */
 				rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot);
 				slot = HPTS_TICKS_PER_SLOT;
 			} else {
 				/* We take off some */
 				rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT);
 				slot = HPTS_TICKS_PER_SLOT;
 			}
 		} else {
 			slot -= rack->r_ctl.rc_agg_delayed;
 			rack->r_ctl.rc_agg_delayed = 0;
 			/* Make sure we have 100 useconds at minimum */
 			if (slot < HPTS_TICKS_PER_SLOT) {
 				rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot;
 				slot = HPTS_TICKS_PER_SLOT;
 			}
 			if (rack->r_ctl.rc_agg_delayed == 0)
 				rack->r_late = 0;
 		}
 	}
 	if (slot) {
 		/* We are pacing too */
 		rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
 	}
 	hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack);
 #ifdef NETFLIX_EXP_DETECTION
 	if (rack->sack_attack_disable &&
 	    (slot < tcp_sad_pacing_interval)) {
 		/*
 		 * We have a potential attacker on
 		 * the line. We have possibly some
 		 * (or now) pacing time set. We want to
 		 * slow down the processing of sacks by some
 		 * amount (if it is an attacker). Set the default
 		 * slot for attackers in place (unless the orginal
 		 * interval is longer). Its stored in
 		 * micro-seconds, so lets convert to msecs.
 		 */
 		slot = tcp_sad_pacing_interval;
 	}
 #endif
 	if (tp->t_flags & TF_DELACK) {
 		delayed_ack = TICKS_2_USEC(tcp_delacktime);
 		rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
 	}
 	if (delayed_ack && ((hpts_timeout == 0) ||
 			    (delayed_ack < hpts_timeout)))
 		hpts_timeout = delayed_ack;
 	else
 		rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
 	/*
 	 * If no timers are going to run and we will fall off the hptsi
 	 * wheel, we resort to a keep-alive timer if its configured.
 	 */
 	if ((hpts_timeout == 0) &&
 	    (slot == 0)) {
 		if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
 		    (tp->t_state <= TCPS_CLOSING)) {
 			/*
 			 * Ok we have no timer (persists, rack, tlp, rxt  or
 			 * del-ack), we don't have segments being paced. So
 			 * all that is left is the keepalive timer.
 			 */
 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
 				/* Get the established keep-alive time */
 				hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp));
 			} else {
 				/*
 				 * Get the initial setup keep-alive time,
 				 * note that this is probably not going to
 				 * happen, since rack will be running a rxt timer
 				 * if a SYN of some sort is outstanding. It is
 				 * actually handled in rack_timeout_rxt().
 				 */
 				hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp));
 			}
 			rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
 			if (rack->in_probe_rtt) {
 				/*
 				 * We want to instead not wake up a long time from
 				 * now but to wake up about the time we would
 				 * exit probe-rtt and initiate a keep-alive ack.
 				 * This will get us out of probe-rtt and update
 				 * our min-rtt.
 				 */
 				hpts_timeout = rack_min_probertt_hold;
 			}
 		}
 	}
 	if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
 	    (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
 		/*
 		 * RACK, TLP, persists and RXT timers all are restartable
 		 * based on actions input .. i.e we received a packet (ack
 		 * or sack) and that changes things (rw, or snd_una etc).
 		 * Thus we can restart them with a new value. For
 		 * keep-alive, delayed_ack we keep track of what was left
 		 * and restart the timer with a smaller value.
 		 */
 		if (left < hpts_timeout)
 			hpts_timeout = left;
 	}
 	if (hpts_timeout) {
 		/*
 		 * Hack alert for now we can't time-out over 2,147,483
 		 * seconds (a bit more than 596 hours), which is probably ok
 		 * :).
 		 */
 		if (hpts_timeout > 0x7ffffffe)
 			hpts_timeout = 0x7ffffffe;
 		rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
 	}
 	rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0);
 	if ((rack->gp_ready == 0) &&
 	    (rack->use_fixed_rate == 0) &&
 	    (hpts_timeout < slot) &&
 	    (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
 		/*
 		 * We have no good estimate yet for the
 		 * old clunky burst mitigation or the
 		 * real pacing. And the tlp or rxt is smaller
 		 * than the pacing calculation. Lets not
 		 * pace that long since we know the calculation
 		 * so far is not accurate.
 		 */
 		slot = hpts_timeout;
 	}
 	/**
 	 * Turn off all the flags for queuing by default. The
 	 * flags have important meanings to what happens when
 	 * LRO interacts with the transport. Most likely (by default now)
 	 * mbuf_queueing and ack compression are on. So the transport
 	 * has a couple of flags that control what happens (if those
 	 * are not on then these flags won't have any effect since it
 	 * won't go through the queuing LRO path).
 	 *
 	 * INP_MBUF_QUEUE_READY - This flags says that I am busy
 	 *                        pacing output, so don't disturb. But
 	 *                        it also means LRO can wake me if there
 	 *                        is a SACK arrival.
 	 *
 	 * INP_DONT_SACK_QUEUE - This flag is used in conjunction
 	 *                       with the above flag (QUEUE_READY) and
 	 *                       when present it says don't even wake me
 	 *                       if a SACK arrives.
 	 *
 	 * The idea behind these flags is that if we are pacing we
 	 * set the MBUF_QUEUE_READY and only get woken up if
 	 * a SACK arrives (which could change things) or if
 	 * our pacing timer expires. If, however, we have a rack
 	 * timer running, then we don't even want a sack to wake
 	 * us since the rack timer has to expire before we can send.
 	 *
 	 * Other cases should usually have none of the flags set
 	 * so LRO can call into us.
 	 */
 	inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY);
 	if (slot) {
 		rack->r_ctl.rc_last_output_to = us_cts + slot;
 		/*
 		 * A pacing timer (slot) is being set, in
 		 * such a case we cannot send (we are blocked by
 		 * the timer). So lets tell LRO that it should not
 		 * wake us unless there is a SACK. Note this only
 		 * will be effective if mbuf queueing is on or
 		 * compressed acks are being processed.
 		 */
 		inp->inp_flags2 |= INP_MBUF_QUEUE_READY;
 		/*
 		 * But wait if we have a Rack timer running
 		 * even a SACK should not disturb us (with
 		 * the exception of r_rr_config 3).
 		 */
 		if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) &&
 		    (rack->r_rr_config != 3))
 			inp->inp_flags2 |= INP_DONT_SACK_QUEUE;
 		if (rack->rc_ack_can_sendout_data) {
 			/*
 			 * Ahh but wait, this is that special case
 			 * where the pacing timer can be disturbed
 			 * backout the changes (used for non-paced
 			 * burst limiting).
 			 */
 			inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY);
 		}
 		if ((rack->use_rack_rr) &&
 		    (rack->r_rr_config < 2) &&
 		    ((hpts_timeout) && (hpts_timeout < slot))) {
 			/*
 			 * Arrange for the hpts to kick back in after the
 			 * t-o if the t-o does not cause a send.
 			 */
 			(void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout),
 						   __LINE__, &diag);
 			rack_log_hpts_diag(rack, us_cts, &diag, &tv);
 			rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
 		} else {
 			(void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot),
 						   __LINE__, &diag);
 			rack_log_hpts_diag(rack, us_cts, &diag, &tv);
 			rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
 		}
 	} else if (hpts_timeout) {
 		/*
 		 * With respect to inp_flags2 here, lets let any new acks wake
 		 * us up here. Since we are not pacing (no pacing timer), output
 		 * can happen so we should let it. If its a Rack timer, then any inbound
 		 * packet probably won't change the sending (we will be blocked)
 		 * but it may change the prr stats so letting it in (the set defaults
 		 * at the start of this block) are good enough.
 		 */
 		(void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout),
 					   __LINE__, &diag);
 		rack_log_hpts_diag(rack, us_cts, &diag, &tv);
 		rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
 	} else {
 		/* No timer starting */
 #ifdef INVARIANTS
 		if (SEQ_GT(tp->snd_max, tp->snd_una)) {
 			panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
 			    tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
 		}
 #endif
 	}
 	rack->rc_tmr_stopped = 0;
 	if (slot)
 		rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv);
 }
 
 /*
  * RACK Timer, here we simply do logging and house keeping.
  * the normal rack_output() function will call the
  * appropriate thing to check if we need to do a RACK retransmit.
  * We return 1, saying don't proceed with rack_output only
  * when all timers have been stopped (destroyed PCB?).
  */
 static int
 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	/*
 	 * This timer simply provides an internal trigger to send out data.
 	 * The check_recovery_mode call will see if there are needed
 	 * retransmissions, if so we will enter fast-recovery. The output
 	 * call may or may not do the same thing depending on sysctl
 	 * settings.
 	 */
 	struct rack_sendmap *rsm;
 
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	counter_u64_add(rack_to_tot, 1);
 	if (rack->r_state && (rack->r_state != tp->t_state))
 		rack_set_state(tp, rack);
 	rack->rc_on_min_to = 0;
 	rsm = rack_check_recovery_mode(tp, cts);
 	rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm);
 	if (rsm) {
 		rack->r_ctl.rc_resend = rsm;
 		rack->r_timer_override = 1;
 		if (rack->use_rack_rr) {
 			/*
 			 * Don't accumulate extra pacing delay
 			 * we are allowing the rack timer to
 			 * over-ride pacing i.e. rrr takes precedence
 			 * if the pacing interval is longer than the rrr
 			 * time (in other words we get the min pacing
 			 * time versus rrr pacing time).
 			 */
 			rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
 		}
 	}
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
 	if (rsm == NULL) {
 		/* restart a timer and return 1 */
 		rack_start_hpts_timer(rack, tp, cts,
 				      0, 0, 0);
 		return (1);
 	}
 	return (0);
 }
 
 static void
 rack_adjust_orig_mlen(struct rack_sendmap *rsm)
 {
 	if (rsm->m->m_len > rsm->orig_m_len) {
 		/*
 		 * Mbuf grew, caused by sbcompress, our offset does
 		 * not change.
 		 */
 		rsm->orig_m_len = rsm->m->m_len;
 	} else if (rsm->m->m_len < rsm->orig_m_len) {
 		/*
 		 * Mbuf shrank, trimmed off the top by an ack, our
 		 * offset changes.
 		 */
 		rsm->soff -= (rsm->orig_m_len - rsm->m->m_len);
 		rsm->orig_m_len = rsm->m->m_len;
 	}
 }
 
 static void
 rack_setup_offset_for_rsm(struct rack_sendmap *src_rsm, struct rack_sendmap *rsm)
 {
 	struct mbuf *m;
 	uint32_t soff;
 
 	if (src_rsm->m && (src_rsm->orig_m_len != src_rsm->m->m_len)) {
 		/* Fix up the orig_m_len and possibly the mbuf offset */
 		rack_adjust_orig_mlen(src_rsm);
 	}
 	m = src_rsm->m;
 	soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start);
 	while (soff >= m->m_len) {
 		/* Move out past this mbuf */
 		soff -= m->m_len;
 		m = m->m_next;
 		KASSERT((m != NULL),
 			("rsm:%p nrsm:%p hit at soff:%u null m",
 			 src_rsm, rsm, soff));
 	}
 	rsm->m = m;
 	rsm->soff = soff;
 	rsm->orig_m_len = m->m_len;
 }
 
 static __inline void
 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
 	       struct rack_sendmap *rsm, uint32_t start)
 {
 	int idx;
 
 	nrsm->r_start = start;
 	nrsm->r_end = rsm->r_end;
 	nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
 	nrsm->r_flags = rsm->r_flags;
 	nrsm->r_dupack = rsm->r_dupack;
 	nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed;
 	nrsm->r_rtr_bytes = 0;
 	nrsm->r_fas = rsm->r_fas;
 	rsm->r_end = nrsm->r_start;
 	nrsm->r_just_ret = rsm->r_just_ret;
 	for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
 		nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
 	}
 	/* Now if we have SYN flag we keep it on the left edge */
 	if (nrsm->r_flags & RACK_HAS_SYN)
 		nrsm->r_flags &= ~RACK_HAS_SYN;
 	/* Now if we have a FIN flag we keep it on the right edge */
 	if (rsm->r_flags & RACK_HAS_FIN)
 		rsm->r_flags &= ~RACK_HAS_FIN;
 	/* Push bit must go to the right edge as well */
 	if (rsm->r_flags & RACK_HAD_PUSH)
 		rsm->r_flags &= ~RACK_HAD_PUSH;
 	/* Clone over the state of the hw_tls flag */
 	nrsm->r_hw_tls = rsm->r_hw_tls;
 	/*
 	 * Now we need to find nrsm's new location in the mbuf chain
 	 * we basically calculate a new offset, which is soff +
 	 * how much is left in original rsm. Then we walk out the mbuf
 	 * chain to find the righ position, it may be the same mbuf
 	 * or maybe not.
 	 */
 	KASSERT(((rsm->m != NULL) ||
 		 (rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))),
 		("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack));
 	if (rsm->m)
 		rack_setup_offset_for_rsm(rsm, nrsm);
 }
 
 static struct rack_sendmap *
 rack_merge_rsm(struct tcp_rack *rack,
 	       struct rack_sendmap *l_rsm,
 	       struct rack_sendmap *r_rsm)
 {
 	/*
 	 * We are merging two ack'd RSM's,
 	 * the l_rsm is on the left (lower seq
 	 * values) and the r_rsm is on the right
 	 * (higher seq value). The simplest way
 	 * to merge these is to move the right
 	 * one into the left. I don't think there
 	 * is any reason we need to try to find
 	 * the oldest (or last oldest retransmitted).
 	 */
 #ifdef INVARIANTS
 	struct rack_sendmap *rm;
 #endif
 	rack_log_map_chg(rack->rc_tp, rack, NULL,
 			 l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__);
 	l_rsm->r_end = r_rsm->r_end;
 	if (l_rsm->r_dupack < r_rsm->r_dupack)
 		l_rsm->r_dupack = r_rsm->r_dupack;
 	if (r_rsm->r_rtr_bytes)
 		l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
 	if (r_rsm->r_in_tmap) {
 		/* This really should not happen */
 		TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext);
 		r_rsm->r_in_tmap = 0;
 	}
 
 	/* Now the flags */
 	if (r_rsm->r_flags & RACK_HAS_FIN)
 		l_rsm->r_flags |= RACK_HAS_FIN;
 	if (r_rsm->r_flags & RACK_TLP)
 		l_rsm->r_flags |= RACK_TLP;
 	if (r_rsm->r_flags & RACK_RWND_COLLAPSED)
 		l_rsm->r_flags |= RACK_RWND_COLLAPSED;
 	if ((r_rsm->r_flags & RACK_APP_LIMITED)  &&
 	    ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) {
 		/*
 		 * If both are app-limited then let the
 		 * free lower the count. If right is app
 		 * limited and left is not, transfer.
 		 */
 		l_rsm->r_flags |= RACK_APP_LIMITED;
 		r_rsm->r_flags &= ~RACK_APP_LIMITED;
 		if (r_rsm == rack->r_ctl.rc_first_appl)
 			rack->r_ctl.rc_first_appl = l_rsm;
 	}
 #ifndef INVARIANTS
 	(void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm);
 #else
 	rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm);
 	if (rm != r_rsm) {
 		panic("removing head in rack:%p rsm:%p rm:%p",
 		      rack, r_rsm, rm);
 	}
 #endif
 	if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
 		/* Transfer the split limit to the map we free */
 		r_rsm->r_limit_type = l_rsm->r_limit_type;
 		l_rsm->r_limit_type = 0;
 	}
 	rack_free(rack, r_rsm);
 	return (l_rsm);
 }
 
 /*
  * TLP Timer, here we simply setup what segment we want to
  * have the TLP expire on, the normal rack_output() will then
  * send it out.
  *
  * We return 1, saying don't proceed with rack_output only
  * when all timers have been stopped (destroyed PCB?).
  */
 static int
 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t *doing_tlp)
 {
 	/*
 	 * Tail Loss Probe.
 	 */
 	struct rack_sendmap *rsm = NULL;
 #ifdef INVARIANTS
 	struct rack_sendmap *insret;
 #endif
 	struct socket *so;
 	uint32_t amm;
 	uint32_t out, avail;
 	int collapsed_win = 0;
 
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
 		/* Its not time yet */
 		return (0);
 	}
 	if (ctf_progress_timeout_check(tp, true)) {
 		rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
 		return (-ETIMEDOUT);	/* tcp_drop() */
 	}
 	/*
 	 * A TLP timer has expired. We have been idle for 2 rtts. So we now
 	 * need to figure out how to force a full MSS segment out.
 	 */
 	rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL);
 	rack->r_ctl.retran_during_recovery = 0;
 	rack->r_ctl.dsack_byte_cnt = 0;
 	counter_u64_add(rack_tlp_tot, 1);
 	if (rack->r_state && (rack->r_state != tp->t_state))
 		rack_set_state(tp, rack);
 	so = tp->t_inpcb->inp_socket;
 	avail = sbavail(&so->so_snd);
 	out = tp->snd_max - tp->snd_una;
 	if ((out > tp->snd_wnd) || rack->rc_has_collapsed) {
 		/* special case, we need a retransmission */
 		collapsed_win = 1;
 		goto need_retran;
 	}
 	if (rack->r_ctl.dsack_persist && (rack->r_ctl.rc_tlp_cnt_out >= 1)) {
 		rack->r_ctl.dsack_persist--;
 		if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
 			rack->r_ctl.num_dsack = 0;
 		}
 		rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
 	}
 	if ((tp->t_flags & TF_GPUTINPROG) &&
 	    (rack->r_ctl.rc_tlp_cnt_out == 1)) {
 		/*
 		 * If this is the second in a row
 		 * TLP and we are doing a measurement
 		 * its time to abandon the measurement.
 		 * Something is likely broken on
 		 * the clients network and measuring a
 		 * broken network does us no good.
 		 */
 		tp->t_flags &= ~TF_GPUTINPROG;
 		rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
 					   rack->r_ctl.rc_gp_srtt /*flex1*/,
 					   tp->gput_seq,
 					   0, 0, 18, __LINE__, NULL, 0);
 	}
 	/*
 	 * Check our send oldest always settings, and if
 	 * there is an oldest to send jump to the need_retran.
 	 */
 	if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0))
 		goto need_retran;
 
 	if (avail > out) {
 		/* New data is available */
 		amm = avail - out;
 		if (amm > ctf_fixed_maxseg(tp)) {
 			amm = ctf_fixed_maxseg(tp);
 			if ((amm + out) > tp->snd_wnd) {
 				/* We are rwnd limited */
 				goto need_retran;
 			}
 		} else if (amm < ctf_fixed_maxseg(tp)) {
 			/* not enough to fill a MTU */
 			goto need_retran;
 		}
 		if (IN_FASTRECOVERY(tp->t_flags)) {
 			/* Unlikely */
 			if (rack->rack_no_prr == 0) {
 				if (out + amm <= tp->snd_wnd) {
 					rack->r_ctl.rc_prr_sndcnt = amm;
 					rack->r_ctl.rc_tlp_new_data = amm;
 					rack_log_to_prr(rack, 4, 0, __LINE__);
 				}
 			} else
 				goto need_retran;
 		} else {
 			/* Set the send-new override */
 			if (out + amm <= tp->snd_wnd)
 				rack->r_ctl.rc_tlp_new_data = amm;
 			else
 				goto need_retran;
 		}
 		rack->r_ctl.rc_tlpsend = NULL;
 		counter_u64_add(rack_tlp_newdata, 1);
 		goto send;
 	}
 need_retran:
 	/*
 	 * Ok we need to arrange the last un-acked segment to be re-sent, or
 	 * optionally the first un-acked segment.
 	 */
 	if (collapsed_win == 0) {
 		if (rack_always_send_oldest)
 			rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 		else {
 			rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
 			if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
 				rsm = rack_find_high_nonack(rack, rsm);
 			}
 		}
 		if (rsm == NULL) {
 #ifdef TCP_BLACKBOX
 			tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
 #endif
 			goto out;
 		}
 	} else {
 		/*
 		 * We must find the last segment
 		 * that was acceptable by the client.
 		 */
 		RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
 			if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) {
 				/* Found one */
 				break;
 			}
 		}
 		if (rsm == NULL) {
 			/* None? if so send the first */
 			rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
 			if (rsm == NULL) {
 #ifdef TCP_BLACKBOX
 				tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
 #endif
 				goto out;
 			}
 		}
 	}
 	if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) {
 		/*
 		 * We need to split this the last segment in two.
 		 */
 		struct rack_sendmap *nrsm;
 
 		nrsm = rack_alloc_full_limit(rack);
 		if (nrsm == NULL) {
 			/*
 			 * No memory to split, we will just exit and punt
 			 * off to the RXT timer.
 			 */
 			goto out;
 		}
 		rack_clone_rsm(rack, nrsm, rsm,
 			       (rsm->r_end - ctf_fixed_maxseg(tp)));
 		rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
 #ifndef INVARIANTS
 		(void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
 #else
 		insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
 		if (insret != NULL) {
 			panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
 			      nrsm, insret, rack, rsm);
 		}
 #endif
 		if (rsm->r_in_tmap) {
 			TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 			nrsm->r_in_tmap = 1;
 		}
 		rsm = nrsm;
 	}
 	rack->r_ctl.rc_tlpsend = rsm;
 send:
 	/* Make sure output path knows we are doing a TLP */
 	*doing_tlp = 1;
 	rack->r_timer_override = 1;
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
 	return (0);
 out:
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
 	return (0);
 }
 
 /*
  * Delayed ack Timer, here we simply need to setup the
  * ACK_NOW flag and remove the DELACK flag. From there
  * the output routine will send the ack out.
  *
  * We only return 1, saying don't proceed, if all timers
  * are stopped (destroyed PCB?).
  */
 static int
 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL);
 	tp->t_flags &= ~TF_DELACK;
 	tp->t_flags |= TF_ACKNOW;
 	KMOD_TCPSTAT_INC(tcps_delack);
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
 	return (0);
 }
 
 /*
  * Persists timer, here we simply send the
  * same thing as a keepalive will.
  * the one byte send.
  *
  * We only return 1, saying don't proceed, if all timers
  * are stopped (destroyed PCB?).
  */
 static int
 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	struct tcptemp *t_template;
 #ifdef INVARIANTS
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	int32_t retval = 1;
 
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	if (rack->rc_in_persist == 0)
 		return (0);
 	if (ctf_progress_timeout_check(tp, false)) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
 		rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
 		counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
 		return (-ETIMEDOUT);	/* tcp_drop() */
 	}
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	/*
 	 * Persistence timer into zero window. Force a byte to be output, if
 	 * possible.
 	 */
 	KMOD_TCPSTAT_INC(tcps_persisttimeo);
 	/*
 	 * Hack: if the peer is dead/unreachable, we do not time out if the
 	 * window is closed.  After a full backoff, drop the connection if
 	 * the idle time (no responses to probes) reaches the maximum
 	 * backoff that we would use if retransmitting.
 	 */
 	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
 	     TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) {
 		KMOD_TCPSTAT_INC(tcps_persistdrop);
 		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
 		counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
 		retval = -ETIMEDOUT;	/* tcp_drop() */
 		goto out;
 	}
 	if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
 	    tp->snd_una == tp->snd_max)
 		rack_exit_persist(tp, rack, cts);
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
 	/*
 	 * If the user has closed the socket then drop a persisting
 	 * connection after a much reduced timeout.
 	 */
 	if (tp->t_state > TCPS_CLOSE_WAIT &&
 	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
 		KMOD_TCPSTAT_INC(tcps_persistdrop);
 		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
 		counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
 		retval = -ETIMEDOUT;	/* tcp_drop() */
 		goto out;
 	}
 	t_template = tcpip_maketemplate(rack->rc_inp);
 	if (t_template) {
 		/* only set it if we were answered */
 		if (rack->forced_ack == 0) {
 			rack->forced_ack = 1;
 			rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
 		} else {
 			rack->probe_not_answered = 1;
 			counter_u64_add(rack_persists_loss, 1);
 			rack->r_ctl.persist_lost_ends++;
 		}
 		counter_u64_add(rack_persists_sends, 1);
 		tcp_respond(tp, t_template->tt_ipgen,
 			    &t_template->tt_t, (struct mbuf *)NULL,
 			    tp->rcv_nxt, tp->snd_una - 1, 0);
 		/* This sends an ack */
 		if (tp->t_flags & TF_DELACK)
 			tp->t_flags &= ~TF_DELACK;
 		free(t_template, M_TEMP);
 	}
 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
 		tp->t_rxtshift++;
 out:
 	rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL);
 	rack_start_hpts_timer(rack, tp, cts,
 			      0, 0, 0);
 	return (retval);
 }
 
 /*
  * If a keepalive goes off, we had no other timers
  * happening. We always return 1 here since this
  * routine either drops the connection or sends
  * out a segment with respond.
  */
 static int
 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	struct tcptemp *t_template;
 	struct inpcb *inp;
 
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
 	inp = tp->t_inpcb;
 	rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL);
 	/*
 	 * Keep-alive timer went off; send something or drop connection if
 	 * idle for too long.
 	 */
 	KMOD_TCPSTAT_INC(tcps_keeptimeo);
 	if (tp->t_state < TCPS_ESTABLISHED)
 		goto dropit;
 	if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
 	    tp->t_state <= TCPS_CLOSING) {
 		if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
 			goto dropit;
 		/*
 		 * Send a packet designed to force a response if the peer is
 		 * up and reachable: either an ACK if the connection is
 		 * still alive, or an RST if the peer has closed the
 		 * connection due to timeout or reboot. Using sequence
 		 * number tp->snd_una-1 causes the transmitted zero-length
 		 * segment to lie outside the receive window; by the
 		 * protocol spec, this requires the correspondent TCP to
 		 * respond.
 		 */
 		KMOD_TCPSTAT_INC(tcps_keepprobe);
 		t_template = tcpip_maketemplate(inp);
 		if (t_template) {
 			if (rack->forced_ack == 0) {
 				rack->forced_ack = 1;
 				rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
 			} else {
 				rack->probe_not_answered = 1;
 			}
 			tcp_respond(tp, t_template->tt_ipgen,
 			    &t_template->tt_t, (struct mbuf *)NULL,
 			    tp->rcv_nxt, tp->snd_una - 1, 0);
 			free(t_template, M_TEMP);
 		}
 	}
 	rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
 	return (1);
 dropit:
 	KMOD_TCPSTAT_INC(tcps_keepdrops);
 	tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
 	return (-ETIMEDOUT);	/* tcp_drop() */
 }
 
 /*
  * Retransmit helper function, clear up all the ack
  * flags and take care of important book keeping.
  */
 static void
 rack_remxt_tmr(struct tcpcb *tp)
 {
 	/*
 	 * The retransmit timer went off, all sack'd blocks must be
 	 * un-acked.
 	 */
 	struct rack_sendmap *rsm, *trsm = NULL;
 	struct tcp_rack *rack;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__);
 	rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL);
 	if (rack->r_state && (rack->r_state != tp->t_state))
 		rack_set_state(tp, rack);
 	/*
 	 * Ideally we would like to be able to
 	 * mark SACK-PASS on anything not acked here.
 	 *
 	 * However, if we do that we would burst out
 	 * all that data 1ms apart. This would be unwise,
 	 * so for now we will just let the normal rxt timer
 	 * and tlp timer take care of it.
 	 *
 	 * Also we really need to stick them back in sequence
 	 * order. This way we send in the proper order and any
 	 * sacks that come floating in will "re-ack" the data.
 	 * To do this we zap the tmap with an INIT and then
 	 * walk through and place every rsm in the RB tree
 	 * back in its seq ordered place.
 	 */
 	TAILQ_INIT(&rack->r_ctl.rc_tmap);
 	RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
 		rsm->r_dupack = 0;
 		rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
 		/* We must re-add it back to the tlist */
 		if (trsm == NULL) {
 			TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 		} else {
 			TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
 		}
 		rsm->r_in_tmap = 1;
 		trsm = rsm;
 		if (rsm->r_flags & RACK_ACKED)
 			rsm->r_flags |= RACK_WAS_ACKED;
 		rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED);
 		rsm->r_flags |= RACK_MUST_RXT;
 	}
 	/* Clear the count (we just un-acked them) */
 	rack->r_ctl.rc_last_timeout_snduna = tp->snd_una;
 	rack->r_ctl.rc_sacked = 0;
 	rack->r_ctl.rc_sacklast = NULL;
 	rack->r_ctl.rc_agg_delayed = 0;
 	rack->r_early = 0;
 	rack->r_ctl.rc_agg_early = 0;
 	rack->r_late = 0;
 	/* Clear the tlp rtx mark */
 	rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
 	if (rack->r_ctl.rc_resend != NULL)
 		rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT;
 	rack->r_ctl.rc_prr_sndcnt = 0;
 	rack_log_to_prr(rack, 6, 0, __LINE__);
 	rack->r_timer_override = 1;
 	if ((((tp->t_flags & TF_SACK_PERMIT) == 0)
 #ifdef NETFLIX_EXP_DETECTION
 	    || (rack->sack_attack_disable != 0)
 #endif
 		    ) && ((tp->t_flags & TF_SENTFIN) == 0)) {
 		/*
 		 * For non-sack customers new data
 		 * needs to go out as retransmits until
 		 * we retransmit up to snd_max.
 		 */
 		rack->r_must_retran = 1;
 		rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp,
 						rack->r_ctl.rc_sacked);
 	}
 	rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
 }
 
 static void
 rack_convert_rtts(struct tcpcb *tp)
 {
 	if (tp->t_srtt > 1) {
 		uint32_t val, frac;
 
 		val = tp->t_srtt >> TCP_RTT_SHIFT;
 		frac = tp->t_srtt & 0x1f;
 		tp->t_srtt = TICKS_2_USEC(val);
 		/*
 		 * frac is the fractional part of the srtt (if any)
 		 * but its in ticks and every bit represents
 		 * 1/32nd of a hz.
 		 */
 		if (frac) {
 			if (hz == 1000) {
 				frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
 			} else {
 				frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
 			}
 			tp->t_srtt += frac;
 		}
 	}
 	if (tp->t_rttvar) {
 		uint32_t val, frac;
 
 		val = tp->t_rttvar >> TCP_RTTVAR_SHIFT;
 		frac = tp->t_rttvar & 0x1f;
 		tp->t_rttvar = TICKS_2_USEC(val);
 		/*
 		 * frac is the fractional part of the srtt (if any)
 		 * but its in ticks and every bit represents
 		 * 1/32nd of a hz.
 		 */
 		if (frac) {
 			if (hz == 1000) {
 				frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
 			} else {
 				frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
 			}
 			tp->t_rttvar += frac;
 		}
 	}
 	tp->t_rxtcur = RACK_REXMTVAL(tp);
 	if (TCPS_HAVEESTABLISHED(tp->t_state)) {
 		tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop);
 	}
 	if (tp->t_rxtcur > rack_rto_max) {
 		tp->t_rxtcur = rack_rto_max;
 	}
 }
 
 static void
 rack_cc_conn_init(struct tcpcb *tp)
 {
 	struct tcp_rack *rack;
 	uint32_t srtt;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	srtt = tp->t_srtt;
 	cc_conn_init(tp);
 	/*
 	 * Now convert to rack's internal format,
 	 * if required.
 	 */
 	if ((srtt == 0) && (tp->t_srtt != 0))
 		rack_convert_rtts(tp);
 	/*
 	 * We want a chance to stay in slowstart as
 	 * we create a connection. TCP spec says that
 	 * initially ssthresh is infinite. For our
 	 * purposes that is the snd_wnd.
 	 */
 	if (tp->snd_ssthresh < tp->snd_wnd) {
 		tp->snd_ssthresh = tp->snd_wnd;
 	}
 	/*
 	 * We also want to assure a IW worth of
 	 * data can get inflight.
 	 */
 	if (rc_init_window(rack) < tp->snd_cwnd)
 		tp->snd_cwnd = rc_init_window(rack);
 }
 
 /*
  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
  * we will setup to retransmit the lowest seq number outstanding.
  */
 static int
 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
 {
 	int32_t rexmt;
 	int32_t retval = 0;
 	bool isipv6;
 
 	if (tp->t_timers->tt_flags & TT_STOPPED) {
 		return (1);
 	}
 	if ((tp->t_flags & TF_GPUTINPROG) &&
 	    (tp->t_rxtshift)) {
 		/*
 		 * We have had a second timeout
 		 * measurements on successive rxt's are not profitable.
 		 * It is unlikely to be of any use (the network is
 		 * broken or the client went away).
 		 */
 		tp->t_flags &= ~TF_GPUTINPROG;
 		rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
 					   rack->r_ctl.rc_gp_srtt /*flex1*/,
 					   tp->gput_seq,
 					   0, 0, 18, __LINE__, NULL, 0);
 	}
 	if (ctf_progress_timeout_check(tp, false)) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
 		rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
 		return (-ETIMEDOUT);	/* tcp_drop() */
 	}
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
 	rack->r_ctl.retran_during_recovery = 0;
 	rack->rc_ack_required = 1;
 	rack->r_ctl.dsack_byte_cnt = 0;
 	if (IN_FASTRECOVERY(tp->t_flags))
 		tp->t_flags |= TF_WASFRECOVERY;
 	else
 		tp->t_flags &= ~TF_WASFRECOVERY;
 	if (IN_CONGRECOVERY(tp->t_flags))
 		tp->t_flags |= TF_WASCRECOVERY;
 	else
 		tp->t_flags &= ~TF_WASCRECOVERY;
 	if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 	    (tp->snd_una == tp->snd_max)) {
 		/* Nothing outstanding .. nothing to do */
 		return (0);
 	}
 	if (rack->r_ctl.dsack_persist) {
 		rack->r_ctl.dsack_persist--;
 		if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
 			rack->r_ctl.num_dsack = 0;
 		}
 		rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
 	}
 	/*
 	 * Rack can only run one timer  at a time, so we cannot
 	 * run a KEEPINIT (gating SYN sending) and a retransmit
 	 * timer for the SYN. So if we are in a front state and
 	 * have a KEEPINIT timer we need to check the first transmit
 	 * against now to see if we have exceeded the KEEPINIT time
 	 * (if one is set).
 	 */
 	if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) &&
 	    (TP_KEEPINIT(tp) != 0)) {
 		struct rack_sendmap *rsm;
 
 		rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
 		if (rsm) {
 			/* Ok we have something outstanding to test keepinit with */
 			if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) &&
 			    ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) {
 				/* We have exceeded the KEEPINIT time */
 				tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
 				goto drop_it;
 			}
 		}
 	}
 	/*
 	 * Retransmission timer went off.  Message has not been acked within
 	 * retransmit interval.  Back off to a longer retransmit interval
 	 * and retransmit one segment.
 	 */
 	rack_remxt_tmr(tp);
 	if ((rack->r_ctl.rc_resend == NULL) ||
 	    ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) {
 		/*
 		 * If the rwnd collapsed on
 		 * the one we are retransmitting
 		 * it does not count against the
 		 * rxt count.
 		 */
 		tp->t_rxtshift++;
 	}
 	if (tp->t_rxtshift > TCP_MAXRXTSHIFT) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
 drop_it:
 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
 		KMOD_TCPSTAT_INC(tcps_timeoutdrop);
 		/* XXXGL: previously t_softerror was casted to uint16_t */
 		MPASS(tp->t_softerror >= 0);
 		retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT;
 		goto out;	/* tcp_drop() */
 	}
 	if (tp->t_state == TCPS_SYN_SENT) {
 		/*
 		 * If the SYN was retransmitted, indicate CWND to be limited
 		 * to 1 segment in cc_conn_init().
 		 */
 		tp->snd_cwnd = 1;
 	} else if (tp->t_rxtshift == 1) {
 		/*
 		 * first retransmit; record ssthresh and cwnd so they can be
 		 * recovered if this turns out to be a "bad" retransmit. A
 		 * retransmit is considered "bad" if an ACK for this segment
 		 * is received within RTT/2 interval; the assumption here is
 		 * that the ACK was already in flight.  See "On Estimating
 		 * End-to-End Network Path Properties" by Allman and Paxson
 		 * for more details.
 		 */
 		tp->snd_cwnd_prev = tp->snd_cwnd;
 		tp->snd_ssthresh_prev = tp->snd_ssthresh;
 		tp->snd_recover_prev = tp->snd_recover;
 		tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2);
 		tp->t_flags |= TF_PREVVALID;
 	} else if ((tp->t_flags & TF_RCVD_TSTMP) == 0)
 		tp->t_flags &= ~TF_PREVVALID;
 	KMOD_TCPSTAT_INC(tcps_rexmttimeo);
 	if ((tp->t_state == TCPS_SYN_SENT) ||
 	    (tp->t_state == TCPS_SYN_RECEIVED))
 		rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift];
 	else
 		rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift];
 
 	RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt,
 	   max(rack_rto_min, rexmt), rack_rto_max, rack->r_ctl.timer_slop);
 	/*
 	 * We enter the path for PLMTUD if connection is established or, if
 	 * connection is FIN_WAIT_1 status, reason for the last is that if
 	 * amount of data we send is very small, we could send it in couple
 	 * of packets and process straight to FIN. In that case we won't
 	 * catch ESTABLISHED state.
 	 */
 #ifdef INET6
 	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false;
 #else
 	isipv6 = false;
 #endif
 	if (((V_tcp_pmtud_blackhole_detect == 1) ||
 	    (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) ||
 	    (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) &&
 	    ((tp->t_state == TCPS_ESTABLISHED) ||
 	    (tp->t_state == TCPS_FIN_WAIT_1))) {
 		/*
 		 * Idea here is that at each stage of mtu probe (usually,
 		 * 1448 -> 1188 -> 524) should be given 2 chances to recover
 		 * before further clamping down. 'tp->t_rxtshift % 2 == 0'
 		 * should take care of that.
 		 */
 		if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
 		    (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
 		    (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
 		    tp->t_rxtshift % 2 == 0)) {
 			/*
 			 * Enter Path MTU Black-hole Detection mechanism: -
 			 * Disable Path MTU Discovery (IP "DF" bit). -
 			 * Reduce MTU to lower value than what we negotiated
 			 * with peer.
 			 */
 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
 				/* Record that we may have found a black hole. */
 				tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
 				/* Keep track of previous MSS. */
 				tp->t_pmtud_saved_maxseg = tp->t_maxseg;
 			}
 
 			/*
 			 * Reduce the MSS to blackhole value or to the
 			 * default in an attempt to retransmit.
 			 */
 #ifdef INET6
 			if (isipv6 &&
 			    tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
 				/* Use the sysctl tuneable blackhole MSS. */
 				tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
 				KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
 			} else if (isipv6) {
 				/* Use the default MSS. */
 				tp->t_maxseg = V_tcp_v6mssdflt;
 				/*
 				 * Disable Path MTU Discovery when we switch
 				 * to minmss.
 				 */
 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 				KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
 			}
 #endif
 #if defined(INET6) && defined(INET)
 			else
 #endif
 #ifdef INET
 			if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
 				/* Use the sysctl tuneable blackhole MSS. */
 				tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
 				KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
 			} else {
 				/* Use the default MSS. */
 				tp->t_maxseg = V_tcp_mssdflt;
 				/*
 				 * Disable Path MTU Discovery when we switch
 				 * to minmss.
 				 */
 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 				KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
 			}
 #endif
 		} else {
 			/*
 			 * If further retransmissions are still unsuccessful
 			 * with a lowered MTU, maybe this isn't a blackhole
 			 * and we restore the previous MSS and blackhole
 			 * detection flags. The limit '6' is determined by
 			 * giving each probe stage (1448, 1188, 524) 2
 			 * chances to recover.
 			 */
 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
 			    (tp->t_rxtshift >= 6)) {
 				tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 				tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
 				tp->t_maxseg = tp->t_pmtud_saved_maxseg;
 				KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed);
 			}
 		}
 	}
 	/*
 	 * Disable RFC1323 and SACK if we haven't got any response to
 	 * our third SYN to work-around some broken terminal servers
 	 * (most of which have hopefully been retired) that have bad VJ
 	 * header compression code which trashes TCP segments containing
 	 * unknown-to-them TCP options.
 	 */
 	if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
 	    (tp->t_rxtshift == 3))
 		tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
 	/*
 	 * If we backed off this far, our srtt estimate is probably bogus.
 	 * Clobber it so we'll take the next rtt measurement as our srtt;
 	 * move the current srtt into rttvar to keep the current retransmit
 	 * times until then.
 	 */
 	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
 #ifdef INET6
 		if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
 			in6_losing(tp->t_inpcb);
 		else
 #endif
 			in_losing(tp->t_inpcb);
 		tp->t_rttvar += tp->t_srtt;
 		tp->t_srtt = 0;
 	}
 	sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
 	tp->snd_recover = tp->snd_max;
 	tp->t_flags |= TF_ACKNOW;
 	tp->t_rtttime = 0;
 	rack_cong_signal(tp, CC_RTO, tp->snd_una, __LINE__);
 out:
 	return (retval);
 }
 
 static int
 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling, uint8_t *doing_tlp)
 {
 	int32_t ret = 0;
 	int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
 
 	if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
 	    (tp->t_flags & TF_GPUTINPROG)) {
 		/*
 		 * We have a goodput in progress
 		 * and we have entered a late state.
 		 * Do we have enough data in the sb
 		 * to handle the GPUT request?
 		 */
 		uint32_t bytes;
 
 		bytes = tp->gput_ack - tp->gput_seq;
 		if (SEQ_GT(tp->gput_seq, tp->snd_una))
 			bytes += tp->gput_seq - tp->snd_una;
 		if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
 			/*
 			 * There are not enough bytes in the socket
 			 * buffer that have been sent to cover this
 			 * measurement. Cancel it.
 			 */
 			rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
 						   rack->r_ctl.rc_gp_srtt /*flex1*/,
 						   tp->gput_seq,
 						   0, 0, 18, __LINE__, NULL, 0);
 			tp->t_flags &= ~TF_GPUTINPROG;
 		}
 	}
 	if (timers == 0) {
 		return (0);
 	}
 	if (tp->t_state == TCPS_LISTEN) {
 		/* no timers on listen sockets */
 		if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
 			return (0);
 		return (1);
 	}
 	if ((timers & PACE_TMR_RACK) &&
 	    rack->rc_on_min_to) {
 		/*
 		 * For the rack timer when we
 		 * are on a min-timeout (which means rrr_conf = 3)
 		 * we don't want to check the timer. It may
 		 * be going off for a pace and thats ok we
 		 * want to send the retransmit (if its ready).
 		 *
 		 * If its on a normal rack timer (non-min) then
 		 * we will check if its expired.
 		 */
 		goto skip_time_check;
 	}
 	if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
 		uint32_t left;
 
 		if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
 			ret = -1;
 			rack_log_to_processing(rack, cts, ret, 0);
 			return (0);
 		}
 		if (hpts_calling == 0) {
 			/*
 			 * A user send or queued mbuf (sack) has called us? We
 			 * return 0 and let the pacing guards
 			 * deal with it if they should or
 			 * should not cause a send.
 			 */
 			ret = -2;
 			rack_log_to_processing(rack, cts, ret, 0);
 			return (0);
 		}
 		/*
 		 * Ok our timer went off early and we are not paced false
 		 * alarm, go back to sleep.
 		 */
 		ret = -3;
 		left = rack->r_ctl.rc_timer_exp - cts;
 		tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left));
 		rack_log_to_processing(rack, cts, ret, left);
 		return (1);
 	}
 skip_time_check:
 	rack->rc_tmr_stopped = 0;
 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
 	if (timers & PACE_TMR_DELACK) {
 		ret = rack_timeout_delack(tp, rack, cts);
 	} else if (timers & PACE_TMR_RACK) {
 		rack->r_ctl.rc_tlp_rxt_last_time = cts;
 		rack->r_fast_output = 0;
 		ret = rack_timeout_rack(tp, rack, cts);
 	} else if (timers & PACE_TMR_TLP) {
 		rack->r_ctl.rc_tlp_rxt_last_time = cts;
 		ret = rack_timeout_tlp(tp, rack, cts, doing_tlp);
 	} else if (timers & PACE_TMR_RXT) {
 		rack->r_ctl.rc_tlp_rxt_last_time = cts;
 		rack->r_fast_output = 0;
 		ret = rack_timeout_rxt(tp, rack, cts);
 	} else if (timers & PACE_TMR_PERSIT) {
 		ret = rack_timeout_persist(tp, rack, cts);
 	} else if (timers & PACE_TMR_KEEP) {
 		ret = rack_timeout_keepalive(tp, rack, cts);
 	}
 	rack_log_to_processing(rack, cts, ret, timers);
 	return (ret);
 }
 
 static void
 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
 {
 	struct timeval tv;
 	uint32_t us_cts, flags_on_entry;
 	uint8_t hpts_removed = 0;
 
 	flags_on_entry = rack->r_ctl.rc_hpts_flags;
 	us_cts = tcp_get_usecs(&tv);
 	if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
 	    ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) ||
 	     ((tp->snd_max - tp->snd_una) == 0))) {
 		tcp_hpts_remove(rack->rc_inp);
 		hpts_removed = 1;
 		/* If we were not delayed cancel out the flag. */
 		if ((tp->snd_max - tp->snd_una) == 0)
 			rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
 		rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
 	}
 	if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
 		rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
 		if (tcp_in_hpts(rack->rc_inp) &&
 		    ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
 			/*
 			 * Canceling timer's when we have no output being
 			 * paced. We also must remove ourselves from the
 			 * hpts.
 			 */
 			tcp_hpts_remove(rack->rc_inp);
 			hpts_removed = 1;
 		}
 		rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
 	}
 	if (hpts_removed == 0)
 		rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
 }
 
 static void
 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
 {
 	return;
 }
 
 static int
 rack_stopall(struct tcpcb *tp)
 {
 	struct tcp_rack *rack;
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	rack->t_timers_stopped = 1;
 	return (0);
 }
 
 static void
 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
 {
 	return;
 }
 
 static int
 rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
 {
 	return (0);
 }
 
 static void
 rack_stop_all_timers(struct tcpcb *tp)
 {
 	struct tcp_rack *rack;
 
 	/*
 	 * Assure no timers are running.
 	 */
 	if (tcp_timer_active(tp, TT_PERSIST)) {
 		/* We enter in persists, set the flag appropriately */
 		rack = (struct tcp_rack *)tp->t_fb_ptr;
 		rack->rc_in_persist = 1;
 	}
 	tcp_timer_suspend(tp, TT_PERSIST);
 	tcp_timer_suspend(tp, TT_REXMT);
 	tcp_timer_suspend(tp, TT_KEEP);
 	tcp_timer_suspend(tp, TT_DELACK);
 }
 
 static void
 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag)
 {
 	int32_t idx;
 
 	rsm->r_rtr_cnt++;
 	rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
 	rsm->r_dupack = 0;
 	if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
 		rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
 		rsm->r_flags |= RACK_OVERMAX;
 	}
 	if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) {
 		rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
 		rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
 	}
 	idx = rsm->r_rtr_cnt - 1;
 	rsm->r_tim_lastsent[idx] = ts;
 	/*
 	 * Here we don't add in the len of send, since its already
 	 * in snduna <->snd_max.
 	 */
 	rsm->r_fas = ctf_flight_size(rack->rc_tp,
 				     rack->r_ctl.rc_sacked);
 	if (rsm->r_flags & RACK_ACKED) {
 		/* Problably MTU discovery messing with us */
 		rsm->r_flags &= ~RACK_ACKED;
 		rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
 	}
 	if (rsm->r_in_tmap) {
 		TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 		rsm->r_in_tmap = 0;
 	}
 	TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 	rsm->r_in_tmap = 1;
 	/* Take off the must retransmit flag, if its on */
 	if (rsm->r_flags & RACK_MUST_RXT) {
 		if (rack->r_must_retran)
 			rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start);
 		if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
 			/*
 			 * We have retransmitted all we need. Clear
 			 * any must retransmit flags.
 			 */
 			rack->r_must_retran = 0;
 			rack->r_ctl.rc_out_at_rto = 0;
 		}
 		rsm->r_flags &= ~RACK_MUST_RXT;
 	}
 	if (rsm->r_flags & RACK_SACK_PASSED) {
 		/* We have retransmitted due to the SACK pass */
 		rsm->r_flags &= ~RACK_SACK_PASSED;
 		rsm->r_flags |= RACK_WAS_SACKPASS;
 	}
 }
 
 static uint32_t
 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag)
 {
 	/*
 	 * We (re-)transmitted starting at rsm->r_start for some length
 	 * (possibly less than r_end.
 	 */
 	struct rack_sendmap *nrsm;
 #ifdef INVARIANTS
 	struct rack_sendmap *insret;
 #endif
 	uint32_t c_end;
 	int32_t len;
 
 	len = *lenp;
 	c_end = rsm->r_start + len;
 	if (SEQ_GEQ(c_end, rsm->r_end)) {
 		/*
 		 * We retransmitted the whole piece or more than the whole
 		 * slopping into the next rsm.
 		 */
 		rack_update_rsm(tp, rack, rsm, ts, add_flag);
 		if (c_end == rsm->r_end) {
 			*lenp = 0;
 			return (0);
 		} else {
 			int32_t act_len;
 
 			/* Hangs over the end return whats left */
 			act_len = rsm->r_end - rsm->r_start;
 			*lenp = (len - act_len);
 			return (rsm->r_end);
 		}
 		/* We don't get out of this block. */
 	}
 	/*
 	 * Here we retransmitted less than the whole thing which means we
 	 * have to split this into what was transmitted and what was not.
 	 */
 	nrsm = rack_alloc_full_limit(rack);
 	if (nrsm == NULL) {
 		/*
 		 * We can't get memory, so lets not proceed.
 		 */
 		*lenp = 0;
 		return (0);
 	}
 	/*
 	 * So here we are going to take the original rsm and make it what we
 	 * retransmitted. nrsm will be the tail portion we did not
 	 * retransmit. For example say the chunk was 1, 11 (10 bytes). And
 	 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
 	 * 1, 6 and the new piece will be 6, 11.
 	 */
 	rack_clone_rsm(rack, nrsm, rsm, c_end);
 	nrsm->r_dupack = 0;
 	rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
 #ifndef INVARIANTS
 	(void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
 #else
 	insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
 	if (insret != NULL) {
 		panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
 		      nrsm, insret, rack, rsm);
 	}
 #endif
 	if (rsm->r_in_tmap) {
 		TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 		nrsm->r_in_tmap = 1;
 	}
 	rsm->r_flags &= (~RACK_HAS_FIN);
 	rack_update_rsm(tp, rack, rsm, ts, add_flag);
 	/* Log a split of rsm into rsm and nrsm */
 	rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
 	*lenp = 0;
 	return (0);
 }
 
 static void
 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
 		uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t cts,
 		struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb, uint32_t s_moff, int hw_tls)
 {
 	struct tcp_rack *rack;
 	struct rack_sendmap *rsm, *nrsm, fe;
 #ifdef INVARIANTS
 	struct rack_sendmap *insret;
 #endif
 	register uint32_t snd_max, snd_una;
 
 	/*
 	 * Add to the RACK log of packets in flight or retransmitted. If
 	 * there is a TS option we will use the TS echoed, if not we will
 	 * grab a TS.
 	 *
 	 * Retransmissions will increment the count and move the ts to its
 	 * proper place. Note that if options do not include TS's then we
 	 * won't be able to effectively use the ACK for an RTT on a retran.
 	 *
 	 * Notes about r_start and r_end. Lets consider a send starting at
 	 * sequence 1 for 10 bytes. In such an example the r_start would be
 	 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
 	 * This means that r_end is actually the first sequence for the next
 	 * slot (11).
 	 *
 	 */
 	/*
 	 * If err is set what do we do XXXrrs? should we not add the thing?
 	 * -- i.e. return if err != 0 or should we pretend we sent it? --
 	 * i.e. proceed with add ** do this for now.
 	 */
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (err)
 		/*
 		 * We don't log errors -- we could but snd_max does not
 		 * advance in this case either.
 		 */
 		return;
 
 	if (th_flags & TH_RST) {
 		/*
 		 * We don't log resets and we return immediately from
 		 * sending
 		 */
 		return;
 	}
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	snd_una = tp->snd_una;
 	snd_max = tp->snd_max;
 	if (th_flags & (TH_SYN | TH_FIN)) {
 		/*
 		 * The call to rack_log_output is made before bumping
 		 * snd_max. This means we can record one extra byte on a SYN
 		 * or FIN if seq_out is adding more on and a FIN is present
 		 * (and we are not resending).
 		 */
 		if ((th_flags & TH_SYN) && (seq_out == tp->iss))
 			len++;
 		if (th_flags & TH_FIN)
 			len++;
 		if (SEQ_LT(snd_max, tp->snd_nxt)) {
 			/*
 			 * The add/update as not been done for the FIN/SYN
 			 * yet.
 			 */
 			snd_max = tp->snd_nxt;
 		}
 	}
 	if (SEQ_LEQ((seq_out + len), snd_una)) {
 		/* Are sending an old segment to induce an ack (keep-alive)? */
 		return;
 	}
 	if (SEQ_LT(seq_out, snd_una)) {
 		/* huh? should we panic? */
 		uint32_t end;
 
 		end = seq_out + len;
 		seq_out = snd_una;
 		if (SEQ_GEQ(end, seq_out))
 			len = end - seq_out;
 		else
 			len = 0;
 	}
 	if (len == 0) {
 		/* We don't log zero window probes */
 		return;
 	}
 	if (IN_FASTRECOVERY(tp->t_flags)) {
 		rack->r_ctl.rc_prr_out += len;
 	}
 	/* First question is it a retransmission or new? */
 	if (seq_out == snd_max) {
 		/* Its new */
 again:
 		rsm = rack_alloc(rack);
 		if (rsm == NULL) {
 			/*
 			 * Hmm out of memory and the tcb got destroyed while
 			 * we tried to wait.
 			 */
 			return;
 		}
 		if (th_flags & TH_FIN) {
 			rsm->r_flags = RACK_HAS_FIN|add_flag;
 		} else {
 			rsm->r_flags = add_flag;
 		}
 		if (hw_tls)
 			rsm->r_hw_tls = 1;
 		rsm->r_tim_lastsent[0] = cts;
 		rsm->r_rtr_cnt = 1;
 		rsm->r_rtr_bytes = 0;
 		if (th_flags & TH_SYN) {
 			/* The data space is one beyond snd_una */
 			rsm->r_flags |= RACK_HAS_SYN;
 		}
 		rsm->r_start = seq_out;
 		rsm->r_end = rsm->r_start + len;
 		rsm->r_dupack = 0;
 		/*
 		 * save off the mbuf location that
 		 * sndmbuf_noadv returned (which is
 		 * where we started copying from)..
 		 */
 		rsm->m = s_mb;
 		rsm->soff = s_moff;
 		/*
 		 * Here we do add in the len of send, since its not yet
 		 * reflected in in snduna <->snd_max
 		 */
 		rsm->r_fas = (ctf_flight_size(rack->rc_tp,
 					      rack->r_ctl.rc_sacked) +
 			      (rsm->r_end - rsm->r_start));
 		/* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */
 		if (rsm->m) {
 			if (rsm->m->m_len <= rsm->soff) {
 				/*
 				 * XXXrrs Question, will this happen?
 				 *
 				 * If sbsndptr is set at the correct place
 				 * then s_moff should always be somewhere
 				 * within rsm->m. But if the sbsndptr was
 				 * off then that won't be true. If it occurs
 				 * we need to walkout to the correct location.
 				 */
 				struct mbuf *lm;
 
 				lm = rsm->m;
 				while (lm->m_len <= rsm->soff) {
 					rsm->soff -= lm->m_len;
 					lm = lm->m_next;
 					KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u",
 							     __func__, rack, s_moff, s_mb, rsm->soff));
 				}
 				rsm->m = lm;
 			}
 			rsm->orig_m_len = rsm->m->m_len;
 		} else
 			rsm->orig_m_len = 0;
 		rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
 		/* Log a new rsm */
 		rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__);
 #ifndef INVARIANTS
 		(void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 #else
 		insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 		if (insret != NULL) {
 			panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
 			      nrsm, insret, rack, rsm);
 		}
 #endif
 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 		rsm->r_in_tmap = 1;
 		/*
 		 * Special case detection, is there just a single
 		 * packet outstanding when we are not in recovery?
 		 *
 		 * If this is true mark it so.
 		 */
 		if ((IN_FASTRECOVERY(tp->t_flags) == 0) &&
 		    (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) {
 			struct rack_sendmap *prsm;
 
 			prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 			if (prsm)
 				prsm->r_one_out_nr = 1;
 		}
 		return;
 	}
 	/*
 	 * If we reach here its a retransmission and we need to find it.
 	 */
 	memset(&fe, 0, sizeof(fe));
 more:
 	if (hintrsm && (hintrsm->r_start == seq_out)) {
 		rsm = hintrsm;
 		hintrsm = NULL;
 	} else {
 		/* No hints sorry */
 		rsm = NULL;
 	}
 	if ((rsm) && (rsm->r_start == seq_out)) {
 		seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag);
 		if (len == 0) {
 			return;
 		} else {
 			goto more;
 		}
 	}
 	/* Ok it was not the last pointer go through it the hard way. */
 refind:
 	fe.r_start = seq_out;
 	rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
 	if (rsm) {
 		if (rsm->r_start == seq_out) {
 			seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag);
 			if (len == 0) {
 				return;
 			} else {
 				goto refind;
 			}
 		}
 		if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
 			/* Transmitted within this piece */
 			/*
 			 * Ok we must split off the front and then let the
 			 * update do the rest
 			 */
 			nrsm = rack_alloc_full_limit(rack);
 			if (nrsm == NULL) {
 				rack_update_rsm(tp, rack, rsm, cts, add_flag);
 				return;
 			}
 			/*
 			 * copy rsm to nrsm and then trim the front of rsm
 			 * to not include this part.
 			 */
 			rack_clone_rsm(rack, nrsm, rsm, seq_out);
 			rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
 #ifndef INVARIANTS
 			(void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
 #else
 			insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
 			if (insret != NULL) {
 				panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
 				      nrsm, insret, rack, rsm);
 			}
 #endif
 			if (rsm->r_in_tmap) {
 				TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 				nrsm->r_in_tmap = 1;
 			}
 			rsm->r_flags &= (~RACK_HAS_FIN);
 			seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag);
 			if (len == 0) {
 				return;
 			} else if (len > 0)
 				goto refind;
 		}
 	}
 	/*
 	 * Hmm not found in map did they retransmit both old and on into the
 	 * new?
 	 */
 	if (seq_out == tp->snd_max) {
 		goto again;
 	} else if (SEQ_LT(seq_out, tp->snd_max)) {
 #ifdef INVARIANTS
 		printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
 		       seq_out, len, tp->snd_una, tp->snd_max);
 		printf("Starting Dump of all rack entries\n");
 		RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
 			printf("rsm:%p start:%u end:%u\n",
 			       rsm, rsm->r_start, rsm->r_end);
 		}
 		printf("Dump complete\n");
 		panic("seq_out not found rack:%p tp:%p",
 		      rack, tp);
 #endif
 	} else {
 #ifdef INVARIANTS
 		/*
 		 * Hmm beyond sndmax? (only if we are using the new rtt-pack
 		 * flag)
 		 */
 		panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
 		      seq_out, len, tp->snd_max, tp);
 #endif
 	}
 }
 
 /*
  * Record one of the RTT updates from an ack into
  * our sample structure.
  */
 
 static void
 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt,
 		    int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt)
 {
 	if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
 	    (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
 		rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
 	}
 	if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
 	    (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
 		rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
 	}
 	if (rack->rc_tp->t_flags & TF_GPUTINPROG) {
 	    if (us_rtt < rack->r_ctl.rc_gp_lowrtt)
 		rack->r_ctl.rc_gp_lowrtt = us_rtt;
 	    if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd)
 		    rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
 	}
 	if ((confidence == 1) &&
 	    ((rsm == NULL) ||
 	     (rsm->r_just_ret) ||
 	     (rsm->r_one_out_nr &&
 	      len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) {
 		/*
 		 * If the rsm had a just return
 		 * hit it then we can't trust the
 		 * rtt measurement for buffer deterimination
 		 * Note that a confidence of 2, indicates
 		 * SACK'd which overrides the r_just_ret or
 		 * the r_one_out_nr. If it was a CUM-ACK and
 		 * we had only two outstanding, but get an
 		 * ack for only 1. Then that also lowers our
 		 * confidence.
 		 */
 		confidence = 0;
 	}
 	if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
 	    (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) {
 		if (rack->r_ctl.rack_rs.confidence == 0) {
 			/*
 			 * We take anything with no current confidence
 			 * saved.
 			 */
 			rack->r_ctl.rack_rs.rs_us_rtt = us_rtt;
 			rack->r_ctl.rack_rs.confidence = confidence;
 			rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt;
 		} else if (confidence || rack->r_ctl.rack_rs.confidence) {
 			/*
 			 * Once we have a confident number,
 			 * we can update it with a smaller
 			 * value since this confident number
 			 * may include the DSACK time until
 			 * the next segment (the second one) arrived.
 			 */
 			rack->r_ctl.rack_rs.rs_us_rtt = us_rtt;
 			rack->r_ctl.rack_rs.confidence = confidence;
 			rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt;
 		}
 	}
 	rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence);
 	rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
 	rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
 	rack->r_ctl.rack_rs.rs_rtt_cnt++;
 }
 
 /*
  * Collect new round-trip time estimate
  * and update averages and current timeout.
  */
 static void
 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
 {
 	int32_t delta;
 	int32_t rtt;
 
 	if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
 		/* No valid sample */
 		return;
 	if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
 		/* We are to use the lowest RTT seen in a single ack */
 		rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
 	} else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
 		/* We are to use the highest RTT seen in a single ack */
 		rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
 	} else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
 		/* We are to use the average RTT seen in a single ack */
 		rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
 				(uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
 	} else {
 #ifdef INVARIANTS
 		panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
 #endif
 		return;
 	}
 	if (rtt == 0)
 		rtt = 1;
 	if (rack->rc_gp_rtt_set == 0) {
 		/*
 		 * With no RTT we have to accept
 		 * even one we are not confident of.
 		 */
 		rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt;
 		rack->rc_gp_rtt_set = 1;
 	} else if (rack->r_ctl.rack_rs.confidence) {
 		/* update the running gp srtt */
 		rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8);
 		rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8;
 	}
 	if (rack->r_ctl.rack_rs.confidence) {
 		/*
 		 * record the low and high for highly buffered path computation,
 		 * we only do this if we are confident (not a retransmission).
 		 */
 		if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) {
 			rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
 		}
 		if (rack->rc_highly_buffered == 0) {
 			/*
 			 * Currently once we declare a path has
 			 * highly buffered there is no going
 			 * back, which may be a problem...
 			 */
 			if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) {
 				rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt,
 						     rack->r_ctl.rc_highest_us_rtt,
 						     rack->r_ctl.rc_lowest_us_rtt,
 						     RACK_RTTS_SEEHBP);
 				rack->rc_highly_buffered = 1;
 			}
 		}
 	}
 	if ((rack->r_ctl.rack_rs.confidence) ||
 	    (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) {
 		/*
 		 * If we are highly confident of it <or> it was
 		 * never retransmitted we accept it as the last us_rtt.
 		 */
 		rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
 		/* The lowest rtt can be set if its was not retransmited */
 		if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) {
 			rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
 			if (rack->r_ctl.rc_lowest_us_rtt == 0)
 				rack->r_ctl.rc_lowest_us_rtt = 1;
 		}
 	}
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (tp->t_srtt != 0) {
 		/*
 		 * We keep a simple srtt in microseconds, like our rtt
 		 * measurement. We don't need to do any tricks with shifting
 		 * etc. Instead we just add in 1/8th of the new measurement
 		 * and subtract out 1/8 of the old srtt. We do the same with
 		 * the variance after finding the absolute value of the
 		 * difference between this sample and the current srtt.
 		 */
 		delta = tp->t_srtt - rtt;
 		/* Take off 1/8th of the current sRTT */
 		tp->t_srtt -= (tp->t_srtt >> 3);
 		/* Add in 1/8th of the new RTT just measured */
 		tp->t_srtt += (rtt >> 3);
 		if (tp->t_srtt <= 0)
 			tp->t_srtt = 1;
 		/* Now lets make the absolute value of the variance */
 		if (delta < 0)
 			delta = -delta;
 		/* Subtract out 1/8th */
 		tp->t_rttvar -= (tp->t_rttvar >> 3);
 		/* Add in 1/8th of the new variance we just saw */
 		tp->t_rttvar += (delta >> 3);
 		if (tp->t_rttvar <= 0)
 			tp->t_rttvar = 1;
 		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
 			tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	} else {
 		/*
 		 * No rtt measurement yet - use the unsmoothed rtt. Set the
 		 * variance to half the rtt (so our first retransmit happens
 		 * at 3*rtt).
 		 */
 		tp->t_srtt = rtt;
 		tp->t_rttvar = rtt >> 1;
 		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 	}
 	rack->rc_srtt_measure_made = 1;
 	KMOD_TCPSTAT_INC(tcps_rttupdated);
 	tp->t_rttupdated++;
 #ifdef STATS
 	if (rack_stats_gets_ms_rtt == 0) {
 		/* Send in the microsecond rtt used for rxt timeout purposes */
 		stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
 	} else if (rack_stats_gets_ms_rtt == 1) {
 		/* Send in the millisecond rtt used for rxt timeout purposes */
 		int32_t ms_rtt;
 
 		/* Round up */
 		ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC;
 		stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt));
 	} else if (rack_stats_gets_ms_rtt == 2) {
 		/* Send in the millisecond rtt has close to the path RTT as we can get  */
 		int32_t ms_rtt;
 
 		/* Round up */
 		ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC;
 		stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt));
 	}  else {
 		/* Send in the microsecond rtt has close to the path RTT as we can get  */
 		stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt));
 	}
 
 #endif
 	/*
 	 * the retransmit should happen at rtt + 4 * rttvar. Because of the
 	 * way we do the smoothing, srtt and rttvar will each average +1/2
 	 * tick of bias.  When we compute the retransmit timer, we want 1/2
 	 * tick of rounding and 1 extra tick because of +-1/2 tick
 	 * uncertainty in the firing of the timer.  The bias will give us
 	 * exactly the 1.5 tick we need.  But, because the bias is
 	 * statistical, we have to test that we don't drop below the minimum
 	 * feasible timer (which is 2 ticks).
 	 */
 	tp->t_rxtshift = 0;
 	RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
 		      max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop);
 	rack_log_rtt_sample(rack, rtt);
 	tp->t_softerror = 0;
 }
 
 
 static void
 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts)
 {
 	/*
 	 * Apply to filter the inbound us-rtt at us_cts.
 	 */
 	uint32_t old_rtt;
 
 	old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
 	apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt,
 			       us_rtt, us_cts);
 	if (old_rtt > us_rtt) {
 		/* We just hit a new lower rtt time */
 		rack_log_rtt_shrinks(rack,  us_cts,  old_rtt,
 				     __LINE__, RACK_RTTS_NEWRTT);
 		/*
 		 * Only count it if its lower than what we saw within our
 		 * calculated range.
 		 */
 		if ((old_rtt - us_rtt) > rack_min_rtt_movement) {
 			if (rack_probertt_lower_within &&
 			    rack->rc_gp_dyn_mul &&
 			    (rack->use_fixed_rate == 0) &&
 			    (rack->rc_always_pace)) {
 				/*
 				 * We are seeing a new lower rtt very close
 				 * to the time that we would have entered probe-rtt.
 				 * This is probably due to the fact that a peer flow
 				 * has entered probe-rtt. Lets go in now too.
 				 */
 				uint32_t val;
 
 				val = rack_probertt_lower_within * rack_time_between_probertt;
 				val /= 100;
 				if ((rack->in_probe_rtt == 0)  &&
 				    ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val)))	{
 					rack_enter_probertt(rack, us_cts);
 				}
 			}
 			rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
 		}
 	}
 }
 
 static int
 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack)
 {
 	uint32_t us_rtt;
 	int32_t i, all;
 	uint32_t t, len_acked;
 
 	if ((rsm->r_flags & RACK_ACKED) ||
 	    (rsm->r_flags & RACK_WAS_ACKED))
 		/* Already done */
 		return (0);
 	if (rsm->r_no_rtt_allowed) {
 		/* Not allowed */
 		return (0);
 	}
 	if (ack_type == CUM_ACKED) {
 		if (SEQ_GT(th_ack, rsm->r_end)) {
 			len_acked = rsm->r_end - rsm->r_start;
 			all = 1;
 		} else {
 			len_acked = th_ack - rsm->r_start;
 			all = 0;
 		}
 	} else {
 		len_acked = rsm->r_end - rsm->r_start;
 		all = 0;
 	}
 	if (rsm->r_rtr_cnt == 1) {
 
 		t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
 		if ((int)t <= 0)
 			t = 1;
 		if (!tp->t_rttlow || tp->t_rttlow > t)
 			tp->t_rttlow = t;
 		if (!rack->r_ctl.rc_rack_min_rtt ||
 		    SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
 			rack->r_ctl.rc_rack_min_rtt = t;
 			if (rack->r_ctl.rc_rack_min_rtt == 0) {
 				rack->r_ctl.rc_rack_min_rtt = 1;
 			}
 		}
 		if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]))
 			us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
 		else
 			us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
 		if (us_rtt == 0)
 			us_rtt = 1;
 		if (CC_ALGO(tp)->rttsample != NULL) {
 			/* Kick the RTT to the CC */
 			CC_ALGO(tp)->rttsample(tp->ccv, us_rtt, 1, rsm->r_fas);
 		}
 		rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
 		if (ack_type == SACKED) {
 			rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1);
 			tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt);
 		} else {
 			/*
 			 * We need to setup what our confidence
 			 * is in this ack.
 			 *
 			 * If the rsm was app limited and it is
 			 * less than a mss in length (the end
 			 * of the send) then we have a gap. If we
 			 * were app limited but say we were sending
 			 * multiple MSS's then we are more confident
 			 * int it.
 			 *
 			 * When we are not app-limited then we see if
 			 * the rsm is being included in the current
 			 * measurement, we tell this by the app_limited_needs_set
 			 * flag.
 			 *
 			 * Note that being cwnd blocked is not applimited
 			 * as well as the pacing delay between packets which
 			 * are sending only 1 or 2 MSS's also will show up
 			 * in the RTT. We probably need to examine this algorithm
 			 * a bit more and enhance it to account for the delay
 			 * between rsm's. We could do that by saving off the
 			 * pacing delay of each rsm (in an rsm) and then
 			 * factoring that in somehow though for now I am
 			 * not sure how :)
 			 */
 			int calc_conf = 0;
 
 			if (rsm->r_flags & RACK_APP_LIMITED) {
 				if (all && (len_acked <= ctf_fixed_maxseg(tp)))
 					calc_conf = 0;
 				else
 					calc_conf = 1;
 			} else if (rack->app_limited_needs_set == 0) {
 				calc_conf = 1;
 			} else {
 				calc_conf = 0;
 			}
 			rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2);
 			tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt,
 					    calc_conf, rsm, rsm->r_rtr_cnt);
 		}
 		if ((rsm->r_flags & RACK_TLP) &&
 		    (!IN_FASTRECOVERY(tp->t_flags))) {
 			/* Segment was a TLP and our retrans matched */
 			if (rack->r_ctl.rc_tlp_cwnd_reduce) {
 				rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
 			}
 		}
 		if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
 			/* New more recent rack_tmit_time */
 			rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
 			rack->rc_rack_rtt = t;
 		}
 		return (1);
 	}
 	/*
 	 * We clear the soft/rxtshift since we got an ack.
 	 * There is no assurance we will call the commit() function
 	 * so we need to clear these to avoid incorrect handling.
 	 */
 	tp->t_rxtshift = 0;
 	RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
 		      rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
 	tp->t_softerror = 0;
 	if (to && (to->to_flags & TOF_TS) &&
 	    (ack_type == CUM_ACKED) &&
 	    (to->to_tsecr) &&
 	    ((rsm->r_flags & RACK_OVERMAX) == 0)) {
 		/*
 		 * Now which timestamp does it match? In this block the ACK
 		 * must be coming from a previous transmission.
 		 */
 		for (i = 0; i < rsm->r_rtr_cnt; i++) {
 			if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) {
 				t = cts - (uint32_t)rsm->r_tim_lastsent[i];
 				if ((int)t <= 0)
 					t = 1;
 				if (CC_ALGO(tp)->rttsample != NULL) {
 					/*
 					 * Kick the RTT to the CC, here
 					 * we lie a bit in that we know the
 					 * retransmission is correct even though
 					 * we retransmitted. This is because
 					 * we match the timestamps.
 					 */
 					if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i]))
 						us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i];
 					else
 						us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[i];
 					CC_ALGO(tp)->rttsample(tp->ccv, us_rtt, 1, rsm->r_fas);
 				}
 				if ((i + 1) < rsm->r_rtr_cnt) {
 					/*
 					 * The peer ack'd from our previous
 					 * transmission. We have a spurious
 					 * retransmission and thus we dont
 					 * want to update our rack_rtt.
 					 *
 					 * Hmm should there be a CC revert here?
 					 *
 					 */
 					return (0);
 				}
 				if (!tp->t_rttlow || tp->t_rttlow > t)
 					tp->t_rttlow = t;
 				if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
 					rack->r_ctl.rc_rack_min_rtt = t;
 					if (rack->r_ctl.rc_rack_min_rtt == 0) {
 						rack->r_ctl.rc_rack_min_rtt = 1;
 					}
 				}
 				if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
 					   (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
 					/* New more recent rack_tmit_time */
 					rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
 					rack->rc_rack_rtt = t;
 				}
 				rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3);
 				tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm,
 						    rsm->r_rtr_cnt);
 				return (1);
 			}
 		}
 		goto ts_not_found;
 	} else {
 		/*
 		 * Ok its a SACK block that we retransmitted. or a windows
 		 * machine without timestamps. We can tell nothing from the
 		 * time-stamp since its not there or the time the peer last
 		 * recieved a segment that moved forward its cum-ack point.
 		 */
 ts_not_found:
 		i = rsm->r_rtr_cnt - 1;
 		t = cts - (uint32_t)rsm->r_tim_lastsent[i];
 		if ((int)t <= 0)
 			t = 1;
 		if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
 			/*
 			 * We retransmitted and the ack came back in less
 			 * than the smallest rtt we have observed. We most
 			 * likely did an improper retransmit as outlined in
 			 * 6.2 Step 2 point 2 in the rack-draft so we
 			 * don't want to update our rack_rtt. We in
 			 * theory (in future) might want to think about reverting our
 			 * cwnd state but we won't for now.
 			 */
 			return (0);
 		} else if (rack->r_ctl.rc_rack_min_rtt) {
 			/*
 			 * We retransmitted it and the retransmit did the
 			 * job.
 			 */
 			if (!rack->r_ctl.rc_rack_min_rtt ||
 			    SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
 				rack->r_ctl.rc_rack_min_rtt = t;
 				if (rack->r_ctl.rc_rack_min_rtt == 0) {
 					rack->r_ctl.rc_rack_min_rtt = 1;
 				}
 			}
 			if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[i])) {
 				/* New more recent rack_tmit_time */
 				rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i];
 				rack->rc_rack_rtt = t;
 			}
 			return (1);
 		}
 	}
 	return (0);
 }
 
 /*
  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
  */
 static void
 rack_log_sack_passed(struct tcpcb *tp,
     struct tcp_rack *rack, struct rack_sendmap *rsm)
 {
 	struct rack_sendmap *nrsm;
 
 	nrsm = rsm;
 	TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
 	    rack_head, r_tnext) {
 		if (nrsm == rsm) {
 			/* Skip orginal segment he is acked */
 			continue;
 		}
 		if (nrsm->r_flags & RACK_ACKED) {
 			/*
 			 * Skip ack'd segments, though we
 			 * should not see these, since tmap
 			 * should not have ack'd segments.
 			 */
 			continue;
 		}
 		if (nrsm->r_flags & RACK_RWND_COLLAPSED) {
 			/*
 			 * If the peer dropped the rwnd on
 			 * these then we don't worry about them.
 			 */
 			continue;
 		}
 		if (nrsm->r_flags & RACK_SACK_PASSED) {
 			/*
 			 * We found one that is already marked
 			 * passed, we have been here before and
 			 * so all others below this are marked.
 			 */
 			break;
 		}
 		nrsm->r_flags |= RACK_SACK_PASSED;
 		nrsm->r_flags &= ~RACK_WAS_SACKPASS;
 	}
 }
 
 static void
 rack_need_set_test(struct tcpcb *tp,
 		   struct tcp_rack *rack,
 		   struct rack_sendmap *rsm,
 		   tcp_seq th_ack,
 		   int line,
 		   int use_which)
 {
 
 	if ((tp->t_flags & TF_GPUTINPROG) &&
 	    SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
 		/*
 		 * We were app limited, and this ack
 		 * butts up or goes beyond the point where we want
 		 * to start our next measurement. We need
 		 * to record the new gput_ts as here and
 		 * possibly update the start sequence.
 		 */
 		uint32_t seq, ts;
 
 		if (rsm->r_rtr_cnt > 1) {
 			/*
 			 * This is a retransmit, can we
 			 * really make any assessment at this
 			 * point?  We are not really sure of
 			 * the timestamp, is it this or the
 			 * previous transmission?
 			 *
 			 * Lets wait for something better that
 			 * is not retransmitted.
 			 */
 			return;
 		}
 		seq = tp->gput_seq;
 		ts = tp->gput_ts;
 		rack->app_limited_needs_set = 0;
 		tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
 		/* Do we start at a new end? */
 		if ((use_which == RACK_USE_BEG) &&
 		    SEQ_GEQ(rsm->r_start, tp->gput_seq)) {
 			/*
 			 * When we get an ACK that just eats
 			 * up some of the rsm, we set RACK_USE_BEG
 			 * since whats at r_start (i.e. th_ack)
 			 * is left unacked and thats where the
 			 * measurement not starts.
 			 */
 			tp->gput_seq = rsm->r_start;
 			rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
 		}
 		if ((use_which == RACK_USE_END) &&
 		    SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
 			    /*
 			     * We use the end when the cumack
 			     * is moving forward and completely
 			     * deleting the rsm passed so basically
 			     * r_end holds th_ack.
 			     *
 			     * For SACK's we also want to use the end
 			     * since this piece just got sacked and
 			     * we want to target anything after that
 			     * in our measurement.
 			     */
 			    tp->gput_seq = rsm->r_end;
 			    rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
 		}
 		if (use_which == RACK_USE_END_OR_THACK) {
 			/*
 			 * special case for ack moving forward,
 			 * not a sack, we need to move all the
 			 * way up to where this ack cum-ack moves
 			 * to.
 			 */
 			if (SEQ_GT(th_ack, rsm->r_end))
 				tp->gput_seq = th_ack;
 			else
 				tp->gput_seq = rsm->r_end;
 			rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
 		}
 		if (SEQ_GT(tp->gput_seq, tp->gput_ack)) {
 			/*
 			 * We moved beyond this guy's range, re-calculate
 			 * the new end point.
 			 */
 			if (rack->rc_gp_filled == 0) {
 				tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
 			} else {
 				tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
 			}
 		}
 		/*
 		 * We are moving the goal post, we may be able to clear the
 		 * measure_saw_probe_rtt flag.
 		 */
 		if ((rack->in_probe_rtt == 0) &&
 		    (rack->measure_saw_probe_rtt) &&
 		    (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
 			rack->measure_saw_probe_rtt = 0;
 		rack_log_pacing_delay_calc(rack, ts, tp->gput_ts,
 					   seq, tp->gput_seq, 0, 5, line, NULL, 0);
 		if (rack->rc_gp_filled &&
 		    ((tp->gput_ack - tp->gput_seq) <
 		     max(rc_init_window(rack), (MIN_GP_WIN *
 						ctf_fixed_maxseg(tp))))) {
 			uint32_t ideal_amount;
 
 			ideal_amount = rack_get_measure_window(tp, rack);
 			if (ideal_amount > sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
 				/*
 				 * There is no sense of continuing this measurement
 				 * because its too small to gain us anything we
 				 * trust. Skip it and that way we can start a new
 				 * measurement quicker.
 				 */
 				tp->t_flags &= ~TF_GPUTINPROG;
 				rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
 							   0, 0, 0, 6, __LINE__, NULL, 0);
 			} else {
 				/*
 				 * Reset the window further out.
 				 */
 				tp->gput_ack = tp->gput_seq + ideal_amount;
 			}
 		}
 	}
 }
 
 static inline int
 is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm)
 {
 	if (SEQ_LT(rsm->r_end, rack->r_ctl.last_tlp_acked_start)) {
 		/* Behind our TLP definition or right at */
 		return (0);
 	}
 	if (SEQ_GT(rsm->r_start, rack->r_ctl.last_tlp_acked_end)) {
 		/* The start is beyond or right at our end of TLP definition */
 		return (0);
 	}
 	/* It has to be a sub-part of the original TLP recorded */
 	return (1);
 }
 
 
 static uint32_t
 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
 		   struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two)
 {
 	uint32_t start, end, changed = 0;
 	struct rack_sendmap stack_map;
 	struct rack_sendmap *rsm, *nrsm, fe, *prev, *next;
 #ifdef INVARIANTS
 	struct rack_sendmap *insret;
 #endif
 	int32_t used_ref = 1;
 	int moved = 0;
 
 	start = sack->start;
 	end = sack->end;
 	rsm = *prsm;
 	memset(&fe, 0, sizeof(fe));
 do_rest_ofb:
 	if ((rsm == NULL) ||
 	    (SEQ_LT(end, rsm->r_start)) ||
 	    (SEQ_GEQ(start, rsm->r_end)) ||
 	    (SEQ_LT(start, rsm->r_start))) {
 		/*
 		 * We are not in the right spot,
 		 * find the correct spot in the tree.
 		 */
 		used_ref = 0;
 		fe.r_start = start;
 		rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
 		moved++;
 	}
 	if (rsm == NULL) {
 		/* TSNH */
 		goto out;
 	}
 	/* Ok we have an ACK for some piece of this rsm */
 	if (rsm->r_start != start) {
 		if ((rsm->r_flags & RACK_ACKED) == 0) {
 			/*
 			 * Before any splitting or hookery is
 			 * done is it a TLP of interest i.e. rxt?
 			 */
 			if ((rsm->r_flags & RACK_TLP) &&
 			    (rsm->r_rtr_cnt > 1)) {
 				/*
 				 * We are splitting a rxt TLP, check
 				 * if we need to save off the start/end
 				 */
 				if (rack->rc_last_tlp_acked_set &&
 				    (is_rsm_inside_declared_tlp_block(rack, rsm))) {
 					/*
 					 * We already turned this on since we are inside
 					 * the previous one was a partially sack now we
 					 * are getting another one (maybe all of it).
 					 *
 					 */
 					rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
 					/*
 					 * Lets make sure we have all of it though.
 					 */
 					if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
 						rack->r_ctl.last_tlp_acked_start = rsm->r_start;
 						rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
 								     rack->r_ctl.last_tlp_acked_end);
 					}
 					if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
 						rack->r_ctl.last_tlp_acked_end = rsm->r_end;
 						rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
 								     rack->r_ctl.last_tlp_acked_end);
 					}
 				} else {
 					rack->r_ctl.last_tlp_acked_start = rsm->r_start;
 					rack->r_ctl.last_tlp_acked_end = rsm->r_end;
 					rack->rc_last_tlp_past_cumack = 0;
 					rack->rc_last_tlp_acked_set = 1;
 					rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
 				}
 			}
 			/**
 			 * Need to split this in two pieces the before and after,
 			 * the before remains in the map, the after must be
 			 * added. In other words we have:
 			 * rsm        |--------------|
 			 * sackblk        |------->
 			 * rsm will become
 			 *     rsm    |---|
 			 * and nrsm will be  the sacked piece
 			 *     nrsm       |----------|
 			 *
 			 * But before we start down that path lets
 			 * see if the sack spans over on top of
 			 * the next guy and it is already sacked.
 			 *
 			 */
 			next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 			if (next && (next->r_flags & RACK_ACKED) &&
 			    SEQ_GEQ(end, next->r_start)) {
 				/**
 				 * So the next one is already acked, and
 				 * we can thus by hookery use our stack_map
 				 * to reflect the piece being sacked and
 				 * then adjust the two tree entries moving
 				 * the start and ends around. So we start like:
 				 *  rsm     |------------|             (not-acked)
 				 *  next                 |-----------| (acked)
 				 *  sackblk        |-------->
 				 *  We want to end like so:
 				 *  rsm     |------|                   (not-acked)
 				 *  next           |-----------------| (acked)
 				 *  nrsm           |-----|
 				 * Where nrsm is a temporary stack piece we
 				 * use to update all the gizmos.
 				 */
 				/* Copy up our fudge block */
 				nrsm = &stack_map;
 				memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
 				/* Now adjust our tree blocks */
 				rsm->r_end = start;
 				next->r_start = start;
 				/* Now we must adjust back where next->m is */
 				rack_setup_offset_for_rsm(rsm, next);
 
 				/* We don't need to adjust rsm, it did not change */
 				/* Clear out the dup ack count of the remainder */
 				rsm->r_dupack = 0;
 				rsm->r_just_ret = 0;
 				rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
 				/* Now lets make sure our fudge block is right */
 				nrsm->r_start = start;
 				/* Now lets update all the stats and such */
 				rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0);
 				if (rack->app_limited_needs_set)
 					rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
 				changed += (nrsm->r_end - nrsm->r_start);
 				rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
 				if (nrsm->r_flags & RACK_SACK_PASSED) {
 					rack->r_ctl.rc_reorder_ts = cts;
 				}
 				/*
 				 * Now we want to go up from rsm (the
 				 * one left un-acked) to the next one
 				 * in the tmap. We do this so when
 				 * we walk backwards we include marking
 				 * sack-passed on rsm (The one passed in
 				 * is skipped since it is generally called
 				 * on something sacked before removing it
 				 * from the tmap).
 				 */
 				if (rsm->r_in_tmap) {
 					nrsm = TAILQ_NEXT(rsm, r_tnext);
 					/*
 					 * Now that we have the next
 					 * one walk backwards from there.
 					 */
 					if (nrsm && nrsm->r_in_tmap)
 						rack_log_sack_passed(tp, rack, nrsm);
 				}
 				/* Now are we done? */
 				if (SEQ_LT(end, next->r_end) ||
 				    (end == next->r_end)) {
 					/* Done with block */
 					goto out;
 				}
 				rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__);
 				counter_u64_add(rack_sack_used_next_merge, 1);
 				/* Postion for the next block */
 				start = next->r_end;
 				rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next);
 				if (rsm == NULL)
 					goto out;
 			} else {
 				/**
 				 * We can't use any hookery here, so we
 				 * need to split the map. We enter like
 				 * so:
 				 *  rsm      |--------|
 				 *  sackblk       |----->
 				 * We will add the new block nrsm and
 				 * that will be the new portion, and then
 				 * fall through after reseting rsm. So we
 				 * split and look like this:
 				 *  rsm      |----|
 				 *  sackblk       |----->
 				 *  nrsm          |---|
 				 * We then fall through reseting
 				 * rsm to nrsm, so the next block
 				 * picks it up.
 				 */
 				nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
 				if (nrsm == NULL) {
 					/*
 					 * failed XXXrrs what can we do but loose the sack
 					 * info?
 					 */
 					goto out;
 				}
 				counter_u64_add(rack_sack_splits, 1);
 				rack_clone_rsm(rack, nrsm, rsm, start);
 				rsm->r_just_ret = 0;
 #ifndef INVARIANTS
 				(void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
 #else
 				insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
 				if (insret != NULL) {
 					panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
 					      nrsm, insret, rack, rsm);
 				}
 #endif
 				if (rsm->r_in_tmap) {
 					TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 					nrsm->r_in_tmap = 1;
 				}
 				rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__);
 				rsm->r_flags &= (~RACK_HAS_FIN);
 				/* Position us to point to the new nrsm that starts the sack blk */
 				rsm = nrsm;
 			}
 		} else {
 			/* Already sacked this piece */
 			counter_u64_add(rack_sack_skipped_acked, 1);
 			moved++;
 			if (end == rsm->r_end) {
 				/* Done with block */
 				rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 				goto out;
 			} else if (SEQ_LT(end, rsm->r_end)) {
 				/* A partial sack to a already sacked block */
 				moved++;
 				rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 				goto out;
 			} else {
 				/*
 				 * The end goes beyond this guy
 				 * reposition the start to the
 				 * next block.
 				 */
 				start = rsm->r_end;
 				rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 				if (rsm == NULL)
 					goto out;
 			}
 		}
 	}
 	if (SEQ_GEQ(end, rsm->r_end)) {
 		/**
 		 * The end of this block is either beyond this guy or right
 		 * at this guy. I.e.:
 		 *  rsm ---                 |-----|
 		 *  end                     |-----|
 		 *  <or>
 		 *  end                     |---------|
 		 */
 		if ((rsm->r_flags & RACK_ACKED) == 0) {
 			/*
 			 * Is it a TLP of interest?
 			 */
 			if ((rsm->r_flags & RACK_TLP) &&
 			    (rsm->r_rtr_cnt > 1)) {
 				/*
 				 * We are splitting a rxt TLP, check
 				 * if we need to save off the start/end
 				 */
 				if (rack->rc_last_tlp_acked_set &&
 				    (is_rsm_inside_declared_tlp_block(rack, rsm))) {
 					/*
 					 * We already turned this on since we are inside
 					 * the previous one was a partially sack now we
 					 * are getting another one (maybe all of it).
 					 */
 					rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
 					/*
 					 * Lets make sure we have all of it though.
 					 */
 					if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
 						rack->r_ctl.last_tlp_acked_start = rsm->r_start;
 						rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
 								     rack->r_ctl.last_tlp_acked_end);
 					}
 					if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
 						rack->r_ctl.last_tlp_acked_end = rsm->r_end;
 						rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
 								     rack->r_ctl.last_tlp_acked_end);
 					}
 				} else {
 					rack->r_ctl.last_tlp_acked_start = rsm->r_start;
 					rack->r_ctl.last_tlp_acked_end = rsm->r_end;
 					rack->rc_last_tlp_past_cumack = 0;
 					rack->rc_last_tlp_acked_set = 1;
 					rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
 				}
 			}
 			rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
 			changed += (rsm->r_end - rsm->r_start);
 			rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
 			if (rsm->r_in_tmap) /* should be true */
 				rack_log_sack_passed(tp, rack, rsm);
 			/* Is Reordering occuring? */
 			if (rsm->r_flags & RACK_SACK_PASSED) {
 				rsm->r_flags &= ~RACK_SACK_PASSED;
 				rack->r_ctl.rc_reorder_ts = cts;
 			}
 			if (rack->app_limited_needs_set)
 				rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
 			rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
 			rsm->r_flags |= RACK_ACKED;
 			if (rsm->r_in_tmap) {
 				TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 				rsm->r_in_tmap = 0;
 			}
 			rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__);
 		} else {
 			counter_u64_add(rack_sack_skipped_acked, 1);
 			moved++;
 		}
 		if (end == rsm->r_end) {
 			/* This block only - done, setup for next */
 			goto out;
 		}
 		/*
 		 * There is more not coverend by this rsm move on
 		 * to the next block in the RB tree.
 		 */
 		nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 		start = rsm->r_end;
 		rsm = nrsm;
 		if (rsm == NULL)
 			goto out;
 		goto do_rest_ofb;
 	}
 	/**
 	 * The end of this sack block is smaller than
 	 * our rsm i.e.:
 	 *  rsm ---                 |-----|
 	 *  end                     |--|
 	 */
 	if ((rsm->r_flags & RACK_ACKED) == 0) {
 		/*
 		 * Is it a TLP of interest?
 		 */
 		if ((rsm->r_flags & RACK_TLP) &&
 		    (rsm->r_rtr_cnt > 1)) {
 			/*
 			 * We are splitting a rxt TLP, check
 			 * if we need to save off the start/end
 			 */
 			if (rack->rc_last_tlp_acked_set &&
 			    (is_rsm_inside_declared_tlp_block(rack, rsm))) {
 				/*
 				 * We already turned this on since we are inside
 				 * the previous one was a partially sack now we
 				 * are getting another one (maybe all of it).
 				 */
 				rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
 				/*
 				 * Lets make sure we have all of it though.
 				 */
 				if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
 					rack->r_ctl.last_tlp_acked_start = rsm->r_start;
 					rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
 							     rack->r_ctl.last_tlp_acked_end);
 				}
 				if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
 					rack->r_ctl.last_tlp_acked_end = rsm->r_end;
 					rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
 							     rack->r_ctl.last_tlp_acked_end);
 				}
 			} else {
 				rack->r_ctl.last_tlp_acked_start = rsm->r_start;
 				rack->r_ctl.last_tlp_acked_end = rsm->r_end;
 				rack->rc_last_tlp_past_cumack = 0;
 				rack->rc_last_tlp_acked_set = 1;
 				rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
 			}
 		}
 		prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 		if (prev &&
 		    (prev->r_flags & RACK_ACKED)) {
 			/**
 			 * Goal, we want the right remainder of rsm to shrink
 			 * in place and span from (rsm->r_start = end) to rsm->r_end.
 			 * We want to expand prev to go all the way
 			 * to prev->r_end <- end.
 			 * so in the tree we have before:
 			 *   prev     |--------|         (acked)
 			 *   rsm               |-------| (non-acked)
 			 *   sackblk           |-|
 			 * We churn it so we end up with
 			 *   prev     |----------|       (acked)
 			 *   rsm                 |-----| (non-acked)
 			 *   nrsm              |-| (temporary)
 			 *
 			 * Note if either prev/rsm is a TLP we don't
 			 * do this.
 			 */
 			nrsm = &stack_map;
 			memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
 			prev->r_end = end;
 			rsm->r_start = end;
 			/* Now adjust nrsm (stack copy) to be
 			 * the one that is the small
 			 * piece that was "sacked".
 			 */
 			nrsm->r_end = end;
 			rsm->r_dupack = 0;
 			rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
 			/*
 			 * Now that the rsm has had its start moved forward
 			 * lets go ahead and get its new place in the world.
 			 */
 			rack_setup_offset_for_rsm(prev, rsm);
 			/*
 			 * Now nrsm is our new little piece
 			 * that is acked (which was merged
 			 * to prev). Update the rtt and changed
 			 * based on that. Also check for reordering.
 			 */
 			rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0);
 			if (rack->app_limited_needs_set)
 				rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
 			changed += (nrsm->r_end - nrsm->r_start);
 			rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
 			if (nrsm->r_flags & RACK_SACK_PASSED) {
 				rack->r_ctl.rc_reorder_ts = cts;
 			}
 			rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__);
 			rsm = prev;
 			counter_u64_add(rack_sack_used_prev_merge, 1);
 		} else {
 			/**
 			 * This is the case where our previous
 			 * block is not acked either, so we must
 			 * split the block in two.
 			 */
 			nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
 			if (nrsm == NULL) {
 				/* failed rrs what can we do but loose the sack info? */
 				goto out;
 			}
 			if ((rsm->r_flags & RACK_TLP) &&
 			    (rsm->r_rtr_cnt > 1)) {
 				/*
 				 * We are splitting a rxt TLP, check
 				 * if we need to save off the start/end
 				 */
 				if (rack->rc_last_tlp_acked_set &&
 				    (is_rsm_inside_declared_tlp_block(rack, rsm))) {
 					    /*
 					     * We already turned this on since this block is inside
 					     * the previous one was a partially sack now we
 					     * are getting another one (maybe all of it).
 					     */
 					    rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
 					    /*
 					     * Lets make sure we have all of it though.
 					     */
 					    if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
 						    rack->r_ctl.last_tlp_acked_start = rsm->r_start;
 						    rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
 									 rack->r_ctl.last_tlp_acked_end);
 					    }
 					    if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
 						    rack->r_ctl.last_tlp_acked_end = rsm->r_end;
 						    rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
 									 rack->r_ctl.last_tlp_acked_end);
 					    }
 				    } else {
 					    rack->r_ctl.last_tlp_acked_start = rsm->r_start;
 					    rack->r_ctl.last_tlp_acked_end = rsm->r_end;
 					    rack->rc_last_tlp_acked_set = 1;
 					    rack->rc_last_tlp_past_cumack = 0;
 					    rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
 				    }
 			}
 			/**
 			 * In this case nrsm becomes
 			 * nrsm->r_start = end;
 			 * nrsm->r_end = rsm->r_end;
 			 * which is un-acked.
 			 * <and>
 			 * rsm->r_end = nrsm->r_start;
 			 * i.e. the remaining un-acked
 			 * piece is left on the left
 			 * hand side.
 			 *
 			 * So we start like this
 			 * rsm      |----------| (not acked)
 			 * sackblk  |---|
 			 * build it so we have
 			 * rsm      |---|         (acked)
 			 * nrsm         |------|  (not acked)
 			 */
 			counter_u64_add(rack_sack_splits, 1);
 			rack_clone_rsm(rack, nrsm, rsm, end);
 			rsm->r_flags &= (~RACK_HAS_FIN);
 			rsm->r_just_ret = 0;
 #ifndef INVARIANTS
 			(void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
 #else
 			insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
 			if (insret != NULL) {
 				panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
 				      nrsm, insret, rack, rsm);
 			}
 #endif
 			if (rsm->r_in_tmap) {
 				TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 				nrsm->r_in_tmap = 1;
 			}
 			nrsm->r_dupack = 0;
 			rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
 			rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
 			changed += (rsm->r_end - rsm->r_start);
 			rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
 			if (rsm->r_in_tmap) /* should be true */
 				rack_log_sack_passed(tp, rack, rsm);
 			/* Is Reordering occuring? */
 			if (rsm->r_flags & RACK_SACK_PASSED) {
 				rsm->r_flags &= ~RACK_SACK_PASSED;
 				rack->r_ctl.rc_reorder_ts = cts;
 			}
 			if (rack->app_limited_needs_set)
 				rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
 			rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
 			rsm->r_flags |= RACK_ACKED;
 			rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__);
 			if (rsm->r_in_tmap) {
 				TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 				rsm->r_in_tmap = 0;
 			}
 		}
 	} else if (start != end){
 		/*
 		 * The block was already acked.
 		 */
 		counter_u64_add(rack_sack_skipped_acked, 1);
 		moved++;
 	}
 out:
 	if (rsm &&
 	    ((rsm->r_flags & RACK_TLP) == 0) &&
 	    (rsm->r_flags & RACK_ACKED)) {
 		/*
 		 * Now can we merge where we worked
 		 * with either the previous or
 		 * next block?
 		 */
 		next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 		while (next) {
 			if (next->r_flags & RACK_TLP)
 				break;
 			if (next->r_flags & RACK_ACKED) {
 			/* yep this and next can be merged */
 				rsm = rack_merge_rsm(rack, rsm, next);
 				next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 			} else
 				break;
 		}
 		/* Now what about the previous? */
 		prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 		while (prev) {
 			if (prev->r_flags & RACK_TLP)
 				break;
 			if (prev->r_flags & RACK_ACKED) {
 				/* yep the previous and this can be merged */
 				rsm = rack_merge_rsm(rack, prev, rsm);
 				prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 			} else
 				break;
 		}
 	}
 	if (used_ref == 0) {
 		counter_u64_add(rack_sack_proc_all, 1);
 	} else {
 		counter_u64_add(rack_sack_proc_short, 1);
 	}
 	/* Save off the next one for quick reference. */
 	if (rsm)
 		nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 	else
 		nrsm = NULL;
 	*prsm = rack->r_ctl.rc_sacklast = nrsm;
 	/* Pass back the moved. */
 	*moved_two = moved;
 	return (changed);
 }
 
 static void inline
 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
 {
 	struct rack_sendmap *tmap;
 
 	tmap = NULL;
 	while (rsm && (rsm->r_flags & RACK_ACKED)) {
 		/* Its no longer sacked, mark it so */
 		rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
 #ifdef INVARIANTS
 		if (rsm->r_in_tmap) {
 			panic("rack:%p rsm:%p flags:0x%x in tmap?",
 			      rack, rsm, rsm->r_flags);
 		}
 #endif
 		rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
 		/* Rebuild it into our tmap */
 		if (tmap == NULL) {
 			TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 			tmap = rsm;
 		} else {
 			TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
 			tmap = rsm;
 		}
 		tmap->r_in_tmap = 1;
 		rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 	}
 	/*
 	 * Now lets possibly clear the sack filter so we start
 	 * recognizing sacks that cover this area.
 	 */
 	sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
 
 }
 
 static void
 rack_do_decay(struct tcp_rack *rack)
 {
 	struct timeval res;
 
 #define	timersub(tvp, uvp, vvp)						\
 	do {								\
 		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
 		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\
 		if ((vvp)->tv_usec < 0) {				\
 			(vvp)->tv_sec--;				\
 			(vvp)->tv_usec += 1000000;			\
 		}							\
 	} while (0)
 
 	timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res);
 #undef timersub
 
 	rack->r_ctl.input_pkt++;
 	if ((rack->rc_in_persist) ||
 	    (res.tv_sec >= 1) ||
 	    (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) {
 		/*
 		 * Check for decay of non-SAD,
 		 * we want all SAD detection metrics to
 		 * decay 1/4 per second (or more) passed.
 		 */
 #ifdef NETFLIX_EXP_DETECTION
 		uint32_t pkt_delta;
 
 		pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt;
 #endif
 		/* Update our saved tracking values */
 		rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt;
 		rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time;
 		/* Now do we escape without decay? */
 #ifdef NETFLIX_EXP_DETECTION
 		if (rack->rc_in_persist ||
 		    (rack->rc_tp->snd_max == rack->rc_tp->snd_una) ||
 		    (pkt_delta < tcp_sad_low_pps)){
 			/*
 			 * We don't decay idle connections
 			 * or ones that have a low input pps.
 			 */
 			return;
 		}
 		/* Decay the counters */
 		rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count,
 							tcp_sad_decay_val);
 		rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count,
 							 tcp_sad_decay_val);
 		rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra,
 							       tcp_sad_decay_val);
 		rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move,
 								tcp_sad_decay_val);
 #endif
 	}
 }
 
 static void
 rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to)
 {
 	struct rack_sendmap *rsm;
 #ifdef INVARIANTS
 	struct rack_sendmap *rm;
 #endif
 
 	/*
 	 * The ACK point is advancing to th_ack, we must drop off
 	 * the packets in the rack log and calculate any eligble
 	 * RTT's.
 	 */
 	rack->r_wanted_output = 1;
 
 	/* Tend any TLP that has been marked for 1/2 the seq space (its old)  */
 	if ((rack->rc_last_tlp_acked_set == 1)&&
 	    (rack->rc_last_tlp_past_cumack == 1) &&
 	    (SEQ_GT(rack->r_ctl.last_tlp_acked_start, th_ack))) {
 		/*
 		 * We have reached the point where our last rack
 		 * tlp retransmit sequence is ahead of the cum-ack.
 		 * This can only happen when the cum-ack moves all
 		 * the way around (its been a full 2^^31+1 bytes
 		 * or more since we sent a retransmitted TLP). Lets
 		 * turn off the valid flag since its not really valid.
 		 *
 		 * Note since sack's also turn on this event we have
 		 * a complication, we have to wait to age it out until
 		 * the cum-ack is by the TLP before checking which is
 		 * what the next else clause does.
 		 */
 		rack_log_dsack_event(rack, 9, __LINE__,
 				     rack->r_ctl.last_tlp_acked_start,
 				     rack->r_ctl.last_tlp_acked_end);
 		rack->rc_last_tlp_acked_set = 0;
 		rack->rc_last_tlp_past_cumack = 0;
 	} else if ((rack->rc_last_tlp_acked_set == 1) &&
 		   (rack->rc_last_tlp_past_cumack == 0) &&
 		   (SEQ_GEQ(th_ack, rack->r_ctl.last_tlp_acked_end))) {
 		/*
 		 * It is safe to start aging TLP's out.
 		 */
 		rack->rc_last_tlp_past_cumack = 1;
 	}
 	/* We do the same for the tlp send seq as well */
 	if ((rack->rc_last_sent_tlp_seq_valid == 1) &&
 	    (rack->rc_last_sent_tlp_past_cumack == 1) &&
 	    (SEQ_GT(rack->r_ctl.last_sent_tlp_seq,  th_ack))) {
 		rack_log_dsack_event(rack, 9, __LINE__,
 				     rack->r_ctl.last_sent_tlp_seq,
 				     (rack->r_ctl.last_sent_tlp_seq +
 				      rack->r_ctl.last_sent_tlp_len));
 		rack->rc_last_sent_tlp_seq_valid = 0;
 		rack->rc_last_sent_tlp_past_cumack = 0;
 	} else if ((rack->rc_last_sent_tlp_seq_valid == 1) &&
 		   (rack->rc_last_sent_tlp_past_cumack == 0) &&
 		   (SEQ_GEQ(th_ack, rack->r_ctl.last_sent_tlp_seq))) {
 		/*
 		 * It is safe to start aging TLP's send.
 		 */
 		rack->rc_last_sent_tlp_past_cumack = 1;
 	}
 more:
 	rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
 	if (rsm == NULL) {
 		if ((th_ack - 1) == tp->iss) {
 			/*
 			 * For the SYN incoming case we will not
 			 * have called tcp_output for the sending of
 			 * the SYN, so there will be no map. All
 			 * other cases should probably be a panic.
 			 */
 			return;
 		}
 		if (tp->t_flags & TF_SENTFIN) {
 			/* if we sent a FIN we often will not have map */
 			return;
 		}
 #ifdef INVARIANTS
 		panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u snd_nxt:%u\n",
 		      tp,
 		      tp->t_state, th_ack, rack,
 		      tp->snd_una, tp->snd_max, tp->snd_nxt);
 #endif
 		return;
 	}
 	if (SEQ_LT(th_ack, rsm->r_start)) {
 		/* Huh map is missing this */
 #ifdef INVARIANTS
 		printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
 		       rsm->r_start,
 		       th_ack, tp->t_state, rack->r_state);
 #endif
 		return;
 	}
 	rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack);
 
 	/* Now was it a retransmitted TLP? */
 	if ((rsm->r_flags & RACK_TLP) &&
 	    (rsm->r_rtr_cnt > 1)) {
 		/*
 		 * Yes, this rsm was a TLP and retransmitted, remember that
 		 * since if a DSACK comes back on this we don't want
 		 * to think of it as a reordered segment. This may
 		 * get updated again with possibly even other TLPs
 		 * in flight, but thats ok. Only when we don't send
 		 * a retransmitted TLP for 1/2 the sequences space
 		 * will it get turned off (above).
 		 */
 		if (rack->rc_last_tlp_acked_set &&
 		    (is_rsm_inside_declared_tlp_block(rack, rsm))) {
 			/*
 			 * We already turned this on since the end matches,
 			 * the previous one was a partially ack now we
 			 * are getting another one (maybe all of it).
 			 */
 			rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
 			/*
 			 * Lets make sure we have all of it though.
 			 */
 			if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
 				rack->r_ctl.last_tlp_acked_start = rsm->r_start;
 				rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
 						     rack->r_ctl.last_tlp_acked_end);
 			}
 			if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
 				rack->r_ctl.last_tlp_acked_end = rsm->r_end;
 				rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
 						     rack->r_ctl.last_tlp_acked_end);
 			}
 		} else {
 			rack->rc_last_tlp_past_cumack = 1;
 			rack->r_ctl.last_tlp_acked_start = rsm->r_start;
 			rack->r_ctl.last_tlp_acked_end = rsm->r_end;
 			rack->rc_last_tlp_acked_set = 1;
 			rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
 		}
 	}
 	/* Now do we consume the whole thing? */
 	if (SEQ_GEQ(th_ack, rsm->r_end)) {
 		/* Its all consumed. */
 		uint32_t left;
 		uint8_t newly_acked;
 
 		rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__);
 		rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
 		rsm->r_rtr_bytes = 0;
 		/* Record the time of highest cumack sent */
 		rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
 #ifndef INVARIANTS
 		(void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 #else
 		rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 		if (rm != rsm) {
 			panic("removing head in rack:%p rsm:%p rm:%p",
 			      rack, rsm, rm);
 		}
 #endif
 		if (rsm->r_in_tmap) {
 			TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 			rsm->r_in_tmap = 0;
 		}
 		newly_acked = 1;
 		if (rsm->r_flags & RACK_ACKED) {
 			/*
 			 * It was acked on the scoreboard -- remove
 			 * it from total
 			 */
 			rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
 			newly_acked = 0;
 		} else if (rsm->r_flags & RACK_SACK_PASSED) {
 			/*
 			 * There are segments ACKED on the
 			 * scoreboard further up. We are seeing
 			 * reordering.
 			 */
 			rsm->r_flags &= ~RACK_SACK_PASSED;
 			rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
 			rsm->r_flags |= RACK_ACKED;
 			rack->r_ctl.rc_reorder_ts = cts;
 			if (rack->r_ent_rec_ns) {
 				/*
 				 * We have sent no more, and we saw an sack
 				 * then ack arrive.
 				 */
 				rack->r_might_revert = 1;
 			}
 		}
 		if ((rsm->r_flags & RACK_TO_REXT) &&
 		    (tp->t_flags & TF_RCVD_TSTMP) &&
 		    (to->to_flags & TOF_TS) &&
 		    (to->to_tsecr != 0) &&
 		    (tp->t_flags & TF_PREVVALID)) {
 			/*
 			 * We can use the timestamp to see
 			 * if this retransmission was from the
 			 * first transmit. If so we made a mistake.
 			 */
 			tp->t_flags &= ~TF_PREVVALID;
 			if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) {
 				/* The first transmit is what this ack is for */
 				rack_cong_signal(tp, CC_RTO_ERR, th_ack, __LINE__);
 			}
 		}
 		left = th_ack - rsm->r_end;
 		if (rack->app_limited_needs_set && newly_acked)
 			rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK);
 		/* Free back to zone */
 		rack_free(rack, rsm);
 		if (left) {
 			goto more;
 		}
 		/* Check for reneging */
 		rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
 		if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
 			/*
 			 * The peer has moved snd_una up to
 			 * the edge of this send, i.e. one
 			 * that it had previously acked. The only
 			 * way that can be true if the peer threw
 			 * away data (space issues) that it had
 			 * previously sacked (else it would have
 			 * given us snd_una up to (rsm->r_end).
 			 * We need to undo the acked markings here.
 			 *
 			 * Note we have to look to make sure th_ack is
 			 * our rsm->r_start in case we get an old ack
 			 * where th_ack is behind snd_una.
 			 */
 			rack_peer_reneges(rack, rsm, th_ack);
 		}
 		return;
 	}
 	if (rsm->r_flags & RACK_ACKED) {
 		/*
 		 * It was acked on the scoreboard -- remove it from
 		 * total for the part being cum-acked.
 		 */
 		rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
 	}
 	/*
 	 * Clear the dup ack count for
 	 * the piece that remains.
 	 */
 	rsm->r_dupack = 0;
 	rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
 	if (rsm->r_rtr_bytes) {
 		/*
 		 * It was retransmitted adjust the
 		 * sack holes for what was acked.
 		 */
 		int ack_am;
 
 		ack_am = (th_ack - rsm->r_start);
 		if (ack_am >= rsm->r_rtr_bytes) {
 			rack->r_ctl.rc_holes_rxt -= ack_am;
 			rsm->r_rtr_bytes -= ack_am;
 		}
 	}
 	/*
 	 * Update where the piece starts and record
 	 * the time of send of highest cumack sent.
 	 */
 	rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
 	rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__);
 	/* Now we need to move our offset forward too */
 	if (rsm->m && (rsm->orig_m_len != rsm->m->m_len)) {
 		/* Fix up the orig_m_len and possibly the mbuf offset */
 		rack_adjust_orig_mlen(rsm);
 	}
 	rsm->soff += (th_ack - rsm->r_start);
 	rsm->r_start = th_ack;
 	/* Now do we need to move the mbuf fwd too? */
 	if (rsm->m) {
 		while (rsm->soff >= rsm->m->m_len) {
 			rsm->soff -= rsm->m->m_len;
 			rsm->m = rsm->m->m_next;
 			KASSERT((rsm->m != NULL),
 				(" nrsm:%p hit at soff:%u null m",
 				 rsm, rsm->soff));
 		}
 		rsm->orig_m_len = rsm->m->m_len;
 	}
 	if (rack->app_limited_needs_set)
 		rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG);
 }
 
 static void
 rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack)
 {
 	struct rack_sendmap *rsm;
 	int sack_pass_fnd = 0;
 
 	if (rack->r_might_revert) {
 		/*
 		 * Ok we have reordering, have not sent anything, we
 		 * might want to revert the congestion state if nothing
 		 * further has SACK_PASSED on it. Lets check.
 		 *
 		 * We also get here when we have DSACKs come in for
 		 * all the data that we FR'd. Note that a rxt or tlp
 		 * timer clears this from happening.
 		 */
 
 		TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
 			if (rsm->r_flags & RACK_SACK_PASSED) {
 				sack_pass_fnd = 1;
 				break;
 			}
 		}
 		if (sack_pass_fnd == 0) {
 			/*
 			 * We went into recovery
 			 * incorrectly due to reordering!
 			 */
 			int orig_cwnd;
 
 			rack->r_ent_rec_ns = 0;
 			orig_cwnd = tp->snd_cwnd;
 			tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec;
 			tp->snd_recover = tp->snd_una;
 			rack_log_to_prr(rack, 14, orig_cwnd, __LINE__);
 			EXIT_RECOVERY(tp->t_flags);
 		}
 		rack->r_might_revert = 0;
 	}
 }
 
 #ifdef NETFLIX_EXP_DETECTION
 static void
 rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack,  uint32_t bytes_this_ack, uint32_t segsiz)
 {
 	if ((rack->do_detection || tcp_force_detection) &&
 	    tcp_sack_to_ack_thresh &&
 	    tcp_sack_to_move_thresh &&
 	    ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) {
 		/*
 		 * We have thresholds set to find
 		 * possible attackers and disable sack.
 		 * Check them.
 		 */
 		uint64_t ackratio, moveratio, movetotal;
 
 		/* Log detecting */
 		rack_log_sad(rack, 1);
 		ackratio = (uint64_t)(rack->r_ctl.sack_count);
 		ackratio *= (uint64_t)(1000);
 		if (rack->r_ctl.ack_count)
 			ackratio /= (uint64_t)(rack->r_ctl.ack_count);
 		else {
 			/* We really should not hit here */
 			ackratio = 1000;
 		}
 		if ((rack->sack_attack_disable == 0) &&
 		    (ackratio > rack_highest_sack_thresh_seen))
 			rack_highest_sack_thresh_seen = (uint32_t)ackratio;
 		movetotal = rack->r_ctl.sack_moved_extra;
 		movetotal += rack->r_ctl.sack_noextra_move;
 		moveratio = rack->r_ctl.sack_moved_extra;
 		moveratio *= (uint64_t)1000;
 		if (movetotal)
 			moveratio /= movetotal;
 		else {
 			/* No moves, thats pretty good */
 			moveratio = 0;
 		}
 		if ((rack->sack_attack_disable == 0) &&
 		    (moveratio > rack_highest_move_thresh_seen))
 			rack_highest_move_thresh_seen = (uint32_t)moveratio;
 		if (rack->sack_attack_disable == 0) {
 			if ((ackratio > tcp_sack_to_ack_thresh) &&
 			    (moveratio > tcp_sack_to_move_thresh)) {
 				/* Disable sack processing */
 				rack->sack_attack_disable = 1;
 				if (rack->r_rep_attack == 0) {
 					rack->r_rep_attack = 1;
 					counter_u64_add(rack_sack_attacks_detected, 1);
 				}
 				if (tcp_attack_on_turns_on_logging) {
 					/*
 					 * Turn on logging, used for debugging
 					 * false positives.
 					 */
 					rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging;
 				}
 				/* Clamp the cwnd at flight size */
 				rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd;
 				rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 				rack_log_sad(rack, 2);
 			}
 		} else {
 			/* We are sack-disabled check for false positives */
 			if ((ackratio <= tcp_restoral_thresh) ||
 			    (rack->r_ctl.rc_num_maps_alloced  < tcp_map_minimum)) {
 				rack->sack_attack_disable = 0;
 				rack_log_sad(rack, 3);
 				/* Restart counting */
 				rack->r_ctl.sack_count = 0;
 				rack->r_ctl.sack_moved_extra = 0;
 				rack->r_ctl.sack_noextra_move = 1;
 				rack->r_ctl.ack_count = max(1,
 				      (bytes_this_ack / segsiz));
 
 				if (rack->r_rep_reverse == 0) {
 					rack->r_rep_reverse = 1;
 					counter_u64_add(rack_sack_attacks_reversed, 1);
 				}
 				/* Restore the cwnd */
 				if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd)
 					rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd;
 			}
 		}
 	}
 }
 #endif
 
 static int
 rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end)
 {
 
 	uint32_t am, l_end;
 	int was_tlp = 0;
 
 	if (SEQ_GT(end, start))
 		am = end - start;
 	else
 		am = 0;
 	if ((rack->rc_last_tlp_acked_set ) &&
 	    (SEQ_GEQ(start, rack->r_ctl.last_tlp_acked_start)) &&
 	    (SEQ_LEQ(end, rack->r_ctl.last_tlp_acked_end))) {
 		/*
 		 * The DSACK is because of a TLP which we don't
 		 * do anything with the reordering window over since
 		 * it was not reordering that caused the DSACK but
 		 * our previous retransmit TLP.
 		 */
 		rack_log_dsack_event(rack, 7, __LINE__, start, end);
 		was_tlp = 1;
 		goto skip_dsack_round;
 	}
 	if (rack->rc_last_sent_tlp_seq_valid) {
 		l_end = rack->r_ctl.last_sent_tlp_seq + rack->r_ctl.last_sent_tlp_len;
 		if (SEQ_GEQ(start, rack->r_ctl.last_sent_tlp_seq) &&
 		    (SEQ_LEQ(end, l_end))) {
 			/*
 			 * This dsack is from the last sent TLP, ignore it
 			 * for reordering purposes.
 			 */
 			rack_log_dsack_event(rack, 7, __LINE__, start, end);
 			was_tlp = 1;
 			goto skip_dsack_round;
 		}
 	}
 	if (rack->rc_dsack_round_seen == 0) {
 		rack->rc_dsack_round_seen = 1;
 		rack->r_ctl.dsack_round_end = rack->rc_tp->snd_max;
 		rack->r_ctl.num_dsack++;
 		rack->r_ctl.dsack_persist = 16;	/* 16 is from the standard */
 		rack_log_dsack_event(rack, 2, __LINE__, 0, 0);
 	}
 skip_dsack_round:
 	/*
 	 * We keep track of how many DSACK blocks we get
 	 * after a recovery incident.
 	 */
 	rack->r_ctl.dsack_byte_cnt += am;
 	if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) &&
 	    rack->r_ctl.retran_during_recovery &&
 	    (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) {
 		/*
 		 * False recovery most likely culprit is reordering. If
 		 * nothing else is missing we need to revert.
 		 */
 		rack->r_might_revert = 1;
 		rack_handle_might_revert(rack->rc_tp, rack);
 		rack->r_might_revert = 0;
 		rack->r_ctl.retran_during_recovery = 0;
 		rack->r_ctl.dsack_byte_cnt = 0;
 	}
 	return (was_tlp);
 }
 
 static uint32_t
 do_rack_compute_pipe(struct tcpcb *tp, struct tcp_rack *rack, uint32_t snd_una)
 {
 	return (((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt);
 }
 
 static int32_t
 rack_compute_pipe(struct tcpcb *tp)
 {
 	return ((int32_t)do_rack_compute_pipe(tp,
 					      (struct tcp_rack *)tp->t_fb_ptr,
 					      tp->snd_una));
 }
 
 static void
 rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack)
 {
 	/* Deal with changed and PRR here (in recovery only) */
 	uint32_t pipe, snd_una;
 
 	rack->r_ctl.rc_prr_delivered += changed;
 
 	if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) {
 		/*
 		 * It is all outstanding, we are application limited
 		 * and thus we don't need more room to send anything.
 		 * Note we use tp->snd_una here and not th_ack because
 		 * the data as yet not been cut from the sb.
 		 */
 		rack->r_ctl.rc_prr_sndcnt = 0;
 		return;
 	}
 	/* Compute prr_sndcnt */
 	if (SEQ_GT(tp->snd_una, th_ack)) {
 		snd_una = tp->snd_una;
 	} else {
 		snd_una = th_ack;
 	}
 	pipe = do_rack_compute_pipe(tp, rack, snd_una);
 	if (pipe > tp->snd_ssthresh) {
 		long sndcnt;
 
 		sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
 		if (rack->r_ctl.rc_prr_recovery_fs > 0)
 			sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
 		else {
 			rack->r_ctl.rc_prr_sndcnt = 0;
 			rack_log_to_prr(rack, 9, 0, __LINE__);
 			sndcnt = 0;
 		}
 		sndcnt++;
 		if (sndcnt > (long)rack->r_ctl.rc_prr_out)
 			sndcnt -= rack->r_ctl.rc_prr_out;
 		else
 			sndcnt = 0;
 		rack->r_ctl.rc_prr_sndcnt = sndcnt;
 		rack_log_to_prr(rack, 10, 0, __LINE__);
 	} else {
 		uint32_t limit;
 
 		if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
 			limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
 		else
 			limit = 0;
 		if (changed > limit)
 			limit = changed;
 		limit += ctf_fixed_maxseg(tp);
 		if (tp->snd_ssthresh > pipe) {
 			rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
 			rack_log_to_prr(rack, 11, 0, __LINE__);
 		} else {
 			rack->r_ctl.rc_prr_sndcnt = min(0, limit);
 			rack_log_to_prr(rack, 12, 0, __LINE__);
 		}
 	}
 }
 
 static void
 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck)
 {
 	uint32_t changed;
 	struct tcp_rack *rack;
 	struct rack_sendmap *rsm;
 	struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
 	register uint32_t th_ack;
 	int32_t i, j, k, num_sack_blks = 0;
 	uint32_t cts, acked, ack_point;
 	int loop_start = 0, moved_two = 0;
 	uint32_t tsused;
 
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	if (tcp_get_flags(th) & TH_RST) {
 		/* We don't log resets */
 		return;
 	}
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	cts = tcp_get_usecs(NULL);
 	rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
 	changed = 0;
 	th_ack = th->th_ack;
 	if (rack->sack_attack_disable == 0)
 		rack_do_decay(rack);
 	if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) {
 		/*
 		 * You only get credit for
 		 * MSS and greater (and you get extra
 		 * credit for larger cum-ack moves).
 		 */
 		int ac;
 
 		ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp);
 		rack->r_ctl.ack_count += ac;
 		counter_u64_add(rack_ack_total, ac);
 	}
 	if (rack->r_ctl.ack_count > 0xfff00000) {
 		/*
 		 * reduce the number to keep us under
 		 * a uint32_t.
 		 */
 		rack->r_ctl.ack_count /= 2;
 		rack->r_ctl.sack_count /= 2;
 	}
 	if (SEQ_GT(th_ack, tp->snd_una)) {
 		rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
 		tp->t_acktime = ticks;
 	}
 	if (rsm && SEQ_GT(th_ack, rsm->r_start))
 		changed = th_ack - rsm->r_start;
 	if (changed) {
 		rack_process_to_cumack(tp, rack, th_ack, cts, to);
 	}
 	if ((to->to_flags & TOF_SACK) == 0) {
 		/* We are done nothing left and no sack. */
 		rack_handle_might_revert(tp, rack);
 		/*
 		 * For cases where we struck a dup-ack
 		 * with no SACK, add to the changes so
 		 * PRR will work right.
 		 */
 		if (dup_ack_struck && (changed == 0)) {
 			changed += ctf_fixed_maxseg(rack->rc_tp);
 		}
 		goto out;
 	}
 	/* Sack block processing */
 	if (SEQ_GT(th_ack, tp->snd_una))
 		ack_point = th_ack;
 	else
 		ack_point = tp->snd_una;
 	for (i = 0; i < to->to_nsacks; i++) {
 		bcopy((to->to_sacks + i * TCPOLEN_SACK),
 		      &sack, sizeof(sack));
 		sack.start = ntohl(sack.start);
 		sack.end = ntohl(sack.end);
 		if (SEQ_GT(sack.end, sack.start) &&
 		    SEQ_GT(sack.start, ack_point) &&
 		    SEQ_LT(sack.start, tp->snd_max) &&
 		    SEQ_GT(sack.end, ack_point) &&
 		    SEQ_LEQ(sack.end, tp->snd_max)) {
 			sack_blocks[num_sack_blks] = sack;
 			num_sack_blks++;
 		} else if (SEQ_LEQ(sack.start, th_ack) &&
 			   SEQ_LEQ(sack.end, th_ack)) {
 			int was_tlp;
 
 			was_tlp = rack_note_dsack(rack, sack.start, sack.end);
 			/*
 			 * Its a D-SACK block.
 			 */
 			tcp_record_dsack(tp, sack.start, sack.end, was_tlp);
 		}
 	}
 	if (rack->rc_dsack_round_seen) {
 		/* Is the dsack roound over? */
 		if (SEQ_GEQ(th_ack, rack->r_ctl.dsack_round_end)) {
 			/* Yes it is */
 			rack->rc_dsack_round_seen = 0;
 			rack_log_dsack_event(rack, 3, __LINE__, 0, 0);
 		}
 	}
 	/*
 	 * Sort the SACK blocks so we can update the rack scoreboard with
 	 * just one pass.
 	 */
 	num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks,
 					 num_sack_blks, th->th_ack);
 	ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
 	if (num_sack_blks == 0) {
 		/* Nothing to sack (DSACKs?) */
 		goto out_with_totals;
 	}
 	if (num_sack_blks < 2) {
 		/* Only one, we don't need to sort */
 		goto do_sack_work;
 	}
 	/* Sort the sacks */
 	for (i = 0; i < num_sack_blks; i++) {
 		for (j = i + 1; j < num_sack_blks; j++) {
 			if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
 				sack = sack_blocks[i];
 				sack_blocks[i] = sack_blocks[j];
 				sack_blocks[j] = sack;
 			}
 		}
 	}
 	/*
 	 * Now are any of the sack block ends the same (yes some
 	 * implementations send these)?
 	 */
 again:
 	if (num_sack_blks == 0)
 		goto out_with_totals;
 	if (num_sack_blks > 1) {
 		for (i = 0; i < num_sack_blks; i++) {
 			for (j = i + 1; j < num_sack_blks; j++) {
 				if (sack_blocks[i].end == sack_blocks[j].end) {
 					/*
 					 * Ok these two have the same end we
 					 * want the smallest end and then
 					 * throw away the larger and start
 					 * again.
 					 */
 					if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
 						/*
 						 * The second block covers
 						 * more area use that
 						 */
 						sack_blocks[i].start = sack_blocks[j].start;
 					}
 					/*
 					 * Now collapse out the dup-sack and
 					 * lower the count
 					 */
 					for (k = (j + 1); k < num_sack_blks; k++) {
 						sack_blocks[j].start = sack_blocks[k].start;
 						sack_blocks[j].end = sack_blocks[k].end;
 						j++;
 					}
 					num_sack_blks--;
 					goto again;
 				}
 			}
 		}
 	}
 do_sack_work:
 	/*
 	 * First lets look to see if
 	 * we have retransmitted and
 	 * can use the transmit next?
 	 */
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	if (rsm &&
 	    SEQ_GT(sack_blocks[0].end, rsm->r_start) &&
 	    SEQ_LT(sack_blocks[0].start, rsm->r_end)) {
 		/*
 		 * We probably did the FR and the next
 		 * SACK in continues as we would expect.
 		 */
 		acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two);
 		if (acked) {
 			rack->r_wanted_output = 1;
 			changed += acked;
 		}
 		if (num_sack_blks == 1) {
 			/*
 			 * This is what we would expect from
 			 * a normal implementation to happen
 			 * after we have retransmitted the FR,
 			 * i.e the sack-filter pushes down
 			 * to 1 block and the next to be retransmitted
 			 * is the sequence in the sack block (has more
 			 * are acked). Count this as ACK'd data to boost
 			 * up the chances of recovering any false positives.
 			 */
 			rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp));
 			counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp)));
 			counter_u64_add(rack_express_sack, 1);
 			if (rack->r_ctl.ack_count > 0xfff00000) {
 				/*
 				 * reduce the number to keep us under
 				 * a uint32_t.
 				 */
 				rack->r_ctl.ack_count /= 2;
 				rack->r_ctl.sack_count /= 2;
 			}
 			goto out_with_totals;
 		} else {
 			/*
 			 * Start the loop through the
 			 * rest of blocks, past the first block.
 			 */
 			moved_two = 0;
 			loop_start = 1;
 		}
 	}
 	/* Its a sack of some sort */
 	rack->r_ctl.sack_count++;
 	if (rack->r_ctl.sack_count > 0xfff00000) {
 		/*
 		 * reduce the number to keep us under
 		 * a uint32_t.
 		 */
 		rack->r_ctl.ack_count /= 2;
 		rack->r_ctl.sack_count /= 2;
 	}
 	counter_u64_add(rack_sack_total, 1);
 	if (rack->sack_attack_disable) {
 		/* An attacker disablement is in place */
 		if (num_sack_blks > 1) {
 			rack->r_ctl.sack_count += (num_sack_blks - 1);
 			rack->r_ctl.sack_moved_extra++;
 			counter_u64_add(rack_move_some, 1);
 			if (rack->r_ctl.sack_moved_extra > 0xfff00000) {
 				rack->r_ctl.sack_moved_extra /= 2;
 				rack->r_ctl.sack_noextra_move /= 2;
 			}
 		}
 		goto out;
 	}
 	rsm = rack->r_ctl.rc_sacklast;
 	for (i = loop_start; i < num_sack_blks; i++) {
 		acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two);
 		if (acked) {
 			rack->r_wanted_output = 1;
 			changed += acked;
 		}
 		if (moved_two) {
 			/*
 			 * If we did not get a SACK for at least a MSS and
 			 * had to move at all, or if we moved more than our
 			 * threshold, it counts against the "extra" move.
 			 */
 			rack->r_ctl.sack_moved_extra += moved_two;
 			counter_u64_add(rack_move_some, 1);
 		} else {
 			/*
 			 * else we did not have to move
 			 * any more than we would expect.
 			 */
 			rack->r_ctl.sack_noextra_move++;
 			counter_u64_add(rack_move_none, 1);
 		}
 		if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) {
 			/*
 			 * If the SACK was not a full MSS then
 			 * we add to sack_count the number of
 			 * MSS's (or possibly more than
 			 * a MSS if its a TSO send) we had to skip by.
 			 */
 			rack->r_ctl.sack_count += moved_two;
 			counter_u64_add(rack_sack_total, moved_two);
 		}
 		/*
 		 * Now we need to setup for the next
 		 * round. First we make sure we won't
 		 * exceed the size of our uint32_t on
 		 * the various counts, and then clear out
 		 * moved_two.
 		 */
 		if ((rack->r_ctl.sack_moved_extra > 0xfff00000) ||
 		    (rack->r_ctl.sack_noextra_move > 0xfff00000)) {
 			rack->r_ctl.sack_moved_extra /= 2;
 			rack->r_ctl.sack_noextra_move /= 2;
 		}
 		if (rack->r_ctl.sack_count > 0xfff00000) {
 			rack->r_ctl.ack_count /= 2;
 			rack->r_ctl.sack_count /= 2;
 		}
 		moved_two = 0;
 	}
 out_with_totals:
 	if (num_sack_blks > 1) {
 		/*
 		 * You get an extra stroke if
 		 * you have more than one sack-blk, this
 		 * could be where we are skipping forward
 		 * and the sack-filter is still working, or
 		 * it could be an attacker constantly
 		 * moving us.
 		 */
 		rack->r_ctl.sack_moved_extra++;
 		counter_u64_add(rack_move_some, 1);
 	}
 out:
 #ifdef NETFLIX_EXP_DETECTION
 	rack_do_detection(tp, rack, BYTES_THIS_ACK(tp, th), ctf_fixed_maxseg(rack->rc_tp));
 #endif
 	if (changed) {
 		/* Something changed cancel the rack timer */
 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 	}
 	tsused = tcp_get_usecs(NULL);
 	rsm = tcp_rack_output(tp, rack, tsused);
 	if ((!IN_FASTRECOVERY(tp->t_flags)) &&
 	    rsm &&
 	    ((rsm->r_flags & RACK_MUST_RXT) == 0)) {
 		/* Enter recovery */
 		entered_recovery = 1;
 		rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
 		/*
 		 * When we enter recovery we need to assure we send
 		 * one packet.
 		 */
 		if (rack->rack_no_prr == 0) {
 			rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
 			rack_log_to_prr(rack, 8, 0, __LINE__);
 		}
 		rack->r_timer_override = 1;
 		rack->r_early = 0;
 		rack->r_ctl.rc_agg_early = 0;
 	} else if (IN_FASTRECOVERY(tp->t_flags) &&
 		   rsm &&
 		   (rack->r_rr_config == 3)) {
 		/*
 		 * Assure we can output and we get no
 		 * remembered pace time except the retransmit.
 		 */
 		rack->r_timer_override = 1;
 		rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
 		rack->r_ctl.rc_resend = rsm;
 	}
 	if (IN_FASTRECOVERY(tp->t_flags) &&
 	    (rack->rack_no_prr == 0) &&
 	    (entered_recovery == 0)) {
 		rack_update_prr(tp, rack, changed, th_ack);
 		if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) &&
 		     ((tcp_in_hpts(rack->rc_inp) == 0) &&
 		      ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) {
 			/*
 			 * If you are pacing output you don't want
 			 * to override.
 			 */
 			rack->r_early = 0;
 			rack->r_ctl.rc_agg_early = 0;
 			rack->r_timer_override = 1;
 		}
 	}
 }
 
 static void
 rack_strike_dupack(struct tcp_rack *rack)
 {
 	struct rack_sendmap *rsm;
 
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	while (rsm && (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
 		rsm = TAILQ_NEXT(rsm, r_tnext);
 		if (rsm->r_flags & RACK_MUST_RXT) {
 			/* Sendmap entries that are marked to
 			 * be retransmitted do not need dupack's
 			 * struck. We get these marks for a number
 			 * of reasons (rxt timeout with no sack,
 			 * mtu change, or rwnd collapses). When
 			 * these events occur, we know we must retransmit
 			 * them and mark the sendmap entries. Dupack counting
 			 * is not needed since we are already set to retransmit
 			 * it as soon as we can.
 			 */
 			continue;
 		}
 	}
 	if (rsm && (rsm->r_dupack < 0xff)) {
 		rsm->r_dupack++;
 		if (rsm->r_dupack >= DUP_ACK_THRESHOLD) {
 			struct timeval tv;
 			uint32_t cts;
 			/*
 			 * Here we see if we need to retransmit. For
 			 * a SACK type connection if enough time has passed
 			 * we will get a return of the rsm. For a non-sack
 			 * connection we will get the rsm returned if the
 			 * dupack value is 3 or more.
 			 */
 			cts = tcp_get_usecs(&tv);
 			rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts);
 			if (rack->r_ctl.rc_resend != NULL) {
 				if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) {
 					rack_cong_signal(rack->rc_tp, CC_NDUPACK,
 							 rack->rc_tp->snd_una, __LINE__);
 				}
 				rack->r_wanted_output = 1;
 				rack->r_timer_override = 1;
 				rack_log_retran_reason(rack, rsm, __LINE__, 1, 3);
 			}
 		} else {
 			rack_log_retran_reason(rack, rsm, __LINE__, 0, 3);
 		}
 	}
 }
 
 static void
 rack_check_bottom_drag(struct tcpcb *tp,
 		       struct tcp_rack *rack,
 		       struct socket *so, int32_t acked)
 {
 	uint32_t segsiz, minseg;
 
 	segsiz = ctf_fixed_maxseg(tp);
 	minseg = segsiz;
 
 	if (tp->snd_max == tp->snd_una) {
 		/*
 		 * We are doing dynamic pacing and we are way
 		 * under. Basically everything got acked while
 		 * we were still waiting on the pacer to expire.
 		 *
 		 * This means we need to boost the b/w in
 		 * addition to any earlier boosting of
 		 * the multiplier.
 		 */
 		rack->rc_dragged_bottom = 1;
 		rack_validate_multipliers_at_or_above100(rack);
 		/*
 		 * Lets use the segment bytes acked plus
 		 * the lowest RTT seen as the basis to
 		 * form a b/w estimate. This will be off
 		 * due to the fact that the true estimate
 		 * should be around 1/2 the time of the RTT
 		 * but we can settle for that.
 		 */
 		if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) &&
 		    acked) {
 			uint64_t bw, calc_bw, rtt;
 
 			rtt = rack->r_ctl.rack_rs.rs_us_rtt;
 			if (rtt == 0) {
 				/* no us sample is there a ms one? */
 				if (rack->r_ctl.rack_rs.rs_rtt_lowest) {
 					rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
 				} else {
 					goto no_measurement;
 				}
 			}
 			bw = acked;
 			calc_bw = bw * 1000000;
 			calc_bw /= rtt;
 			if (rack->r_ctl.last_max_bw &&
 			    (rack->r_ctl.last_max_bw < calc_bw)) {
 				/*
 				 * If we have a last calculated max bw
 				 * enforce it.
 				 */
 				calc_bw = rack->r_ctl.last_max_bw;
 			}
 			/* now plop it in */
 			if (rack->rc_gp_filled == 0) {
 				if (calc_bw > ONE_POINT_TWO_MEG) {
 					/*
 					 * If we have no measurement
 					 * don't let us set in more than
 					 * 1.2Mbps. If we are still too
 					 * low after pacing with this we
 					 * will hopefully have a max b/w
 					 * available to sanity check things.
 					 */
 					calc_bw = ONE_POINT_TWO_MEG;
 				}
 				rack->r_ctl.rc_rtt_diff = 0;
 				rack->r_ctl.gp_bw = calc_bw;
 				rack->rc_gp_filled = 1;
 				if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
 					rack->r_ctl.num_measurements = RACK_REQ_AVG;
 				rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
 			} else if (calc_bw > rack->r_ctl.gp_bw) {
 				rack->r_ctl.rc_rtt_diff = 0;
 				if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
 					rack->r_ctl.num_measurements = RACK_REQ_AVG;
 				rack->r_ctl.gp_bw = calc_bw;
 				rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
 			} else
 				rack_increase_bw_mul(rack, -1, 0, 0, 1);
 			if ((rack->gp_ready == 0) &&
 			    (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
 				/* We have enough measurements now */
 				rack->gp_ready = 1;
 				rack_set_cc_pacing(rack);
 				if (rack->defer_options)
 					rack_apply_deferred_options(rack);
 			}
 			/*
 			 * For acks over 1mss we do a extra boost to simulate
 			 * where we would get 2 acks (we want 110 for the mul).
 			 */
 			if (acked > segsiz)
 				rack_increase_bw_mul(rack, -1, 0, 0, 1);
 		} else {
 			/*
 			 * zero rtt possibly?, settle for just an old increase.
 			 */
 no_measurement:
 			rack_increase_bw_mul(rack, -1, 0, 0, 1);
 		}
 	} else if ((IN_FASTRECOVERY(tp->t_flags) == 0) &&
 		   (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)),
 					       minseg)) &&
 		   (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) &&
 		   (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) &&
 		   (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <=
 		    (segsiz * rack_req_segs))) {
 		/*
 		 * We are doing dynamic GP pacing and
 		 * we have everything except 1MSS or less
 		 * bytes left out. We are still pacing away.
 		 * And there is data that could be sent, This
 		 * means we are inserting delayed ack time in
 		 * our measurements because we are pacing too slow.
 		 */
 		rack_validate_multipliers_at_or_above100(rack);
 		rack->rc_dragged_bottom = 1;
 		rack_increase_bw_mul(rack, -1, 0, 0, 1);
 	}
 }
 
 
 
 static void
 rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount)
 {
 	/*
 	 * The fast output path is enabled and we
 	 * have moved the cumack forward. Lets see if
 	 * we can expand forward the fast path length by
 	 * that amount. What we would ideally like to
 	 * do is increase the number of bytes in the
 	 * fast path block (left_to_send) by the
 	 * acked amount. However we have to gate that
 	 * by two factors:
 	 * 1) The amount outstanding and the rwnd of the peer
 	 *    (i.e. we don't want to exceed the rwnd of the peer).
 	 *    <and>
 	 * 2) The amount of data left in the socket buffer (i.e.
 	 *    we can't send beyond what is in the buffer).
 	 *
 	 * Note that this does not take into account any increase
 	 * in the cwnd. We will only extend the fast path by
 	 * what was acked.
 	 */
 	uint32_t new_total, gating_val;
 
 	new_total = acked_amount + rack->r_ctl.fsb.left_to_send;
 	gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)),
 			 (tp->snd_wnd - (tp->snd_max - tp->snd_una)));
 	if (new_total <= gating_val) {
 		/* We can increase left_to_send by the acked amount */
 		counter_u64_add(rack_extended_rfo, 1);
 		rack->r_ctl.fsb.left_to_send = new_total;
 		KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))),
 			("rack:%p left_to_send:%u sbavail:%u out:%u",
 			 rack, rack->r_ctl.fsb.left_to_send,
 			 sbavail(&rack->rc_inp->inp_socket->so_snd),
 			 (tp->snd_max - tp->snd_una)));
 
 	}
 }
 
 static void
 rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una)
 {
 	/*
 	 * Here any sendmap entry that points to the
 	 * beginning mbuf must be adjusted to the correct
 	 * offset. This must be called with:
 	 * 1) The socket buffer locked
 	 * 2) snd_una adjusted to its new postion.
 	 *
 	 * Note that (2) implies rack_ack_received has also
 	 * been called.
 	 *
 	 * We grab the first mbuf in the socket buffer and
 	 * then go through the front of the sendmap, recalculating
 	 * the stored offset for any sendmap entry that has
 	 * that mbuf. We must use the sb functions to do this
 	 * since its possible an add was done has well as
 	 * the subtraction we may have just completed. This should
 	 * not be a penalty though, since we just referenced the sb
 	 * to go in and trim off the mbufs that we freed (of course
 	 * there will be a penalty for the sendmap references though).
 	 */
 	struct mbuf *m;
 	struct rack_sendmap *rsm;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	m = sb->sb_mb;
 	rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
 	if ((rsm == NULL) || (m == NULL)) {
 		/* Nothing outstanding */
 		return;
 	}
 	while (rsm->m && (rsm->m == m)) {
 		/* one to adjust */
 #ifdef INVARIANTS
 		struct mbuf *tm;
 		uint32_t soff;
 
 		tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff);
 		if (rsm->orig_m_len != m->m_len) {
 			rack_adjust_orig_mlen(rsm);
 		}
 		if (rsm->soff != soff) {
 			/*
 			 * This is not a fatal error, we anticipate it
 			 * might happen (the else code), so we count it here
 			 * so that under invariant we can see that it really
 			 * does happen.
 			 */
 			counter_u64_add(rack_adjust_map_bw, 1);
 		}
 		rsm->m = tm;
 		rsm->soff = soff;
 		if (tm)
 			rsm->orig_m_len = rsm->m->m_len;
 		else
 			rsm->orig_m_len = 0;
 #else
 		rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff);
 		if (rsm->m)
 			rsm->orig_m_len = rsm->m->m_len;
 		else
 			rsm->orig_m_len = 0;
 #endif
 		rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
 			      rsm);
 		if (rsm == NULL)
 			break;
 	}
 }
 
 /*
  * Return value of 1, we do not need to call rack_process_data().
  * return value of 0, rack_process_data can be called.
  * For ret_val if its 0 the TCP is locked, if its non-zero
  * its unlocked and probably unsafe to touch the TCB.
  */
 static int
 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to,
     uint32_t tiwin, int32_t tlen,
     int32_t * ofia, int32_t thflags, int32_t *ret_val)
 {
 	int32_t ourfinisacked = 0;
 	int32_t nsegs, acked_amount;
 	int32_t acked;
 	struct mbuf *mfree;
 	struct tcp_rack *rack;
 	int32_t under_pacing = 0;
 	int32_t recovery = 0;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (SEQ_GT(th->th_ack, tp->snd_max)) {
 		__ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val,
 				      &rack->r_ctl.challenge_ack_ts,
 				      &rack->r_ctl.challenge_ack_cnt);
 		rack->r_wanted_output = 1;
 		return (1);
 	}
 	if (rack->gp_ready &&
 	    (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
 		under_pacing = 1;
 	}
 	if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
 		int in_rec, dup_ack_struck = 0;
 
 		in_rec = IN_FASTRECOVERY(tp->t_flags);
 		if (rack->rc_in_persist) {
 			tp->t_rxtshift = 0;
 			RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
 				      rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
 		}
 		if ((th->th_ack == tp->snd_una) &&
 		    (tiwin == tp->snd_wnd) &&
 		    ((to->to_flags & TOF_SACK) == 0)) {
 			rack_strike_dupack(rack);
 			dup_ack_struck = 1;
 		}
 		rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), dup_ack_struck);
 	}
 	if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
 		/*
 		 * Old ack, behind (or duplicate to) the last one rcv'd
 		 * Note: We mark reordering is occuring if its
 		 * less than and we have not closed our window.
 		 */
 		if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) {
 			rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
 		}
 		return (0);
 	}
 	/*
 	 * If we reach this point, ACK is not a duplicate, i.e., it ACKs
 	 * something we sent.
 	 */
 	if (tp->t_flags & TF_NEEDSYN) {
 		/*
 		 * T/TCP: Connection was half-synchronized, and our SYN has
 		 * been ACK'd (so connection is now fully synchronized).  Go
 		 * to non-starred state, increment snd_una for ACK of SYN,
 		 * and check if we can do window scaling.
 		 */
 		tp->t_flags &= ~TF_NEEDSYN;
 		tp->snd_una++;
 		/* Do window scaling? */
 		if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 		    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 			tp->rcv_scale = tp->request_r_scale;
 			/* Send window already scaled. */
 		}
 	}
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	acked = BYTES_THIS_ACK(tp, th);
 	if (acked) {
 		/*
 		 * Any time we move the cum-ack forward clear
 		 * keep-alive tied probe-not-answered. The
 		 * persists clears its own on entry.
 		 */
 		rack->probe_not_answered = 0;
 	}
 	KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
 	KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
 	/*
 	 * If we just performed our first retransmit, and the ACK arrives
 	 * within our recovery window, then it was a mistake to do the
 	 * retransmit in the first place.  Recover our original cwnd and
 	 * ssthresh, and proceed to transmit where we left off.
 	 */
 	if ((tp->t_flags & TF_PREVVALID) &&
 	    ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
 		tp->t_flags &= ~TF_PREVVALID;
 		if (tp->t_rxtshift == 1 &&
 		    (int)(ticks - tp->t_badrxtwin) < 0)
 			rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__);
 	}
 	if (acked) {
 		/* assure we are not backed off */
 		tp->t_rxtshift = 0;
 		RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
 			      rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
 		rack->rc_tlp_in_progress = 0;
 		rack->r_ctl.rc_tlp_cnt_out = 0;
 		/*
 		 * If it is the RXT timer we want to
 		 * stop it, so we can restart a TLP.
 		 */
 		if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
 			rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 #ifdef NETFLIX_HTTP_LOGGING
 		tcp_http_check_for_comp(rack->rc_tp, th->th_ack);
 #endif
 	}
 	/*
 	 * If we have a timestamp reply, update smoothed round trip time. If
 	 * no timestamp is present but transmit timer is running and timed
 	 * sequence number was acked, update smoothed round trip time. Since
 	 * we now have an rtt measurement, cancel the timer backoff (cf.,
 	 * Phil Karn's retransmit alg.). Recompute the initial retransmit
 	 * timer.
 	 *
 	 * Some boxes send broken timestamp replies during the SYN+ACK
 	 * phase, ignore timestamps of 0 or we could calculate a huge RTT
 	 * and blow up the retransmit timer.
 	 */
 	/*
 	 * If all outstanding data is acked, stop retransmit timer and
 	 * remember to restart (more output or persist). If there is more
 	 * data to be acked, restart retransmit timer, using current
 	 * (possibly backed-off) value.
 	 */
 	if (acked == 0) {
 		if (ofia)
 			*ofia = ourfinisacked;
 		return (0);
 	}
 	if (IN_RECOVERY(tp->t_flags)) {
 		if (SEQ_LT(th->th_ack, tp->snd_recover) &&
 		    (SEQ_LT(th->th_ack, tp->snd_max))) {
 			tcp_rack_partialack(tp);
 		} else {
 			rack_post_recovery(tp, th->th_ack);
 			recovery = 1;
 		}
 	}
 	/*
 	 * Let the congestion control algorithm update congestion control
 	 * related information. This typically means increasing the
 	 * congestion window.
 	 */
 	rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, recovery);
 	SOCKBUF_LOCK(&so->so_snd);
 	acked_amount = min(acked, (int)sbavail(&so->so_snd));
 	tp->snd_wnd -= acked_amount;
 	mfree = sbcut_locked(&so->so_snd, acked_amount);
 	if ((sbused(&so->so_snd) == 0) &&
 	    (acked > acked_amount) &&
 	    (tp->t_state >= TCPS_FIN_WAIT_1) &&
 	    (tp->t_flags & TF_SENTFIN)) {
 		/*
 		 * We must be sure our fin
 		 * was sent and acked (we can be
 		 * in FIN_WAIT_1 without having
 		 * sent the fin).
 		 */
 		ourfinisacked = 1;
 	}
 	tp->snd_una = th->th_ack;
 	if (acked_amount && sbavail(&so->so_snd))
 		rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
 	rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
 	/* NB: sowwakeup_locked() does an implicit unlock. */
 	sowwakeup_locked(so);
 	m_freem(mfree);
 	if (SEQ_GT(tp->snd_una, tp->snd_recover))
 		tp->snd_recover = tp->snd_una;
 
 	if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
 		tp->snd_nxt = tp->snd_una;
 	}
 	if (under_pacing &&
 	    (rack->use_fixed_rate == 0) &&
 	    (rack->in_probe_rtt == 0) &&
 	    rack->rc_gp_dyn_mul &&
 	    rack->rc_always_pace) {
 		/* Check if we are dragging bottom */
 		rack_check_bottom_drag(tp, rack, so, acked);
 	}
 	if (tp->snd_una == tp->snd_max) {
 		/* Nothing left outstanding */
 		tp->t_flags &= ~TF_PREVVALID;
 		rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
 		rack->r_ctl.retran_during_recovery = 0;
 		rack->r_ctl.dsack_byte_cnt = 0;
 		if (rack->r_ctl.rc_went_idle_time == 0)
 			rack->r_ctl.rc_went_idle_time = 1;
 		rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
 		if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
 			tp->t_acktime = 0;
 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 		/* Set need output so persist might get set */
 		rack->r_wanted_output = 1;
 		sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
 		if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
 		    (sbavail(&so->so_snd) == 0) &&
 		    (tp->t_flags2 & TF2_DROP_AF_DATA)) {
 			/*
 			 * The socket was gone and the
 			 * peer sent data (now or in the past), time to
 			 * reset him.
 			 */
 			*ret_val = 1;
 			/* tcp_close will kill the inp pre-log the Reset */
 			tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
 			tp = tcp_close(tp);
 			ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
 			return (1);
 		}
 	}
 	if (ofia)
 		*ofia = ourfinisacked;
 	return (0);
 }
 
 
 static void
 rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, uint32_t out, int line,
 		  int dir, uint32_t flags, struct rack_sendmap *rsm)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log, 0, sizeof(log));
 		log.u_bbr.flex1 = cnt;
 		log.u_bbr.flex2 = split;
 		log.u_bbr.flex3 = out;
 		log.u_bbr.flex4 = line;
 		log.u_bbr.flex5 = rack->r_must_retran;
 		log.u_bbr.flex6 = flags;
 		log.u_bbr.flex7 = rack->rc_has_collapsed;
 		log.u_bbr.flex8 = dir;	/*
 					 * 1 is collapsed, 0 is uncollapsed,
 					 * 2 is log of a rsm being marked, 3 is a split.
 					 */
 		if (rsm == NULL)
 			log.u_bbr.rttProp = 0;
 		else
 			log.u_bbr.rttProp = (uint64_t)rsm;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    TCP_RACK_LOG_COLLAPSE, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 static void
 rack_collapsed_window(struct tcp_rack *rack, uint32_t out, int line)
 {
 	/*
 	 * Here all we do is mark the collapsed point and set the flag.
 	 * This may happen again and again, but there is no
 	 * sense splitting our map until we know where the
 	 * peer finally lands in the collapse.
 	 */
 	rack_trace_point(rack, RACK_TP_COLLAPSED_WND);
 	if ((rack->rc_has_collapsed == 0) ||
 	    (rack->r_ctl.last_collapse_point != (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)))
 		counter_u64_add(rack_collapsed_win_seen, 1);
 	rack->r_ctl.last_collapse_point = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
 	rack->r_ctl.high_collapse_point = rack->rc_tp->snd_max;
 	rack->rc_has_collapsed = 1;
 	rack->r_collapse_point_valid = 1;
 	rack_log_collapse(rack, 0, 0, rack->r_ctl.last_collapse_point, line, 1, 0, NULL);
 }
 
 static void
 rack_un_collapse_window(struct tcp_rack *rack, int line)
 {
 	struct rack_sendmap *nrsm, *rsm, fe;
 	int cnt = 0, split = 0;
 #ifdef INVARIANTS
 	struct rack_sendmap *insret;
 #endif
 
 	memset(&fe, 0, sizeof(fe));
 	rack->rc_has_collapsed = 0;
 	fe.r_start = rack->r_ctl.last_collapse_point;
 	rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
 	if (rsm == NULL) {
 		/* Nothing to do maybe the peer ack'ed it all */
 		rack_log_collapse(rack, 0, 0, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL);
 		return;
 	}
 	/* Now do we need to split this one? */
 	if (SEQ_GT(rack->r_ctl.last_collapse_point, rsm->r_start)) {
 		rack_log_collapse(rack, rsm->r_start, rsm->r_end,
 				  rack->r_ctl.last_collapse_point, line, 3, rsm->r_flags, rsm);
 		nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
 		if (nrsm == NULL) {
 			/* We can't get a rsm, mark all? */
 			nrsm = rsm;
 			goto no_split;
 		}
 		/* Clone it */
 		split = 1;
 		rack_clone_rsm(rack, nrsm, rsm, rack->r_ctl.last_collapse_point);
 #ifndef INVARIANTS
 		(void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
 #else
 		insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
 		if (insret != NULL) {
 			panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p",
 			      nrsm, insret, rack, rsm);
 		}
 #endif
 		rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT,
 				 rack->r_ctl.last_collapse_point, __LINE__);
 		if (rsm->r_in_tmap) {
 			TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
 			nrsm->r_in_tmap = 1;
 		}
 		/*
 		 * Set in the new RSM as the
 		 * collapsed starting point
 		 */
 		rsm = nrsm;
 	}
 no_split:
 	RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) {
 		nrsm->r_flags |= RACK_RWND_COLLAPSED;
 		rack_log_collapse(rack, nrsm->r_start, nrsm->r_end, 0, line, 4, nrsm->r_flags, nrsm);
 		cnt++;
 	}
 	if (cnt) {
 		counter_u64_add(rack_collapsed_win, 1);
 	}
 	rack_log_collapse(rack, cnt, split, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL);
 }
 
 static void
 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack,
 			int32_t tlen, int32_t tfo_syn)
 {
 	if (DELAY_ACK(tp, tlen) || tfo_syn) {
 		if (rack->rc_dack_mode &&
 		    (tlen > 500) &&
 		    (rack->rc_dack_toggle == 1)) {
 			goto no_delayed_ack;
 		}
 		rack_timer_cancel(tp, rack,
 				  rack->r_ctl.rc_rcvtime, __LINE__);
 		tp->t_flags |= TF_DELACK;
 	} else {
 no_delayed_ack:
 		rack->r_wanted_output = 1;
 		tp->t_flags |= TF_ACKNOW;
 		if (rack->rc_dack_mode) {
 			if (tp->t_flags & TF_DELACK)
 				rack->rc_dack_toggle = 1;
 			else
 				rack->rc_dack_toggle = 0;
 		}
 	}
 }
 
 static void
 rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack)
 {
 	/*
 	 * If fast output is in progress, lets validate that
 	 * the new window did not shrink on us and make it
 	 * so fast output should end.
 	 */
 	if (rack->r_fast_output) {
 		uint32_t out;
 
 		/*
 		 * Calculate what we will send if left as is
 		 * and compare that to our send window.
 		 */
 		out = ctf_outstanding(tp);
 		if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) {
 			/* ok we have an issue */
 			if (out >= tp->snd_wnd) {
 				/* Turn off fast output the window is met or collapsed */
 				rack->r_fast_output = 0;
 			} else {
 				/* we have some room left */
 				rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out;
 				if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) {
 					/* If not at least 1 full segment never mind */
 					rack->r_fast_output = 0;
 				}
 			}
 		}
 	}
 }
 
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
 {
 	/*
 	 * Update window information. Don't look at window if no ACK: TAC's
 	 * send garbage on first SYN.
 	 */
 	int32_t nsegs;
 	int32_t tfo_syn;
 	struct tcp_rack *rack;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
 	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
 	    (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
 		/* keep track of pure window updates */
 		if (tlen == 0 &&
 		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 			KMOD_TCPSTAT_INC(tcps_rcvwinupd);
 		tp->snd_wnd = tiwin;
 		rack_validate_fo_sendwin_up(tp, rack);
 		tp->snd_wl1 = th->th_seq;
 		tp->snd_wl2 = th->th_ack;
 		if (tp->snd_wnd > tp->max_sndwnd)
 			tp->max_sndwnd = tp->snd_wnd;
 		rack->r_wanted_output = 1;
 	} else if (thflags & TH_ACK) {
 		if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
 			tp->snd_wnd = tiwin;
 			rack_validate_fo_sendwin_up(tp, rack);
 			tp->snd_wl1 = th->th_seq;
 			tp->snd_wl2 = th->th_ack;
 		}
 	}
 	if (tp->snd_wnd < ctf_outstanding(tp))
 		/* The peer collapsed the window */
 		rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__);
 	else if (rack->rc_has_collapsed)
 		rack_un_collapse_window(rack, __LINE__);
 	if ((rack->r_collapse_point_valid) &&
 	    (SEQ_GT(th->th_ack, rack->r_ctl.high_collapse_point)))
 		rack->r_collapse_point_valid = 0;
 	/* Was persist timer active and now we have window space? */
 	if ((rack->rc_in_persist != 0) &&
 	    (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
 				rack->r_ctl.rc_pace_min_segs))) {
 		rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime);
 		tp->snd_nxt = tp->snd_max;
 		/* Make sure we output to start the timer */
 		rack->r_wanted_output = 1;
 	}
 	/* Do we enter persists? */
 	if ((rack->rc_in_persist == 0) &&
 	    (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
 	    TCPS_HAVEESTABLISHED(tp->t_state) &&
 	    ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) &&
 	    sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
 	    (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
 		/*
 		 * Here the rwnd is less than
 		 * the pacing size, we are established,
 		 * nothing is outstanding, and there is
 		 * data to send. Enter persists.
 		 */
 		rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
 	}
 	if (tp->t_flags2 & TF2_DROP_AF_DATA) {
 		m_freem(m);
 		return (0);
 	}
 	/*
 	 * don't process the URG bit, ignore them drag
 	 * along the up.
 	 */
 	tp->rcv_up = tp->rcv_nxt;
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
 	 * Process the segment text, merging it into the TCP sequencing
 	 * queue, and arranging for acknowledgment of receipt if necessary.
 	 * This process logically involves adjusting tp->rcv_wnd as data is
 	 * presented to the user (this happens in tcp_usrreq.c, case
 	 * PRU_RCVD).  If a FIN has already been received on this connection
 	 * then we just ignore the text.
 	 */
 	tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
 		   IS_FASTOPEN(tp->t_flags));
 	if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		tcp_seq save_start = th->th_seq;
 		tcp_seq save_rnxt  = tp->rcv_nxt;
 		int     save_tlen  = tlen;
 
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 		/*
 		 * Insert segment which includes th into TCP reassembly
 		 * queue with control block tp.  Set thflags to whether
 		 * reassembly now includes a segment with FIN.  This handles
 		 * the common case inline (segment is the next to be
 		 * received on an established connection, and the queue is
 		 * empty), avoiding linkage into and removal from the queue
 		 * and repetition of various conversions. Set DELACK for
 		 * segments received in order, but ack immediately when
 		 * segments are out of order (so fast retransmit can work).
 		 */
 		if (th->th_seq == tp->rcv_nxt &&
 		    SEGQ_EMPTY(tp) &&
 		    (TCPS_HAVEESTABLISHED(tp->t_state) ||
 		    tfo_syn)) {
 #ifdef NETFLIX_SB_LIMITS
 			u_int mcnt, appended;
 
 			if (so->so_rcv.sb_shlim) {
 				mcnt = m_memcnt(m);
 				appended = 0;
 				if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
 				    CFO_NOSLEEP, NULL) == false) {
 					counter_u64_add(tcp_sb_shlim_fails, 1);
 					m_freem(m);
 					return (0);
 				}
 			}
 #endif
 			rack_handle_delayed_ack(tp, rack, tlen, tfo_syn);
 			tp->rcv_nxt += tlen;
 			if (tlen &&
 			    ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
 			    (tp->t_fbyte_in == 0)) {
 				tp->t_fbyte_in = ticks;
 				if (tp->t_fbyte_in == 0)
 					tp->t_fbyte_in = 1;
 				if (tp->t_fbyte_out && tp->t_fbyte_in)
 					tp->t_flags2 |= TF2_FBYTES_COMPLETE;
 			}
 			thflags = tcp_get_flags(th) & TH_FIN;
 			KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
 			KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				m_freem(m);
 			} else
 #ifdef NETFLIX_SB_LIMITS
 				appended =
 #endif
 					sbappendstream_locked(&so->so_rcv, m, 0);
 
 			rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 #ifdef NETFLIX_SB_LIMITS
 			if (so->so_rcv.sb_shlim && appended != mcnt)
 				counter_fo_release(so->so_rcv.sb_shlim,
 				    mcnt - appended);
 #endif
 		} else {
 			/*
 			 * XXX: Due to the header drop above "th" is
 			 * theoretically invalid by now.  Fortunately
 			 * m_adj() doesn't actually frees any mbufs when
 			 * trimming from the head.
 			 */
 			tcp_seq temp = save_start;
 
 			thflags = tcp_reass(tp, th, &temp, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
 			if (tp->t_flags & TF_WAKESOR) {
 				tp->t_flags &= ~TF_WAKESOR;
 				/* NB: sorwakeup_locked() does an implicit unlock. */
 				sorwakeup_locked(so);
 			}
 		}
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    (save_tlen > 0) &&
 		    TCPS_HAVEESTABLISHED(tp->t_state)) {
 			if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
 				/*
 				 * DSACK actually handled in the fastpath
 				 * above.
 				 */
 				RACK_OPTS_INC(tcp_sack_path_1);
 				tcp_update_sack_list(tp, save_start,
 				    save_start + save_tlen);
 			} else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
 				if ((tp->rcv_numsacks >= 1) &&
 				    (tp->sackblks[0].end == save_start)) {
 					/*
 					 * Partial overlap, recorded at todrop
 					 * above.
 					 */
 					RACK_OPTS_INC(tcp_sack_path_2a);
 					tcp_update_sack_list(tp,
 					    tp->sackblks[0].start,
 					    tp->sackblks[0].end);
 				} else {
 					RACK_OPTS_INC(tcp_sack_path_2b);
 					tcp_update_dsack_list(tp, save_start,
 					    save_start + save_tlen);
 				}
 			} else if (tlen >= save_tlen) {
 				/* Update of sackblks. */
 				RACK_OPTS_INC(tcp_sack_path_3);
 				tcp_update_dsack_list(tp, save_start,
 				    save_start + save_tlen);
 			} else if (tlen > 0) {
 				RACK_OPTS_INC(tcp_sack_path_4);
 				tcp_update_dsack_list(tp, save_start,
 				    save_start + tlen);
 			}
 		}
 	} else {
 		m_freem(m);
 		thflags &= ~TH_FIN;
 	}
 
 	/*
 	 * If FIN is received ACK the FIN and let the user know that the
 	 * connection is closing.
 	 */
 	if (thflags & TH_FIN) {
 		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 			/* The socket upcall is handled by socantrcvmore. */
 			socantrcvmore(so);
 			/*
 			 * If connection is half-synchronized (ie NEEDSYN
 			 * flag on) then delay ACK, so it may be piggybacked
 			 * when SYN is sent. Otherwise, since we received a
 			 * FIN then no more input can be expected, send ACK
 			 * now.
 			 */
 			if (tp->t_flags & TF_NEEDSYN) {
 				rack_timer_cancel(tp, rack,
 				    rack->r_ctl.rc_rcvtime, __LINE__);
 				tp->t_flags |= TF_DELACK;
 			} else {
 				tp->t_flags |= TF_ACKNOW;
 			}
 			tp->rcv_nxt++;
 		}
 		switch (tp->t_state) {
 			/*
 			 * In SYN_RECEIVED and ESTABLISHED STATES enter the
 			 * CLOSE_WAIT state.
 			 */
 		case TCPS_SYN_RECEIVED:
 			tp->t_starttime = ticks;
 			/* FALLTHROUGH */
 		case TCPS_ESTABLISHED:
 			rack_timer_cancel(tp, rack,
 			    rack->r_ctl.rc_rcvtime, __LINE__);
 			tcp_state_change(tp, TCPS_CLOSE_WAIT);
 			break;
 
 			/*
 			 * If still in FIN_WAIT_1 STATE FIN has not been
 			 * acked so enter the CLOSING state.
 			 */
 		case TCPS_FIN_WAIT_1:
 			rack_timer_cancel(tp, rack,
 			    rack->r_ctl.rc_rcvtime, __LINE__);
 			tcp_state_change(tp, TCPS_CLOSING);
 			break;
 
 			/*
 			 * In FIN_WAIT_2 state enter the TIME_WAIT state,
 			 * starting the time-wait timer, turning off the
 			 * other standard timers.
 			 */
 		case TCPS_FIN_WAIT_2:
 			rack_timer_cancel(tp, rack,
 			    rack->r_ctl.rc_rcvtime, __LINE__);
 			tcp_twstart(tp);
 			return (1);
 		}
 	}
 	/*
 	 * Return any desired output.
 	 */
 	if ((tp->t_flags & TF_ACKNOW) ||
 	    (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
 		rack->r_wanted_output = 1;
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	return (0);
 }
 
 /*
  * Here nothing is really faster, its just that we
  * have broken out the fast-data path also just like
  * the fast-ack.
  */
 static int
 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t nsegs;
 	int32_t newsize = 0;	/* automatic sockbuf scaling */
 	struct tcp_rack *rack;
 #ifdef NETFLIX_SB_LIMITS
 	u_int mcnt, appended;
 #endif
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 
 #endif
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * the timestamp. NOTE that the test is modified according to the
 	 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
 	 */
 	if (__predict_false(th->th_seq != tp->rcv_nxt)) {
 		return (0);
 	}
 	if (__predict_false(tp->snd_nxt != tp->snd_max)) {
 		return (0);
 	}
 	if (tiwin && tiwin != tp->snd_wnd) {
 		return (0);
 	}
 	if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
 		return (0);
 	}
 	if (__predict_false((to->to_flags & TOF_TS) &&
 	    (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
 		return (0);
 	}
 	if (__predict_false((th->th_ack != tp->snd_una))) {
 		return (0);
 	}
 	if (__predict_false(tlen > sbspace(&so->so_rcv))) {
 		return (0);
 	}
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	/*
 	 * This is a pure, in-sequence data packet with nothing on the
 	 * reassembly queue and we have enough buffer space to take it.
 	 */
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 
 #ifdef NETFLIX_SB_LIMITS
 	if (so->so_rcv.sb_shlim) {
 		mcnt = m_memcnt(m);
 		appended = 0;
 		if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
 		    CFO_NOSLEEP, NULL) == false) {
 			counter_u64_add(tcp_sb_shlim_fails, 1);
 			m_freem(m);
 			return (1);
 		}
 	}
 #endif
 	/* Clean receiver SACK report if present */
 	if (tp->rcv_numsacks)
 		tcp_clean_sackreport(tp);
 	KMOD_TCPSTAT_INC(tcps_preddat);
 	tp->rcv_nxt += tlen;
 	if (tlen &&
 	    ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
 	    (tp->t_fbyte_in == 0)) {
 		tp->t_fbyte_in = ticks;
 		if (tp->t_fbyte_in == 0)
 			tp->t_fbyte_in = 1;
 		if (tp->t_fbyte_out && tp->t_fbyte_in)
 			tp->t_flags2 |= TF2_FBYTES_COMPLETE;
 	}
 	/*
 	 * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
 	 */
 	tp->snd_wl1 = th->th_seq;
 	/*
 	 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
 	 */
 	tp->rcv_up = tp->rcv_nxt;
 	KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
 	KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_INPUT, ostate, tp,
 		    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
 	newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
 
 	/* Add data to socket buffer. */
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 		m_freem(m);
 	} else {
 		/*
 		 * Set new socket buffer size. Give up when limit is
 		 * reached.
 		 */
 		if (newsize)
 			if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
 				so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 #ifdef NETFLIX_SB_LIMITS
 		appended =
 #endif
 			sbappendstream_locked(&so->so_rcv, m, 0);
 		ctf_calc_rwin(so, tp);
 	}
 	rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
 	/* NB: sorwakeup_locked() does an implicit unlock. */
 	sorwakeup_locked(so);
 #ifdef NETFLIX_SB_LIMITS
 	if (so->so_rcv.sb_shlim && mcnt != appended)
 		counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
 #endif
 	rack_handle_delayed_ack(tp, rack, tlen, 0);
 	if (tp->snd_una == tp->snd_max)
 		sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
 	return (1);
 }
 
 /*
  * This subfunction is used to try to highly optimize the
  * fast path. We again allow window updates that are
  * in sequence to remain in the fast-path. We also add
  * in the __predict's to attempt to help the compiler.
  * Note that if we return a 0, then we can *not* process
  * it and the caller should push the packet into the
  * slow-path.
  */
 static int
 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t nxt_pkt, uint32_t cts)
 {
 	int32_t acked;
 	int32_t nsegs;
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
 	 * now IPv6.
 	 */
 	u_char tcp_saveipgen[IP6_HDR_LEN];
 	struct tcphdr tcp_savetcp;
 	short ostate = 0;
 #endif
 	int32_t under_pacing = 0;
 	struct tcp_rack *rack;
 
 	if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
 		/* Old ack, behind (or duplicate to) the last one rcv'd */
 		return (0);
 	}
 	if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
 		/* Above what we have sent? */
 		return (0);
 	}
 	if (__predict_false(tp->snd_nxt != tp->snd_max)) {
 		/* We are retransmitting */
 		return (0);
 	}
 	if (__predict_false(tiwin == 0)) {
 		/* zero window */
 		return (0);
 	}
 	if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
 		/* We need a SYN or a FIN, unlikely.. */
 		return (0);
 	}
 	if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
 		/* Timestamp is behind .. old ack with seq wrap? */
 		return (0);
 	}
 	if (__predict_false(IN_RECOVERY(tp->t_flags))) {
 		/* Still recovering */
 		return (0);
 	}
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (rack->r_ctl.rc_sacked) {
 		/* We have sack holes on our scoreboard */
 		return (0);
 	}
 	/* Ok if we reach here, we can process a fast-ack */
 	if (rack->gp_ready &&
 	    (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
 		under_pacing = 1;
 	}
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
 	rack_log_ack(tp, to, th, 0, 0);
 	/* Did the window get updated? */
 	if (tiwin != tp->snd_wnd) {
 		tp->snd_wnd = tiwin;
 		rack_validate_fo_sendwin_up(tp, rack);
 		tp->snd_wl1 = th->th_seq;
 		if (tp->snd_wnd > tp->max_sndwnd)
 			tp->max_sndwnd = tp->snd_wnd;
 	}
 	/* Do we exit persists? */
 	if ((rack->rc_in_persist != 0) &&
 	    (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
 			       rack->r_ctl.rc_pace_min_segs))) {
 		rack_exit_persist(tp, rack, cts);
 	}
 	/* Do we enter persists? */
 	if ((rack->rc_in_persist == 0) &&
 	    (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
 	    TCPS_HAVEESTABLISHED(tp->t_state) &&
 	    ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) &&
 	    sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
 	    (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
 		/*
 		 * Here the rwnd is less than
 		 * the pacing size, we are established,
 		 * nothing is outstanding, and there is
 		 * data to send. Enter persists.
 		 */
 		rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * the timestamp. NOTE that the test is modified according to the
 	 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * This is a pure ack for outstanding data.
 	 */
 	KMOD_TCPSTAT_INC(tcps_predack);
 
 	/*
 	 * "bad retransmit" recovery.
 	 */
 	if ((tp->t_flags & TF_PREVVALID) &&
 	    ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
 		tp->t_flags &= ~TF_PREVVALID;
 		if (tp->t_rxtshift == 1 &&
 		    (int)(ticks - tp->t_badrxtwin) < 0)
 			rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__);
 	}
 	/*
 	 * Recalculate the transmit timer / rtt.
 	 *
 	 * Some boxes send broken timestamp replies during the SYN+ACK
 	 * phase, ignore timestamps of 0 or we could calculate a huge RTT
 	 * and blow up the retransmit timer.
 	 */
 	acked = BYTES_THIS_ACK(tp, th);
 
 #ifdef TCP_HHOOK
 	/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
 	hhook_run_tcp_est_in(tp, th, to);
 #endif
 	KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
 	KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
 	if (acked) {
 		struct mbuf *mfree;
 
 		rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0);
 		SOCKBUF_LOCK(&so->so_snd);
 		mfree = sbcut_locked(&so->so_snd, acked);
 		tp->snd_una = th->th_ack;
 		/* Note we want to hold the sb lock through the sendmap adjust */
 		rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
 		/* Wake up the socket if we have room to write more */
 		rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
 		sowwakeup_locked(so);
 		m_freem(mfree);
 		tp->t_rxtshift = 0;
 		RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
 			      rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
 		rack->rc_tlp_in_progress = 0;
 		rack->r_ctl.rc_tlp_cnt_out = 0;
 		/*
 		 * If it is the RXT timer we want to
 		 * stop it, so we can restart a TLP.
 		 */
 		if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
 			rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 #ifdef NETFLIX_HTTP_LOGGING
 		tcp_http_check_for_comp(rack->rc_tp, th->th_ack);
 #endif
 	}
 	/*
 	 * Let the congestion control algorithm update congestion control
 	 * related information. This typically means increasing the
 	 * congestion window.
 	 */
 	if (tp->snd_wnd < ctf_outstanding(tp)) {
 		/* The peer collapsed the window */
 		rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__);
 	} else if (rack->rc_has_collapsed)
 		rack_un_collapse_window(rack, __LINE__);
 	if ((rack->r_collapse_point_valid) &&
 	    (SEQ_GT(tp->snd_una, rack->r_ctl.high_collapse_point)))
 		rack->r_collapse_point_valid = 0;
 	/*
 	 * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
 	 */
 	tp->snd_wl2 = th->th_ack;
 	tp->t_dupacks = 0;
 	m_freem(m);
 	/* ND6_HINT(tp);	 *//* Some progress has been made. */
 
 	/*
 	 * If all outstanding data are acked, stop retransmit timer,
 	 * otherwise restart timer using current (possibly backed-off)
 	 * value. If process is waiting for space, wakeup/selwakeup/signal.
 	 * If data are ready to send, let tcp_output decide between more
 	 * output or persist.
 	 */
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_INPUT, ostate, tp,
 		    (void *)tcp_saveipgen,
 		    &tcp_savetcp, 0);
 #endif
 	if (under_pacing &&
 	    (rack->use_fixed_rate == 0) &&
 	    (rack->in_probe_rtt == 0) &&
 	    rack->rc_gp_dyn_mul &&
 	    rack->rc_always_pace) {
 		/* Check if we are dragging bottom */
 		rack_check_bottom_drag(tp, rack, so, acked);
 	}
 	if (tp->snd_una == tp->snd_max) {
 		tp->t_flags &= ~TF_PREVVALID;
 		rack->r_ctl.retran_during_recovery = 0;
 		rack->r_ctl.dsack_byte_cnt = 0;
 		rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
 		if (rack->r_ctl.rc_went_idle_time == 0)
 			rack->r_ctl.rc_went_idle_time = 1;
 		rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
 		if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
 			tp->t_acktime = 0;
 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 	}
 	if (acked && rack->r_fast_output)
 		rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked);
 	if (sbavail(&so->so_snd)) {
 		rack->r_wanted_output = 1;
 	}
 	return (1);
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t ret_val = 0;
 	int32_t todrop;
 	int32_t ourfinisacked = 0;
 	struct tcp_rack *rack;
 
 	ctf_calc_rwin(so, tp);
 	/*
 	 * If the state is SYN_SENT: if seg contains an ACK, but not for our
 	 * SYN, drop the input. if seg contains a RST, then drop the
 	 * connection. if seg does not contain SYN, then drop it. Otherwise
 	 * this is an acceptable SYN segment initialize tp->rcv_nxt and
 	 * tp->irs if seg contains ack then advance tp->snd_una if seg
 	 * contains an ECE and ECN support is enabled, the stream is ECN
 	 * capable. if SYN has been acked change to ESTABLISHED else
 	 * SYN_RCVD state arrange for segment to be acked (eventually)
 	 * continue processing rest of data/controls.
 	 */
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LEQ(th->th_ack, tp->iss) ||
 	    SEQ_GT(th->th_ack, tp->snd_max))) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 		return (1);
 	}
 	if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
 		TCP_PROBE5(connect__refused, NULL, tp,
 		    mtod(m, const char *), tp, th);
 		tp = tcp_drop(tp, ECONNREFUSED);
 		ctf_do_drop(m, tp);
 		return (1);
 	}
 	if (thflags & TH_RST) {
 		ctf_do_drop(m, tp);
 		return (1);
 	}
 	if (!(thflags & TH_SYN)) {
 		ctf_do_drop(m, tp);
 		return (1);
 	}
 	tp->irs = th->th_seq;
 	tcp_rcvseqinit(tp);
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (thflags & TH_ACK) {
 		int tfo_partial = 0;
 
 		KMOD_TCPSTAT_INC(tcps_connects);
 		soisconnected(so);
 #ifdef MAC
 		mac_socketpeer_set_from_mbuf(m, so);
 #endif
 		/* Do window scaling on this connection? */
 		if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 		    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 			tp->rcv_scale = tp->request_r_scale;
 		}
 		tp->rcv_adv += min(tp->rcv_wnd,
 		    TCP_MAXWIN << tp->rcv_scale);
 		/*
 		 * If not all the data that was sent in the TFO SYN
 		 * has been acked, resend the remainder right away.
 		 */
 		if (IS_FASTOPEN(tp->t_flags) &&
 		    (tp->snd_una != tp->snd_max)) {
 			tp->snd_nxt = th->th_ack;
 			tfo_partial = 1;
 		}
 		/*
 		 * If there's data, delay ACK; if there's also a FIN ACKNOW
 		 * will be turned on later.
 		 */
 		if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) {
 			rack_timer_cancel(tp, rack,
 					  rack->r_ctl.rc_rcvtime, __LINE__);
 			tp->t_flags |= TF_DELACK;
 		} else {
 			rack->r_wanted_output = 1;
 			tp->t_flags |= TF_ACKNOW;
 			rack->rc_dack_toggle = 0;
 		}
 
 		tcp_ecn_input_syn_sent(tp, thflags, iptos);
 
 		if (SEQ_GT(th->th_ack, tp->snd_una)) {
 			/*
 			 * We advance snd_una for the
 			 * fast open case. If th_ack is
 			 * acknowledging data beyond
 			 * snd_una we can't just call
 			 * ack-processing since the
 			 * data stream in our send-map
 			 * will start at snd_una + 1 (one
 			 * beyond the SYN). If its just
 			 * equal we don't need to do that
 			 * and there is no send_map.
 			 */
 			tp->snd_una++;
 		}
 		/*
 		 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
 		 * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
 		 */
 		tp->t_starttime = ticks;
 		if (tp->t_flags & TF_NEEDFIN) {
 			tcp_state_change(tp, TCPS_FIN_WAIT_1);
 			tp->t_flags &= ~TF_NEEDFIN;
 			thflags &= ~TH_SYN;
 		} else {
 			tcp_state_change(tp, TCPS_ESTABLISHED);
 			TCP_PROBE5(connect__established, NULL, tp,
 			    mtod(m, const char *), tp, th);
 			rack_cc_conn_init(tp);
 		}
 	} else {
 		/*
 		 * Received initial SYN in SYN-SENT[*] state => simultaneous
 		 * open.  If segment contains CC option and there is a
 		 * cached CC, apply TAO test. If it succeeds, connection is *
 		 * half-synchronized. Otherwise, do 3-way handshake:
 		 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
 		 * there was no CC option, clear cached CC value.
 		 */
 		tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN);
 		tcp_state_change(tp, TCPS_SYN_RECEIVED);
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	/*
 	 * Advance th->th_seq to correspond to first data byte. If data,
 	 * trim to stay within window, dropping FIN if necessary.
 	 */
 	th->th_seq++;
 	if (tlen > tp->rcv_wnd) {
 		todrop = tlen - tp->rcv_wnd;
 		m_adj(m, -todrop);
 		tlen = tp->rcv_wnd;
 		thflags &= ~TH_FIN;
 		KMOD_TCPSTAT_INC(tcps_rcvpackafterwin);
 		KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 	}
 	tp->snd_wl1 = th->th_seq - 1;
 	tp->rcv_up = th->th_seq;
 	/*
 	 * Client side of transaction: already sent SYN and data. If the
 	 * remote host used T/TCP to validate the SYN, our data will be
 	 * ACK'd; if so, enter normal data segment processing in the middle
 	 * of step 5, ack processing. Otherwise, goto step 6.
 	 */
 	if (thflags & TH_ACK) {
 		/* For syn-sent we need to possibly update the rtt */
 		if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
 			uint32_t t, mcts;
 
 			mcts = tcp_ts_getticks();
 			t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC;
 			if (!tp->t_rttlow || tp->t_rttlow > t)
 				tp->t_rttlow = t;
 			rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4);
 			tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
 			tcp_rack_xmit_timer_commit(rack, tp);
 		}
 		if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
 			return (ret_val);
 		/* We may have changed to FIN_WAIT_1 above */
 		if (tp->t_state == TCPS_FIN_WAIT_1) {
 			/*
 			 * In FIN_WAIT_1 STATE in addition to the processing
 			 * for the ESTABLISHED state if our FIN is now
 			 * acknowledged then enter FIN_WAIT_2.
 			 */
 			if (ourfinisacked) {
 				/*
 				 * If we can't receive any more data, then
 				 * closing user can proceed. Starting the
 				 * timer is contrary to the specification,
 				 * but if we don't get a FIN we'll hang
 				 * forever.
 				 *
 				 * XXXjl: we should release the tp also, and
 				 * use a compressed state.
 				 */
 				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 					soisdisconnected(so);
 					tcp_timer_activate(tp, TT_2MSL,
 					    (tcp_fast_finwait2_recycle ?
 					    tcp_finwait2_timeout :
 					    TP_MAXIDLE(tp)));
 				}
 				tcp_state_change(tp, TCPS_FIN_WAIT_2);
 			}
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	   tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	struct tcp_rack *rack;
 	int32_t ret_val = 0;
 	int32_t ourfinisacked = 0;
 
 	ctf_calc_rwin(so, tp);
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 	    SEQ_GT(th->th_ack, tp->snd_max))) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 		return (1);
 	}
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (IS_FASTOPEN(tp->t_flags)) {
 		/*
 		 * When a TFO connection is in SYN_RECEIVED, the
 		 * only valid packets are the initial SYN, a
 		 * retransmit/copy of the initial SYN (possibly with
 		 * a subset of the original data), a valid ACK, a
 		 * FIN, or a RST.
 		 */
 		if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
 			tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 			ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		} else if (thflags & TH_SYN) {
 			/* non-initial SYN is ignored */
 			if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
 			    (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
 			    (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
 				ctf_do_drop(m, NULL);
 				return (0);
 			}
 		} else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
 			ctf_do_drop(m, NULL);
 			return (0);
 		}
 	}
 
 	if ((thflags & TH_RST) ||
 	    (tp->t_fin_is_rst && (thflags & TH_FIN)))
 		return (__ctf_process_rst(m, th, so, tp,
 					  &rack->r_ctl.challenge_ack_ts,
 					  &rack->r_ctl.challenge_ack_cnt));
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	/*
 	 * In the SYN-RECEIVED state, validate that the packet belongs to
 	 * this connection before trimming the data to fit the receive
 	 * window.  Check the sequence number versus IRS since we know the
 	 * sequence numbers haven't wrapped.  This is a partial fix for the
 	 * "LAND" DoS attack.
 	 */
 	if (SEQ_LT(th->th_seq, tp->irs)) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 		return (1);
 	}
 	if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
 			      &rack->r_ctl.challenge_ack_ts,
 			      &rack->r_ctl.challenge_ack_cnt)) {
 		return (ret_val);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	tp->snd_wnd = tiwin;
 	rack_validate_fo_sendwin_up(tp, rack);
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (IS_FASTOPEN(tp->t_flags)) {
 			rack_cc_conn_init(tp);
 		}
 		return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 		    tiwin, thflags, nxt_pkt));
 	}
 	KMOD_TCPSTAT_INC(tcps_connects);
 	if (tp->t_flags & TF_SONOTCONN) {
 		tp->t_flags &= ~TF_SONOTCONN;
 		soisconnected(so);
 	}
 	/* Do window scaling? */
 	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
 	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
 		tp->rcv_scale = tp->request_r_scale;
 	}
 	/*
 	 * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
 	 * FIN-WAIT-1
 	 */
 	tp->t_starttime = ticks;
 	if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
 		tcp_fastopen_decrement_counter(tp->t_tfo_pending);
 		tp->t_tfo_pending = NULL;
 	}
 	if (tp->t_flags & TF_NEEDFIN) {
 		tcp_state_change(tp, TCPS_FIN_WAIT_1);
 		tp->t_flags &= ~TF_NEEDFIN;
 	} else {
 		tcp_state_change(tp, TCPS_ESTABLISHED);
 		TCP_PROBE5(accept__established, NULL, tp,
 		    mtod(m, const char *), tp, th);
 		/*
 		 * TFO connections call cc_conn_init() during SYN
 		 * processing.  Calling it again here for such connections
 		 * is not harmless as it would undo the snd_cwnd reduction
 		 * that occurs when a TFO SYN|ACK is retransmitted.
 		 */
 		if (!IS_FASTOPEN(tp->t_flags))
 			rack_cc_conn_init(tp);
 	}
 	/*
 	 * Account for the ACK of our SYN prior to
 	 * regular ACK processing below, except for
 	 * simultaneous SYN, which is handled later.
 	 */
 	if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN))
 		tp->snd_una++;
 	/*
 	 * If segment contains data or ACK, will call tcp_reass() later; if
 	 * not, do so now to pass queued data to user.
 	 */
 	if (tlen == 0 && (thflags & TH_FIN) == 0) {
 		(void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
 		    (struct mbuf *)0);
 		if (tp->t_flags & TF_WAKESOR) {
 			tp->t_flags &= ~TF_WAKESOR;
 			/* NB: sorwakeup_locked() does an implicit unlock. */
 			sorwakeup_locked(so);
 		}
 	}
 	tp->snd_wl1 = th->th_seq - 1;
 	/* For syn-recv we need to possibly update the rtt */
 	if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
 		uint32_t t, mcts;
 
 		mcts = tcp_ts_getticks();
 		t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC;
 		if (!tp->t_rttlow || tp->t_rttlow > t)
 			tp->t_rttlow = t;
 		rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5);
 		tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
 		tcp_rack_xmit_timer_commit(rack, tp);
 	}
 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (tp->t_state == TCPS_FIN_WAIT_1) {
 		/* We could have went to FIN_WAIT_1 (or EST) above */
 		/*
 		 * In FIN_WAIT_1 STATE in addition to the processing for the
 		 * ESTABLISHED state if our FIN is now acknowledged then
 		 * enter FIN_WAIT_2.
 		 */
 		if (ourfinisacked) {
 			/*
 			 * If we can't receive any more data, then closing
 			 * user can proceed. Starting the timer is contrary
 			 * to the specification, but if we don't get a FIN
 			 * we'll hang forever.
 			 *
 			 * XXXjl: we should release the tp also, and use a
 			 * compressed state.
 			 */
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				soisdisconnected(so);
 				tcp_timer_activate(tp, TT_2MSL,
 				    (tcp_fast_finwait2_recycle ?
 				    tcp_finwait2_timeout :
 				    TP_MAXIDLE(tp)));
 			}
 			tcp_state_change(tp, TCPS_FIN_WAIT_2);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t ret_val = 0;
 	struct tcp_rack *rack;
 
 	/*
 	 * Header prediction: check for the two common cases of a
 	 * uni-directional data xfer.  If the packet has no control flags,
 	 * is in-sequence, the window didn't change and we're not
 	 * retransmitting, it's a candidate.  If the length is zero and the
 	 * ack moved forward, we're the sender side of the xfer.  Just free
 	 * the data acked & wake any higher level process that was blocked
 	 * waiting for space.  If the length is non-zero and the ack didn't
 	 * move, we're the receiver side.  If we're getting packets in-order
 	 * (the reassembly queue is empty), add the data toc The socket
 	 * buffer and note that we need a delayed ack. Make sure that the
 	 * hidden state-flags are also off. Since we check for
 	 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
 	 */
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
 	    __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) &&
 	    __predict_true(SEGQ_EMPTY(tp)) &&
 	    __predict_true(th->th_seq == tp->rcv_nxt)) {
 		if (tlen == 0) {
 			if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
 			    tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) {
 				return (0);
 			}
 		} else {
 			if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
 			    tiwin, nxt_pkt, iptos)) {
 				return (0);
 			}
 		}
 	}
 	ctf_calc_rwin(so, tp);
 
 	if ((thflags & TH_RST) ||
 	    (tp->t_fin_is_rst && (thflags & TH_FIN)))
 		return (__ctf_process_rst(m, th, so, tp,
 					  &rack->r_ctl.challenge_ack_ts,
 					  &rack->r_ctl.challenge_ack_cnt));
 
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		ctf_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
 			      &rack->r_ctl.challenge_ack_ts,
 			      &rack->r_ctl.challenge_ack_cnt)) {
 		return (ret_val);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 
 		} else if (tp->t_flags & TF_ACKNOW) {
 			ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
 			return (ret_val);
 		} else {
 			ctf_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	/* State changes only happen in rack_process_data() */
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t ret_val = 0;
 	struct tcp_rack *rack;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	ctf_calc_rwin(so, tp);
 	if ((thflags & TH_RST) ||
 	    (tp->t_fin_is_rst && (thflags & TH_FIN)))
 		return (__ctf_process_rst(m, th, so, tp,
 					  &rack->r_ctl.challenge_ack_ts,
 					  &rack->r_ctl.challenge_ack_cnt));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		ctf_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
 			      &rack->r_ctl.challenge_ack_ts,
 			      &rack->r_ctl.challenge_ack_cnt)) {
 		return (ret_val);
 	}
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 
 		} else if (tp->t_flags & TF_ACKNOW) {
 			ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
 			return (ret_val);
 		} else {
 			ctf_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
 						tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 static int
 rack_check_data_after_close(struct mbuf *m,
     struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
 {
 	struct tcp_rack *rack;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (rack->rc_allow_data_af_clo == 0) {
 	close_now:
 		tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
 		/* tcp_close will kill the inp pre-log the Reset */
 		tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
 		tp = tcp_close(tp);
 		KMOD_TCPSTAT_INC(tcps_rcvafterclose);
 		ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
 		return (1);
 	}
 	if (sbavail(&so->so_snd) == 0)
 		goto close_now;
 	/* Ok we allow data that is ignored and a followup reset */
 	tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
 	tp->rcv_nxt = th->th_seq + *tlen;
 	tp->t_flags2 |= TF2_DROP_AF_DATA;
 	rack->r_wanted_output = 1;
 	*tlen = 0;
 	return (0);
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t ret_val = 0;
 	int32_t ourfinisacked = 0;
 	struct tcp_rack *rack;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	ctf_calc_rwin(so, tp);
 
 	if ((thflags & TH_RST) ||
 	    (tp->t_fin_is_rst && (thflags & TH_FIN)))
 		return (__ctf_process_rst(m, th, so, tp,
 					  &rack->r_ctl.challenge_ack_ts,
 					  &rack->r_ctl.challenge_ack_cnt));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		ctf_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
 			      &rack->r_ctl.challenge_ack_ts,
 			      &rack->r_ctl.challenge_ack_cnt)) {
 		return (ret_val);
 	}
 	/*
 	 * If new data are received on a connection after the user processes
 	 * are gone, then RST the other end.
 	 */
 	if ((tp->t_flags & TF_CLOSED) && tlen &&
 	    rack_check_data_after_close(m, tp, &tlen, th, so))
 		return (1);
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
 			return (ret_val);
 		} else {
 			ctf_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (ourfinisacked) {
 		/*
 		 * If we can't receive any more data, then closing user can
 		 * proceed. Starting the timer is contrary to the
 		 * specification, but if we don't get a FIN we'll hang
 		 * forever.
 		 *
 		 * XXXjl: we should release the tp also, and use a
 		 * compressed state.
 		 */
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 			soisdisconnected(so);
 			tcp_timer_activate(tp, TT_2MSL,
 			    (tcp_fast_finwait2_recycle ?
 			    tcp_finwait2_timeout :
 			    TP_MAXIDLE(tp)));
 		}
 		tcp_state_change(tp, TCPS_FIN_WAIT_2);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
 						tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t ret_val = 0;
 	int32_t ourfinisacked = 0;
 	struct tcp_rack *rack;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	ctf_calc_rwin(so, tp);
 
 	if ((thflags & TH_RST) ||
 	    (tp->t_fin_is_rst && (thflags & TH_FIN)))
 		return (__ctf_process_rst(m, th, so, tp,
 					  &rack->r_ctl.challenge_ack_ts,
 					  &rack->r_ctl.challenge_ack_cnt));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		ctf_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
 			      &rack->r_ctl.challenge_ack_ts,
 			      &rack->r_ctl.challenge_ack_cnt)) {
 		return (ret_val);
 	}
 	/*
 	 * If new data are received on a connection after the user processes
 	 * are gone, then RST the other end.
 	 */
 	if ((tp->t_flags & TF_CLOSED) && tlen &&
 	    rack_check_data_after_close(m, tp, &tlen, th, so))
 		return (1);
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
 			return (ret_val);
 		} else {
 			ctf_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (ourfinisacked) {
 		tcp_twstart(tp);
 		m_freem(m);
 		return (1);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
 						tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t ret_val = 0;
 	int32_t ourfinisacked = 0;
 	struct tcp_rack *rack;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	ctf_calc_rwin(so, tp);
 
 	if ((thflags & TH_RST) ||
 	    (tp->t_fin_is_rst && (thflags & TH_FIN)))
 		return (__ctf_process_rst(m, th, so, tp,
 					  &rack->r_ctl.challenge_ack_ts,
 					  &rack->r_ctl.challenge_ack_cnt));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		ctf_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
 			      &rack->r_ctl.challenge_ack_ts,
 			      &rack->r_ctl.challenge_ack_cnt)) {
 		return (ret_val);
 	}
 	/*
 	 * If new data are received on a connection after the user processes
 	 * are gone, then RST the other end.
 	 */
 	if ((tp->t_flags & TF_CLOSED) && tlen &&
 	    rack_check_data_after_close(m, tp, &tlen, th, so))
 		return (1);
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
 			return (ret_val);
 		} else {
 			ctf_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * case TCPS_LAST_ACK: Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (ourfinisacked) {
 		tp = tcp_close(tp);
 		ctf_do_drop(m, tp);
 		return (1);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
 						tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 /*
  * Return value of 1, the TCB is unlocked and most
  * likely gone, return value of 0, the TCP is still
  * locked.
  */
 static int
 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t ret_val = 0;
 	int32_t ourfinisacked = 0;
 	struct tcp_rack *rack;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	ctf_calc_rwin(so, tp);
 
 	/* Reset receive buffer auto scaling when not in bulk receive mode. */
 	if ((thflags & TH_RST) ||
 	    (tp->t_fin_is_rst && (thflags & TH_FIN)))
 		return (__ctf_process_rst(m, th, so, tp,
 					  &rack->r_ctl.challenge_ack_ts,
 					  &rack->r_ctl.challenge_ack_cnt));
 	/*
 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
 	 * synchronized state.
 	 */
 	if (thflags & TH_SYN) {
 		ctf_challenge_ack(m, th, tp, &ret_val);
 		return (ret_val);
 	}
 	/*
 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
 	 * it's less than ts_recent, drop it.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
 		if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
 			return (ret_val);
 	}
 	if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val,
 			      &rack->r_ctl.challenge_ack_ts,
 			      &rack->r_ctl.challenge_ack_cnt)) {
 		return (ret_val);
 	}
 	/*
 	 * If new data are received on a connection after the user processes
 	 * are gone, then RST the other end.
 	 */
 	if ((tp->t_flags & TF_CLOSED) && tlen &&
 	    rack_check_data_after_close(m, tp, &tlen, th, so))
 		return (1);
 	/*
 	 * If last ACK falls within this segment's sequence numbers, record
 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
 	 * from the latest proposal of the tcplw@cray.com list (Braden
 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
 	 * with our earlier PAWS tests, so this check should be solely
 	 * predicated on the sequence space of this segment. 3) That we
 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
 	 * SEG.Len, This modified check allows us to overcome RFC1323's
 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
 	 * p.869. In such cases, we can still calculate the RTT correctly
 	 * when RCV.NXT == Last.ACK.Sent.
 	 */
 	if ((to->to_flags & TOF_TS) != 0 &&
 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
 		tp->ts_recent_age = tcp_ts_getticks();
 		tp->ts_recent = to->to_tsval;
 	}
 	/*
 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
 	 * is on (half-synchronized state), then queue data for later
 	 * processing; else drop segment and return.
 	 */
 	if ((thflags & TH_ACK) == 0) {
 		if (tp->t_flags & TF_NEEDSYN) {
 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 			    tiwin, thflags, nxt_pkt));
 		} else if (tp->t_flags & TF_ACKNOW) {
 			ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
 			((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
 			return (ret_val);
 		} else {
 			ctf_do_drop(m, NULL);
 			return (0);
 		}
 	}
 	/*
 	 * Ack processing.
 	 */
 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
 		return (ret_val);
 	}
 	if (sbavail(&so->so_snd)) {
 		if (ctf_progress_timeout_check(tp, true)) {
 			rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
 						tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
 	}
 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
 	    tiwin, thflags, nxt_pkt));
 }
 
 static void inline
 rack_clear_rate_sample(struct tcp_rack *rack)
 {
 	rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
 	rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
 	rack->r_ctl.rack_rs.rs_rtt_tot = 0;
 }
 
 static void
 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override)
 {
 	uint64_t bw_est, rate_wanted;
 	int chged = 0;
 	uint32_t user_max, orig_min, orig_max;
 
 	orig_min = rack->r_ctl.rc_pace_min_segs;
 	orig_max = rack->r_ctl.rc_pace_max_segs;
 	user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs;
 	if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs)
 		chged = 1;
 	rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp);
 	if (rack->use_fixed_rate || rack->rc_force_max_seg) {
 		if (user_max != rack->r_ctl.rc_pace_max_segs)
 			chged = 1;
 	}
 	if (rack->rc_force_max_seg) {
 		rack->r_ctl.rc_pace_max_segs = user_max;
 	} else if (rack->use_fixed_rate) {
 		bw_est = rack_get_bw(rack);
 		if ((rack->r_ctl.crte == NULL) ||
 		    (bw_est != rack->r_ctl.crte->rate)) {
 			rack->r_ctl.rc_pace_max_segs = user_max;
 		} else {
 			/* We are pacing right at the hardware rate */
 			uint32_t segsiz;
 
 			segsiz = min(ctf_fixed_maxseg(tp),
 				     rack->r_ctl.rc_pace_min_segs);
 			rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(
 				                           tp, bw_est, segsiz, 0,
 							   rack->r_ctl.crte, NULL);
 		}
 	} else if (rack->rc_always_pace) {
 		if (rack->r_ctl.gp_bw ||
 #ifdef NETFLIX_PEAKRATE
 		    rack->rc_tp->t_maxpeakrate ||
 #endif
 		    rack->r_ctl.init_rate) {
 			/* We have a rate of some sort set */
 			uint32_t  orig;
 
 			bw_est = rack_get_bw(rack);
 			orig = rack->r_ctl.rc_pace_max_segs;
 			if (fill_override)
 				rate_wanted = *fill_override;
 			else
 				rate_wanted = rack_get_output_bw(rack, bw_est, NULL, NULL);
 			if (rate_wanted) {
 				/* We have something */
 				rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack,
 										   rate_wanted,
 										   ctf_fixed_maxseg(rack->rc_tp));
 			} else
 				rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs;
 			if (orig != rack->r_ctl.rc_pace_max_segs)
 				chged = 1;
 		} else if ((rack->r_ctl.gp_bw == 0) &&
 			   (rack->r_ctl.rc_pace_max_segs == 0)) {
 			/*
 			 * If we have nothing limit us to bursting
 			 * out IW sized pieces.
 			 */
 			chged = 1;
 			rack->r_ctl.rc_pace_max_segs = rc_init_window(rack);
 		}
 	}
 	if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) {
 		chged = 1;
 		rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES;
 	}
 	if (chged)
 		rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2);
 }
 
 
 static void
 rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack)
 {
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 #endif
 #ifdef INET
 	struct ip *ip = NULL;
 #endif
 	struct udphdr *udp = NULL;
 
 	/* Ok lets fill in the fast block, it can only be used with no IP options! */
 #ifdef INET6
 	if (rack->r_is_v6) {
 		rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
 		if (tp->t_port) {
 			rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr);
 			udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr));
 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
 			udp->uh_dport = tp->t_port;
 			rack->r_ctl.fsb.udp = udp;
 			rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1);
 		} else
 		{
 			rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1);
 			rack->r_ctl.fsb.udp = NULL;
 		}
 		tcpip_fillheaders(rack->rc_inp,
 				  tp->t_port,
 				  ip6, rack->r_ctl.fsb.th);
 	} else
 #endif				/* INET6 */
 	{
 		rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr);
 		ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
 		if (tp->t_port) {
 			rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr);
 			udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
 			udp->uh_dport = tp->t_port;
 			rack->r_ctl.fsb.udp = udp;
 			rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1);
 		} else
 		{
 			rack->r_ctl.fsb.udp = NULL;
 			rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1);
 		}
 		tcpip_fillheaders(rack->rc_inp,
 				  tp->t_port,
 				  ip, rack->r_ctl.fsb.th);
 	}
 	rack->r_fsb_inited = 1;
 }
 
 static int
 rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack)
 {
 	/*
 	 * Allocate the larger of spaces V6 if available else just
 	 * V4 and include udphdr (overbook)
 	 */
 #ifdef INET6
 	rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + sizeof(struct udphdr);
 #else
 	rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr) + sizeof(struct udphdr);
 #endif
 	rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len,
 					    M_TCPFSB, M_NOWAIT|M_ZERO);
 	if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) {
 		return (ENOMEM);
 	}
 	rack->r_fsb_inited = 0;
 	return (0);
 }
 
 static int
 rack_init(struct tcpcb *tp)
 {
 	struct tcp_rack *rack = NULL;
 #ifdef INVARIANTS
 	struct rack_sendmap *insret;
 #endif
 	uint32_t iwin, snt, us_cts;
 	int err;
 
 	tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
 	if (tp->t_fb_ptr == NULL) {
 		/*
 		 * We need to allocate memory but cant. The INP and INP_INFO
 		 * locks and they are recursive (happens during setup. So a
 		 * scheme to drop the locks fails :(
 		 *
 		 */
 		return (ENOMEM);
 	}
 	memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	RB_INIT(&rack->r_ctl.rc_mtree);
 	TAILQ_INIT(&rack->r_ctl.rc_free);
 	TAILQ_INIT(&rack->r_ctl.rc_tmap);
 	rack->rc_tp = tp;
 	rack->rc_inp = tp->t_inpcb;
 	/* Set the flag */
 	rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 	/* Probably not needed but lets be sure */
 	rack_clear_rate_sample(rack);
 	/*
 	 * Save off the default values, socket options will poke
 	 * at these if pacing is not on or we have not yet
 	 * reached where pacing is on (gp_ready/fixed enabled).
 	 * When they get set into the CC module (when gp_ready
 	 * is enabled or we enable fixed) then we will set these
 	 * values into the CC and place in here the old values
 	 * so we have a restoral. Then we will set the flag
 	 * rc_pacing_cc_set. That way whenever we turn off pacing
 	 * or switch off this stack, we will know to go restore
 	 * the saved values.
 	 */
 	rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn;
 	rack->r_ctl.rc_saved_beta.beta_ecn = V_newreno_beta_ecn;
 	/* We want abe like behavior as well */
 	rack->r_ctl.rc_saved_beta.newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED;
 	rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
 	rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
 	rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
 	rack->r_ctl.roundends = tp->snd_max;
 	if (use_rack_rr)
 		rack->use_rack_rr = 1;
 	if (V_tcp_delack_enabled)
 		tp->t_delayed_ack = 1;
 	else
 		tp->t_delayed_ack = 0;
 #ifdef TCP_ACCOUNTING
 	if (rack_tcp_accounting) {
 		tp->t_flags2 |= TF2_TCP_ACCOUNTING;
 	}
 #endif
 	if (rack_enable_shared_cwnd)
 		rack->rack_enable_scwnd = 1;
 	rack->rc_user_set_max_segs = rack_hptsi_segments;
 	rack->rc_force_max_seg = 0;
 	if (rack_use_imac_dack)
 		rack->rc_dack_mode = 1;
 	TAILQ_INIT(&rack->r_ctl.opt_list);
 	rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
 	rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
 	rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
 	rack->r_ctl.rc_lowest_us_rtt = 0xffffffff;
 	rack->r_ctl.rc_highest_us_rtt = 0;
 	rack->r_ctl.bw_rate_cap = rack_bw_rate_cap;
 	rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop);
 	if (rack_use_cmp_acks)
 		rack->r_use_cmp_ack = 1;
 	if (rack_disable_prr)
 		rack->rack_no_prr = 1;
 	if (rack_gp_no_rec_chg)
 		rack->rc_gp_no_rec_chg = 1;
 	if (rack_pace_every_seg && tcp_can_enable_pacing()) {
 		rack->rc_always_pace = 1;
 		if (rack->use_fixed_rate || rack->gp_ready)
 			rack_set_cc_pacing(rack);
 	} else
 		rack->rc_always_pace = 0;
 	if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack)
 		rack->r_mbuf_queue = 1;
 	else
 		rack->r_mbuf_queue = 0;
 	if  (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
 		tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
 	else
 		tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
 	rack_set_pace_segments(tp, rack, __LINE__, NULL);
 	if (rack_limits_scwnd)
 		rack->r_limit_scw = 1;
 	else
 		rack->r_limit_scw = 0;
 	rack->rc_labc = V_tcp_abc_l_var;
 	rack->r_ctl.rc_high_rwnd = tp->snd_wnd;
 	rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
 	rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
 	rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
 	rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
 	rack->r_ctl.rc_min_to = rack_min_to;
 	microuptime(&rack->r_ctl.act_rcv_time);
 	rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time;
 	rack->rc_init_win = rack_default_init_window;
 	rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss;
 	if (rack_hw_up_only)
 		rack->r_up_only = 1;
 	if (rack_do_dyn_mul) {
 		/* When dynamic adjustment is on CA needs to start at 100% */
 		rack->rc_gp_dyn_mul = 1;
 		if (rack_do_dyn_mul >= 100)
 			rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul;
 	} else
 		rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
 	rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec;
 	rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
 	rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time);
 	setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN,
 				rack_probertt_filter_life);
 	us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
 	rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
 	rack->r_ctl.rc_time_of_last_probertt = us_cts;
 	rack->r_ctl.challenge_ack_ts = tcp_ts_getticks();
 	rack->r_ctl.rc_time_probertt_starts = 0;
 	if (rack_dsack_std_based & 0x1) {
 		/* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
 		rack->rc_rack_tmr_std_based = 1;
 	}
 	if (rack_dsack_std_based & 0x2) {
 		/* Basically this means  rack timers are extended based on dsack by up to (2 * srtt) */
 		rack->rc_rack_use_dsack = 1;
 	}
 	/* We require at least one measurement, even if the sysctl is 0 */
 	if (rack_req_measurements)
 		rack->r_ctl.req_measurements = rack_req_measurements;
 	else
 		rack->r_ctl.req_measurements = 1;
 	if (rack_enable_hw_pacing)
 		rack->rack_hdw_pace_ena = 1;
 	if (rack_hw_rate_caps)
 		rack->r_rack_hw_rate_caps = 1;
 	/* Do we force on detection? */
 #ifdef NETFLIX_EXP_DETECTION
 	if (tcp_force_detection)
 		rack->do_detection = 1;
 	else
 #endif
 		rack->do_detection = 0;
 	if (rack_non_rxt_use_cr)
 		rack->rack_rec_nonrxt_use_cr = 1;
 	err = rack_init_fsb(tp, rack);
 	if (err) {
 		uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
 		tp->t_fb_ptr = NULL;
 		return (err);
 	}
 	if (tp->snd_una != tp->snd_max) {
 		/* Create a send map for the current outstanding data */
 		struct rack_sendmap *rsm;
 
 		rsm = rack_alloc(rack);
 		if (rsm == NULL) {
 			uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
 			tp->t_fb_ptr = NULL;
 			return (ENOMEM);
 		}
 		rsm->r_no_rtt_allowed = 1;
 		rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
 		rsm->r_rtr_cnt = 1;
 		rsm->r_rtr_bytes = 0;
 		if (tp->t_flags & TF_SENTFIN)
 			rsm->r_flags |= RACK_HAS_FIN;
 		if ((tp->snd_una == tp->iss) &&
 		    !TCPS_HAVEESTABLISHED(tp->t_state))
 			rsm->r_flags |= RACK_HAS_SYN;
 		rsm->r_start = tp->snd_una;
 		rsm->r_end = tp->snd_max;
 		rsm->r_dupack = 0;
 		if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) {
 			rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff);
 			if (rsm->m)
 				rsm->orig_m_len = rsm->m->m_len;
 			else
 				rsm->orig_m_len = 0;
 		} else {
 			/*
 			 * This can happen if we have a stand-alone FIN or
 			 *  SYN.
 			 */
 			rsm->m = NULL;
 			rsm->orig_m_len = 0;
 			rsm->soff = 0;
 		}
 #ifndef INVARIANTS
 		(void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 #else
 		insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 		if (insret != NULL) {
 			panic("Insert in rb tree fails ret:%p rack:%p rsm:%p",
 			      insret, rack, rsm);
 		}
 #endif
 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
 		rsm->r_in_tmap = 1;
 	}
 	/*
 	 * Timers in Rack are kept in microseconds so lets
 	 * convert any initial incoming variables
 	 * from ticks into usecs. Note that we
 	 * also change the values of t_srtt and t_rttvar, if
 	 * they are non-zero. They are kept with a 5
 	 * bit decimal so we have to carefully convert
 	 * these to get the full precision.
 	 */
 	rack_convert_rtts(tp);
 	tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow);
 	if (rack_do_hystart) {
 		tp->ccv->flags |= CCF_HYSTART_ALLOWED;
 		if (rack_do_hystart > 1)
 			tp->ccv->flags |= CCF_HYSTART_CAN_SH_CWND;
 		if (rack_do_hystart > 2)
 			tp->ccv->flags |= CCF_HYSTART_CONS_SSTH;
 	}
 	if (rack_def_profile)
 		rack_set_profile(rack, rack_def_profile);
 	/* Cancel the GP measurement in progress */
 	tp->t_flags &= ~TF_GPUTINPROG;
 	if (SEQ_GT(tp->snd_max, tp->iss))
 		snt = tp->snd_max - tp->iss;
 	else
 		snt = 0;
 	iwin = rc_init_window(rack);
 	if (snt < iwin) {
 		/* We are not past the initial window
 		 * so we need to make sure cwnd is
 		 * correct.
 		 */
 		if (tp->snd_cwnd < iwin)
 			tp->snd_cwnd = iwin;
 		/*
 		 * If we are within the initial window
 		 * we want ssthresh to be unlimited. Setting
 		 * it to the rwnd (which the default stack does
 		 * and older racks) is not really a good idea
 		 * since we want to be in SS and grow both the
 		 * cwnd and the rwnd (via dynamic rwnd growth). If
 		 * we set it to the rwnd then as the peer grows its
 		 * rwnd we will be stuck in CA and never hit SS.
 		 *
 		 * Its far better to raise it up high (this takes the
 		 * risk that there as been a loss already, probably
 		 * we should have an indicator in all stacks of loss
 		 * but we don't), but considering the normal use this
 		 * is a risk worth taking. The consequences of not
 		 * hitting SS are far worse than going one more time
 		 * into it early on (before we have sent even a IW).
 		 * It is highly unlikely that we will have had a loss
 		 * before getting the IW out.
 		 */
 		tp->snd_ssthresh = 0xffffffff;
 	}
 	rack_stop_all_timers(tp);
 	/* Lets setup the fsb block */
 	rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
 	rack_log_rtt_shrinks(rack,  us_cts,  tp->t_rxtcur,
 			     __LINE__, RACK_RTTS_INIT);
 	return (0);
 }
 
 static int
 rack_handoff_ok(struct tcpcb *tp)
 {
 	if ((tp->t_state == TCPS_CLOSED) ||
 	    (tp->t_state == TCPS_LISTEN)) {
 		/* Sure no problem though it may not stick */
 		return (0);
 	}
 	if ((tp->t_state == TCPS_SYN_SENT) ||
 	    (tp->t_state == TCPS_SYN_RECEIVED)) {
 		/*
 		 * We really don't know if you support sack,
 		 * you have to get to ESTAB or beyond to tell.
 		 */
 		return (EAGAIN);
 	}
 	if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) {
 		/*
 		 * Rack will only send a FIN after all data is acknowledged.
 		 * So in this case we have more data outstanding. We can't
 		 * switch stacks until either all data and only the FIN
 		 * is left (in which case rack_init() now knows how
 		 * to deal with that) <or> all is acknowledged and we
 		 * are only left with incoming data, though why you
 		 * would want to switch to rack after all data is acknowledged
 		 * I have no idea (rrs)!
 		 */
 		return (EAGAIN);
 	}
 	if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){
 		return (0);
 	}
 	/*
 	 * If we reach here we don't do SACK on this connection so we can
 	 * never do rack.
 	 */
 	return (EINVAL);
 }
 
 
 static void
 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
 {
 	if (tp->t_fb_ptr) {
 		struct tcp_rack *rack;
 		struct rack_sendmap *rsm, *nrsm;
 #ifdef INVARIANTS
 		struct rack_sendmap *rm;
 #endif
 
 		rack = (struct tcp_rack *)tp->t_fb_ptr;
 		if (tp->t_in_pkt) {
 			/*
 			 * It is unsafe to process the packets since a
 			 * reset may be lurking in them (its rare but it
 			 * can occur). If we were to find a RST, then we
 			 * would end up dropping the connection and the
 			 * INP lock, so when we return the caller (tcp_usrreq)
 			 * will blow up when it trys to unlock the inp.
 			 */
 			struct mbuf *save, *m;
 
 			m = tp->t_in_pkt;
 			tp->t_in_pkt = NULL;
 			tp->t_tail_pkt = NULL;
 			while (m) {
 				save = m->m_nextpkt;
 				m->m_nextpkt = NULL;
 				m_freem(m);
 				m = save;
 			}
 		}
 		tp->t_flags &= ~TF_FORCEDATA;
 #ifdef NETFLIX_SHARED_CWND
 		if (rack->r_ctl.rc_scw) {
 			uint32_t limit;
 
 			if (rack->r_limit_scw)
 				limit = max(1, rack->r_ctl.rc_lowest_us_rtt);
 			else
 				limit = 0;
 			tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw,
 						  rack->r_ctl.rc_scw_index,
 						  limit);
 			rack->r_ctl.rc_scw = NULL;
 		}
 #endif
 		if (rack->r_ctl.fsb.tcp_ip_hdr) {
 			free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB);
 			rack->r_ctl.fsb.tcp_ip_hdr = NULL;
 			rack->r_ctl.fsb.th = NULL;
 		}
 		/* Convert back to ticks, with  */
 		if (tp->t_srtt > 1) {
 			uint32_t val, frac;
 
 			val = USEC_2_TICKS(tp->t_srtt);
 			frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
 			tp->t_srtt = val << TCP_RTT_SHIFT;
 			/*
 			 * frac is the fractional part here is left
 			 * over from converting to hz and shifting.
 			 * We need to convert this to the 5 bit
 			 * remainder.
 			 */
 			if (frac) {
 				if (hz == 1000) {
 					frac = (((uint64_t)frac *  (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
 				} else {
 					frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
 				}
 				tp->t_srtt += frac;
 			}
 		}
 		if (tp->t_rttvar) {
 			uint32_t val, frac;
 
 			val = USEC_2_TICKS(tp->t_rttvar);
 			frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
 			tp->t_rttvar = val <<  TCP_RTTVAR_SHIFT;
 			/*
 			 * frac is the fractional part here is left
 			 * over from converting to hz and shifting.
 			 * We need to convert this to the 5 bit
 			 * remainder.
 			 */
 			if (frac) {
 				if (hz == 1000) {
 					frac = (((uint64_t)frac *  (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
 				} else {
 					frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
 				}
 				tp->t_rttvar += frac;
 			}
 		}
 		tp->t_rxtcur = USEC_2_TICKS(tp->t_rxtcur);
 		tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow);
 		if (rack->rc_always_pace) {
 			tcp_decrement_paced_conn();
 			rack_undo_cc_pacing(rack);
 			rack->rc_always_pace = 0;
 		}
 		/* Clean up any options if they were not applied */
 		while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) {
 			struct deferred_opt_list *dol;
 
 			dol = TAILQ_FIRST(&rack->r_ctl.opt_list);
 			TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next);
 			free(dol, M_TCPDO);
 		}
 		/* rack does not use force data but other stacks may clear it */
 		if (rack->r_ctl.crte != NULL) {
 			tcp_rel_pacing_rate(rack->r_ctl.crte, tp);
 			rack->rack_hdrw_pacing = 0;
 			rack->r_ctl.crte = NULL;
 		}
 #ifdef TCP_BLACKBOX
 		tcp_log_flowend(tp);
 #endif
 		RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) {
 #ifndef INVARIANTS
 			(void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 #else
 			rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
 			if (rm != rsm) {
 				panic("At fini, rack:%p rsm:%p rm:%p",
 				      rack, rsm, rm);
 			}
 #endif
 			uma_zfree(rack_zone, rsm);
 		}
 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
 		while (rsm) {
 			TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
 			uma_zfree(rack_zone, rsm);
 			rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
 		}
 		rack->rc_free_cnt = 0;
 		uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
 		tp->t_fb_ptr = NULL;
 	}
 	if (tp->t_inpcb) {
 		tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
 		tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
 		tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
 		tp->t_inpcb->inp_flags2 &= ~INP_MBUF_ACKCMP;
 		/* Cancel the GP measurement in progress */
 		tp->t_flags &= ~TF_GPUTINPROG;
 		tp->t_inpcb->inp_flags2 &= ~INP_MBUF_L_ACKS;
 	}
 	/* Make sure snd_nxt is correctly set */
 	tp->snd_nxt = tp->snd_max;
 }
 
 static void
 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
 {
 	if ((rack->r_state == TCPS_CLOSED) && (tp->t_state != TCPS_CLOSED)) {
 		rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 	}
 	switch (tp->t_state) {
 	case TCPS_SYN_SENT:
 		rack->r_state = TCPS_SYN_SENT;
 		rack->r_substate = rack_do_syn_sent;
 		break;
 	case TCPS_SYN_RECEIVED:
 		rack->r_state = TCPS_SYN_RECEIVED;
 		rack->r_substate = rack_do_syn_recv;
 		break;
 	case TCPS_ESTABLISHED:
 		rack_set_pace_segments(tp, rack, __LINE__, NULL);
 		rack->r_state = TCPS_ESTABLISHED;
 		rack->r_substate = rack_do_established;
 		break;
 	case TCPS_CLOSE_WAIT:
 		rack_set_pace_segments(tp, rack, __LINE__, NULL);
 		rack->r_state = TCPS_CLOSE_WAIT;
 		rack->r_substate = rack_do_close_wait;
 		break;
 	case TCPS_FIN_WAIT_1:
 		rack_set_pace_segments(tp, rack, __LINE__, NULL);
 		rack->r_state = TCPS_FIN_WAIT_1;
 		rack->r_substate = rack_do_fin_wait_1;
 		break;
 	case TCPS_CLOSING:
 		rack_set_pace_segments(tp, rack, __LINE__, NULL);
 		rack->r_state = TCPS_CLOSING;
 		rack->r_substate = rack_do_closing;
 		break;
 	case TCPS_LAST_ACK:
 		rack_set_pace_segments(tp, rack, __LINE__, NULL);
 		rack->r_state = TCPS_LAST_ACK;
 		rack->r_substate = rack_do_lastack;
 		break;
 	case TCPS_FIN_WAIT_2:
 		rack_set_pace_segments(tp, rack, __LINE__, NULL);
 		rack->r_state = TCPS_FIN_WAIT_2;
 		rack->r_substate = rack_do_fin_wait_2;
 		break;
 	case TCPS_LISTEN:
 	case TCPS_CLOSED:
 	case TCPS_TIME_WAIT:
 	default:
 		break;
 	};
 	if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
 		rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
 
 }
 
 static void
 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
 {
 	/*
 	 * We received an ack, and then did not
 	 * call send or were bounced out due to the
 	 * hpts was running. Now a timer is up as well, is
 	 * it the right timer?
 	 */
 	struct rack_sendmap *rsm;
 	int tmr_up;
 
 	tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
 	if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
 		return;
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
 	    (tmr_up == PACE_TMR_RXT)) {
 		/* Should be an RXT */
 		return;
 	}
 	if (rsm == NULL) {
 		/* Nothing outstanding? */
 		if (tp->t_flags & TF_DELACK) {
 			if (tmr_up == PACE_TMR_DELACK)
 				/* We are supposed to have delayed ack up and we do */
 				return;
 		} else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) {
 			/*
 			 * if we hit enobufs then we would expect the possibility
 			 * of nothing outstanding and the RXT up (and the hptsi timer).
 			 */
 			return;
 		} else if (((V_tcp_always_keepalive ||
 			     rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
 			    (tp->t_state <= TCPS_CLOSING)) &&
 			   (tmr_up == PACE_TMR_KEEP) &&
 			   (tp->snd_max == tp->snd_una)) {
 			/* We should have keep alive up and we do */
 			return;
 		}
 	}
 	if (SEQ_GT(tp->snd_max, tp->snd_una) &&
 		   ((tmr_up == PACE_TMR_TLP) ||
 		    (tmr_up == PACE_TMR_RACK) ||
 		    (tmr_up == PACE_TMR_RXT))) {
 		/*
 		 * Either a Rack, TLP or RXT is fine if  we
 		 * have outstanding data.
 		 */
 		return;
 	} else if (tmr_up == PACE_TMR_DELACK) {
 		/*
 		 * If the delayed ack was going to go off
 		 * before the rtx/tlp/rack timer were going to
 		 * expire, then that would be the timer in control.
 		 * Note we don't check the time here trusting the
 		 * code is correct.
 		 */
 		return;
 	}
 	/*
 	 * Ok the timer originally started is not what we want now.
 	 * We will force the hpts to be stopped if any, and restart
 	 * with the slot set to what was in the saved slot.
 	 */
 	if (tcp_in_hpts(rack->rc_inp)) {
 		if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
 			uint32_t us_cts;
 
 			us_cts = tcp_get_usecs(NULL);
 			if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
 				rack->r_early = 1;
 				rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
 			}
 			rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
 		}
 		tcp_hpts_remove(tp->t_inpcb);
 	}
 	rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 	rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
 }
 
 
 static void
 rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts, uint32_t high_seq)
 {
 	if ((SEQ_LT(tp->snd_wl1, seq) ||
 	    (tp->snd_wl1 == seq && (SEQ_LT(tp->snd_wl2, ack) ||
 	    (tp->snd_wl2 == ack && tiwin > tp->snd_wnd))))) {
 		/* keep track of pure window updates */
 		if ((tp->snd_wl2 == ack) && (tiwin > tp->snd_wnd))
 			KMOD_TCPSTAT_INC(tcps_rcvwinupd);
 		tp->snd_wnd = tiwin;
 		rack_validate_fo_sendwin_up(tp, rack);
 		tp->snd_wl1 = seq;
 		tp->snd_wl2 = ack;
 		if (tp->snd_wnd > tp->max_sndwnd)
 			tp->max_sndwnd = tp->snd_wnd;
 	    rack->r_wanted_output = 1;
 	} else if ((tp->snd_wl2 == ack) && (tiwin < tp->snd_wnd)) {
 		tp->snd_wnd = tiwin;
 		rack_validate_fo_sendwin_up(tp, rack);
 		tp->snd_wl1 = seq;
 		tp->snd_wl2 = ack;
 	} else {
 		/* Not a valid win update */
 		return;
 	}
 	/* Do we exit persists? */
 	if ((rack->rc_in_persist != 0) &&
 	    (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
 				rack->r_ctl.rc_pace_min_segs))) {
 		rack_exit_persist(tp, rack, cts);
 	}
 	/* Do we enter persists? */
 	if ((rack->rc_in_persist == 0) &&
 	    (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
 	    TCPS_HAVEESTABLISHED(tp->t_state) &&
 	    ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) &&
 	    sbavail(&tp->t_inpcb->inp_socket->so_snd) &&
 	    (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) {
 		/*
 		 * Here the rwnd is less than
 		 * the pacing size, we are established,
 		 * nothing is outstanding, and there is
 		 * data to send. Enter persists.
 		 */
 		rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
 	}
 }
 
 static void
 rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq)
 {
 
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval ltv;
 		char tcp_hdr_buf[60];
 		struct tcphdr *th;
 		struct timespec ts;
 		uint32_t orig_snd_una;
 		uint8_t xx = 0;
 
 #ifdef NETFLIX_HTTP_LOGGING
 		struct http_sendfile_track *http_req;
 
 		if (SEQ_GT(ae->ack, tp->snd_una)) {
 			http_req = tcp_http_find_req_for_seq(tp, (ae->ack-1));
 		} else {
 			http_req = tcp_http_find_req_for_seq(tp, ae->ack);
 		}
 #endif
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
 		if (rack->rack_no_prr == 0)
 			log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
 		else
 			log.u_bbr.flex1 = 0;
 		log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->r_might_revert;
 		log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
 		log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.pkts_out = tp->t_maxseg;
 		log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
 		log.u_bbr.flex7 = 1;
 		log.u_bbr.lost = ae->flags;
 		log.u_bbr.cwnd_gain = ackval;
 		log.u_bbr.pacing_gain = 0x2;
 		if (ae->flags & TSTMP_HDWR) {
 			/* Record the hardware timestamp if present */
 			log.u_bbr.flex3 = M_TSTMP;
 			ts.tv_sec = ae->timestamp / 1000000000;
 			ts.tv_nsec = ae->timestamp % 1000000000;
 			ltv.tv_sec = ts.tv_sec;
 			ltv.tv_usec = ts.tv_nsec / 1000;
 			log.u_bbr.lt_epoch = tcp_tv_to_usectick(&ltv);
 		} else if (ae->flags & TSTMP_LRO) {
 			/* Record the LRO the arrival timestamp */
 			log.u_bbr.flex3 = M_TSTMP_LRO;
 			ts.tv_sec = ae->timestamp / 1000000000;
 			ts.tv_nsec = ae->timestamp % 1000000000;
 			ltv.tv_sec = ts.tv_sec;
 			ltv.tv_usec = ts.tv_nsec / 1000;
 			log.u_bbr.flex5 = tcp_tv_to_usectick(&ltv);
 		}
 		log.u_bbr.timeStamp = tcp_get_usecs(&ltv);
 		/* Log the rcv time */
 		log.u_bbr.delRate = ae->timestamp;
 #ifdef NETFLIX_HTTP_LOGGING
 		log.u_bbr.applimited = tp->t_http_closed;
 		log.u_bbr.applimited <<= 8;
 		log.u_bbr.applimited |= tp->t_http_open;
 		log.u_bbr.applimited <<= 8;
 		log.u_bbr.applimited |= tp->t_http_req;
 		if (http_req) {
 			/* Copy out any client req info */
 			/* seconds */
 			log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC);
 			/* useconds */
 			log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC);
 			log.u_bbr.rttProp = http_req->timestamp;
 			log.u_bbr.cur_del_rate = http_req->start;
 			if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) {
 				log.u_bbr.flex8 |= 1;
 			} else {
 				log.u_bbr.flex8 |= 2;
 				log.u_bbr.bw_inuse = http_req->end;
 			}
 			log.u_bbr.flex6 = http_req->start_seq;
 			if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) {
 				log.u_bbr.flex8 |= 4;
 				log.u_bbr.epoch = http_req->end_seq;
 			}
 		}
 #endif
 		memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf));
 		th = (struct tcphdr *)tcp_hdr_buf;
 		th->th_seq = ae->seq;
 		th->th_ack = ae->ack;
 		th->th_win = ae->win;
 		/* Now fill in the ports */
 		th->th_sport = tp->t_inpcb->inp_fport;
 		th->th_dport = tp->t_inpcb->inp_lport;
 		tcp_set_flags(th, ae->flags);
 		/* Now do we have a timestamp option? */
 		if (ae->flags & HAS_TSTMP) {
 			u_char *cp;
 			uint32_t val;
 
 			th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2);
 			cp = (u_char *)(th + 1);
 			*cp = TCPOPT_NOP;
 			cp++;
 			*cp = TCPOPT_NOP;
 			cp++;
 			*cp = TCPOPT_TIMESTAMP;
 			cp++;
 			*cp = TCPOLEN_TIMESTAMP;
 			cp++;
 			val = htonl(ae->ts_value);
 			bcopy((char *)&val,
 			      (char *)cp, sizeof(uint32_t));
 			val = htonl(ae->ts_echo);
 			bcopy((char *)&val,
 			      (char *)(cp + 4), sizeof(uint32_t));
 		} else
 			th->th_off = (sizeof(struct tcphdr) >> 2);
 
 		/*
 		 * For sane logging we need to play a little trick.
 		 * If the ack were fully processed we would have moved
 		 * snd_una to high_seq, but since compressed acks are
 		 * processed in two phases, at this point (logging) snd_una
 		 * won't be advanced. So we would see multiple acks showing
 		 * the advancement. We can prevent that by "pretending" that
 		 * snd_una was advanced and then un-advancing it so that the
 		 * logging code has the right value for tlb_snd_una.
 		 */
 		if (tp->snd_una != high_seq) {
 			orig_snd_una = tp->snd_una;
 			tp->snd_una = high_seq;
 			xx = 1;
 		} else
 			xx = 0;
 		TCP_LOG_EVENTP(tp, th,
 			       &tp->t_inpcb->inp_socket->so_rcv,
 			       &tp->t_inpcb->inp_socket->so_snd, TCP_LOG_IN, 0,
 			       0, &log, true, &ltv);
 		if (xx) {
 			tp->snd_una = orig_snd_una;
 		}
 	}
 
 }
 
 static void
 rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts)
 {
 	uint32_t us_rtt;
 	/*
 	 * A persist or keep-alive was forced out, update our
 	 * min rtt time. Note now worry about lost responses.
 	 * When a subsequent keep-alive or persist times out
 	 * and forced_ack is still on, then the last probe
 	 * was not responded to. In such cases we have a
 	 * sysctl that controls the behavior. Either we apply
 	 * the rtt but with reduced confidence (0). Or we just
 	 * plain don't apply the rtt estimate. Having data flow
 	 * will clear the probe_not_answered flag i.e. cum-ack
 	 * move forward <or> exiting and reentering persists.
 	 */
 
 	rack->forced_ack = 0;
 	rack->rc_tp->t_rxtshift = 0;
 	if ((rack->rc_in_persist &&
 	     (tiwin == rack->rc_tp->snd_wnd)) ||
 	    (rack->rc_in_persist == 0)) {
 		/*
 		 * In persists only apply the RTT update if this is
 		 * a response to our window probe. And that
 		 * means the rwnd sent must match the current
 		 * snd_wnd. If it does not, then we got a
 		 * window update ack instead. For keepalive
 		 * we allow the answer no matter what the window.
 		 *
 		 * Note that if the probe_not_answered is set then
 		 * the forced_ack_ts is the oldest one i.e. the first
 		 * probe sent that might have been lost. This assures
 		 * us that if we do calculate an RTT it is longer not
 		 * some short thing.
 		 */
 		if (rack->rc_in_persist)
 			counter_u64_add(rack_persists_acks, 1);
 		us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
 		if (us_rtt == 0)
 			us_rtt = 1;
 		if (rack->probe_not_answered == 0) {
 			rack_apply_updated_usrtt(rack, us_rtt, us_cts);
 			tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1);
 		} else {
 			/* We have a retransmitted probe here too */
 			if (rack_apply_rtt_with_reduced_conf) {
 				rack_apply_updated_usrtt(rack, us_rtt, us_cts);
 				tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1);
 			}
 		}
 	}
 }
 
 static int
 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv)
 {
 	/*
 	 * Handle a "special" compressed ack mbuf. Each incoming
 	 * ack has only four possible dispositions:
 	 *
 	 * A) It moves the cum-ack forward
 	 * B) It is behind the cum-ack.
 	 * C) It is a window-update ack.
 	 * D) It is a dup-ack.
 	 *
 	 * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES
 	 * in the incoming mbuf. We also need to still pay attention
 	 * to nxt_pkt since there may be another packet after this
 	 * one.
 	 */
 #ifdef TCP_ACCOUNTING
 	uint64_t ts_val;
 	uint64_t rdstc;
 #endif
 	int segsiz;
 	struct timespec ts;
 	struct tcp_rack *rack;
 	struct tcp_ackent *ae;
 	uint32_t tiwin, ms_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack;
 	int cnt, i, did_out, ourfinisacked = 0;
 	struct tcpopt to_holder, *to = NULL;
 #ifdef TCP_ACCOUNTING
 	int win_up_req = 0;
 #endif
 	int nsegs = 0;
 	int under_pacing = 1;
 	int recovery = 0;
 #ifdef TCP_ACCOUNTING
 	sched_pin();
 #endif
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (rack->gp_ready &&
 	    (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT))
 		under_pacing = 0;
 	else
 		under_pacing = 1;
 
 	if (rack->r_state != tp->t_state)
 		rack_set_state(tp, rack);
 	if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
 	    (tp->t_flags & TF_GPUTINPROG)) {
 		/*
 		 * We have a goodput in progress
 		 * and we have entered a late state.
 		 * Do we have enough data in the sb
 		 * to handle the GPUT request?
 		 */
 		uint32_t bytes;
 
 		bytes = tp->gput_ack - tp->gput_seq;
 		if (SEQ_GT(tp->gput_seq, tp->snd_una))
 			bytes += tp->gput_seq - tp->snd_una;
 		if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
 			/*
 			 * There are not enough bytes in the socket
 			 * buffer that have been sent to cover this
 			 * measurement. Cancel it.
 			 */
 			rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
 						   rack->r_ctl.rc_gp_srtt /*flex1*/,
 						   tp->gput_seq,
 						   0, 0, 18, __LINE__, NULL, 0);
 			tp->t_flags &= ~TF_GPUTINPROG;
 		}
 	}
 	to = &to_holder;
 	to->to_flags = 0;
 	KASSERT((m->m_len >= sizeof(struct tcp_ackent)),
 		("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len));
 	cnt = m->m_len / sizeof(struct tcp_ackent);
 	counter_u64_add(rack_multi_single_eq, cnt);
 	high_seq = tp->snd_una;
 	the_win = tp->snd_wnd;
 	win_seq = tp->snd_wl1;
 	win_upd_ack = tp->snd_wl2;
 	cts = tcp_tv_to_usectick(tv);
 	ms_cts = tcp_tv_to_mssectick(tv);
 	rack->r_ctl.rc_rcvtime = cts;
 	segsiz = ctf_fixed_maxseg(tp);
 	if ((rack->rc_gp_dyn_mul) &&
 	    (rack->use_fixed_rate == 0) &&
 	    (rack->rc_always_pace)) {
 		/* Check in on probertt */
 		rack_check_probe_rtt(rack, cts);
 	}
 	for (i = 0; i < cnt; i++) {
 #ifdef TCP_ACCOUNTING
 		ts_val = get_cyclecount();
 #endif
 		rack_clear_rate_sample(rack);
 		ae = ((mtod(m, struct tcp_ackent *)) + i);
 		/* Setup the window */
 		tiwin = ae->win << tp->snd_scale;
 		if (tiwin > rack->r_ctl.rc_high_rwnd)
 			rack->r_ctl.rc_high_rwnd = tiwin;
 		/* figure out the type of ack */
 		if (SEQ_LT(ae->ack, high_seq)) {
 			/* Case B*/
 			ae->ack_val_set = ACK_BEHIND;
 		} else if (SEQ_GT(ae->ack, high_seq)) {
 			/* Case A */
 			ae->ack_val_set = ACK_CUMACK;
 		} else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){
 			/* Case D */
 			ae->ack_val_set = ACK_DUPACK;
 		} else {
 			/* Case C */
 			ae->ack_val_set = ACK_RWND;
 		}
 		rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq);
 		/* Validate timestamp */
 		if (ae->flags & HAS_TSTMP) {
 			/* Setup for a timestamp */
 			to->to_flags = TOF_TS;
 			ae->ts_echo -= tp->ts_offset;
 			to->to_tsecr = ae->ts_echo;
 			to->to_tsval = ae->ts_value;
 			/*
 			 * If echoed timestamp is later than the current time, fall back to
 			 * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
 			 * were used when this connection was established.
 			 */
 			if (TSTMP_GT(ae->ts_echo, ms_cts))
 				to->to_tsecr = 0;
 			if (tp->ts_recent &&
 			    TSTMP_LT(ae->ts_value, tp->ts_recent)) {
 				if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) {
 #ifdef TCP_ACCOUNTING
 					rdstc = get_cyclecount();
 					if (rdstc > ts_val) {
 						counter_u64_add(tcp_proc_time[ae->ack_val_set] ,
 								(rdstc - ts_val));
 						if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 							tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val);
 						}
 					}
 #endif
 					continue;
 				}
 			}
 			if (SEQ_LEQ(ae->seq, tp->last_ack_sent) &&
 			    SEQ_LEQ(tp->last_ack_sent, ae->seq)) {
 				tp->ts_recent_age = tcp_ts_getticks();
 				tp->ts_recent = ae->ts_value;
 			}
 		} else {
 			/* Setup for a no options */
 			to->to_flags = 0;
 		}
 		/* Update the rcv time and perform idle reduction possibly */
 		if  (tp->t_idle_reduce &&
 		     (tp->snd_max == tp->snd_una) &&
 		     (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
 			counter_u64_add(rack_input_idle_reduces, 1);
 			rack_cc_after_idle(rack, tp);
 		}
 		tp->t_rcvtime = ticks;
 		/* Now what about ECN? */
 		if (tcp_ecn_input_segment(tp, ae->flags, ae->codepoint))
 			rack_cong_signal(tp, CC_ECN, ae->ack, __LINE__);
 #ifdef TCP_ACCOUNTING
 		/* Count for the specific type of ack in */
 		counter_u64_add(tcp_cnt_counters[ae->ack_val_set], 1);
 		if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 			tp->tcp_cnt_counters[ae->ack_val_set]++;
 		}
 #endif
 		/*
 		 * Note how we could move up these in the determination
 		 * above, but we don't so that way the timestamp checks (and ECN)
 		 * is done first before we do any processing on the ACK.
 		 * The non-compressed path through the code has this
 		 * weakness (noted by @jtl) that it actually does some
 		 * processing before verifying the timestamp information.
 		 * We don't take that path here which is why we set
 		 * the ack_val_set first, do the timestamp and ecn
 		 * processing, and then look at what we have setup.
 		 */
 		if (ae->ack_val_set == ACK_BEHIND) {
 			/*
 			 * Case B flag reordering, if window is not closed
 			 * or it could be a keep-alive or persists
 			 */
 			if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) {
 				rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
 			}
 		} else if (ae->ack_val_set == ACK_DUPACK) {
 			/* Case D */
 			rack_strike_dupack(rack);
 		} else if (ae->ack_val_set == ACK_RWND) {
 			/* Case C */
 			if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
 				ts.tv_sec = ae->timestamp / 1000000000;
 				ts.tv_nsec = ae->timestamp % 1000000000;
 				rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
 				rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
 			} else {
 				rack->r_ctl.act_rcv_time = *tv;
 			}
 			if (rack->forced_ack) {
 				rack_handle_probe_response(rack, tiwin,
 							   tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
 			}
 #ifdef TCP_ACCOUNTING
 			win_up_req = 1;
 #endif
 			win_upd_ack = ae->ack;
 			win_seq = ae->seq;
 			the_win = tiwin;
 			rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq);
 		} else {
 			/* Case A */
 			if (SEQ_GT(ae->ack, tp->snd_max)) {
 				/*
 				 * We just send an ack since the incoming
 				 * ack is beyond the largest seq we sent.
 				 */
 				if ((tp->t_flags & TF_ACKNOW) == 0) {
 					ctf_ack_war_checks(tp, &rack->r_ctl.challenge_ack_ts, &rack->r_ctl.challenge_ack_cnt);
 					if (tp->t_flags && TF_ACKNOW)
 						rack->r_wanted_output = 1;
 				}
 			} else {
 				nsegs++;
 				/* If the window changed setup to update */
 				if (tiwin != tp->snd_wnd) {
 					win_upd_ack = ae->ack;
 					win_seq = ae->seq;
 					the_win = tiwin;
 					rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq);
 				}
 #ifdef TCP_ACCOUNTING
 				/* Account for the acks */
 				if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 					tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz);
 				}
 				counter_u64_add(tcp_cnt_counters[CNT_OF_ACKS_IN],
 						(((ae->ack - high_seq) + segsiz - 1) / segsiz));
 #endif
 				high_seq = ae->ack;
 				if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 					union tcp_log_stackspecific log;
 					struct timeval tv;
 
 					memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 					log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 					log.u_bbr.flex1 = high_seq;
 					log.u_bbr.flex2 = rack->r_ctl.roundends;
 					log.u_bbr.flex3 = rack->r_ctl.current_round;
 					log.u_bbr.rttProp = (uint64_t)CC_ALGO(tp)->newround;
 					log.u_bbr.flex8 = 8;
 					tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
 						       0, &log, false, NULL, NULL, 0, &tv);
 				}
 				/*
 				 * The draft (v3) calls for us to use SEQ_GEQ, but that
 				 * causes issues when we are just going app limited. Lets
 				 * instead use SEQ_GT <or> where its equal but more data
 				 * is outstanding.
 				 */
 				if ((SEQ_GT(high_seq, rack->r_ctl.roundends)) ||
 				    ((high_seq == rack->r_ctl.roundends) &&
 				     SEQ_GT(tp->snd_max, tp->snd_una))) {
 					rack->r_ctl.current_round++;
 					rack->r_ctl.roundends = tp->snd_max;
 					if (CC_ALGO(tp)->newround != NULL) {
 						CC_ALGO(tp)->newround(tp->ccv, rack->r_ctl.current_round);
 					}
 				}
 				/* Setup our act_rcv_time */
 				if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
 					ts.tv_sec = ae->timestamp / 1000000000;
 					ts.tv_nsec = ae->timestamp % 1000000000;
 					rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
 					rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
 				} else {
 					rack->r_ctl.act_rcv_time = *tv;
 				}
 				rack_process_to_cumack(tp, rack, ae->ack, cts, to);
 				if (rack->rc_dsack_round_seen) {
 					/* Is the dsack round over? */
 					if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) {
 						/* Yes it is */
 						rack->rc_dsack_round_seen = 0;
 						rack_log_dsack_event(rack, 3, __LINE__, 0, 0);
 					}
 				}
 			}
 		}
 		/* And lets be sure to commit the rtt measurements for this ack */
 		tcp_rack_xmit_timer_commit(rack, tp);
 #ifdef TCP_ACCOUNTING
 		rdstc = get_cyclecount();
 		if (rdstc > ts_val) {
 			counter_u64_add(tcp_proc_time[ae->ack_val_set] , (rdstc - ts_val));
 			if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 				tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val);
 				if (ae->ack_val_set == ACK_CUMACK)
 					tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val);
 			}
 		}
 #endif
 	}
 #ifdef TCP_ACCOUNTING
 	ts_val = get_cyclecount();
 #endif
 	/* Tend to any collapsed window */
 	if (SEQ_GT(tp->snd_max, high_seq) && (tp->snd_wnd < (tp->snd_max - high_seq))) {
 		/* The peer collapsed the window */
 		rack_collapsed_window(rack, (tp->snd_max - high_seq), __LINE__);
 	} else if (rack->rc_has_collapsed)
 		rack_un_collapse_window(rack, __LINE__);
 	if ((rack->r_collapse_point_valid) &&
 	    (SEQ_GT(high_seq, rack->r_ctl.high_collapse_point)))
 		rack->r_collapse_point_valid = 0;
 	acked_amount = acked = (high_seq - tp->snd_una);
 	if (acked) {
 		/*
 		 * Clear the probe not answered flag
 		 * since cum-ack moved forward.
 		 */
 		rack->probe_not_answered = 0;
 		if (rack->sack_attack_disable == 0)
 			rack_do_decay(rack);
 		if (acked >= segsiz) {
 			/*
 			 * You only get credit for
 			 * MSS and greater (and you get extra
 			 * credit for larger cum-ack moves).
 			 */
 			int ac;
 
 			ac = acked / segsiz;
 			rack->r_ctl.ack_count += ac;
 			counter_u64_add(rack_ack_total, ac);
 		}
 		if (rack->r_ctl.ack_count > 0xfff00000) {
 			/*
 			 * reduce the number to keep us under
 			 * a uint32_t.
 			 */
 			rack->r_ctl.ack_count /= 2;
 			rack->r_ctl.sack_count /= 2;
 		}
 		if (tp->t_flags & TF_NEEDSYN) {
 			/*
 			 * T/TCP: Connection was half-synchronized, and our SYN has
 			 * been ACK'd (so connection is now fully synchronized).  Go
 			 * to non-starred state, increment snd_una for ACK of SYN,
 			 * and check if we can do window scaling.
 			 */
 			tp->t_flags &= ~TF_NEEDSYN;
 			tp->snd_una++;
 			acked_amount = acked = (high_seq - tp->snd_una);
 		}
 		if (acked > sbavail(&so->so_snd))
 			acked_amount = sbavail(&so->so_snd);
 #ifdef NETFLIX_EXP_DETECTION
 		/*
 		 * We only care on a cum-ack move if we are in a sack-disabled
 		 * state. We have already added in to the ack_count, and we never
 		 * would disable on a cum-ack move, so we only care to do the
 		 * detection if it may "undo" it, i.e. we were in disabled already.
 		 */
 		if (rack->sack_attack_disable)
 			rack_do_detection(tp, rack, acked_amount, segsiz);
 #endif
 		if (IN_FASTRECOVERY(tp->t_flags) &&
 		    (rack->rack_no_prr == 0))
 			rack_update_prr(tp, rack, acked_amount, high_seq);
 		if (IN_RECOVERY(tp->t_flags)) {
 			if (SEQ_LT(high_seq, tp->snd_recover) &&
 			    (SEQ_LT(high_seq, tp->snd_max))) {
 				tcp_rack_partialack(tp);
 			} else {
 				rack_post_recovery(tp, high_seq);
 				recovery = 1;
 			}
 		}
 		/* Handle the rack-log-ack part (sendmap) */
 		if ((sbused(&so->so_snd) == 0) &&
 		    (acked > acked_amount) &&
 		    (tp->t_state >= TCPS_FIN_WAIT_1) &&
 		    (tp->t_flags & TF_SENTFIN)) {
 			/*
 			 * We must be sure our fin
 			 * was sent and acked (we can be
 			 * in FIN_WAIT_1 without having
 			 * sent the fin).
 			 */
 			ourfinisacked = 1;
 			/*
 			 * Lets make sure snd_una is updated
 			 * since most likely acked_amount = 0 (it
 			 * should be).
 			 */
 			tp->snd_una = high_seq;
 		}
 		/* Did we make a RTO error? */
 		if ((tp->t_flags & TF_PREVVALID) &&
 		    ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
 			tp->t_flags &= ~TF_PREVVALID;
 			if (tp->t_rxtshift == 1 &&
 			    (int)(ticks - tp->t_badrxtwin) < 0)
 				rack_cong_signal(tp, CC_RTO_ERR, high_seq, __LINE__);
 		}
 		/* Handle the data in the socket buffer */
 		KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1);
 		KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
 		if (acked_amount > 0) {
 			struct mbuf *mfree;
 
 			rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, recovery);
 			SOCKBUF_LOCK(&so->so_snd);
 			mfree = sbcut_locked(&so->so_snd, acked_amount);
 			tp->snd_una = high_seq;
 			/* Note we want to hold the sb lock through the sendmap adjust */
 			rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
 			/* Wake up the socket if we have room to write more */
 			rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
 			sowwakeup_locked(so);
 			m_freem(mfree);
 		}
 		/* update progress */
 		tp->t_acktime = ticks;
 		rack_log_progress_event(rack, tp, tp->t_acktime,
 					PROGRESS_UPDATE, __LINE__);
 		/* Clear out shifts and such */
 		tp->t_rxtshift = 0;
 		RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
 				   rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
 		rack->rc_tlp_in_progress = 0;
 		rack->r_ctl.rc_tlp_cnt_out = 0;
 		/* Send recover and snd_nxt must be dragged along */
 		if (SEQ_GT(tp->snd_una, tp->snd_recover))
 			tp->snd_recover = tp->snd_una;
 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 			tp->snd_nxt = tp->snd_una;
 		/*
 		 * If the RXT timer is running we want to
 		 * stop it, so we can restart a TLP (or new RXT).
 		 */
 		if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
 			rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 #ifdef NETFLIX_HTTP_LOGGING
 		tcp_http_check_for_comp(rack->rc_tp, high_seq);
 #endif
 		tp->snd_wl2 = high_seq;
 		tp->t_dupacks = 0;
 		if (under_pacing &&
 		    (rack->use_fixed_rate == 0) &&
 		    (rack->in_probe_rtt == 0) &&
 		    rack->rc_gp_dyn_mul &&
 		    rack->rc_always_pace) {
 			/* Check if we are dragging bottom */
 			rack_check_bottom_drag(tp, rack, so, acked);
 		}
 		if (tp->snd_una == tp->snd_max) {
 			tp->t_flags &= ~TF_PREVVALID;
 			rack->r_ctl.retran_during_recovery = 0;
 			rack->r_ctl.dsack_byte_cnt = 0;
 			rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
 			if (rack->r_ctl.rc_went_idle_time == 0)
 				rack->r_ctl.rc_went_idle_time = 1;
 			rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
 			if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0)
 				tp->t_acktime = 0;
 			/* Set so we might enter persists... */
 			rack->r_wanted_output = 1;
 			rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 			sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
 			if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
 			    (sbavail(&so->so_snd) == 0) &&
 			    (tp->t_flags2 & TF2_DROP_AF_DATA)) {
 				/*
 				 * The socket was gone and the
 				 * peer sent data (not now in the past), time to
 				 * reset him.
 				 */
 				rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
 				/* tcp_close will kill the inp pre-log the Reset */
 				tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
 #ifdef TCP_ACCOUNTING
 				rdstc = get_cyclecount();
 				if (rdstc > ts_val) {
 					counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val));
 					if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 						tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
 						tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
 					}
 				}
 #endif
 				m_freem(m);
 				tp = tcp_close(tp);
 				if (tp == NULL) {
 #ifdef TCP_ACCOUNTING
 					sched_unpin();
 #endif
 					return (1);
 				}
 				/*
 				 * We would normally do drop-with-reset which would
 				 * send back a reset. We can't since we don't have
 				 * all the needed bits. Instead lets arrange for
 				 * a call to tcp_output(). That way since we
 				 * are in the closed state we will generate a reset.
 				 *
 				 * Note if tcp_accounting is on we don't unpin since
 				 * we do that after the goto label.
 				 */
 				goto send_out_a_rst;
 			}
 			if ((sbused(&so->so_snd) == 0) &&
 			    (tp->t_state >= TCPS_FIN_WAIT_1) &&
 			    (tp->t_flags & TF_SENTFIN)) {
 				/*
 				 * If we can't receive any more data, then closing user can
 				 * proceed. Starting the timer is contrary to the
 				 * specification, but if we don't get a FIN we'll hang
 				 * forever.
 				 *
 				 */
 				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 					soisdisconnected(so);
 					tcp_timer_activate(tp, TT_2MSL,
 							   (tcp_fast_finwait2_recycle ?
 							    tcp_finwait2_timeout :
 							    TP_MAXIDLE(tp)));
 				}
 				if (ourfinisacked == 0) {
 					/*
 					 * We don't change to fin-wait-2 if we have our fin acked
 					 * which means we are probably in TCPS_CLOSING.
 					 */
 					tcp_state_change(tp, TCPS_FIN_WAIT_2);
 				}
 			}
 		}
 		/* Wake up the socket if we have room to write more */
 		if (sbavail(&so->so_snd)) {
 			rack->r_wanted_output = 1;
 			if (ctf_progress_timeout_check(tp, true)) {
 				rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
 							tp, tick, PROGRESS_DROP, __LINE__);
 				/*
 				 * We cheat here and don't send a RST, we should send one
 				 * when the pacer drops the connection.
 				 */
 #ifdef TCP_ACCOUNTING
 				rdstc = get_cyclecount();
 				if (rdstc > ts_val) {
 					counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val));
 					if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 						tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
 						tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
 					}
 				}
 				sched_unpin();
 #endif
 				(void)tcp_drop(tp, ETIMEDOUT);
 				m_freem(m);
 				return (1);
 			}
 		}
 		if (ourfinisacked) {
 			switch(tp->t_state) {
 			case TCPS_CLOSING:
 #ifdef TCP_ACCOUNTING
 				rdstc = get_cyclecount();
 				if (rdstc > ts_val) {
 					counter_u64_add(tcp_proc_time[ACK_CUMACK] ,
 							(rdstc - ts_val));
 					if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 						tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
 						tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
 					}
 				}
 				sched_unpin();
 #endif
 				tcp_twstart(tp);
 				m_freem(m);
 				return (1);
 				break;
 			case TCPS_LAST_ACK:
 #ifdef TCP_ACCOUNTING
 				rdstc = get_cyclecount();
 				if (rdstc > ts_val) {
 					counter_u64_add(tcp_proc_time[ACK_CUMACK] ,
 							(rdstc - ts_val));
 					if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 						tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
 						tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
 					}
 				}
 				sched_unpin();
 #endif
 				tp = tcp_close(tp);
 				ctf_do_drop(m, tp);
 				return (1);
 				break;
 			case TCPS_FIN_WAIT_1:
 #ifdef TCP_ACCOUNTING
 				rdstc = get_cyclecount();
 				if (rdstc > ts_val) {
 					counter_u64_add(tcp_proc_time[ACK_CUMACK] ,
 							(rdstc - ts_val));
 					if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 						tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
 						tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
 					}
 				}
 #endif
 				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 					soisdisconnected(so);
 					tcp_timer_activate(tp, TT_2MSL,
 							   (tcp_fast_finwait2_recycle ?
 							    tcp_finwait2_timeout :
 							    TP_MAXIDLE(tp)));
 				}
 				tcp_state_change(tp, TCPS_FIN_WAIT_2);
 				break;
 			default:
 				break;
 			}
 		}
 		if (rack->r_fast_output) {
 			/*
 			 * We re doing fast output.. can we expand that?
 			 */
 			rack_gain_for_fastoutput(rack, tp, so, acked_amount);
 		}
 #ifdef TCP_ACCOUNTING
 		rdstc = get_cyclecount();
 		if (rdstc > ts_val) {
 			counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val));
 			if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 				tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
 				tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
 			}
 		}
 
 	} else if (win_up_req) {
 		rdstc = get_cyclecount();
 		if (rdstc > ts_val) {
 			counter_u64_add(tcp_proc_time[ACK_RWND] , (rdstc - ts_val));
 			if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 				tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val);
 			}
 		}
 #endif
 	}
 	/* Now is there a next packet, if so we are done */
 	m_freem(m);
 	did_out = 0;
 	if (nxt_pkt) {
 #ifdef TCP_ACCOUNTING
 		sched_unpin();
 #endif
 		rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs);
 		return (0);
 	}
 	rack_handle_might_revert(tp, rack);
 	ctf_calc_rwin(so, tp);
 	if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
 	send_out_a_rst:
 		if (tcp_output(tp) < 0) {
 #ifdef TCP_ACCOUNTING
 			sched_unpin();
 #endif
 			return (1);
 		}
 		did_out = 1;
 	}
 	rack_free_trim(rack);
 #ifdef TCP_ACCOUNTING
 	sched_unpin();
 #endif
 	rack_timer_audit(tp, rack, &so->so_snd);
 	rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs);
 	return (0);
 }
 
 
 static int
 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
     int32_t nxt_pkt, struct timeval *tv)
 {
 #ifdef TCP_ACCOUNTING
 	uint64_t ts_val;
 #endif
 	int32_t thflags, retval, did_out = 0;
 	int32_t way_out = 0;
 	/*
 	 * cts - is the current time from tv (caller gets ts) in microseconds.
 	 * ms_cts - is the current time from tv in milliseconds.
 	 * us_cts - is the time that LRO or hardware actually got the packet in microseconds.
 	 */
 	uint32_t cts, us_cts, ms_cts;
 	uint32_t tiwin, high_seq;
 	struct timespec ts;
 	struct tcpopt to;
 	struct tcp_rack *rack;
 	struct rack_sendmap *rsm;
 	int32_t prev_state = 0;
 #ifdef TCP_ACCOUNTING
 	int ack_val_set = 0xf;
 #endif
 	int nsegs;
 	/*
 	 * tv passed from common code is from either M_TSTMP_LRO or
 	 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present.
 	 */
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (m->m_flags & M_ACKCMP) {
 		/*
 		 * All compressed ack's are ack's by definition so
 		 * remove any ack required flag and then do the processing.
 		 */
 		rack->rc_ack_required = 0;
 		return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv));
 	}
 	if (m->m_flags & M_ACKCMP) {
 		panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp);
 	}
 	cts = tcp_tv_to_usectick(tv);
 	ms_cts =  tcp_tv_to_mssectick(tv);
 	nsegs = m->m_pkthdr.lro_nsegs;
 	counter_u64_add(rack_proc_non_comp_ack, 1);
 	thflags = tcp_get_flags(th);
 #ifdef TCP_ACCOUNTING
 	sched_pin();
 	if (thflags & TH_ACK)
 		ts_val = get_cyclecount();
 #endif
 	if ((m->m_flags & M_TSTMP) ||
 	    (m->m_flags & M_TSTMP_LRO)) {
 		mbuf_tstmp2timespec(m, &ts);
 		rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
 		rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
 	} else
 		rack->r_ctl.act_rcv_time = *tv;
 	kern_prefetch(rack, &prev_state);
 	prev_state = 0;
 	/*
 	 * Unscale the window into a 32-bit value. For the SYN_SENT state
 	 * the scale is zero.
 	 */
 	tiwin = th->th_win << tp->snd_scale;
 #ifdef TCP_ACCOUNTING
 	if (thflags & TH_ACK) {
 		/*
 		 * We have a tradeoff here. We can either do what we are
 		 * doing i.e. pinning to this CPU and then doing the accounting
 		 * <or> we could do a critical enter, setup the rdtsc and cpu
 		 * as in below, and then validate we are on the same CPU on
 		 * exit. I have choosen to not do the critical enter since
 		 * that often will gain you a context switch, and instead lock
 		 * us (line above this if) to the same CPU with sched_pin(). This
 		 * means we may be context switched out for a higher priority
 		 * interupt but we won't be moved to another CPU.
 		 *
 		 * If this occurs (which it won't very often since we most likely
 		 * are running this code in interupt context and only a higher
 		 * priority will bump us ... clock?) we will falsely add in
 		 * to the time the interupt processing time plus the ack processing
 		 * time. This is ok since its a rare event.
 		 */
 		ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin,
 						    ctf_fixed_maxseg(tp));
 	}
 #endif
 	/*
 	 * Parse options on any incoming segment.
 	 */
 	memset(&to, 0, sizeof(to));
 	tcp_dooptions(&to, (u_char *)(th + 1),
 	    (th->th_off << 2) - sizeof(struct tcphdr),
 	    (thflags & TH_SYN) ? TO_SYN : 0);
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
 	    __func__));
 
 	if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
 	    (tp->t_flags & TF_GPUTINPROG)) {
 		/*
 		 * We have a goodput in progress
 		 * and we have entered a late state.
 		 * Do we have enough data in the sb
 		 * to handle the GPUT request?
 		 */
 		uint32_t bytes;
 
 		bytes = tp->gput_ack - tp->gput_seq;
 		if (SEQ_GT(tp->gput_seq, tp->snd_una))
 			bytes += tp->gput_seq - tp->snd_una;
 		if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) {
 			/*
 			 * There are not enough bytes in the socket
 			 * buffer that have been sent to cover this
 			 * measurement. Cancel it.
 			 */
 			rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
 						   rack->r_ctl.rc_gp_srtt /*flex1*/,
 						   tp->gput_seq,
 						   0, 0, 18, __LINE__, NULL, 0);
 			tp->t_flags &= ~TF_GPUTINPROG;
 		}
 	}
 	high_seq = th->th_ack;
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval ltv;
 #ifdef NETFLIX_HTTP_LOGGING
 		struct http_sendfile_track *http_req;
 
 		if (SEQ_GT(th->th_ack, tp->snd_una)) {
 			http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1));
 		} else {
 			http_req = tcp_http_find_req_for_seq(tp, th->th_ack);
 		}
 #endif
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
 		if (rack->rack_no_prr == 0)
 			log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
 		else
 			log.u_bbr.flex1 = 0;
 		log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->r_might_revert;
 		log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
 		log.u_bbr.flex3 = m->m_flags;
 		log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
 		log.u_bbr.lost = thflags;
 		log.u_bbr.pacing_gain = 0x1;
 #ifdef TCP_ACCOUNTING
 		log.u_bbr.cwnd_gain = ack_val_set;
 #endif
 		log.u_bbr.flex7 = 2;
 		if (m->m_flags & M_TSTMP) {
 			/* Record the hardware timestamp if present */
 			mbuf_tstmp2timespec(m, &ts);
 			ltv.tv_sec = ts.tv_sec;
 			ltv.tv_usec = ts.tv_nsec / 1000;
 			log.u_bbr.lt_epoch = tcp_tv_to_usectick(&ltv);
 		} else if (m->m_flags & M_TSTMP_LRO) {
 			/* Record the LRO the arrival timestamp */
 			mbuf_tstmp2timespec(m, &ts);
 			ltv.tv_sec = ts.tv_sec;
 			ltv.tv_usec = ts.tv_nsec / 1000;
 			log.u_bbr.flex5 = tcp_tv_to_usectick(&ltv);
 		}
 		log.u_bbr.timeStamp = tcp_get_usecs(&ltv);
 		/* Log the rcv time */
 		log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp;
 #ifdef NETFLIX_HTTP_LOGGING
 		log.u_bbr.applimited = tp->t_http_closed;
 		log.u_bbr.applimited <<= 8;
 		log.u_bbr.applimited |= tp->t_http_open;
 		log.u_bbr.applimited <<= 8;
 		log.u_bbr.applimited |= tp->t_http_req;
 		if (http_req) {
 			/* Copy out any client req info */
 			/* seconds */
 			log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC);
 			/* useconds */
 			log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC);
 			log.u_bbr.rttProp = http_req->timestamp;
 			log.u_bbr.cur_del_rate = http_req->start;
 			if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) {
 				log.u_bbr.flex8 |= 1;
 			} else {
 				log.u_bbr.flex8 |= 2;
 				log.u_bbr.bw_inuse = http_req->end;
 			}
 			log.u_bbr.flex6 = http_req->start_seq;
 			if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) {
 				log.u_bbr.flex8 |= 4;
 				log.u_bbr.epoch = http_req->end_seq;
 			}
 		}
 #endif
 		TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
 		    tlen, &log, true, &ltv);
 	}
 	/* Remove ack required flag if set, we have one  */
 	if (thflags & TH_ACK)
 		rack->rc_ack_required = 0;
 	if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
 		way_out = 4;
 		retval = 0;
 		m_freem(m);
 		goto done_with_input;
 	}
 	/*
 	 * If a segment with the ACK-bit set arrives in the SYN-SENT state
 	 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
 	 */
 	if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
 	    (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 #ifdef TCP_ACCOUNTING
 		sched_unpin();
 #endif
 		return (1);
 	}
 	/*
 	 * If timestamps were negotiated during SYN/ACK and a
 	 * segment without a timestamp is received, silently drop
 	 * the segment, unless it is a RST segment or missing timestamps are
 	 * tolerated.
 	 * See section 3.2 of RFC 7323.
 	 */
 	if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) &&
 	    ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) {
 		way_out = 5;
 		retval = 0;
 		m_freem(m);
 		goto done_with_input;
 	}
 
 	/*
 	 * Segment received on connection. Reset idle time and keep-alive
 	 * timer. XXX: This should be done after segment validation to
 	 * ignore broken/spoofed segs.
 	 */
 	if  (tp->t_idle_reduce &&
 	     (tp->snd_max == tp->snd_una) &&
 	     (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
 		counter_u64_add(rack_input_idle_reduces, 1);
 		rack_cc_after_idle(rack, tp);
 	}
 	tp->t_rcvtime = ticks;
 #ifdef STATS
 	stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
 #endif
 	if (tiwin > rack->r_ctl.rc_high_rwnd)
 		rack->r_ctl.rc_high_rwnd = tiwin;
 	/*
 	 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
 	 * this to occur after we've validated the segment.
 	 */
 	if (tcp_ecn_input_segment(tp, thflags, iptos))
 		rack_cong_signal(tp, CC_ECN, th->th_ack, __LINE__);
 
 	/*
 	 * If echoed timestamp is later than the current time, fall back to
 	 * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
 	 * were used when this connection was established.
 	 */
 	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
 		to.to_tsecr -= tp->ts_offset;
 		if (TSTMP_GT(to.to_tsecr, ms_cts))
 			to.to_tsecr = 0;
 	}
 
 	/*
 	 * If its the first time in we need to take care of options and
 	 * verify we can do SACK for rack!
 	 */
 	if (rack->r_state == 0) {
 		/* Should be init'd by rack_init() */
 		KASSERT(rack->rc_inp != NULL,
 		    ("%s: rack->rc_inp unexpectedly NULL", __func__));
 		if (rack->rc_inp == NULL) {
 			rack->rc_inp = tp->t_inpcb;
 		}
 
 		/*
 		 * Process options only when we get SYN/ACK back. The SYN
 		 * case for incoming connections is handled in tcp_syncache.
 		 * According to RFC1323 the window field in a SYN (i.e., a
 		 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
 		 * this is traditional behavior, may need to be cleaned up.
 		 */
 		if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
 			/* Handle parallel SYN for ECN */
 			tcp_ecn_input_parallel_syn(tp, thflags, iptos);
 			if ((to.to_flags & TOF_SCALE) &&
 			    (tp->t_flags & TF_REQ_SCALE)) {
 				tp->t_flags |= TF_RCVD_SCALE;
 				tp->snd_scale = to.to_wscale;
 			} else
 				tp->t_flags &= ~TF_REQ_SCALE;
 			/*
 			 * Initial send window.  It will be updated with the
 			 * next incoming segment to the scaled value.
 			 */
 			tp->snd_wnd = th->th_win;
 			rack_validate_fo_sendwin_up(tp, rack);
 			if ((to.to_flags & TOF_TS) &&
 			    (tp->t_flags & TF_REQ_TSTMP)) {
 				tp->t_flags |= TF_RCVD_TSTMP;
 				tp->ts_recent = to.to_tsval;
 				tp->ts_recent_age = cts;
 			} else
 				tp->t_flags &= ~TF_REQ_TSTMP;
 			if (to.to_flags & TOF_MSS) {
 				tcp_mss(tp, to.to_mss);
 			}
 			if ((tp->t_flags & TF_SACK_PERMIT) &&
 			    (to.to_flags & TOF_SACKPERM) == 0)
 				tp->t_flags &= ~TF_SACK_PERMIT;
 			if (IS_FASTOPEN(tp->t_flags)) {
 				if (to.to_flags & TOF_FASTOPEN) {
 					uint16_t mss;
 
 					if (to.to_flags & TOF_MSS)
 						mss = to.to_mss;
 					else
 						if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
 							mss = TCP6_MSS;
 						else
 							mss = TCP_MSS;
 					tcp_fastopen_update_cache(tp, mss,
 					    to.to_tfo_len, to.to_tfo_cookie);
 				} else
 					tcp_fastopen_disable_path(tp);
 			}
 		}
 		/*
 		 * At this point we are at the initial call. Here we decide
 		 * if we are doing RACK or not. We do this by seeing if
 		 * TF_SACK_PERMIT is set and the sack-not-required is clear.
 		 * The code now does do dup-ack counting so if you don't
 		 * switch back you won't get rack & TLP, but you will still
 		 * get this stack.
 		 */
 
 		if ((rack_sack_not_required == 0) &&
 		    ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
 			tcp_switch_back_to_default(tp);
 			(*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
 			    tlen, iptos);
 #ifdef TCP_ACCOUNTING
 			sched_unpin();
 #endif
 			return (1);
 		}
 		tcp_set_hpts(tp->t_inpcb);
 		sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
 	}
 	if (thflags & TH_FIN)
 		tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN);
 	us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
 	if ((rack->rc_gp_dyn_mul) &&
 	    (rack->use_fixed_rate == 0) &&
 	    (rack->rc_always_pace)) {
 		/* Check in on probertt */
 		rack_check_probe_rtt(rack, us_cts);
 	}
 	rack_clear_rate_sample(rack);
 	if ((rack->forced_ack) &&
 	    ((tcp_get_flags(th) & TH_RST) == 0)) {
 		rack_handle_probe_response(rack, tiwin, us_cts);
 	}
 	/*
 	 * This is the one exception case where we set the rack state
 	 * always. All other times (timers etc) we must have a rack-state
 	 * set (so we assure we have done the checks above for SACK).
 	 */
 	rack->r_ctl.rc_rcvtime = cts;
 	if (rack->r_state != tp->t_state)
 		rack_set_state(tp, rack);
 	if (SEQ_GT(th->th_ack, tp->snd_una) &&
 	    (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL)
 		kern_prefetch(rsm, &prev_state);
 	prev_state = rack->r_state;
 	retval = (*rack->r_substate) (m, th, so,
 	    tp, &to, drop_hdrlen,
 	    tlen, tiwin, thflags, nxt_pkt, iptos);
 #ifdef INVARIANTS
 	if ((retval == 0) &&
 	    (tp->t_inpcb == NULL)) {
 		panic("retval:%d tp:%p t_inpcb:NULL state:%d",
 		    retval, tp, prev_state);
 	}
 #endif
 	if (retval == 0) {
 		/*
 		 * If retval is 1 the tcb is unlocked and most likely the tp
 		 * is gone.
 		 */
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 		if ((rack->rc_gp_dyn_mul) &&
 		    (rack->rc_always_pace) &&
 		    (rack->use_fixed_rate == 0) &&
 		    rack->in_probe_rtt &&
 		    (rack->r_ctl.rc_time_probertt_starts == 0)) {
 			/*
 			 * If we are going for target, lets recheck before
 			 * we output.
 			 */
 			rack_check_probe_rtt(rack, us_cts);
 		}
 		if (rack->set_pacing_done_a_iw == 0) {
 			/* How much has been acked? */
 			if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) {
 				/* We have enough to set in the pacing segment size */
 				rack->set_pacing_done_a_iw = 1;
 				rack_set_pace_segments(tp, rack, __LINE__, NULL);
 			}
 		}
 		tcp_rack_xmit_timer_commit(rack, tp);
 #ifdef TCP_ACCOUNTING
 		/*
 		 * If we set the ack_val_se to what ack processing we are doing
 		 * we also want to track how many cycles we burned. Note
 		 * the bits after tcp_output we let be "free". This is because
 		 * we are also tracking the tcp_output times as well. Note the
 		 * use of 0xf here since we only have 11 counter (0 - 0xa) and
 		 * 0xf cannot be returned and is what we initialize it too to
 		 * indicate we are not doing the tabulations.
 		 */
 		if (ack_val_set != 0xf) {
 			uint64_t crtsc;
 
 			crtsc = get_cyclecount();
 			counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val));
 			if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 				tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val);
 			}
 		}
 #endif
 		if (nxt_pkt == 0) {
 			if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) {
 do_output_now:
 				if (tcp_output(tp) < 0)
 					return (1);
 				did_out = 1;
 			}
 			rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
 			rack_free_trim(rack);
 		}
 		/* Update any rounds needed */
 		if (rack_verbose_logging &&  (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) {
 			union tcp_log_stackspecific log;
 			struct timeval tv;
 
 			memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 			log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 			log.u_bbr.flex1 = high_seq;
 			log.u_bbr.flex2 = rack->r_ctl.roundends;
 			log.u_bbr.flex3 = rack->r_ctl.current_round;
 			log.u_bbr.rttProp = (uint64_t)CC_ALGO(tp)->newround;
 			log.u_bbr.flex8 = 9;
 			tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
 				       0, &log, false, NULL, NULL, 0, &tv);
 		}
 		/*
 		 * The draft (v3) calls for us to use SEQ_GEQ, but that
 		 * causes issues when we are just going app limited. Lets
 		 * instead use SEQ_GT <or> where its equal but more data
 		 * is outstanding.
 		 */
 		if ((SEQ_GT(tp->snd_una, rack->r_ctl.roundends)) ||
 		    ((tp->snd_una == rack->r_ctl.roundends) && SEQ_GT(tp->snd_max, tp->snd_una))) {
 			rack->r_ctl.current_round++;
 			rack->r_ctl.roundends = tp->snd_max;
 			if (CC_ALGO(tp)->newround != NULL) {
 				CC_ALGO(tp)->newround(tp->ccv, rack->r_ctl.current_round);
 			}
 		}
 		if ((nxt_pkt == 0) &&
 		    ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
 		    (SEQ_GT(tp->snd_max, tp->snd_una) ||
 		     (tp->t_flags & TF_DELACK) ||
 		     ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
 		      (tp->t_state <= TCPS_CLOSING)))) {
 			/* We could not send (probably in the hpts but stopped the timer earlier)? */
 			if ((tp->snd_max == tp->snd_una) &&
 			    ((tp->t_flags & TF_DELACK) == 0) &&
 			    (tcp_in_hpts(rack->rc_inp)) &&
 			    (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
 				/* keep alive not needed if we are hptsi output yet */
 				;
 			} else {
 				int late = 0;
 				if (tcp_in_hpts(rack->rc_inp)) {
 					if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
 						us_cts = tcp_get_usecs(NULL);
 						if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
 							rack->r_early = 1;
 							rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
 						} else
 							late = 1;
 						rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
 					}
 					tcp_hpts_remove(tp->t_inpcb);
 				}
 				if (late && (did_out == 0)) {
 					/*
 					 * We are late in the sending
 					 * and we did not call the output
 					 * (this probably should not happen).
 					 */
 					goto do_output_now;
 				}
 				rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
 			}
 			way_out = 1;
 		} else if (nxt_pkt == 0) {
 			/* Do we have the correct timer running? */
 			rack_timer_audit(tp, rack, &so->so_snd);
 			way_out = 2;
 		}
 	done_with_input:
 		rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, nsegs));
 		if (did_out)
 			rack->r_wanted_output = 0;
 #ifdef INVARIANTS
 		if (tp->t_inpcb == NULL) {
 			panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
 			      did_out,
 			      retval, tp, prev_state);
 		}
 #endif
 #ifdef TCP_ACCOUNTING
 	} else {
 		/*
 		 * Track the time (see above).
 		 */
 		if (ack_val_set != 0xf) {
 			uint64_t crtsc;
 
 			crtsc = get_cyclecount();
 			counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val));
 			/*
 			 * Note we *DO NOT* increment the per-tcb counters since
 			 * in the else the TP may be gone!!
 			 */
 		}
 #endif
 	}
 #ifdef TCP_ACCOUNTING
 	sched_unpin();
 #endif
 	return (retval);
 }
 
 void
 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
 {
 	struct timeval tv;
 
 	/* First lets see if we have old packets */
 	if (tp->t_in_pkt) {
 		if (ctf_do_queued_segments(so, tp, 1)) {
 			m_freem(m);
 			return;
 		}
 	}
 	if (m->m_flags & M_TSTMP_LRO) {
 		mbuf_tstmp2timeval(m, &tv);
 	} else {
 		/* Should not be should we kassert instead? */
 		tcp_get_usecs(&tv);
 	}
 	if (rack_do_segment_nounlock(m, th, so, tp,
 				     drop_hdrlen, tlen, iptos, 0, &tv) == 0) {
 		INP_WUNLOCK(tp->t_inpcb);
 	}
 }
 
 struct rack_sendmap *
 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
 {
 	struct rack_sendmap *rsm = NULL;
 	int32_t idx;
 	uint32_t srtt = 0, thresh = 0, ts_low = 0;
 
 	/* Return the next guy to be re-transmitted */
 	if (RB_EMPTY(&rack->r_ctl.rc_mtree)) {
 		return (NULL);
 	}
 	if (tp->t_flags & TF_SENTFIN) {
 		/* retran the end FIN? */
 		return (NULL);
 	}
 	/* ok lets look at this one */
 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 	if (rack->r_must_retran && rsm && (rsm->r_flags & RACK_MUST_RXT)) {
 		return (rsm);
 	}
 	if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
 		goto check_it;
 	}
 	rsm = rack_find_lowest_rsm(rack);
 	if (rsm == NULL) {
 		return (NULL);
 	}
 check_it:
 	if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) &&
 	    (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
 		/*
 		 * No sack so we automatically do the 3 strikes and
 		 * retransmit (no rack timer would be started).
 		 */
 
 		return (rsm);
 	}
 	if (rsm->r_flags & RACK_ACKED) {
 		return (NULL);
 	}
 	if (((rsm->r_flags & RACK_SACK_PASSED) == 0) &&
 	    (rsm->r_dupack < DUP_ACK_THRESHOLD)) {
 		/* Its not yet ready */
 		return (NULL);
 	}
 	srtt = rack_grab_rtt(tp, rack);
 	idx = rsm->r_rtr_cnt - 1;
 	ts_low = (uint32_t)rsm->r_tim_lastsent[idx];
 	thresh = rack_calc_thresh_rack(rack, srtt, tsused);
 	if ((tsused == ts_low) ||
 	    (TSTMP_LT(tsused, ts_low))) {
 		/* No time since sending */
 		return (NULL);
 	}
 	if ((tsused - ts_low) < thresh) {
 		/* It has not been long enough yet */
 		return (NULL);
 	}
 	if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
 	    ((rsm->r_flags & RACK_SACK_PASSED) &&
 	     (rack->sack_attack_disable == 0))) {
 		/*
 		 * We have passed the dup-ack threshold <or>
 		 * a SACK has indicated this is missing.
 		 * Note that if you are a declared attacker
 		 * it is only the dup-ack threshold that
 		 * will cause retransmits.
 		 */
 		/* log retransmit reason */
 		rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1);
 		rack->r_fast_output = 0;
 		return (rsm);
 	}
 	return (NULL);
 }
 
 static void
 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
 			   uint64_t bw_est, uint64_t bw, uint64_t len_time, int method,
 			   int line, struct rack_sendmap *rsm, uint8_t quality)
 {
 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log, 0, sizeof(log));
 		log.u_bbr.flex1 = slot;
 		log.u_bbr.flex2 = len;
 		log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs;
 		log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs;
 		log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss;
 		log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca;
 		log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->r_late;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->r_early;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->app_limited_needs_set;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->rc_gp_filled;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
 		log.u_bbr.use_lt_bw <<= 1;
 		log.u_bbr.use_lt_bw |= rack->gp_ready;
 		log.u_bbr.pkt_epoch = line;
 		log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed;
 		log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early;
 		log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec;
 		log.u_bbr.bw_inuse = bw_est;
 		log.u_bbr.delRate = bw;
 		if (rack->r_ctl.gp_bw == 0)
 			log.u_bbr.cur_del_rate = 0;
 		else
 			log.u_bbr.cur_del_rate = rack_get_bw(rack);
 		log.u_bbr.rttProp = len_time;
 		log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt;
 		log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit;
 		log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
 		if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) {
 			/* We are in slow start */
 			log.u_bbr.flex7 = 1;
 		} else {
 			/* we are on congestion avoidance */
 			log.u_bbr.flex7 = 0;
 		}
 		log.u_bbr.flex8 = method;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec;
 		log.u_bbr.cwnd_gain <<= 1;
 		log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
 		log.u_bbr.cwnd_gain <<= 1;
 		log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
 		log.u_bbr.bbr_substate = quality;
 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
 		    &rack->rc_inp->inp_socket->so_rcv,
 		    &rack->rc_inp->inp_socket->so_snd,
 		    BBR_LOG_HPTSI_CALC, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 static uint32_t
 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss)
 {
 	uint32_t new_tso, user_max;
 
 	user_max = rack->rc_user_set_max_segs * mss;
 	if (rack->rc_force_max_seg) {
 		return (user_max);
 	}
 	if (rack->use_fixed_rate &&
 	    ((rack->r_ctl.crte == NULL) ||
 	     (bw != rack->r_ctl.crte->rate))) {
 		/* Use the user mss since we are not exactly matched */
 		return (user_max);
 	}
 	new_tso = tcp_get_pacing_burst_size(rack->rc_tp, bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL);
 	if (new_tso > user_max)
 		new_tso = user_max;
 	return (new_tso);
 }
 
 static int32_t
 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced)
 {
 	uint64_t lentim, fill_bw;
 
 	/* Lets first see if we are full, if so continue with normal rate */
 	rack->r_via_fill_cw = 0;
 	if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use)
 		return (slot);
 	if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd)
 		return (slot);
 	if (rack->r_ctl.rc_last_us_rtt == 0)
 		return (slot);
 	if (rack->rc_pace_fill_if_rttin_range &&
 	    (rack->r_ctl.rc_last_us_rtt >=
 	     (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) {
 		/* The rtt is huge, N * smallest, lets not fill */
 		return (slot);
 	}
 	/*
 	 * first lets calculate the b/w based on the last us-rtt
 	 * and the sndwnd.
 	 */
 	fill_bw = rack->r_ctl.cwnd_to_use;
 	/* Take the rwnd if its smaller */
 	if (fill_bw > rack->rc_tp->snd_wnd)
 		fill_bw = rack->rc_tp->snd_wnd;
 	if (rack->r_fill_less_agg) {
 		/*
 		 * Now take away the inflight (this will reduce our
 		 * aggressiveness and yeah, if we get that much out in 1RTT
 		 * we will have had acks come back and still be behind).
 		 */
 		fill_bw -= ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 	}
 	/* Now lets make it into a b/w */
 	fill_bw *= (uint64_t)HPTS_USEC_IN_SEC;
 	fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt;
 	/* We are below the min b/w */
 	if (non_paced)
 		*rate_wanted = fill_bw;
 	if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted))
 		return (slot);
 	if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap))
 		fill_bw = rack->r_ctl.bw_rate_cap;
 	rack->r_via_fill_cw = 1;
 	if (rack->r_rack_hw_rate_caps &&
 	    (rack->r_ctl.crte != NULL)) {
 		uint64_t high_rate;
 
 		high_rate = tcp_hw_highest_rate(rack->r_ctl.crte);
 		if (fill_bw > high_rate) {
 			/* We are capping bw at the highest rate table entry */
 			if (*rate_wanted > high_rate) {
 				/* The original rate was also capped */
 				rack->r_via_fill_cw = 0;
 			}
 			rack_log_hdwr_pacing(rack,
 					     fill_bw, high_rate, __LINE__,
 					     0, 3);
 			fill_bw = high_rate;
 			if (capped)
 				*capped = 1;
 		}
 	} else if ((rack->r_ctl.crte == NULL) &&
 		   (rack->rack_hdrw_pacing == 0) &&
 		   (rack->rack_hdw_pace_ena) &&
 		   rack->r_rack_hw_rate_caps &&
 		   (rack->rack_attempt_hdwr_pace == 0) &&
 		   (rack->rc_inp->inp_route.ro_nh != NULL) &&
 		   (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
 		/*
 		 * Ok we may have a first attempt that is greater than our top rate
 		 * lets check.
 		 */
 		uint64_t high_rate;
 
 		high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp);
 		if (high_rate) {
 			if (fill_bw > high_rate) {
 				fill_bw = high_rate;
 				if (capped)
 					*capped = 1;
 			}
 		}
 	}
 	/*
 	 * Ok fill_bw holds our mythical b/w to fill the cwnd
 	 * in a rtt, what does that time wise equate too?
 	 */
 	lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC;
 	lentim /= fill_bw;
 	*rate_wanted = fill_bw;
 	if (non_paced || (lentim < slot)) {
 		rack_log_pacing_delay_calc(rack, len, slot, fill_bw,
 					   0, lentim, 12, __LINE__, NULL, 0);
 		return ((int32_t)lentim);
 	} else
 		return (slot);
 }
 
 static int32_t
 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz)
 {
 	uint64_t srtt;
 	int32_t slot = 0;
 	int can_start_hw_pacing = 1;
 	int err;
 
 	if (rack->rc_always_pace == 0) {
 		/*
 		 * We use the most optimistic possible cwnd/srtt for
 		 * sending calculations. This will make our
 		 * calculation anticipate getting more through
 		 * quicker then possible. But thats ok we don't want
 		 * the peer to have a gap in data sending.
 		 */
 		uint64_t cwnd, tr_perms = 0;
 		int32_t reduce = 0;
 
 	old_method:
 		/*
 		 * We keep no precise pacing with the old method
 		 * instead we use the pacer to mitigate bursts.
 		 */
 		if (rack->r_ctl.rc_rack_min_rtt)
 			srtt = rack->r_ctl.rc_rack_min_rtt;
 		else
 			srtt = max(tp->t_srtt, 1);
 		if (rack->r_ctl.rc_rack_largest_cwnd)
 			cwnd = rack->r_ctl.rc_rack_largest_cwnd;
 		else
 			cwnd = rack->r_ctl.cwnd_to_use;
 		/* Inflate cwnd by 1000 so srtt of usecs is in ms */
 		tr_perms = (cwnd * 1000) / srtt;
 		if (tr_perms == 0) {
 			tr_perms = ctf_fixed_maxseg(tp);
 		}
 		/*
 		 * Calculate how long this will take to drain, if
 		 * the calculation comes out to zero, thats ok we
 		 * will use send_a_lot to possibly spin around for
 		 * more increasing tot_len_this_send to the point
 		 * that its going to require a pace, or we hit the
 		 * cwnd. Which in that case we are just waiting for
 		 * a ACK.
 		 */
 		slot = len / tr_perms;
 		/* Now do we reduce the time so we don't run dry? */
 		if (slot && rack_slot_reduction) {
 			reduce = (slot / rack_slot_reduction);
 			if (reduce < slot) {
 				slot -= reduce;
 			} else
 				slot = 0;
 		}
 		slot *= HPTS_USEC_IN_MSEC;
 		if (rack->rc_pace_to_cwnd) {
 			uint64_t rate_wanted = 0;
 
 			slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1);
 			rack->rc_ack_can_sendout_data = 1;
 			rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0);
 		} else
 			rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0);
 	} else {
 		uint64_t bw_est, res, lentim, rate_wanted;
 		uint32_t orig_val, segs, oh;
 		int capped = 0;
 		int prev_fill;
 
 		if ((rack->r_rr_config == 1) && rsm) {
 			return (rack->r_ctl.rc_min_to);
 		}
 		if (rack->use_fixed_rate) {
 			rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack);
 		} else if ((rack->r_ctl.init_rate == 0) &&
 #ifdef NETFLIX_PEAKRATE
 			   (rack->rc_tp->t_maxpeakrate == 0) &&
 #endif
 			   (rack->r_ctl.gp_bw == 0)) {
 			/* no way to yet do an estimate */
 			bw_est = rate_wanted = 0;
 		} else {
 			bw_est = rack_get_bw(rack);
 			rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped);
 		}
 		if ((bw_est == 0) || (rate_wanted == 0) ||
 		    ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) {
 			/*
 			 * No way yet to make a b/w estimate or
 			 * our raise is set incorrectly.
 			 */
 			goto old_method;
 		}
 		/* We need to account for all the overheads */
 		segs = (len + segsiz - 1) / segsiz;
 		/*
 		 * We need the diff between 1514 bytes (e-mtu with e-hdr)
 		 * and how much data we put in each packet. Yes this
 		 * means we may be off if we are larger than 1500 bytes
 		 * or smaller. But this just makes us more conservative.
 		 */
 		if (rack_hw_rate_min &&
 		    (bw_est < rack_hw_rate_min))
 			can_start_hw_pacing = 0;
 		if (ETHERNET_SEGMENT_SIZE > segsiz)
 			oh = ETHERNET_SEGMENT_SIZE - segsiz;
 		else
 			oh = 0;
 		segs *= oh;
 		lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC;
 		res = lentim / rate_wanted;
 		slot = (uint32_t)res;
 		orig_val = rack->r_ctl.rc_pace_max_segs;
 		if (rack->r_ctl.crte == NULL) {
 			/*
 			 * Only do this if we are not hardware pacing
 			 * since if we are doing hw-pacing below we will
 			 * set make a call after setting up or changing
 			 * the rate.
 			 */
 			rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
 		} else if (rack->rc_inp->inp_snd_tag == NULL) {
 			/*
 			 * We lost our rate somehow, this can happen
 			 * if the interface changed underneath us.
 			 */
 			tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
 			rack->r_ctl.crte = NULL;
 			/* Lets re-allow attempting to setup pacing */
 			rack->rack_hdrw_pacing = 0;
 			rack->rack_attempt_hdwr_pace = 0;
 			rack_log_hdwr_pacing(rack,
 					     rate_wanted, bw_est, __LINE__,
 					     0, 6);
 		}
 		/* Did we change the TSO size, if so log it */
 		if (rack->r_ctl.rc_pace_max_segs != orig_val)
 			rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL, 0);
 		prev_fill = rack->r_via_fill_cw;
 		if ((rack->rc_pace_to_cwnd) &&
 		    (capped == 0) &&
 		    (rack->use_fixed_rate == 0) &&
 		    (rack->in_probe_rtt == 0) &&
 		    (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) {
 			/*
 			 * We want to pace at our rate *or* faster to
 			 * fill the cwnd to the max if its not full.
 			 */
 			slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0);
 		}
 		if ((rack->rc_inp->inp_route.ro_nh != NULL) &&
 		    (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
 			if ((rack->rack_hdw_pace_ena) &&
 			    (can_start_hw_pacing > 0) &&
 			    (rack->rack_hdrw_pacing == 0) &&
 			    (rack->rack_attempt_hdwr_pace == 0)) {
 				/*
 				 * Lets attempt to turn on hardware pacing
 				 * if we can.
 				 */
 				rack->rack_attempt_hdwr_pace = 1;
 				rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp,
 								       rack->rc_inp->inp_route.ro_nh->nh_ifp,
 								       rate_wanted,
 								       RS_PACING_GEQ,
 								       &err, &rack->r_ctl.crte_prev_rate);
 				if (rack->r_ctl.crte) {
 					rack->rack_hdrw_pacing = 1;
 					rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, segsiz,
 												 0, rack->r_ctl.crte,
 												 NULL);
 					rack_log_hdwr_pacing(rack,
 							     rate_wanted, rack->r_ctl.crte->rate, __LINE__,
 							     err, 0);
 					rack->r_ctl.last_hw_bw_req = rate_wanted;
 				} else {
 					counter_u64_add(rack_hw_pace_init_fail, 1);
 				}
 			} else if (rack->rack_hdrw_pacing &&
 				   (rack->r_ctl.last_hw_bw_req != rate_wanted)) {
 				/* Do we need to adjust our rate? */
 				const struct tcp_hwrate_limit_table *nrte;
 
 				if (rack->r_up_only &&
 				    (rate_wanted < rack->r_ctl.crte->rate)) {
 					/**
 					 * We have four possible states here
 					 * having to do with the previous time
 					 * and this time.
 					 *   previous  |  this-time
 					 * A)     0      |     0   -- fill_cw not in the picture
 					 * B)     1      |     0   -- we were doing a fill-cw but now are not
 					 * C)     1      |     1   -- all rates from fill_cw
 					 * D)     0      |     1   -- we were doing non-fill and now we are filling
 					 *
 					 * For case A, C and D we don't allow a drop. But for
 					 * case B where we now our on our steady rate we do
 					 * allow a drop.
 					 *
 					 */
 					if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0)))
 						goto done_w_hdwr;
 				}
 				if ((rate_wanted > rack->r_ctl.crte->rate) ||
 				    (rate_wanted <= rack->r_ctl.crte_prev_rate)) {
 					if (rack_hw_rate_to_low &&
 					    (bw_est < rack_hw_rate_to_low)) {
 						/*
 						 * The pacing rate is too low for hardware, but
 						 * do allow hardware pacing to be restarted.
 						 */
 						rack_log_hdwr_pacing(rack,
 							     bw_est, rack->r_ctl.crte->rate, __LINE__,
 							     0, 5);
 						tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
 						rack->r_ctl.crte = NULL;
 						rack->rack_attempt_hdwr_pace = 0;
 						rack->rack_hdrw_pacing = 0;
 						rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
 						goto done_w_hdwr;
 					}
 					nrte = tcp_chg_pacing_rate(rack->r_ctl.crte,
 								   rack->rc_tp,
 								   rack->rc_inp->inp_route.ro_nh->nh_ifp,
 								   rate_wanted,
 								   RS_PACING_GEQ,
 								   &err, &rack->r_ctl.crte_prev_rate);
 					if (nrte == NULL) {
 						/* Lost the rate */
 						rack->rack_hdrw_pacing = 0;
 						rack->r_ctl.crte = NULL;
 						rack_log_hdwr_pacing(rack,
 								     rate_wanted, 0, __LINE__,
 								     err, 1);
 						rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
 						counter_u64_add(rack_hw_pace_lost, 1);
 					} else if (nrte != rack->r_ctl.crte) {
 						rack->r_ctl.crte = nrte;
 						rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted,
 													 segsiz, 0,
 													 rack->r_ctl.crte,
 													 NULL);
 						rack_log_hdwr_pacing(rack,
 								     rate_wanted, rack->r_ctl.crte->rate, __LINE__,
 								     err, 2);
 						rack->r_ctl.last_hw_bw_req = rate_wanted;
 					}
 				} else {
 					/* We just need to adjust the segment size */
 					rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
 					rack_log_hdwr_pacing(rack,
 							     rate_wanted, rack->r_ctl.crte->rate, __LINE__,
 							     0, 4);
 					rack->r_ctl.last_hw_bw_req = rate_wanted;
 				}
 			}
 		}
 		if ((rack->r_ctl.crte != NULL) &&
 		    (rack->r_ctl.crte->rate == rate_wanted)) {
 			/*
 			 * We need to add a extra if the rates
 			 * are exactly matched. The idea is
 			 * we want the software to make sure the
 			 * queue is empty before adding more, this
 			 * gives us N MSS extra pace times where
 			 * N is our sysctl
 			 */
 			slot += (rack->r_ctl.crte->time_between * rack_hw_pace_extra_slots);
 		}
 done_w_hdwr:
 		if (rack_limit_time_with_srtt &&
 		    (rack->use_fixed_rate == 0) &&
 #ifdef NETFLIX_PEAKRATE
 		    (rack->rc_tp->t_maxpeakrate == 0) &&
 #endif
 		    (rack->rack_hdrw_pacing == 0)) {
 			/*
 			 * Sanity check, we do not allow the pacing delay
 			 * to be longer than the SRTT of the path. If it is
 			 * a slow path, then adding a packet should increase
 			 * the RTT and compensate for this i.e. the srtt will
 			 * be greater so the allowed pacing time will be greater.
 			 *
 			 * Note this restriction is not for where a peak rate
 			 * is set, we are doing fixed pacing or hardware pacing.
 			 */
 			if (rack->rc_tp->t_srtt)
 				srtt = rack->rc_tp->t_srtt;
 			else
 				srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC;	/* its in ms convert */
 			if (srtt < (uint64_t)slot) {
 				rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0);
 				slot = srtt;
 			}
 		}
 		rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0);
 	}
 	if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) {
 		/*
 		 * If this rate is seeing enobufs when it
 		 * goes to send then either the nic is out
 		 * of gas or we are mis-estimating the time
 		 * somehow and not letting the queue empty
 		 * completely. Lets add to the pacing time.
 		 */
 		int hw_boost_delay;
 
 		hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult;
 		if (hw_boost_delay > rack_enobuf_hw_max)
 			hw_boost_delay = rack_enobuf_hw_max;
 		else if (hw_boost_delay < rack_enobuf_hw_min)
 			hw_boost_delay = rack_enobuf_hw_min;
 		slot += hw_boost_delay;
 	}
 	return (slot);
 }
 
 static void
 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack,
     tcp_seq startseq, uint32_t sb_offset)
 {
 	struct rack_sendmap *my_rsm = NULL;
 	struct rack_sendmap fe;
 
 	if (tp->t_state < TCPS_ESTABLISHED) {
 		/*
 		 * We don't start any measurements if we are
 		 * not at least established.
 		 */
 		return;
 	}
 	if (tp->t_state >= TCPS_FIN_WAIT_1) {
 		/*
 		 * We will get no more data into the SB
 		 * this means we need to have the data available
 		 * before we start a measurement.
 		 */
 
 		if (sbavail(&tp->t_inpcb->inp_socket->so_snd) <
 		    max(rc_init_window(rack),
 			(MIN_GP_WIN * ctf_fixed_maxseg(tp)))) {
 			/* Nope not enough data */
 			return;
 		}
 	}
 	tp->t_flags |= TF_GPUTINPROG;
 	rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
 	rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
 	tp->gput_seq = startseq;
 	rack->app_limited_needs_set = 0;
 	if (rack->in_probe_rtt)
 		rack->measure_saw_probe_rtt = 1;
 	else if ((rack->measure_saw_probe_rtt) &&
 		 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
 		rack->measure_saw_probe_rtt = 0;
 	if (rack->rc_gp_filled)
 		tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time);
 	else {
 		/* Special case initial measurement */
 		struct timeval tv;
 
 		tp->gput_ts = tcp_get_usecs(&tv);
 		rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
 	}
 	/*
 	 * We take a guess out into the future,
 	 * if we have no measurement and no
 	 * initial rate, we measure the first
 	 * initial-windows worth of data to
 	 * speed up getting some GP measurement and
 	 * thus start pacing.
 	 */
 	if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) {
 		rack->app_limited_needs_set = 1;
 		tp->gput_ack = startseq + max(rc_init_window(rack),
 					      (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
 		rack_log_pacing_delay_calc(rack,
 					   tp->gput_seq,
 					   tp->gput_ack,
 					   0,
 					   tp->gput_ts,
 					   rack->r_ctl.rc_app_limited_cnt,
 					   9,
 					   __LINE__, NULL, 0);
 		return;
 	}
 	if (sb_offset) {
 		/*
 		 * We are out somewhere in the sb
 		 * can we use the already outstanding data?
 		 */
 		if (rack->r_ctl.rc_app_limited_cnt == 0) {
 			/*
 			 * Yes first one is good and in this case
 			 * the tp->gput_ts is correctly set based on
 			 * the last ack that arrived (no need to
 			 * set things up when an ack comes in).
 			 */
 			my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
 			if ((my_rsm == NULL) ||
 			    (my_rsm->r_rtr_cnt != 1)) {
 				/* retransmission? */
 				goto use_latest;
 			}
 		} else {
 			if (rack->r_ctl.rc_first_appl == NULL) {
 				/*
 				 * If rc_first_appl is NULL
 				 * then the cnt should be 0.
 				 * This is probably an error, maybe
 				 * a KASSERT would be approprate.
 				 */
 				goto use_latest;
 			}
 			/*
 			 * If we have a marker pointer to the last one that is
 			 * app limited we can use that, but we need to set
 			 * things up so that when it gets ack'ed we record
 			 * the ack time (if its not already acked).
 			 */
 			rack->app_limited_needs_set = 1;
 			/*
 			 * We want to get to the rsm that is either
 			 * next with space i.e. over 1 MSS or the one
 			 * after that (after the app-limited).
 			 */
 			my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
 					 rack->r_ctl.rc_first_appl);
 			if (my_rsm) {
 				if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp))
 					/* Have to use the next one */
 					my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree,
 							 my_rsm);
 				else {
 					/* Use after the first MSS of it is acked */
 					tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp);
 					goto start_set;
 				}
 			}
 			if ((my_rsm == NULL) ||
 			    (my_rsm->r_rtr_cnt != 1)) {
 				/*
 				 * Either its a retransmit or
 				 * the last is the app-limited one.
 				 */
 				goto use_latest;
 			}
 		}
 		tp->gput_seq = my_rsm->r_start;
 start_set:
 		if (my_rsm->r_flags & RACK_ACKED) {
 			/*
 			 * This one has been acked use the arrival ack time
 			 */
 			tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival;
 			rack->app_limited_needs_set = 0;
 		}
 		rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)];
 		tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
 		rack_log_pacing_delay_calc(rack,
 					   tp->gput_seq,
 					   tp->gput_ack,
 					   (uint64_t)my_rsm,
 					   tp->gput_ts,
 					   rack->r_ctl.rc_app_limited_cnt,
 					   9,
 					   __LINE__, NULL, 0);
 		return;
 	}
 
 use_latest:
 	/*
 	 * We don't know how long we may have been
 	 * idle or if this is the first-send. Lets
 	 * setup the flag so we will trim off
 	 * the first ack'd data so we get a true
 	 * measurement.
 	 */
 	rack->app_limited_needs_set = 1;
 	tp->gput_ack = startseq + rack_get_measure_window(tp, rack);
 	/* Find this guy so we can pull the send time */
 	fe.r_start = startseq;
 	my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
 	if (my_rsm) {
 		rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)];
 		if (my_rsm->r_flags & RACK_ACKED) {
 			/*
 			 * Unlikely since its probably what was
 			 * just transmitted (but I am paranoid).
 			 */
 			tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival;
 			rack->app_limited_needs_set = 0;
 		}
 		if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) {
 			/* This also is unlikely */
 			tp->gput_seq = my_rsm->r_start;
 		}
 	} else {
 		/*
 		 * TSNH unless we have some send-map limit,
 		 * and even at that it should not be hitting
 		 * that limit (we should have stopped sending).
 		 */
 		struct timeval tv;
 
 		microuptime(&tv);
 		rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
 	}
 	rack_log_pacing_delay_calc(rack,
 				   tp->gput_seq,
 				   tp->gput_ack,
 				   (uint64_t)my_rsm,
 				   tp->gput_ts,
 				   rack->r_ctl.rc_app_limited_cnt,
 				   9, __LINE__, NULL, 0);
 }
 
 static inline uint32_t
 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack,  uint32_t cwnd_to_use,
     uint32_t avail, int32_t sb_offset)
 {
 	uint32_t len;
 	uint32_t sendwin;
 
 	if (tp->snd_wnd > cwnd_to_use)
 		sendwin = cwnd_to_use;
 	else
 		sendwin = tp->snd_wnd;
 	if (ctf_outstanding(tp) >= tp->snd_wnd) {
 		/* We never want to go over our peers rcv-window */
 		len = 0;
 	} else {
 		uint32_t flight;
 
 		flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
 		if (flight >= sendwin) {
 			/*
 			 * We have in flight what we are allowed by cwnd (if
 			 * it was rwnd blocking it would have hit above out
 			 * >= tp->snd_wnd).
 			 */
 			return (0);
 		}
 		len = sendwin - flight;
 		if ((len + ctf_outstanding(tp)) > tp->snd_wnd) {
 			/* We would send too much (beyond the rwnd) */
 			len = tp->snd_wnd - ctf_outstanding(tp);
 		}
 		if ((len + sb_offset) > avail) {
 			/*
 			 * We don't have that much in the SB, how much is
 			 * there?
 			 */
 			len = avail - sb_offset;
 		}
 	}
 	return (len);
 }
 
 static void
 rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags,
 	     unsigned ipoptlen, int32_t orig_len, int32_t len, int error,
 	     int rsm_is_null, int optlen, int line, uint16_t mode)
 {
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
 		log.u_bbr.flex1 = error;
 		log.u_bbr.flex2 = flags;
 		log.u_bbr.flex3 = rsm_is_null;
 		log.u_bbr.flex4 = ipoptlen;
 		log.u_bbr.flex5 = tp->rcv_numsacks;
 		log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
 		log.u_bbr.flex7 = optlen;
 		log.u_bbr.flex8 = rack->r_fsb_inited;
 		log.u_bbr.applimited = rack->r_fast_output;
 		log.u_bbr.bw_inuse = rack_get_bw(rack);
 		log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
 		log.u_bbr.cwnd_gain = mode;
 		log.u_bbr.pkts_out = orig_len;
 		log.u_bbr.lt_epoch = len;
 		log.u_bbr.delivered = line;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		tcp_log_event_(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0,
 			       len, &log, false, NULL, NULL, 0, &tv);
 	}
 }
 
 
 static struct mbuf *
 rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen,
 		   struct rack_fast_send_blk *fsb,
 		   int32_t seglimit, int32_t segsize, int hw_tls)
 {
 #ifdef KERN_TLS
 	struct ktls_session *tls, *ntls;
 #ifdef INVARIANTS
 	struct mbuf *start;
 #endif
 #endif
 	struct mbuf *m, *n, **np, *smb;
 	struct mbuf *top;
 	int32_t off, soff;
 	int32_t len = *plen;
 	int32_t fragsize;
 	int32_t len_cp = 0;
 	uint32_t mlen, frags;
 
 	soff = off = the_off;
 	smb = m = the_m;
 	np = &top;
 	top = NULL;
 #ifdef KERN_TLS
 	if (hw_tls && (m->m_flags & M_EXTPG))
 		tls = m->m_epg_tls;
 	else
 		tls = NULL;
 #ifdef INVARIANTS
 	start = m;
 #endif
 #endif
 	while (len > 0) {
 		if (m == NULL) {
 			*plen = len_cp;
 			break;
 		}
 #ifdef KERN_TLS
 		if (hw_tls) {
 			if (m->m_flags & M_EXTPG)
 				ntls = m->m_epg_tls;
 			else
 				ntls = NULL;
 
 			/*
 			 * Avoid mixing TLS records with handshake
 			 * data or TLS records from different
 			 * sessions.
 			 */
 			if (tls != ntls) {
 				MPASS(m != start);
 				*plen = len_cp;
 				break;
 			}
 		}
 #endif
 		mlen = min(len, m->m_len - off);
 		if (seglimit) {
 			/*
 			 * For M_EXTPG mbufs, add 3 segments
 			 * + 1 in case we are crossing page boundaries
 			 * + 2 in case the TLS hdr/trailer are used
 			 * It is cheaper to just add the segments
 			 * than it is to take the cache miss to look
 			 * at the mbuf ext_pgs state in detail.
 			 */
 			if (m->m_flags & M_EXTPG) {
 				fragsize = min(segsize, PAGE_SIZE);
 				frags = 3;
 			} else {
 				fragsize = segsize;
 				frags = 0;
 			}
 
 			/* Break if we really can't fit anymore. */
 			if ((frags + 1) >= seglimit) {
 				*plen =	len_cp;
 				break;
 			}
 
 			/*
 			 * Reduce size if you can't copy the whole
 			 * mbuf. If we can't copy the whole mbuf, also
 			 * adjust len so the loop will end after this
 			 * mbuf.
 			 */
 			if ((frags + howmany(mlen, fragsize)) >= seglimit) {
 				mlen = (seglimit - frags - 1) * fragsize;
 				len = mlen;
 				*plen = len_cp + len;
 			}
 			frags += howmany(mlen, fragsize);
 			if (frags == 0)
 				frags++;
 			seglimit -= frags;
 			KASSERT(seglimit > 0,
 			    ("%s: seglimit went too low", __func__));
 		}
 		n = m_get(M_NOWAIT, m->m_type);
 		*np = n;
 		if (n == NULL)
 			goto nospace;
 		n->m_len = mlen;
 		soff += mlen;
 		len_cp += n->m_len;
 		if (m->m_flags & (M_EXT|M_EXTPG)) {
 			n->m_data = m->m_data + off;
 			mb_dupcl(n, m);
 		} else {
 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
 			    (u_int)n->m_len);
 		}
 		len -= n->m_len;
 		off = 0;
 		m = m->m_next;
 		np = &n->m_next;
 		if (len || (soff == smb->m_len)) {
 			/*
 			 * We have more so we move forward  or
 			 * we have consumed the entire mbuf and
 			 * len has fell to 0.
 			 */
 			soff = 0;
 			smb = m;
 		}
 
 	}
 	if (fsb != NULL) {
 		fsb->m = smb;
 		fsb->off = soff;
 		if (smb) {
 			/*
 			 * Save off the size of the mbuf. We do
 			 * this so that we can recognize when it
 			 * has been trimmed by sbcut() as acks
 			 * come in.
 			 */
 			fsb->o_m_len = smb->m_len;
 		} else {
 			/*
 			 * This is the case where the next mbuf went to NULL. This
 			 * means with this copy we have sent everything in the sb.
 			 * In theory we could clear the fast_output flag, but lets
 			 * not since its possible that we could get more added
 			 * and acks that call the extend function which would let
 			 * us send more.
 			 */
 			fsb->o_m_len = 0;
 		}
 	}
 	return (top);
 nospace:
 	if (top)
 		m_freem(top);
 	return (NULL);
 
 }
 
 /*
  * This is a copy of m_copym(), taking the TSO segment size/limit
  * constraints into account, and advancing the sndptr as it goes.
  */
 static struct mbuf *
 rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen,
 		int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff)
 {
 	struct mbuf *m, *n;
 	int32_t soff;
 
 	soff = rack->r_ctl.fsb.off;
 	m = rack->r_ctl.fsb.m;
 	if (rack->r_ctl.fsb.o_m_len > m->m_len) {
 		/*
 		 * The mbuf had the front of it chopped off by an ack
 		 * we need to adjust the soff/off by that difference.
 		 */
 		uint32_t delta;
 
 		delta = rack->r_ctl.fsb.o_m_len - m->m_len;
 		soff -= delta;
 	} else if (rack->r_ctl.fsb.o_m_len < m->m_len) {
 		/*
 		 * The mbuf was expanded probably by
 		 * a m_compress. Just update o_m_len.
 		 */
 		rack->r_ctl.fsb.o_m_len = m->m_len;
 	}
 	KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff));
 	KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen));
 	KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?",
 				 __FUNCTION__,
 				 rack, *plen, m, m->m_len));
 	/* Save off the right location before we copy and advance */
 	*s_soff = soff;
 	*s_mb = rack->r_ctl.fsb.m;
 	n = rack_fo_base_copym(m, soff, plen,
 			       &rack->r_ctl.fsb,
 			       seglimit, segsize, rack->r_ctl.fsb.hw_tls);
 	return (n);
 }
 
 static int
 rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm,
 		     uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len, uint8_t doing_tlp)
 {
 	/*
 	 * Enter the fast retransmit path. We are given that a sched_pin is
 	 * in place (if accounting is compliled in) and the cycle count taken
 	 * at the entry is in the ts_val. The concept her is that the rsm
 	 * now holds the mbuf offsets and such so we can directly transmit
 	 * without a lot of overhead, the len field is already set for
 	 * us to prohibit us from sending too much (usually its 1MSS).
 	 */
 	struct ip *ip = NULL;
 	struct udphdr *udp = NULL;
 	struct tcphdr *th = NULL;
 	struct mbuf *m = NULL;
 	struct inpcb *inp;
 	uint8_t *cpto;
 	struct tcp_log_buffer *lgb;
 #ifdef TCP_ACCOUNTING
 	uint64_t crtsc;
 	int cnt_thru = 1;
 #endif
 	struct tcpopt to;
 	u_char opt[TCP_MAXOLEN];
 	uint32_t hdrlen, optlen;
 	int32_t slot, segsiz, max_val, tso = 0, error, ulen = 0;
 	uint16_t flags;
 	uint32_t if_hw_tsomaxsegcount = 0, startseq;
 	uint32_t if_hw_tsomaxsegsize;
 
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 
 	if (rack->r_is_v6) {
 		ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
 		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	} else
 #endif				/* INET6 */
 	{
 		ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
 		hdrlen = sizeof(struct tcpiphdr);
 	}
 	if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) {
 		goto failed;
 	}
 	if (doing_tlp) {
 		/* Its a TLP add the flag, it may already be there but be sure */
 		rsm->r_flags |= RACK_TLP;
 	} else {
 		/* If it was a TLP it is not not on this retransmit */
 		rsm->r_flags &= ~RACK_TLP;
 	}
 	startseq = rsm->r_start;
 	segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
 	inp = rack->rc_inp;
 	to.to_flags = 0;
 	flags = tcp_outflags[tp->t_state];
 	if (flags & (TH_SYN|TH_RST)) {
 		goto failed;
 	}
 	if (rsm->r_flags & RACK_HAS_FIN) {
 		/* We can't send a FIN here */
 		goto failed;
 	}
 	if (flags & TH_FIN) {
 		/* We never send a FIN */
 		flags &= ~TH_FIN;
 	}
 	if (tp->t_flags & TF_RCVD_TSTMP) {
 		to.to_tsval = ms_cts + tp->ts_offset;
 		to.to_tsecr = tp->ts_recent;
 		to.to_flags = TOF_TS;
 	}
 	optlen = tcp_addoptions(&to, opt);
 	hdrlen += optlen;
 	udp = rack->r_ctl.fsb.udp;
 	if (udp)
 		hdrlen += sizeof(struct udphdr);
 	if (rack->r_ctl.rc_pace_max_segs)
 		max_val = rack->r_ctl.rc_pace_max_segs;
 	else if (rack->rc_user_set_max_segs)
 		max_val = rack->rc_user_set_max_segs * segsiz;
 	else
 		max_val = len;
 	if ((tp->t_flags & TF_TSO) &&
 	    V_tcp_do_tso &&
 	    (len > segsiz) &&
 	    (tp->t_port == 0))
 		tso = 1;
 #ifdef INET6
 	if (MHLEN < hdrlen + max_linkhdr)
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	else
 #endif
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		goto failed;
 	m->m_data += max_linkhdr;
 	m->m_len = hdrlen;
 	th = rack->r_ctl.fsb.th;
 	/* Establish the len to send */
 	if (len > max_val)
 		len = max_val;
 	if ((tso) && (len + optlen > tp->t_maxseg)) {
 		uint32_t if_hw_tsomax;
 		int32_t max_len;
 
 		/* extract TSO information */
 		if_hw_tsomax = tp->t_tsomax;
 		if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
 		if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
 		/*
 		 * Check if we should limit by maximum payload
 		 * length:
 		 */
 		if (if_hw_tsomax != 0) {
 			/* compute maximum TSO length */
 			max_len = (if_hw_tsomax - hdrlen -
 				   max_linkhdr);
 			if (max_len <= 0) {
 				goto failed;
 			} else if (len > max_len) {
 				len = max_len;
 			}
 		}
 		if (len <= segsiz) {
 			/*
 			 * In case there are too many small fragments don't
 			 * use TSO:
 			 */
 			tso = 0;
 		}
 	} else {
 		tso = 0;
 	}
 	if ((tso == 0) && (len > segsiz))
 		len = segsiz;
 	if ((len == 0) ||
 	    (len <= MHLEN - hdrlen - max_linkhdr)) {
 		goto failed;
 	}
 	th->th_seq = htonl(rsm->r_start);
 	th->th_ack = htonl(tp->rcv_nxt);
 	/*
 	 * The PUSH bit should only be applied
 	 * if the full retransmission is made. If
 	 * we are sending less than this is the
 	 * left hand edge and should not have
 	 * the PUSH bit.
 	 */
 	if ((rsm->r_flags & RACK_HAD_PUSH) &&
 	    (len == (rsm->r_end - rsm->r_start)))
 		flags |= TH_PUSH;
 	th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale));
 	if (th->th_win == 0) {
 		tp->t_sndzerowin++;
 		tp->t_flags |= TF_RXWIN0SENT;
 	} else
 		tp->t_flags &= ~TF_RXWIN0SENT;
 	if (rsm->r_flags & RACK_TLP) {
 		/*
 		 * TLP should not count in retran count, but
 		 * in its own bin
 		 */
 		counter_u64_add(rack_tlp_retran, 1);
 		counter_u64_add(rack_tlp_retran_bytes, len);
 	} else {
 		tp->t_sndrexmitpack++;
 		KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
 		KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
 	}
 #ifdef STATS
 	stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
 				 len);
 #endif
 	if (rsm->m == NULL)
 		goto failed;
 	if (rsm->orig_m_len != rsm->m->m_len) {
 		/* Fix up the orig_m_len and possibly the mbuf offset */
 		rack_adjust_orig_mlen(rsm);
 	}
 	m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, rsm->r_hw_tls);
 	if (len <= segsiz) {
 		/*
 		 * Must have ran out of mbufs for the copy
 		 * shorten it to no longer need tso. Lets
 		 * not put on sendalot since we are low on
 		 * mbufs.
 		 */
 		tso = 0;
 	}
 	if ((m->m_next == NULL) || (len <= 0)){
 		goto failed;
 	}
 	if (udp) {
 		if (rack->r_is_v6)
 			ulen = hdrlen + len - sizeof(struct ip6_hdr);
 		else
 			ulen = hdrlen + len - sizeof(struct ip);
 		udp->uh_ulen = htons(ulen);
 	}
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 	if (TCPS_HAVERCVDSYN(tp->t_state) &&
 	    (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) {
 		int ect = tcp_ecn_output_established(tp, &flags, len, true);
 		if ((tp->t_state == TCPS_SYN_RECEIVED) &&
 		    (tp->t_flags2 & TF2_ECN_SND_ECE))
 		    tp->t_flags2 &= ~TF2_ECN_SND_ECE;
 #ifdef INET6
 		if (rack->r_is_v6) {
 		    ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20);
 		    ip6->ip6_flow |= htonl(ect << 20);
 		}
 		else
 #endif
 		{
 		    ip->ip_tos &= ~IPTOS_ECN_MASK;
 		    ip->ip_tos |= ect;
 		}
 	}
 	tcp_set_flags(th, flags);
 	m->m_pkthdr.len = hdrlen + len;	/* in6_cksum() need this */
 #ifdef INET6
 	if (rack->r_is_v6) {
 		if (tp->t_port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
 			th->th_sum = htons(0);
 			UDPSTAT_INC(udps_opackets);
 		} else {
 			m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			th->th_sum = in6_cksum_pseudo(ip6,
 						      sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
 						      0);
 		}
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		if (tp->t_port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
 						ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
 			th->th_sum = htons(0);
 			UDPSTAT_INC(udps_opackets);
 		} else {
 			m->m_pkthdr.csum_flags = CSUM_TCP;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			th->th_sum = in_pseudo(ip->ip_src.s_addr,
 					       ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
 									IPPROTO_TCP + len + optlen));
 		}
 		/* IP version must be set here for ipv4/ipv6 checking later */
 		KASSERT(ip->ip_v == IPVERSION,
 			("%s: IP version incorrect: %d", __func__, ip->ip_v));
 	}
 #endif
 	if (tso) {
 		KASSERT(len > tp->t_maxseg - optlen,
 			("%s: len <= tso_segsz tp:%p", __func__, tp));
 		m->m_pkthdr.csum_flags |= CSUM_TSO;
 		m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
 	}
 #ifdef INET6
 	if (rack->r_is_v6) {
 		ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit;
 		ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
 		if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		ip->ip_len = htons(m->m_pkthdr.len);
 		ip->ip_ttl = rack->r_ctl.fsb.hoplimit;
 		if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 			if (tp->t_port == 0 || len < V_tcp_minmss) {
 				ip->ip_off |= htons(IP_DF);
 			}
 		} else {
 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 		}
 	}
 #endif
 	/* Time to copy in our header */
 	cpto = mtod(m, uint8_t *);
 	memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
 	th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
 	if (optlen) {
 		bcopy(opt, th + 1, optlen);
 		th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 	} else {
 		th->th_off = sizeof(struct tcphdr) >> 2;
 	}
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		if (rsm->r_flags & RACK_RWND_COLLAPSED) {
 			rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm);
 			counter_u64_add(rack_collapsed_win_rxt, 1);
 			counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start));
 		}
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
 		if (rack->rack_no_prr)
 			log.u_bbr.flex1 = 0;
 		else
 			log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
 		log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
 		log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
 		log.u_bbr.flex4 = max_val;
 		log.u_bbr.flex5 = 0;
 		/* Save off the early/late values */
 		log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
 		log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
 		log.u_bbr.bw_inuse = rack_get_bw(rack);
 		if (doing_tlp == 0)
 			log.u_bbr.flex8 = 1;
 		else
 			log.u_bbr.flex8 = 2;
 		log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
 		log.u_bbr.flex7 = 55;
 		log.u_bbr.pkts_out = tp->t_maxseg;
 		log.u_bbr.timeStamp = cts;
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
 		log.u_bbr.delivered = 0;
 		lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
 				     len, &log, false, NULL, NULL, 0, tv);
 	} else
 		lgb = NULL;
 #ifdef INET6
 	if (rack->r_is_v6) {
 		error = ip6_output(m, NULL,
 				   &inp->inp_route6,
 				   0, NULL, NULL, inp);
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		error = ip_output(m, NULL,
 				  &inp->inp_route,
 				  0, 0, inp);
 	}
 #endif
 	m = NULL;
 	if (lgb) {
 		lgb->tlb_errno = error;
 		lgb = NULL;
 	}
 	if (error) {
 		goto failed;
 	}
 	rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv),
 			rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls);
 	if (doing_tlp && (rack->fast_rsm_hack == 0)) {
 		rack->rc_tlp_in_progress = 1;
 		rack->r_ctl.rc_tlp_cnt_out++;
 	}
 	if (error == 0) {
 		tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls);
 		if (doing_tlp) {
 			rack->rc_last_sent_tlp_past_cumack = 0;
 			rack->rc_last_sent_tlp_seq_valid = 1;
 			rack->r_ctl.last_sent_tlp_seq = rsm->r_start;
 			rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start;
 		}
 	}
 	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 	rack->forced_ack = 0;	/* If we send something zap the FA flag */
 	if (IN_FASTRECOVERY(tp->t_flags) && rsm)
 		rack->r_ctl.retran_during_recovery += len;
 	{
 		int idx;
 
 		idx = (len / segsiz) + 3;
 		if (idx >= TCP_MSS_ACCT_ATIMER)
 			counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
 		else
 			counter_u64_add(rack_out_size[idx], 1);
 	}
 	if (tp->t_rtttime == 0) {
 		tp->t_rtttime = ticks;
 		tp->t_rtseq = startseq;
 		KMOD_TCPSTAT_INC(tcps_segstimed);
 	}
 	counter_u64_add(rack_fto_rsm_send, 1);
 	if (error && (error == ENOBUFS)) {
 		if (rack->r_ctl.crte != NULL) {
 			rack_trace_point(rack, RACK_TP_HWENOBUF);
 		} else
 			rack_trace_point(rack, RACK_TP_ENOBUF);
 		slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
 		if (rack->rc_enobuf < 0x7f)
 			rack->rc_enobuf++;
 		if (slot < (10 * HPTS_USEC_IN_MSEC))
 			slot = 10 * HPTS_USEC_IN_MSEC;
 	} else
 		slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz);
 	if ((slot == 0) ||
 	    (rack->rc_always_pace == 0) ||
 	    (rack->r_rr_config == 1)) {
 		/*
 		 * We have no pacing set or we
 		 * are using old-style rack or
 		 * we are overridden to use the old 1ms pacing.
 		 */
 		slot = rack->r_ctl.rc_min_to;
 	}
 	rack_start_hpts_timer(rack, tp, cts, slot, len, 0);
 #ifdef TCP_ACCOUNTING
 	crtsc = get_cyclecount();
 	if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 		tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
 	}
 	counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru);
 	if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 		tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
 	}
 	counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val));
 	if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 		tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz);
 	}
 	counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((len + segsiz - 1) / segsiz));
 	sched_unpin();
 #endif
 	return (0);
 failed:
 	if (m)
 		m_free(m);
 	return (-1);
 }
 
 static void
 rack_sndbuf_autoscale(struct tcp_rack *rack)
 {
 	/*
 	 * Automatic sizing of send socket buffer.  Often the send buffer
 	 * size is not optimally adjusted to the actual network conditions
 	 * at hand (delay bandwidth product).  Setting the buffer size too
 	 * small limits throughput on links with high bandwidth and high
 	 * delay (eg. trans-continental/oceanic links).  Setting the
 	 * buffer size too big consumes too much real kernel memory,
 	 * especially with many connections on busy servers.
 	 *
 	 * The criteria to step up the send buffer one notch are:
 	 *  1. receive window of remote host is larger than send buffer
 	 *     (with a fudge factor of 5/4th);
 	 *  2. send buffer is filled to 7/8th with data (so we actually
 	 *     have data to make use of it);
 	 *  3. send buffer fill has not hit maximal automatic size;
 	 *  4. our send window (slow start and cogestion controlled) is
 	 *     larger than sent but unacknowledged data in send buffer.
 	 *
 	 * Note that the rack version moves things much faster since
 	 * we want to avoid hitting cache lines in the rack_fast_output()
 	 * path so this is called much less often and thus moves
 	 * the SB forward by a percentage.
 	 */
 	struct socket *so;
 	struct tcpcb *tp;
 	uint32_t sendwin, scaleup;
 
 	tp = rack->rc_tp;
 	so = rack->rc_inp->inp_socket;
 	sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd);
 	if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
 		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
 		    sbused(&so->so_snd) >=
 		    (so->so_snd.sb_hiwat / 8 * 7) &&
 		    sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
 		    sendwin >= (sbused(&so->so_snd) -
 		    (tp->snd_nxt - tp->snd_una))) {
 			if (rack_autosndbuf_inc)
 				scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100;
 			else
 				scaleup = V_tcp_autosndbuf_inc;
 			if (scaleup < V_tcp_autosndbuf_inc)
 				scaleup = V_tcp_autosndbuf_inc;
 			scaleup += so->so_snd.sb_hiwat;
 			if (scaleup > V_tcp_autosndbuf_max)
 				scaleup = V_tcp_autosndbuf_max;
 			if (!sbreserve_locked(so, SO_SND, scaleup, curthread))
 				so->so_snd.sb_flags &= ~SB_AUTOSIZE;
 		}
 	}
 }
 
 static int
 rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val,
 		 uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err)
 {
 	/*
 	 * Enter to do fast output. We are given that the sched_pin is
 	 * in place (if accounting is compiled in) and the cycle count taken
 	 * at entry is in place in ts_val. The idea here is that
 	 * we know how many more bytes needs to be sent (presumably either
 	 * during pacing or to fill the cwnd and that was greater than
 	 * the max-burst). We have how much to send and all the info we
 	 * need to just send.
 	 */
 	struct ip *ip = NULL;
 	struct udphdr *udp = NULL;
 	struct tcphdr *th = NULL;
 	struct mbuf *m, *s_mb;
 	struct inpcb *inp;
 	uint8_t *cpto;
 	struct tcp_log_buffer *lgb;
 #ifdef TCP_ACCOUNTING
 	uint64_t crtsc;
 #endif
 	struct tcpopt to;
 	u_char opt[TCP_MAXOLEN];
 	uint32_t hdrlen, optlen;
 #ifdef TCP_ACCOUNTING
 	int cnt_thru = 1;
 #endif
 	int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0;
 	uint16_t flags;
 	uint32_t s_soff;
 	uint32_t if_hw_tsomaxsegcount = 0, startseq;
 	uint32_t if_hw_tsomaxsegsize;
 	uint16_t add_flag = RACK_SENT_FP;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 
 	if (rack->r_is_v6) {
 		ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
 		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	} else
 #endif				/* INET6 */
 	{
 		ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
 		hdrlen = sizeof(struct tcpiphdr);
 	}
 	if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) {
 		m = NULL;
 		goto failed;
 	}
 	startseq = tp->snd_max;
 	segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
 	inp = rack->rc_inp;
 	len = rack->r_ctl.fsb.left_to_send;
 	to.to_flags = 0;
 	flags = rack->r_ctl.fsb.tcp_flags;
 	if (tp->t_flags & TF_RCVD_TSTMP) {
 		to.to_tsval = ms_cts + tp->ts_offset;
 		to.to_tsecr = tp->ts_recent;
 		to.to_flags = TOF_TS;
 	}
 	optlen = tcp_addoptions(&to, opt);
 	hdrlen += optlen;
 	udp = rack->r_ctl.fsb.udp;
 	if (udp)
 		hdrlen += sizeof(struct udphdr);
 	if (rack->r_ctl.rc_pace_max_segs)
 		max_val = rack->r_ctl.rc_pace_max_segs;
 	else if (rack->rc_user_set_max_segs)
 		max_val = rack->rc_user_set_max_segs * segsiz;
 	else
 		max_val = len;
 	if ((tp->t_flags & TF_TSO) &&
 	    V_tcp_do_tso &&
 	    (len > segsiz) &&
 	    (tp->t_port == 0))
 		tso = 1;
 again:
 #ifdef INET6
 	if (MHLEN < hdrlen + max_linkhdr)
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	else
 #endif
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		goto failed;
 	m->m_data += max_linkhdr;
 	m->m_len = hdrlen;
 	th = rack->r_ctl.fsb.th;
 	/* Establish the len to send */
 	if (len > max_val)
 		len = max_val;
 	if ((tso) && (len + optlen > tp->t_maxseg)) {
 		uint32_t if_hw_tsomax;
 		int32_t max_len;
 
 		/* extract TSO information */
 		if_hw_tsomax = tp->t_tsomax;
 		if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
 		if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
 		/*
 		 * Check if we should limit by maximum payload
 		 * length:
 		 */
 		if (if_hw_tsomax != 0) {
 			/* compute maximum TSO length */
 			max_len = (if_hw_tsomax - hdrlen -
 				   max_linkhdr);
 			if (max_len <= 0) {
 				goto failed;
 			} else if (len > max_len) {
 				len = max_len;
 			}
 		}
 		if (len <= segsiz) {
 			/*
 			 * In case there are too many small fragments don't
 			 * use TSO:
 			 */
 			tso = 0;
 		}
 	} else {
 		tso = 0;
 	}
 	if ((tso == 0) && (len > segsiz))
 		len = segsiz;
 	if ((len == 0) ||
 	    (len <= MHLEN - hdrlen - max_linkhdr)) {
 		goto failed;
 	}
 	sb_offset = tp->snd_max - tp->snd_una;
 	th->th_seq = htonl(tp->snd_max);
 	th->th_ack = htonl(tp->rcv_nxt);
 	th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale));
 	if (th->th_win == 0) {
 		tp->t_sndzerowin++;
 		tp->t_flags |= TF_RXWIN0SENT;
 	} else
 		tp->t_flags &= ~TF_RXWIN0SENT;
 	tp->snd_up = tp->snd_una;	/* drag it along, its deprecated */
 	KMOD_TCPSTAT_INC(tcps_sndpack);
 	KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
 #ifdef STATS
 	stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
 				 len);
 #endif
 	if (rack->r_ctl.fsb.m == NULL)
 		goto failed;
 
 	/* s_mb and s_soff are saved for rack_log_output */
 	m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize,
 				    &s_mb, &s_soff);
 	if (len <= segsiz) {
 		/*
 		 * Must have ran out of mbufs for the copy
 		 * shorten it to no longer need tso. Lets
 		 * not put on sendalot since we are low on
 		 * mbufs.
 		 */
 		tso = 0;
 	}
 	if (rack->r_ctl.fsb.rfo_apply_push &&
 	    (len == rack->r_ctl.fsb.left_to_send)) {
 		flags |= TH_PUSH;
 		add_flag |= RACK_HAD_PUSH;
 	}
 	if ((m->m_next == NULL) || (len <= 0)){
 		goto failed;
 	}
 	if (udp) {
 		if (rack->r_is_v6)
 			ulen = hdrlen + len - sizeof(struct ip6_hdr);
 		else
 			ulen = hdrlen + len - sizeof(struct ip);
 		udp->uh_ulen = htons(ulen);
 	}
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 	if (TCPS_HAVERCVDSYN(tp->t_state) &&
 	    (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) {
 		int ect = tcp_ecn_output_established(tp, &flags, len, false);
 		if ((tp->t_state == TCPS_SYN_RECEIVED) &&
 		    (tp->t_flags2 & TF2_ECN_SND_ECE))
 			tp->t_flags2 &= ~TF2_ECN_SND_ECE;
 #ifdef INET6
 		if (rack->r_is_v6) {
 			ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20);
 			ip6->ip6_flow |= htonl(ect << 20);
 		}
 		else
 #endif
 		{
 			ip->ip_tos &= ~IPTOS_ECN_MASK;
 			ip->ip_tos |= ect;
 		}
 	}
 	tcp_set_flags(th, flags);
 	m->m_pkthdr.len = hdrlen + len;	/* in6_cksum() need this */
 #ifdef INET6
 	if (rack->r_is_v6) {
 		if (tp->t_port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
 			th->th_sum = htons(0);
 			UDPSTAT_INC(udps_opackets);
 		} else {
 			m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			th->th_sum = in6_cksum_pseudo(ip6,
 						      sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
 						      0);
 		}
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		if (tp->t_port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
 						ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
 			th->th_sum = htons(0);
 			UDPSTAT_INC(udps_opackets);
 		} else {
 			m->m_pkthdr.csum_flags = CSUM_TCP;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			th->th_sum = in_pseudo(ip->ip_src.s_addr,
 					       ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
 									IPPROTO_TCP + len + optlen));
 		}
 		/* IP version must be set here for ipv4/ipv6 checking later */
 		KASSERT(ip->ip_v == IPVERSION,
 			("%s: IP version incorrect: %d", __func__, ip->ip_v));
 	}
 #endif
 	if (tso) {
 		KASSERT(len > tp->t_maxseg - optlen,
 			("%s: len <= tso_segsz tp:%p", __func__, tp));
 		m->m_pkthdr.csum_flags |= CSUM_TSO;
 		m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
 	}
 #ifdef INET6
 	if (rack->r_is_v6) {
 		ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit;
 		ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
 		if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		ip->ip_len = htons(m->m_pkthdr.len);
 		ip->ip_ttl = rack->r_ctl.fsb.hoplimit;
 		if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 			if (tp->t_port == 0 || len < V_tcp_minmss) {
 				ip->ip_off |= htons(IP_DF);
 			}
 		} else {
 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 		}
 	}
 #endif
 	/* Time to copy in our header */
 	cpto = mtod(m, uint8_t *);
 	memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
 	th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
 	if (optlen) {
 		bcopy(opt, th + 1, optlen);
 		th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 	} else {
 		th->th_off = sizeof(struct tcphdr) >> 2;
 	}
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
 		if (rack->rack_no_prr)
 			log.u_bbr.flex1 = 0;
 		else
 			log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
 		log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
 		log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
 		log.u_bbr.flex4 = max_val;
 		log.u_bbr.flex5 = 0;
 		/* Save off the early/late values */
 		log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
 		log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
 		log.u_bbr.bw_inuse = rack_get_bw(rack);
 		log.u_bbr.flex8 = 0;
 		log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
 		log.u_bbr.flex7 = 44;
 		log.u_bbr.pkts_out = tp->t_maxseg;
 		log.u_bbr.timeStamp = cts;
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
 		log.u_bbr.delivered = 0;
 		lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
 				     len, &log, false, NULL, NULL, 0, tv);
 	} else
 		lgb = NULL;
 #ifdef INET6
 	if (rack->r_is_v6) {
 		error = ip6_output(m, NULL,
 				   &inp->inp_route6,
 				   0, NULL, NULL, inp);
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		error = ip_output(m, NULL,
 				  &inp->inp_route,
 				  0, 0, inp);
 	}
 #endif
 	if (lgb) {
 		lgb->tlb_errno = error;
 		lgb = NULL;
 	}
 	if (error) {
 		*send_err = error;
 		m = NULL;
 		goto failed;
 	}
 	rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv),
 			NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls);
 	m = NULL;
 	if (tp->snd_una == tp->snd_max) {
 		rack->r_ctl.rc_tlp_rxt_last_time = cts;
 		rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
 		tp->t_acktime = ticks;
 	}
 	if (error == 0)
 		tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls);
 
 	rack->forced_ack = 0;	/* If we send something zap the FA flag */
 	tot_len += len;
 	if ((tp->t_flags & TF_GPUTINPROG) == 0)
 		rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset);
 	tp->snd_max += len;
 	tp->snd_nxt = tp->snd_max;
 	{
 		int idx;
 
 		idx = (len / segsiz) + 3;
 		if (idx >= TCP_MSS_ACCT_ATIMER)
 			counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
 		else
 			counter_u64_add(rack_out_size[idx], 1);
 	}
 	if (len <= rack->r_ctl.fsb.left_to_send)
 		rack->r_ctl.fsb.left_to_send -= len;
 	else
 		rack->r_ctl.fsb.left_to_send = 0;
 	if (rack->r_ctl.fsb.left_to_send < segsiz) {
 		rack->r_fast_output = 0;
 		rack->r_ctl.fsb.left_to_send = 0;
 		/* At the end of fast_output scale up the sb */
 		SOCKBUF_LOCK(&rack->rc_inp->inp_socket->so_snd);
 		rack_sndbuf_autoscale(rack);
 		SOCKBUF_UNLOCK(&rack->rc_inp->inp_socket->so_snd);
 	}
 	if (tp->t_rtttime == 0) {
 		tp->t_rtttime = ticks;
 		tp->t_rtseq = startseq;
 		KMOD_TCPSTAT_INC(tcps_segstimed);
 	}
 	if ((rack->r_ctl.fsb.left_to_send >= segsiz) &&
 	    (max_val > len) &&
 	    (tso == 0)) {
 		max_val -= len;
 		len = segsiz;
 		th = rack->r_ctl.fsb.th;
 #ifdef TCP_ACCOUNTING
 		cnt_thru++;
 #endif
 		goto again;
 	}
 	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 	counter_u64_add(rack_fto_send, 1);
 	slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz);
 	rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0);
 #ifdef TCP_ACCOUNTING
 	crtsc = get_cyclecount();
 	if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 		tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
 	}
 	counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru);
 	if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 		tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
 	}
 	counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val));
 	if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 		tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz);
 	}
 	counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len + segsiz - 1) / segsiz));
 	sched_unpin();
 #endif
 	return (0);
 failed:
 	if (m)
 		m_free(m);
 	rack->r_fast_output = 0;
 	return (-1);
 }
 
 static struct rack_sendmap *
 rack_check_collapsed(struct tcp_rack *rack, uint32_t cts)
 {
 	struct rack_sendmap *rsm = NULL;
 	struct rack_sendmap fe;
 	int thresh;
 
 restart:
 	fe.r_start = rack->r_ctl.last_collapse_point;
 	rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
 	if ((rsm == NULL) || ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0)) {
 		/* Nothing, strange turn off validity  */
 		rack->r_collapse_point_valid = 0;
 		return (NULL);
 	}
 	/* Can we send it yet? */
 	if (rsm->r_end > (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)) {
 		/*
 		 * Receiver window has not grown enough for
 		 * the segment to be put on the wire.
 		 */
 		return (NULL);
 	}
 	if (rsm->r_flags & RACK_ACKED) {
 		/*
 		 * It has been sacked, lets move to the
 		 * next one if possible.
 		 */
 		rack->r_ctl.last_collapse_point = rsm->r_end;
 		/* Are we done? */
 		if (SEQ_GEQ(rack->r_ctl.last_collapse_point,
 			    rack->r_ctl.high_collapse_point)) {
 			rack->r_collapse_point_valid = 0;
 			return (NULL);
 		}
 		goto restart;
 	}
 	/* Now has it been long enough ? */
 	thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts);
 	if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > thresh) {
 		rack_log_collapse(rack, rsm->r_start,
 				  (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
 				  thresh, __LINE__, 6, rsm->r_flags, rsm);
 		return (rsm);
 	}
 	/* Not enough time */
 	rack_log_collapse(rack, rsm->r_start,
 			  (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
 			  thresh, __LINE__, 7, rsm->r_flags, rsm);
 	return (NULL);
 }
 
 static int
 rack_output(struct tcpcb *tp)
 {
 	struct socket *so;
 	uint32_t recwin;
 	uint32_t sb_offset, s_moff = 0;
 	int32_t len, error = 0;
 	uint16_t flags;
 	struct mbuf *m, *s_mb = NULL;
 	struct mbuf *mb;
 	uint32_t if_hw_tsomaxsegcount = 0;
 	uint32_t if_hw_tsomaxsegsize;
 	int32_t segsiz, minseg;
 	long tot_len_this_send = 0;
 #ifdef INET
 	struct ip *ip = NULL;
 #endif
 	struct udphdr *udp = NULL;
 	struct tcp_rack *rack;
 	struct tcphdr *th;
 	uint8_t pass = 0;
 	uint8_t mark = 0;
 	uint8_t wanted_cookie = 0;
 	u_char opt[TCP_MAXOLEN];
 	unsigned ipoptlen, optlen, hdrlen, ulen=0;
 	uint32_t rack_seq;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	unsigned ipsec_optlen = 0;
 
 #endif
 	int32_t idle, sendalot;
 	int32_t sub_from_prr = 0;
 	volatile int32_t sack_rxmit;
 	struct rack_sendmap *rsm = NULL;
 	int32_t tso, mtu;
 	struct tcpopt to;
 	int32_t slot = 0;
 	int32_t sup_rack = 0;
 	uint32_t cts, ms_cts, delayed, early;
 	uint16_t add_flag = RACK_SENT_SP;
 	/* The doing_tlp flag will be set by the actual rack_timeout_tlp() */
 	uint8_t hpts_calling,  doing_tlp = 0;
 	uint32_t cwnd_to_use, pace_max_seg;
 	int32_t do_a_prefetch = 0;
 	int32_t prefetch_rsm = 0;
 	int32_t orig_len = 0;
 	struct timeval tv;
 	int32_t prefetch_so_done = 0;
 	struct tcp_log_buffer *lgb;
 	struct inpcb *inp;
 	struct sockbuf *sb;
 	uint64_t ts_val = 0;
 #ifdef TCP_ACCOUNTING
 	uint64_t crtsc;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int32_t isipv6;
 #endif
 	bool hw_tls = false;
 
 	/* setup and take the cache hits here */
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 #ifdef TCP_ACCOUNTING
 	sched_pin();
 	ts_val = get_cyclecount();
 #endif
 	hpts_calling = rack->rc_inp->inp_hpts_calls;
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(rack->rc_inp);
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE) {
 #ifdef TCP_ACCOUNTING
 		sched_unpin();
 #endif
 		return (tcp_offload_output(tp));
 	}
 #endif
 	/*
 	 * For TFO connections in SYN_RECEIVED, only allow the initial
 	 * SYN|ACK and those sent by the retransmit timer.
 	 */
 	if (IS_FASTOPEN(tp->t_flags) &&
 	    (tp->t_state == TCPS_SYN_RECEIVED) &&
 	    SEQ_GT(tp->snd_max, tp->snd_una) &&    /* initial SYN|ACK sent */
 	    (rack->r_ctl.rc_resend == NULL)) {         /* not a retransmit */
 #ifdef TCP_ACCOUNTING
 		sched_unpin();
 #endif
 		return (0);
 	}
 #ifdef INET6
 	if (rack->r_state) {
 		/* Use the cache line loaded if possible */
 		isipv6 = rack->r_is_v6;
 	} else {
 		isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0;
 	}
 #endif
 	early = 0;
 	cts = tcp_get_usecs(&tv);
 	ms_cts = tcp_tv_to_mssectick(&tv);
 	if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
 	    tcp_in_hpts(rack->rc_inp)) {
 		/*
 		 * We are on the hpts for some timer but not hptsi output.
 		 * Remove from the hpts unconditionally.
 		 */
 		rack_timer_cancel(tp, rack, cts, __LINE__);
 	}
 	/* Are we pacing and late? */
 	if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
 	    TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
 		/* We are delayed */
 		delayed = cts - rack->r_ctl.rc_last_output_to;
 	} else {
 		delayed = 0;
 	}
 	/* Do the timers, which may override the pacer */
 	if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
 		int retval;
 
 		retval = rack_process_timers(tp, rack, cts, hpts_calling,
 		    &doing_tlp);
 		if (retval != 0) {
 			counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
 #ifdef TCP_ACCOUNTING
 			sched_unpin();
 #endif
 			/*
 			 * If timers want tcp_drop(), then pass error out,
 			 * otherwise suppress it.
 			 */
 			return (retval < 0 ? retval : 0);
 		}
 	}
 	if (rack->rc_in_persist) {
 		if (tcp_in_hpts(rack->rc_inp) == 0) {
 			/* Timer is not running */
 			rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
 		}
 #ifdef TCP_ACCOUNTING
 		sched_unpin();
 #endif
 		return (0);
 	}
 	if ((rack->rc_ack_required == 1) &&
 	    (rack->r_timer_override == 0)){
 		/* A timeout occurred and no ack has arrived */
 		if (tcp_in_hpts(rack->rc_inp) == 0) {
 			/* Timer is not running */
 			rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
 		}
 #ifdef TCP_ACCOUNTING
 		sched_unpin();
 #endif
 		return (0);
 	}
 	if ((rack->r_timer_override) ||
 	    (rack->rc_ack_can_sendout_data) ||
 	    (delayed) ||
 	    (tp->t_state < TCPS_ESTABLISHED)) {
 		rack->rc_ack_can_sendout_data = 0;
 		if (tcp_in_hpts(rack->rc_inp))
 			tcp_hpts_remove(rack->rc_inp);
 	} else if (tcp_in_hpts(rack->rc_inp)) {
 		/*
 		 * On the hpts you can't pass even if ACKNOW is on, we will
 		 * when the hpts fires.
 		 */
 #ifdef TCP_ACCOUNTING
 		crtsc = get_cyclecount();
 		if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 			tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val);
 		}
 		counter_u64_add(tcp_proc_time[SND_BLOCKED], (crtsc - ts_val));
 		if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 			tp->tcp_cnt_counters[SND_BLOCKED]++;
 		}
 		counter_u64_add(tcp_cnt_counters[SND_BLOCKED], 1);
 		sched_unpin();
 #endif
 		counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
 		return (0);
 	}
 	rack->rc_inp->inp_hpts_calls = 0;
 	/* Finish out both pacing early and late accounting */
 	if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
 	    TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) {
 		early = rack->r_ctl.rc_last_output_to - cts;
 	} else
 		early = 0;
 	if (delayed) {
 		rack->r_ctl.rc_agg_delayed += delayed;
 		rack->r_late = 1;
 	} else if (early) {
 		rack->r_ctl.rc_agg_early += early;
 		rack->r_early = 1;
 	}
 	/* Now that early/late accounting is done turn off the flag */
 	rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
 	rack->r_wanted_output = 0;
 	rack->r_timer_override = 0;
 	if ((tp->t_state != rack->r_state) &&
 	    TCPS_HAVEESTABLISHED(tp->t_state)) {
 		rack_set_state(tp, rack);
 	}
 	if ((rack->r_fast_output) &&
 	    (doing_tlp == 0) &&
 	    (tp->rcv_numsacks == 0)) {
 		int ret;
 
 		error = 0;
 		ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error);
 		if (ret >= 0)
 			return(ret);
 		else if (error) {
 			inp = rack->rc_inp;
 			so = inp->inp_socket;
 			sb = &so->so_snd;
 			goto nomore;
 		}
 	}
 	inp = rack->rc_inp;
 	/*
 	 * For TFO connections in SYN_SENT or SYN_RECEIVED,
 	 * only allow the initial SYN or SYN|ACK and those sent
 	 * by the retransmit timer.
 	 */
 	if (IS_FASTOPEN(tp->t_flags) &&
 	    ((tp->t_state == TCPS_SYN_RECEIVED) ||
 	     (tp->t_state == TCPS_SYN_SENT)) &&
 	    SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
 	    (tp->t_rxtshift == 0)) {              /* not a retransmit */
 		cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
 		so = inp->inp_socket;
 		sb = &so->so_snd;
 		goto just_return_nolock;
 	}
 	/*
 	 * Determine length of data that should be transmitted, and flags
 	 * that will be used. If there is some data or critical controls
 	 * (SYN, RST) to send, then transmit; otherwise, investigate
 	 * further.
 	 */
 	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
 	if (tp->t_idle_reduce) {
 		if (idle && (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur))
 			rack_cc_after_idle(rack, tp);
 	}
 	tp->t_flags &= ~TF_LASTIDLE;
 	if (idle) {
 		if (tp->t_flags & TF_MORETOCOME) {
 			tp->t_flags |= TF_LASTIDLE;
 			idle = 0;
 		}
 	}
 	if ((tp->snd_una == tp->snd_max) &&
 	    rack->r_ctl.rc_went_idle_time &&
 	    TSTMP_GT(cts, rack->r_ctl.rc_went_idle_time)) {
 		idle = cts - rack->r_ctl.rc_went_idle_time;
 		if (idle > rack_min_probertt_hold) {
 			/* Count as a probe rtt */
 			if (rack->in_probe_rtt == 0) {
 				rack->r_ctl.rc_lower_rtt_us_cts = cts;
 				rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts;
 				rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts;
 				rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts;
 			} else {
 				rack_exit_probertt(rack, cts);
 			}
 		}
 		idle = 0;
 	}
 	if (rack_use_fsb && (rack->r_fsb_inited == 0) && (rack->r_state != TCPS_CLOSED))
 		rack_init_fsb_block(tp, rack);
 again:
 	/*
 	 * If we've recently taken a timeout, snd_max will be greater than
 	 * snd_nxt.  There may be SACK information that allows us to avoid
 	 * resending already delivered data.  Adjust snd_nxt accordingly.
 	 */
 	sendalot = 0;
 	cts = tcp_get_usecs(&tv);
 	ms_cts = tcp_tv_to_mssectick(&tv);
 	tso = 0;
 	mtu = 0;
 	segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
 	minseg = segsiz;
 	if (rack->r_ctl.rc_pace_max_segs == 0)
 		pace_max_seg = rack->rc_user_set_max_segs * segsiz;
 	else
 		pace_max_seg = rack->r_ctl.rc_pace_max_segs;
 	sb_offset = tp->snd_max - tp->snd_una;
 	cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
 	flags = tcp_outflags[tp->t_state];
 	while (rack->rc_free_cnt < rack_free_cache) {
 		rsm = rack_alloc(rack);
 		if (rsm == NULL) {
 			if (inp->inp_hpts_calls)
 				/* Retry in a ms */
 				slot = (1 * HPTS_USEC_IN_MSEC);
 			so = inp->inp_socket;
 			sb = &so->so_snd;
 			goto just_return_nolock;
 		}
 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
 		rack->rc_free_cnt++;
 		rsm = NULL;
 	}
 	if (inp->inp_hpts_calls)
 		inp->inp_hpts_calls = 0;
 	sack_rxmit = 0;
 	len = 0;
 	rsm = NULL;
 	if (flags & TH_RST) {
 		SOCKBUF_LOCK(&inp->inp_socket->so_snd);
 		so = inp->inp_socket;
 		sb = &so->so_snd;
 		goto send;
 	}
 	if (rack->r_ctl.rc_resend) {
 		/* Retransmit timer */
 		rsm = rack->r_ctl.rc_resend;
 		rack->r_ctl.rc_resend = NULL;
 		len = rsm->r_end - rsm->r_start;
 		sack_rxmit = 1;
 		sendalot = 0;
 		KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
 			("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
 			 __func__, __LINE__,
 			 rsm->r_start, tp->snd_una, tp, rack, rsm));
 		sb_offset = rsm->r_start - tp->snd_una;
 		if (len >= segsiz)
 			len = segsiz;
 	} else if (rack->r_collapse_point_valid &&
 		   ((rsm = rack_check_collapsed(rack, cts)) != NULL)) {
 		/*
 		 * If an RSM is returned then enough time has passed
 		 * for us to retransmit it. Move up the collapse point,
 		 * since this rsm has its chance to retransmit now.
 		 */
 		rack_trace_point(rack, RACK_TP_COLLAPSED_RXT);
 		rack->r_ctl.last_collapse_point = rsm->r_end;
 		/* Are we done? */
 		if (SEQ_GEQ(rack->r_ctl.last_collapse_point,
 			    rack->r_ctl.high_collapse_point))
 			rack->r_collapse_point_valid = 0;
 		sack_rxmit = 1;
 		/* We are not doing a TLP */
 		doing_tlp = 0;
 		len = rsm->r_end - rsm->r_start;
 		sb_offset = rsm->r_start - tp->snd_una;
 		sendalot = 0;
 		if ((rack->full_size_rxt == 0) &&
 		    (rack->shape_rxt_to_pacing_min == 0) &&
 		    (len >= segsiz))
 			len = segsiz;
 	} else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) {
 		/* We have a retransmit that takes precedence */
 		if ((!IN_FASTRECOVERY(tp->t_flags)) &&
 		    ((rsm->r_flags & RACK_MUST_RXT) == 0) &&
 		    ((tp->t_flags & TF_WASFRECOVERY) == 0)) {
 			/* Enter recovery if not induced by a time-out */
 			rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
 		}
 #ifdef INVARIANTS
 		if (SEQ_LT(rsm->r_start, tp->snd_una)) {
 			panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
 			      tp, rack, rsm, rsm->r_start, tp->snd_una);
 		}
 #endif
 		len = rsm->r_end - rsm->r_start;
 		KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
 			("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
 			 __func__, __LINE__,
 			 rsm->r_start, tp->snd_una, tp, rack, rsm));
 		sb_offset = rsm->r_start - tp->snd_una;
 		sendalot = 0;
 		if (len >= segsiz)
 			len = segsiz;
 		if (len > 0) {
 			sack_rxmit = 1;
 			KMOD_TCPSTAT_INC(tcps_sack_rexmits);
 			KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes,
 			    min(len, segsiz));
 		}
 	} else if (rack->r_ctl.rc_tlpsend) {
 		/* Tail loss probe */
 		long cwin;
 		long tlen;
 
 		/*
 		 * Check if we can do a TLP with a RACK'd packet
 		 * this can happen if we are not doing the rack
 		 * cheat and we skipped to a TLP and it
 		 * went off.
 		 */
 		rsm = rack->r_ctl.rc_tlpsend;
 		/* We are doing a TLP make sure the flag is preent */
 		rsm->r_flags |= RACK_TLP;
 		rack->r_ctl.rc_tlpsend = NULL;
 		sack_rxmit = 1;
 		tlen = rsm->r_end - rsm->r_start;
 		if (tlen > segsiz)
 			tlen = segsiz;
 		KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
 			("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
 			 __func__, __LINE__,
 			 rsm->r_start, tp->snd_una, tp, rack, rsm));
 		sb_offset = rsm->r_start - tp->snd_una;
 		cwin = min(tp->snd_wnd, tlen);
 		len = cwin;
 	}
 	if (rack->r_must_retran &&
 	    (doing_tlp == 0) &&
 	    (SEQ_GT(tp->snd_max, tp->snd_una)) &&
 	    (rsm == NULL)) {
 		/*
 		 * There are two different ways that we
 		 * can get into this block:
 		 * a) This is a non-sack connection, we had a time-out
 		 *    and thus r_must_retran was set and everything
 		 *    left outstanding as been marked for retransmit.
 		 * b) The MTU of the path shrank, so that everything
 		 *    was marked to be retransmitted with the smaller
 		 *    mtu and r_must_retran was set.
 		 *
 		 * This means that we expect the sendmap (outstanding)
 		 * to all be marked must. We can use the tmap to
 		 * look at them.
 		 *
 		 */
 		int sendwin, flight;
 
 		sendwin = min(tp->snd_wnd, tp->snd_cwnd);
 		flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto);
 		if (flight >= sendwin) {
 			/*
 			 * We can't send yet.
 			 */
 			so = inp->inp_socket;
 			sb = &so->so_snd;
 			goto just_return_nolock;
 		}
 		/*
 		 * This is the case a/b mentioned above. All
 		 * outstanding/not-acked should be marked.
 		 * We can use the tmap to find them.
 		 */
 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
 		if (rsm == NULL) {
 			/* TSNH */
 			rack->r_must_retran = 0;
 			rack->r_ctl.rc_out_at_rto = 0;
 			so = inp->inp_socket;
 			sb = &so->so_snd;
 			goto just_return_nolock;
 		}
 		if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
 			/*
 			 * The first one does not have the flag, did we collapse
 			 * further up in our list?
 			 */
 			rack->r_must_retran = 0;
 			rack->r_ctl.rc_out_at_rto = 0;
 			rsm = NULL;
 			sack_rxmit = 0;
 		} else {
 			sack_rxmit = 1;
 			len = rsm->r_end - rsm->r_start;
 			sb_offset = rsm->r_start - tp->snd_una;
 			sendalot = 0;
 			if ((rack->full_size_rxt == 0) &&
 			    (rack->shape_rxt_to_pacing_min == 0) &&
 			    (len >= segsiz))
 				len = segsiz;
 			/*
 			 * Delay removing the flag RACK_MUST_RXT so
 			 * that the fastpath for retransmit will
 			 * work with this rsm.
 			 */
 		}
 	}
 	/*
 	 * Enforce a connection sendmap count limit if set
 	 * as long as we are not retransmiting.
 	 */
 	if ((rsm == NULL) &&
 	    (rack->do_detection == 0) &&
 	    (V_tcp_map_entries_limit > 0) &&
 	    (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
 		counter_u64_add(rack_to_alloc_limited, 1);
 		if (!rack->alloc_limit_reported) {
 			rack->alloc_limit_reported = 1;
 			counter_u64_add(rack_alloc_limited_conns, 1);
 		}
 		so = inp->inp_socket;
 		sb = &so->so_snd;
 		goto just_return_nolock;
 	}
 	if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
 		/* we are retransmitting the fin */
 		len--;
 		if (len) {
 			/*
 			 * When retransmitting data do *not* include the
 			 * FIN. This could happen from a TLP probe.
 			 */
 			flags &= ~TH_FIN;
 		}
 	}
 	if (rsm && rack->r_fsb_inited && rack_use_rsm_rfo &&
 	    ((rsm->r_flags & RACK_HAS_FIN) == 0)) {
 		int ret;
 
 		ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp);
 		if (ret == 0)
 			return (0);
 	}
 	so = inp->inp_socket;
 	sb = &so->so_snd;
 	if (do_a_prefetch == 0) {
 		kern_prefetch(sb, &do_a_prefetch);
 		do_a_prefetch = 1;
 	}
 #ifdef NETFLIX_SHARED_CWND
 	if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) &&
 	    rack->rack_enable_scwnd) {
 		/* We are doing cwnd sharing */
 		if (rack->gp_ready &&
 		    (rack->rack_attempted_scwnd == 0) &&
 		    (rack->r_ctl.rc_scw == NULL) &&
 		    tp->t_lib) {
 			/* The pcbid is in, lets make an attempt */
 			counter_u64_add(rack_try_scwnd, 1);
 			rack->rack_attempted_scwnd = 1;
 			rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp,
 								   &rack->r_ctl.rc_scw_index,
 								   segsiz);
 		}
 		if (rack->r_ctl.rc_scw &&
 		    (rack->rack_scwnd_is_idle == 1) &&
 		    sbavail(&so->so_snd)) {
 			/* we are no longer out of data */
 			tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
 			rack->rack_scwnd_is_idle = 0;
 		}
 		if (rack->r_ctl.rc_scw) {
 			/* First lets update and get the cwnd */
 			rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw,
 								    rack->r_ctl.rc_scw_index,
 								    tp->snd_cwnd, tp->snd_wnd, segsiz);
 		}
 	}
 #endif
 	/*
 	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
 	 * state flags.
 	 */
 	if (tp->t_flags & TF_NEEDFIN)
 		flags |= TH_FIN;
 	if (tp->t_flags & TF_NEEDSYN)
 		flags |= TH_SYN;
 	if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
 		void *end_rsm;
 		end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
 		if (end_rsm)
 			kern_prefetch(end_rsm, &prefetch_rsm);
 		prefetch_rsm = 1;
 	}
 	SOCKBUF_LOCK(sb);
 	/*
 	 * If snd_nxt == snd_max and we have transmitted a FIN, the
 	 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
 	 * negative length.  This can also occur when TCP opens up its
 	 * congestion window while receiving additional duplicate acks after
 	 * fast-retransmit because TCP will reset snd_nxt to snd_max after
 	 * the fast-retransmit.
 	 *
 	 * In the normal retransmit-FIN-only case, however, snd_nxt will be
 	 * set to snd_una, the sb_offset will be 0, and the length may wind
 	 * up 0.
 	 *
 	 * If sack_rxmit is true we are retransmitting from the scoreboard
 	 * in which case len is already set.
 	 */
 	if ((sack_rxmit == 0) &&
 	    (TCPS_HAVEESTABLISHED(tp->t_state) || IS_FASTOPEN(tp->t_flags))) {
 		uint32_t avail;
 
 		avail = sbavail(sb);
 		if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
 			sb_offset = tp->snd_nxt - tp->snd_una;
 		else
 			sb_offset = 0;
 		if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) {
 			if (rack->r_ctl.rc_tlp_new_data) {
 				/* TLP is forcing out new data */
 				if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
 					rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
 				}
 				if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) {
 					if (tp->snd_wnd > sb_offset)
 						len = tp->snd_wnd - sb_offset;
 					else
 						len = 0;
 				} else {
 					len = rack->r_ctl.rc_tlp_new_data;
 				}
 				rack->r_ctl.rc_tlp_new_data = 0;
 			}  else {
 				len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset);
 			}
 			if ((rack->r_ctl.crte == NULL) && IN_FASTRECOVERY(tp->t_flags) && (len > segsiz)) {
 				/*
 				 * For prr=off, we need to send only 1 MSS
 				 * at a time. We do this because another sack could
 				 * be arriving that causes us to send retransmits and
 				 * we don't want to be on a long pace due to a larger send
 				 * that keeps us from sending out the retransmit.
 				 */
 				len = segsiz;
 			}
 		} else {
 			uint32_t outstanding;
 			/*
 			 * We are inside of a Fast recovery episode, this
 			 * is caused by a SACK or 3 dup acks. At this point
 			 * we have sent all the retransmissions and we rely
 			 * on PRR to dictate what we will send in the form of
 			 * new data.
 			 */
 
 			outstanding = tp->snd_max - tp->snd_una;
 			if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) {
 				if (tp->snd_wnd > outstanding) {
 					len = tp->snd_wnd - outstanding;
 					/* Check to see if we have the data */
 					if ((sb_offset + len) > avail) {
 						/* It does not all fit */
 						if (avail > sb_offset)
 							len = avail - sb_offset;
 						else
 							len = 0;
 					}
 				} else {
 					len = 0;
 				}
 			} else if (avail > sb_offset) {
 				len = avail - sb_offset;
 			} else {
 				len = 0;
 			}
 			if (len > 0) {
 				if (len > rack->r_ctl.rc_prr_sndcnt) {
 					len = rack->r_ctl.rc_prr_sndcnt;
 				}
 				if (len > 0) {
 					sub_from_prr = 1;
 				}
 			}
 			if (len > segsiz) {
 				/*
 				 * We should never send more than a MSS when
 				 * retransmitting or sending new data in prr
 				 * mode unless the override flag is on. Most
 				 * likely the PRR algorithm is not going to
 				 * let us send a lot as well :-)
 				 */
 				if (rack->r_ctl.rc_prr_sendalot == 0) {
 					len = segsiz;
 				}
 			} else if (len < segsiz) {
 				/*
 				 * Do we send any? The idea here is if the
 				 * send empty's the socket buffer we want to
 				 * do it. However if not then lets just wait
 				 * for our prr_sndcnt to get bigger.
 				 */
 				long leftinsb;
 
 				leftinsb = sbavail(sb) - sb_offset;
 				if (leftinsb > len) {
 					/* This send does not empty the sb */
 					len = 0;
 				}
 			}
 		}
 	} else if (!TCPS_HAVEESTABLISHED(tp->t_state)) {
 		/*
 		 * If you have not established
 		 * and are not doing FAST OPEN
 		 * no data please.
 		 */
 		if ((sack_rxmit == 0) &&
 		    (!IS_FASTOPEN(tp->t_flags))){
 			len = 0;
 			sb_offset = 0;
 		}
 	}
 	if (prefetch_so_done == 0) {
 		kern_prefetch(so, &prefetch_so_done);
 		prefetch_so_done = 1;
 	}
 	/*
 	 * Lop off SYN bit if it has already been sent.  However, if this is
 	 * SYN-SENT state and if segment contains data and if we don't know
 	 * that foreign host supports TAO, suppress sending segment.
 	 */
 	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
 	    ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
 		/*
 		 * When sending additional segments following a TFO SYN|ACK,
 		 * do not include the SYN bit.
 		 */
 		if (IS_FASTOPEN(tp->t_flags) &&
 		    (tp->t_state == TCPS_SYN_RECEIVED))
 			flags &= ~TH_SYN;
 	}
 	/*
 	 * Be careful not to send data and/or FIN on SYN segments. This
 	 * measure is needed to prevent interoperability problems with not
 	 * fully conformant TCP implementations.
 	 */
 	if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
 		len = 0;
 		flags &= ~TH_FIN;
 	}
 	/*
 	 * On TFO sockets, ensure no data is sent in the following cases:
 	 *
 	 *  - When retransmitting SYN|ACK on a passively-created socket
 	 *
 	 *  - When retransmitting SYN on an actively created socket
 	 *
 	 *  - When sending a zero-length cookie (cookie request) on an
 	 *    actively created socket
 	 *
 	 *  - When the socket is in the CLOSED state (RST is being sent)
 	 */
 	if (IS_FASTOPEN(tp->t_flags) &&
 	    (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
 	     ((tp->t_state == TCPS_SYN_SENT) &&
 	      (tp->t_tfo_client_cookie_len == 0)) ||
 	     (flags & TH_RST))) {
 		sack_rxmit = 0;
 		len = 0;
 	}
 	/* Without fast-open there should never be data sent on a SYN */
 	if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) {
 		tp->snd_nxt = tp->iss;
 		len = 0;
 	}
 	if ((len > segsiz) && (tcp_dsack_block_exists(tp))) {
 		/* We only send 1 MSS if we have a DSACK block */
 		add_flag |= RACK_SENT_W_DSACK;
 		len = segsiz;
 	}
 	orig_len = len;
 	if (len <= 0) {
 		/*
 		 * If FIN has been sent but not acked, but we haven't been
 		 * called to retransmit, len will be < 0.  Otherwise, window
 		 * shrank after we sent into it.  If window shrank to 0,
 		 * cancel pending retransmit, pull snd_nxt back to (closed)
 		 * window, and set the persist timer if it isn't already
 		 * going.  If the window didn't close completely, just wait
 		 * for an ACK.
 		 *
 		 * We also do a general check here to ensure that we will
 		 * set the persist timer when we have data to send, but a
 		 * 0-byte window. This makes sure the persist timer is set
 		 * even if the packet hits one of the "goto send" lines
 		 * below.
 		 */
 		len = 0;
 		if ((tp->snd_wnd == 0) &&
 		    (TCPS_HAVEESTABLISHED(tp->t_state)) &&
 		    (tp->snd_una == tp->snd_max) &&
 		    (sb_offset < (int)sbavail(sb))) {
 			rack_enter_persist(tp, rack, cts);
 		}
 	} else if ((rsm == NULL) &&
 		   (doing_tlp == 0) &&
 		   (len < pace_max_seg)) {
 		/*
 		 * We are not sending a maximum sized segment for
 		 * some reason. Should we not send anything (think
 		 * sws or persists)?
 		 */
 		if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) &&
 		    (TCPS_HAVEESTABLISHED(tp->t_state)) &&
 		    (len < minseg) &&
 		    (len < (int)(sbavail(sb) - sb_offset))) {
 			/*
 			 * Here the rwnd is less than
 			 * the minimum pacing size, this is not a retransmit,
 			 * we are established and
 			 * the send is not the last in the socket buffer
 			 * we send nothing, and we may enter persists
 			 * if nothing is outstanding.
 			 */
 			len = 0;
 			if (tp->snd_max == tp->snd_una) {
 				/*
 				 * Nothing out we can
 				 * go into persists.
 				 */
 				rack_enter_persist(tp, rack, cts);
 			}
 		     } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) &&
 			   (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
 			   (len < (int)(sbavail(sb) - sb_offset)) &&
 			   (len < minseg)) {
 			/*
 			 * Here we are not retransmitting, and
 			 * the cwnd is not so small that we could
 			 * not send at least a min size (rxt timer
 			 * not having gone off), We have 2 segments or
 			 * more already in flight, its not the tail end
 			 * of the socket buffer  and the cwnd is blocking
 			 * us from sending out a minimum pacing segment size.
 			 * Lets not send anything.
 			 */
 			len = 0;
 		} else if (((tp->snd_wnd - ctf_outstanding(tp)) <
 			    min((rack->r_ctl.rc_high_rwnd/2), minseg)) &&
 			   (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
 			   (len < (int)(sbavail(sb) - sb_offset)) &&
 			   (TCPS_HAVEESTABLISHED(tp->t_state))) {
 			/*
 			 * Here we have a send window but we have
 			 * filled it up and we can't send another pacing segment.
 			 * We also have in flight more than 2 segments
 			 * and we are not completing the sb i.e. we allow
 			 * the last bytes of the sb to go out even if
 			 * its not a full pacing segment.
 			 */
 			len = 0;
 		} else if ((rack->r_ctl.crte != NULL) &&
 			   (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) &&
 			   (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) &&
 			   (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) &&
 			   (len < (int)(sbavail(sb) - sb_offset))) {
 			/*
 			 * Here we are doing hardware pacing, this is not a TLP,
 			 * we are not sending a pace max segment size, there is rwnd
 			 * room to send at least N pace_max_seg, the cwnd is greater
 			 * than or equal to a full pacing segments plus 4 mss and we have 2 or
 			 * more segments in flight and its not the tail of the socket buffer.
 			 *
 			 * We don't want to send instead we need to get more ack's in to
 			 * allow us to send a full pacing segment. Normally, if we are pacing
 			 * about the right speed, we should have finished our pacing
 			 * send as most of the acks have come back if we are at the
 			 * right rate. This is a bit fuzzy since return path delay
 			 * can delay the acks, which is why we want to make sure we
 			 * have cwnd space to have a bit more than a max pace segments in flight.
 			 *
 			 * If we have not gotten our acks back we are pacing at too high a
 			 * rate delaying will not hurt and will bring our GP estimate down by
 			 * injecting the delay. If we don't do this we will send
 			 * 2 MSS out in response to the acks being clocked in which
 			 * defeats the point of hw-pacing (i.e. to help us get
 			 * larger TSO's out).
 			 */
 			len = 0;
 
 		}
 
 	}
 	/* len will be >= 0 after this point. */
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 	rack_sndbuf_autoscale(rack);
 	/*
 	 * Decide if we can use TCP Segmentation Offloading (if supported by
 	 * hardware).
 	 *
 	 * TSO may only be used if we are in a pure bulk sending state.  The
 	 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
 	 * options prevent using TSO.  With TSO the TCP header is the same
 	 * (except for the sequence number) for all generated packets.  This
 	 * makes it impossible to transmit any options which vary per
 	 * generated segment or packet.
 	 *
 	 * IPv4 handling has a clear separation of ip options and ip header
 	 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
 	 * the right thing below to provide length of just ip options and thus
 	 * checking for ipoptlen is enough to decide if ip options are present.
 	 */
 	ipoptlen = 0;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	/*
 	 * Pre-calculate here as we save another lookup into the darknesses
 	 * of IPsec that way and can actually decide if TSO is ok.
 	 */
 #ifdef INET6
 	if (isipv6 && IPSEC_ENABLED(ipv6))
 		ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
 #ifdef INET
 	else
 #endif
 #endif				/* INET6 */
 #ifdef INET
 		if (IPSEC_ENABLED(ipv4))
 			ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
 #endif				/* INET */
 #endif
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	ipoptlen += ipsec_optlen;
 #endif
 	if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz &&
 	    (tp->t_port == 0) &&
 	    ((tp->t_flags & TF_SIGNATURE) == 0) &&
 	    tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
 	    ipoptlen == 0)
 		tso = 1;
 	{
 		uint32_t outstanding __unused;
 
 		outstanding = tp->snd_max - tp->snd_una;
 		if (tp->t_flags & TF_SENTFIN) {
 			/*
 			 * If we sent a fin, snd_max is 1 higher than
 			 * snd_una
 			 */
 			outstanding--;
 		}
 		if (sack_rxmit) {
 			if ((rsm->r_flags & RACK_HAS_FIN) == 0)
 				flags &= ~TH_FIN;
 		} else {
 			if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
 				   sbused(sb)))
 				flags &= ~TH_FIN;
 		}
 	}
 	recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
 	    (long)TCP_MAXWIN << tp->rcv_scale);
 
 	/*
 	 * Sender silly window avoidance.   We transmit under the following
 	 * conditions when len is non-zero:
 	 *
 	 * - We have a full segment (or more with TSO) - This is the last
 	 * buffer in a write()/send() and we are either idle or running
 	 * NODELAY - we've timed out (e.g. persist timer) - we have more
 	 * then 1/2 the maximum send window's worth of data (receiver may be
 	 * limited the window size) - we need to retransmit
 	 */
 	if (len) {
 		if (len >= segsiz) {
 			goto send;
 		}
 		/*
 		 * NOTE! on localhost connections an 'ack' from the remote
 		 * end may occur synchronously with the output and cause us
 		 * to flush a buffer queued with moretocome.  XXX
 		 *
 		 */
 		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
 		    (idle || (tp->t_flags & TF_NODELAY)) &&
 		    ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
 		    (tp->t_flags & TF_NOPUSH) == 0) {
 			pass = 2;
 			goto send;
 		}
 		if ((tp->snd_una == tp->snd_max) && len) {	/* Nothing outstanding */
 			pass = 22;
 			goto send;
 		}
 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
 			pass = 4;
 			goto send;
 		}
 		if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {	/* retransmit case */
 			pass = 5;
 			goto send;
 		}
 		if (sack_rxmit) {
 			pass = 6;
 			goto send;
 		}
 		if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) &&
 		    (ctf_outstanding(tp) < (segsiz * 2))) {
 			/*
 			 * We have less than two MSS outstanding (delayed ack)
 			 * and our rwnd will not let us send a full sized
 			 * MSS. Lets go ahead and let this small segment
 			 * out because we want to try to have at least two
 			 * packets inflight to not be caught by delayed ack.
 			 */
 			pass = 12;
 			goto send;
 		}
 	}
 	/*
 	 * Sending of standalone window updates.
 	 *
 	 * Window updates are important when we close our window due to a
 	 * full socket buffer and are opening it again after the application
 	 * reads data from it.  Once the window has opened again and the
 	 * remote end starts to send again the ACK clock takes over and
 	 * provides the most current window information.
 	 *
 	 * We must avoid the silly window syndrome whereas every read from
 	 * the receive buffer, no matter how small, causes a window update
 	 * to be sent.  We also should avoid sending a flurry of window
 	 * updates when the socket buffer had queued a lot of data and the
 	 * application is doing small reads.
 	 *
 	 * Prevent a flurry of pointless window updates by only sending an
 	 * update when we can increase the advertized window by more than
 	 * 1/4th of the socket buffer capacity.  When the buffer is getting
 	 * full or is very small be more aggressive and send an update
 	 * whenever we can increase by two mss sized segments. In all other
 	 * situations the ACK's to new incoming data will carry further
 	 * window increases.
 	 *
 	 * Don't send an independent window update if a delayed ACK is
 	 * pending (it will get piggy-backed on it) or the remote side
 	 * already has done a half-close and won't send more data.  Skip
 	 * this if the connection is in T/TCP half-open state.
 	 */
 	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
 	    !(tp->t_flags & TF_DELACK) &&
 	    !TCPS_HAVERCVDFIN(tp->t_state)) {
 		/*
 		 * "adv" is the amount we could increase the window, taking
 		 * into account that we are limited by TCP_MAXWIN <<
 		 * tp->rcv_scale.
 		 */
 		int32_t adv;
 		int oldwin;
 
 		adv = recwin;
 		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
 			oldwin = (tp->rcv_adv - tp->rcv_nxt);
 			if (adv > oldwin)
 			    adv -= oldwin;
 			else {
 				/* We can't increase the window */
 				adv = 0;
 			}
 		} else
 			oldwin = 0;
 
 		/*
 		 * If the new window size ends up being the same as or less
 		 * than the old size when it is scaled, then don't force
 		 * a window update.
 		 */
 		if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale)
 			goto dontupdate;
 
 		if (adv >= (int32_t)(2 * segsiz) &&
 		    (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
 		     recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
 		     so->so_rcv.sb_hiwat <= 8 * segsiz)) {
 			pass = 7;
 			goto send;
 		}
 		if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) {
 			pass = 23;
 			goto send;
 		}
 	}
 dontupdate:
 
 	/*
 	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
 	 * is also a catch-all for the retransmit timer timeout case.
 	 */
 	if (tp->t_flags & TF_ACKNOW) {
 		pass = 8;
 		goto send;
 	}
 	if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
 		pass = 9;
 		goto send;
 	}
 	/*
 	 * If our state indicates that FIN should be sent and we have not
 	 * yet done so, then we need to send.
 	 */
 	if ((flags & TH_FIN) &&
 	    (tp->snd_nxt == tp->snd_una)) {
 		pass = 11;
 		goto send;
 	}
 	/*
 	 * No reason to send a segment, just return.
 	 */
 just_return:
 	SOCKBUF_UNLOCK(sb);
 just_return_nolock:
 	{
 		int app_limited = CTF_JR_SENT_DATA;
 
 		if (tot_len_this_send > 0) {
 			/* Make sure snd_nxt is up to max */
 			rack->r_ctl.fsb.recwin = recwin;
 			slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz);
 			if ((error == 0) &&
 			    rack_use_rfo &&
 			    ((flags & (TH_SYN|TH_FIN)) == 0) &&
 			    (ipoptlen == 0) &&
 			    (tp->snd_nxt == tp->snd_max) &&
 			    (tp->rcv_numsacks == 0) &&
 			    rack->r_fsb_inited &&
 			    TCPS_HAVEESTABLISHED(tp->t_state) &&
 			    (rack->r_must_retran == 0) &&
 			    ((tp->t_flags & TF_NEEDFIN) == 0) &&
 			    (len > 0) && (orig_len > 0) &&
 			    (orig_len > len) &&
 			    ((orig_len - len) >= segsiz) &&
 			    ((optlen == 0) ||
 			     ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
 				/* We can send at least one more MSS using our fsb */
 
 				rack->r_fast_output = 1;
 				rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
 				rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
 				rack->r_ctl.fsb.tcp_flags = flags;
 				rack->r_ctl.fsb.left_to_send = orig_len - len;
 				if (hw_tls)
 					rack->r_ctl.fsb.hw_tls = 1;
 				else
 					rack->r_ctl.fsb.hw_tls = 0;
 				KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
 					("rack:%p left_to_send:%u sbavail:%u out:%u",
 					rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
 					 (tp->snd_max - tp->snd_una)));
 				if (rack->r_ctl.fsb.left_to_send < segsiz)
 					rack->r_fast_output = 0;
 				else {
 					if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
 						rack->r_ctl.fsb.rfo_apply_push = 1;
 					else
 						rack->r_ctl.fsb.rfo_apply_push = 0;
 				}
 			} else
 				rack->r_fast_output = 0;
 
 
 			rack_log_fsb(rack, tp, so, flags,
 				     ipoptlen, orig_len, len, 0,
 				     1, optlen, __LINE__, 1);
 			if (SEQ_GT(tp->snd_max, tp->snd_nxt))
 				tp->snd_nxt = tp->snd_max;
 		} else {
 			int end_window = 0;
 			uint32_t seq = tp->gput_ack;
 
 			rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
 			if (rsm) {
 				/*
 				 * Mark the last sent that we just-returned (hinting
 				 * that delayed ack may play a role in any rtt measurement).
 				 */
 				rsm->r_just_ret = 1;
 			}
 			counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
 			rack->r_ctl.rc_agg_delayed = 0;
 			rack->r_early = 0;
 			rack->r_late = 0;
 			rack->r_ctl.rc_agg_early = 0;
 			if ((ctf_outstanding(tp) +
 			     min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)),
 				 minseg)) >= tp->snd_wnd) {
 				/* We are limited by the rwnd */
 				app_limited = CTF_JR_RWND_LIMITED;
 				if (IN_FASTRECOVERY(tp->t_flags))
 				    rack->r_ctl.rc_prr_sndcnt = 0;
 			} else if (ctf_outstanding(tp) >= sbavail(sb)) {
 				/* We are limited by whats available -- app limited */
 				app_limited = CTF_JR_APP_LIMITED;
 				if (IN_FASTRECOVERY(tp->t_flags))
 				    rack->r_ctl.rc_prr_sndcnt = 0;
 			} else if ((idle == 0) &&
 				   ((tp->t_flags & TF_NODELAY) == 0) &&
 				   ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
 				   (len < segsiz)) {
 				/*
 				 * No delay is not on and the
 				 * user is sending less than 1MSS. This
 				 * brings out SWS avoidance so we
 				 * don't send. Another app-limited case.
 				 */
 				app_limited = CTF_JR_APP_LIMITED;
 			} else if (tp->t_flags & TF_NOPUSH) {
 				/*
 				 * The user has requested no push of
 				 * the last segment and we are
 				 * at the last segment. Another app
 				 * limited case.
 				 */
 				app_limited = CTF_JR_APP_LIMITED;
 			} else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) {
 				/* Its the cwnd */
 				app_limited = CTF_JR_CWND_LIMITED;
 			} else if (IN_FASTRECOVERY(tp->t_flags) &&
 				   (rack->rack_no_prr == 0) &&
 				   (rack->r_ctl.rc_prr_sndcnt < segsiz)) {
 				app_limited = CTF_JR_PRR;
 			} else {
 				/* Now why here are we not sending? */
 #ifdef NOW
 #ifdef INVARIANTS
 				panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use);
 #endif
 #endif
 				app_limited = CTF_JR_ASSESSING;
 			}
 			/*
 			 * App limited in some fashion, for our pacing GP
 			 * measurements we don't want any gap (even cwnd).
 			 * Close  down the measurement window.
 			 */
 			if (rack_cwnd_block_ends_measure &&
 			    ((app_limited == CTF_JR_CWND_LIMITED) ||
 			     (app_limited == CTF_JR_PRR))) {
 				/*
 				 * The reason we are not sending is
 				 * the cwnd (or prr). We have been configured
 				 * to end the measurement window in
 				 * this case.
 				 */
 				end_window = 1;
 			} else if (rack_rwnd_block_ends_measure &&
 				   (app_limited == CTF_JR_RWND_LIMITED)) {
 				/*
 				 * We are rwnd limited and have been
 				 * configured to end the measurement
 				 * window in this case.
 				 */
 				end_window = 1;
 			} else if (app_limited == CTF_JR_APP_LIMITED) {
 				/*
 				 * A true application limited period, we have
 				 * ran out of data.
 				 */
 				end_window = 1;
 			} else if (app_limited == CTF_JR_ASSESSING) {
 				/*
 				 * In the assessing case we hit the end of
 				 * the if/else and had no known reason
 				 * This will panic us under invariants..
 				 *
 				 * If we get this out in logs we need to
 				 * investagate which reason we missed.
 				 */
 				end_window = 1;
 			}
 			if (end_window) {
 				uint8_t log = 0;
 
 				/* Adjust the Gput measurement */
 				if ((tp->t_flags & TF_GPUTINPROG) &&
 				    SEQ_GT(tp->gput_ack, tp->snd_max)) {
 					tp->gput_ack = tp->snd_max;
 					if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) {
 						/*
 						 * There is not enough to measure.
 						 */
 						tp->t_flags &= ~TF_GPUTINPROG;
 						rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
 									   rack->r_ctl.rc_gp_srtt /*flex1*/,
 									   tp->gput_seq,
 									   0, 0, 18, __LINE__, NULL, 0);
 					} else
 						log = 1;
 				}
 				/* Mark the last packet has app limited */
 				rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
 				if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
 					if (rack->r_ctl.rc_app_limited_cnt == 0)
 						rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
 					else {
 						/*
 						 * Go out to the end app limited and mark
 						 * this new one as next and move the end_appl up
 						 * to this guy.
 						 */
 						if (rack->r_ctl.rc_end_appl)
 							rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start;
 						rack->r_ctl.rc_end_appl = rsm;
 					}
 					rsm->r_flags |= RACK_APP_LIMITED;
 					rack->r_ctl.rc_app_limited_cnt++;
 				}
 				if (log)
 					rack_log_pacing_delay_calc(rack,
 								   rack->r_ctl.rc_app_limited_cnt, seq,
 								   tp->gput_ack, 0, 0, 4, __LINE__, NULL, 0);
 			}
 		}
 		/* Check if we need to go into persists or not */
 		if ((tp->snd_max == tp->snd_una) &&
 		    TCPS_HAVEESTABLISHED(tp->t_state) &&
 		    sbavail(sb) &&
 		    (sbavail(sb) > tp->snd_wnd) &&
 		    (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) {
 			/* Yes lets make sure to move to persist before timer-start */
 			rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime);
 		}
 		rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack);
 		rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use);
 	}
 #ifdef NETFLIX_SHARED_CWND
 	if ((sbavail(sb) == 0) &&
 	    rack->r_ctl.rc_scw) {
 		tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
 		rack->rack_scwnd_is_idle = 1;
 	}
 #endif
 #ifdef TCP_ACCOUNTING
 	if (tot_len_this_send > 0) {
 		crtsc = get_cyclecount();
 		if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 			tp->tcp_cnt_counters[SND_OUT_DATA]++;
 		}
 		counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1);
 		if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 			tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
 		}
 		counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val));
 		if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 			tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz);
 		}
 		counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) / segsiz));
 	} else {
 		crtsc = get_cyclecount();
 		if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 			tp->tcp_cnt_counters[SND_LIMITED]++;
 		}
 		counter_u64_add(tcp_cnt_counters[SND_LIMITED], 1);
 		if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 			tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val);
 		}
 		counter_u64_add(tcp_proc_time[SND_LIMITED], (crtsc - ts_val));
 	}
 	sched_unpin();
 #endif
 	return (0);
 
 send:
 	if (rsm || sack_rxmit)
 		counter_u64_add(rack_nfto_resend, 1);
 	else
 		counter_u64_add(rack_non_fto_send, 1);
 	if ((flags & TH_FIN) &&
 	    sbavail(sb)) {
 		/*
 		 * We do not transmit a FIN
 		 * with data outstanding. We
 		 * need to make it so all data
 		 * is acked first.
 		 */
 		flags &= ~TH_FIN;
 	}
 	/* Enforce stack imposed max seg size if we have one */
 	if (rack->r_ctl.rc_pace_max_segs &&
 	    (len > rack->r_ctl.rc_pace_max_segs)) {
 		mark = 1;
 		len = rack->r_ctl.rc_pace_max_segs;
 	}
 	SOCKBUF_LOCK_ASSERT(sb);
 	if (len > 0) {
 		if (len >= segsiz)
 			tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
 	}
 	/*
 	 * Before ESTABLISHED, force sending of initial options unless TCP
 	 * set not to do any options. NOTE: we assume that the IP/TCP header
 	 * plus TCP options always fit in a single mbuf, leaving room for a
 	 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
 	 * + optlen <= MCLBYTES
 	 */
 	optlen = 0;
 #ifdef INET6
 	if (isipv6)
 		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	else
 #endif
 		hdrlen = sizeof(struct tcpiphdr);
 
 	/*
 	 * Compute options for segment. We only have to care about SYN and
 	 * established connection segments.  Options for SYN-ACK segments
 	 * are handled in TCP syncache.
 	 */
 	to.to_flags = 0;
 	if ((tp->t_flags & TF_NOOPT) == 0) {
 		/* Maximum segment size. */
 		if (flags & TH_SYN) {
 			tp->snd_nxt = tp->iss;
 			to.to_mss = tcp_mssopt(&inp->inp_inc);
 			if (tp->t_port)
 				to.to_mss -= V_tcp_udp_tunneling_overhead;
 			to.to_flags |= TOF_MSS;
 
 			/*
 			 * On SYN or SYN|ACK transmits on TFO connections,
 			 * only include the TFO option if it is not a
 			 * retransmit, as the presence of the TFO option may
 			 * have caused the original SYN or SYN|ACK to have
 			 * been dropped by a middlebox.
 			 */
 			if (IS_FASTOPEN(tp->t_flags) &&
 			    (tp->t_rxtshift == 0)) {
 				if (tp->t_state == TCPS_SYN_RECEIVED) {
 					to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
 					to.to_tfo_cookie =
 						(u_int8_t *)&tp->t_tfo_cookie.server;
 					to.to_flags |= TOF_FASTOPEN;
 					wanted_cookie = 1;
 				} else if (tp->t_state == TCPS_SYN_SENT) {
 					to.to_tfo_len =
 						tp->t_tfo_client_cookie_len;
 					to.to_tfo_cookie =
 						tp->t_tfo_cookie.client;
 					to.to_flags |= TOF_FASTOPEN;
 					wanted_cookie = 1;
 					/*
 					 * If we wind up having more data to
 					 * send with the SYN than can fit in
 					 * one segment, don't send any more
 					 * until the SYN|ACK comes back from
 					 * the other end.
 					 */
 					sendalot = 0;
 				}
 			}
 		}
 		/* Window scaling. */
 		if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
 			to.to_wscale = tp->request_r_scale;
 			to.to_flags |= TOF_SCALE;
 		}
 		/* Timestamps. */
 		if ((tp->t_flags & TF_RCVD_TSTMP) ||
 		    ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
 			to.to_tsval = ms_cts + tp->ts_offset;
 			to.to_tsecr = tp->ts_recent;
 			to.to_flags |= TOF_TS;
 		}
 		/* Set receive buffer autosizing timestamp. */
 		if (tp->rfbuf_ts == 0 &&
 		    (so->so_rcv.sb_flags & SB_AUTOSIZE))
 			tp->rfbuf_ts = tcp_ts_getticks();
 		/* Selective ACK's. */
 		if (tp->t_flags & TF_SACK_PERMIT) {
 			if (flags & TH_SYN)
 				to.to_flags |= TOF_SACKPERM;
 			else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 				 tp->rcv_numsacks > 0) {
 				to.to_flags |= TOF_SACK;
 				to.to_nsacks = tp->rcv_numsacks;
 				to.to_sacks = (u_char *)tp->sackblks;
 			}
 		}
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		/* TCP-MD5 (RFC2385). */
 		if (tp->t_flags & TF_SIGNATURE)
 			to.to_flags |= TOF_SIGNATURE;
 #endif				/* TCP_SIGNATURE */
 
 		/* Processing the options. */
 		hdrlen += optlen = tcp_addoptions(&to, opt);
 		/*
 		 * If we wanted a TFO option to be added, but it was unable
 		 * to fit, ensure no data is sent.
 		 */
 		if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
 		    !(to.to_flags & TOF_FASTOPEN))
 			len = 0;
 	}
 	if (tp->t_port) {
 		if (V_tcp_udp_tunneling_port == 0) {
 			/* The port was removed?? */
 			SOCKBUF_UNLOCK(&so->so_snd);
 #ifdef TCP_ACCOUNTING
 			crtsc = get_cyclecount();
 			if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 				tp->tcp_cnt_counters[SND_OUT_FAIL]++;
 			}
 			counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
 			if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 				tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
 			}
 			counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
 			sched_unpin();
 #endif
 			return (EHOSTUNREACH);
 		}
 		hdrlen += sizeof(struct udphdr);
 	}
 #ifdef INET6
 	if (isipv6)
 		ipoptlen = ip6_optlen(tp->t_inpcb);
 	else
 #endif
 		if (tp->t_inpcb->inp_options)
 			ipoptlen = tp->t_inpcb->inp_options->m_len -
 				offsetof(struct ipoption, ipopt_list);
 		else
 			ipoptlen = 0;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	ipoptlen += ipsec_optlen;
 #endif
 
 	/*
 	 * Adjust data length if insertion of options will bump the packet
 	 * length beyond the t_maxseg length. Clear the FIN bit because we
 	 * cut off the tail of the segment.
 	 */
 	if (len + optlen + ipoptlen > tp->t_maxseg) {
 		if (tso) {
 			uint32_t if_hw_tsomax;
 			uint32_t moff;
 			int32_t max_len;
 
 			/* extract TSO information */
 			if_hw_tsomax = tp->t_tsomax;
 			if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
 			if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
 			KASSERT(ipoptlen == 0,
 				("%s: TSO can't do IP options", __func__));
 
 			/*
 			 * Check if we should limit by maximum payload
 			 * length:
 			 */
 			if (if_hw_tsomax != 0) {
 				/* compute maximum TSO length */
 				max_len = (if_hw_tsomax - hdrlen -
 					   max_linkhdr);
 				if (max_len <= 0) {
 					len = 0;
 				} else if (len > max_len) {
 					sendalot = 1;
 					len = max_len;
 					mark = 2;
 				}
 			}
 			/*
 			 * Prevent the last segment from being fractional
 			 * unless the send sockbuf can be emptied:
 			 */
 			max_len = (tp->t_maxseg - optlen);
 			if ((sb_offset + len) < sbavail(sb)) {
 				moff = len % (u_int)max_len;
 				if (moff != 0) {
 					mark = 3;
 					len -= moff;
 				}
 			}
 			/*
 			 * In case there are too many small fragments don't
 			 * use TSO:
 			 */
 			if (len <= segsiz) {
 				mark = 4;
 				tso = 0;
 			}
 			/*
 			 * Send the FIN in a separate segment after the bulk
 			 * sending is done. We don't trust the TSO
 			 * implementations to clear the FIN flag on all but
 			 * the last segment.
 			 */
 			if (tp->t_flags & TF_NEEDFIN) {
 				sendalot = 4;
 			}
 		} else {
 			mark = 5;
 			if (optlen + ipoptlen >= tp->t_maxseg) {
 				/*
 				 * Since we don't have enough space to put
 				 * the IP header chain and the TCP header in
 				 * one packet as required by RFC 7112, don't
 				 * send it. Also ensure that at least one
 				 * byte of the payload can be put into the
 				 * TCP segment.
 				 */
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EMSGSIZE;
 				sack_rxmit = 0;
 				goto out;
 			}
 			len = tp->t_maxseg - optlen - ipoptlen;
 			sendalot = 5;
 		}
 	} else {
 		tso = 0;
 		mark = 6;
 	}
 	KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
 		("%s: len > IP_MAXPACKET", __func__));
 #ifdef DIAGNOSTIC
 #ifdef INET6
 	if (max_linkhdr + hdrlen > MCLBYTES)
 #else
 		if (max_linkhdr + hdrlen > MHLEN)
 #endif
 			panic("tcphdr too big");
 #endif
 
 	/*
 	 * This KASSERT is here to catch edge cases at a well defined place.
 	 * Before, those had triggered (random) panic conditions further
 	 * down.
 	 */
 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
 	if ((len == 0) &&
 	    (flags & TH_FIN) &&
 	    (sbused(sb))) {
 		/*
 		 * We have outstanding data, don't send a fin by itself!.
 		 */
 		goto just_return;
 	}
 	/*
 	 * Grab a header mbuf, attaching a copy of data to be transmitted,
 	 * and initialize the header from the template for sends on this
 	 * connection.
 	 */
 	hw_tls = (sb->sb_flags & SB_TLS_IFNET) != 0;
 	if (len) {
 		uint32_t max_val;
 		uint32_t moff;
 
 		if (rack->r_ctl.rc_pace_max_segs)
 			max_val = rack->r_ctl.rc_pace_max_segs;
 		else if (rack->rc_user_set_max_segs)
 			max_val = rack->rc_user_set_max_segs * segsiz;
 		else
 			max_val = len;
 		/*
 		 * We allow a limit on sending with hptsi.
 		 */
 		if (len > max_val) {
 			mark = 7;
 			len = max_val;
 		}
 #ifdef INET6
 		if (MHLEN < hdrlen + max_linkhdr)
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		else
 #endif
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 
 		if (m == NULL) {
 			SOCKBUF_UNLOCK(sb);
 			error = ENOBUFS;
 			sack_rxmit = 0;
 			goto out;
 		}
 		m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 
 		/*
 		 * Start the m_copy functions from the closest mbuf to the
 		 * sb_offset in the socket buffer chain.
 		 */
 		mb = sbsndptr_noadv(sb, sb_offset, &moff);
 		s_mb = mb;
 		s_moff = moff;
 		if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
 			m_copydata(mb, moff, (int)len,
 				   mtod(m, caddr_t)+hdrlen);
 			if (SEQ_LT(tp->snd_nxt, tp->snd_max))
 				sbsndptr_adv(sb, mb, len);
 			m->m_len += len;
 		} else {
 			struct sockbuf *msb;
 
 			if (SEQ_LT(tp->snd_nxt, tp->snd_max))
 				msb = NULL;
 			else
 				msb = sb;
 			m->m_next = tcp_m_copym(
 				mb, moff, &len,
 				if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
 				((rsm == NULL) ? hw_tls : 0)
 #ifdef NETFLIX_COPY_ARGS
 				, &s_mb, &s_moff
 #endif
 				);
 			if (len <= (tp->t_maxseg - optlen)) {
 				/*
 				 * Must have ran out of mbufs for the copy
 				 * shorten it to no longer need tso. Lets
 				 * not put on sendalot since we are low on
 				 * mbufs.
 				 */
 				tso = 0;
 			}
 			if (m->m_next == NULL) {
 				SOCKBUF_UNLOCK(sb);
 				(void)m_free(m);
 				error = ENOBUFS;
 				sack_rxmit = 0;
 				goto out;
 			}
 		}
 		if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
 			if (rsm && (rsm->r_flags & RACK_TLP)) {
 				/*
 				 * TLP should not count in retran count, but
 				 * in its own bin
 				 */
 				counter_u64_add(rack_tlp_retran, 1);
 				counter_u64_add(rack_tlp_retran_bytes, len);
 			} else {
 				tp->t_sndrexmitpack++;
 				KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
 				KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
 			}
 #ifdef STATS
 			stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
 						 len);
 #endif
 		} else {
 			KMOD_TCPSTAT_INC(tcps_sndpack);
 			KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
 #ifdef STATS
 			stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
 						 len);
 #endif
 		}
 		/*
 		 * If we're sending everything we've got, set PUSH. (This
 		 * will keep happy those implementations which only give
 		 * data to the user when a buffer fills or a PUSH comes in.)
 		 */
 		if (sb_offset + len == sbused(sb) &&
 		    sbused(sb) &&
 		    !(flags & TH_SYN)) {
 			flags |= TH_PUSH;
 			add_flag |= RACK_HAD_PUSH;
 		}
 
 		SOCKBUF_UNLOCK(sb);
 	} else {
 		SOCKBUF_UNLOCK(sb);
 		if (tp->t_flags & TF_ACKNOW)
 			KMOD_TCPSTAT_INC(tcps_sndacks);
 		else if (flags & (TH_SYN | TH_FIN | TH_RST))
 			KMOD_TCPSTAT_INC(tcps_sndctrl);
 		else
 			KMOD_TCPSTAT_INC(tcps_sndwinup);
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			sack_rxmit = 0;
 			goto out;
 		}
 #ifdef INET6
 		if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
 		    MHLEN >= hdrlen) {
 			M_ALIGN(m, hdrlen);
 		} else
 #endif
 			m->m_data += max_linkhdr;
 		m->m_len = hdrlen;
 	}
 	SOCKBUF_UNLOCK_ASSERT(sb);
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 	if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) &&  rack->r_fsb_inited) {
 #ifdef INET6
 		if (isipv6)
 			ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
 		else
 #endif				/* INET6 */
 			ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
 		th = rack->r_ctl.fsb.th;
 		udp = rack->r_ctl.fsb.udp;
 		if (udp) {
 #ifdef INET6
 			if (isipv6)
 				ulen = hdrlen + len - sizeof(struct ip6_hdr);
 			else
 #endif				/* INET6 */
 				ulen = hdrlen + len - sizeof(struct ip);
 			udp->uh_ulen = htons(ulen);
 		}
 	} else {
 #ifdef INET6
 		if (isipv6) {
 			ip6 = mtod(m, struct ip6_hdr *);
 			if (tp->t_port) {
 				udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr));
 				udp->uh_sport = htons(V_tcp_udp_tunneling_port);
 				udp->uh_dport = tp->t_port;
 				ulen = hdrlen + len - sizeof(struct ip6_hdr);
 				udp->uh_ulen = htons(ulen);
 				th = (struct tcphdr *)(udp + 1);
 			} else
 				th = (struct tcphdr *)(ip6 + 1);
 			tcpip_fillheaders(inp, tp->t_port, ip6, th);
 		} else
 #endif				/* INET6 */
 		{
 			ip = mtod(m, struct ip *);
 			if (tp->t_port) {
 				udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
 				udp->uh_sport = htons(V_tcp_udp_tunneling_port);
 				udp->uh_dport = tp->t_port;
 				ulen = hdrlen + len - sizeof(struct ip);
 				udp->uh_ulen = htons(ulen);
 				th = (struct tcphdr *)(udp + 1);
 			} else
 				th = (struct tcphdr *)(ip + 1);
 			tcpip_fillheaders(inp, tp->t_port, ip, th);
 		}
 	}
 	/*
 	 * Fill in fields, remembering maximum advertised window for use in
 	 * delaying messages about window sizes. If resending a FIN, be sure
 	 * not to use a new sequence number.
 	 */
 	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
 	    tp->snd_nxt == tp->snd_max)
 		tp->snd_nxt--;
 	/*
 	 * If we are starting a connection, send ECN setup SYN packet. If we
 	 * are on a retransmit, we may resend those bits a number of times
 	 * as per RFC 3168.
 	 */
 	if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) {
 		flags |= tcp_ecn_output_syn_sent(tp);
 	}
 	/* Also handle parallel SYN for ECN */
 	if (TCPS_HAVERCVDSYN(tp->t_state) &&
 	    (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) {
 		int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit);
 		if ((tp->t_state == TCPS_SYN_RECEIVED) &&
 		    (tp->t_flags2 & TF2_ECN_SND_ECE))
 			tp->t_flags2 &= ~TF2_ECN_SND_ECE;
 #ifdef INET6
 		if (isipv6) {
 			ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20);
 			ip6->ip6_flow |= htonl(ect << 20);
 		}
 		else
 #endif
 		{
 			ip->ip_tos &= ~IPTOS_ECN_MASK;
 			ip->ip_tos |= ect;
 		}
 	}
 	/*
 	 * If we are doing retransmissions, then snd_nxt will not reflect
 	 * the first unsent octet.  For ACK only packets, we do not want the
 	 * sequence number of the retransmitted packet, we want the sequence
 	 * number of the next unsent octet.  So, if there is no data (and no
 	 * SYN or FIN), use snd_max instead of snd_nxt when filling in
 	 * ti_seq.  But if we are in persist state, snd_max might reflect
 	 * one byte beyond the right edge of the window, so use snd_nxt in
 	 * that case, since we know we aren't doing a retransmission.
 	 * (retransmit and persist are mutually exclusive...)
 	 */
 	if (sack_rxmit == 0) {
 		if (len || (flags & (TH_SYN | TH_FIN))) {
 			th->th_seq = htonl(tp->snd_nxt);
 			rack_seq = tp->snd_nxt;
 		} else {
 			th->th_seq = htonl(tp->snd_max);
 			rack_seq = tp->snd_max;
 		}
 	} else {
 		th->th_seq = htonl(rsm->r_start);
 		rack_seq = rsm->r_start;
 	}
 	th->th_ack = htonl(tp->rcv_nxt);
 	tcp_set_flags(th, flags);
 	/*
 	 * Calculate receive window.  Don't shrink window, but avoid silly
 	 * window syndrome.
 	 * If a RST segment is sent, advertise a window of zero.
 	 */
 	if (flags & TH_RST) {
 		recwin = 0;
 	} else {
 		if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
 		    recwin < (long)segsiz) {
 			recwin = 0;
 		}
 		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
 		    recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
 			recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
 	}
 
 	/*
 	 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
 	 * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
 	 * handled in syncache.
 	 */
 	if (flags & TH_SYN)
 		th->th_win = htons((u_short)
 				   (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
 	else {
 		/* Avoid shrinking window with window scaling. */
 		recwin = roundup2(recwin, 1 << tp->rcv_scale);
 		th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
 	}
 	/*
 	 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
 	 * window.  This may cause the remote transmitter to stall.  This
 	 * flag tells soreceive() to disable delayed acknowledgements when
 	 * draining the buffer.  This can occur if the receiver is
 	 * attempting to read more data than can be buffered prior to
 	 * transmitting on the connection.
 	 */
 	if (th->th_win == 0) {
 		tp->t_sndzerowin++;
 		tp->t_flags |= TF_RXWIN0SENT;
 	} else
 		tp->t_flags &= ~TF_RXWIN0SENT;
 	tp->snd_up = tp->snd_una;	/* drag it along, its deprecated */
 	/* Now are we using fsb?, if so copy the template data to the mbuf */
 	if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) {
 		uint8_t *cpto;
 
 		cpto = mtod(m, uint8_t *);
 		memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
 		/*
 		 * We have just copied in:
 		 * IP/IP6
 		 * <optional udphdr>
 		 * tcphdr (no options)
 		 *
 		 * We need to grab the correct pointers into the mbuf
 		 * for both the tcp header, and possibly the udp header (if tunneling).
 		 * We do this by using the offset in the copy buffer and adding it
 		 * to the mbuf base pointer (cpto).
 		 */
 #ifdef INET6
 		if (isipv6)
 			ip6 = mtod(m, struct ip6_hdr *);
 		else
 #endif				/* INET6 */
 			ip = mtod(m, struct ip *);
 		th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
 		/* If we have a udp header lets set it into the mbuf as well */
 		if (udp)
 			udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr));
 	}
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 	if (to.to_flags & TOF_SIGNATURE) {
 		/*
 		 * Calculate MD5 signature and put it into the place
 		 * determined before.
 		 * NOTE: since TCP options buffer doesn't point into
 		 * mbuf's data, calculate offset and use it.
 		 */
 		if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
 						       (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
 			/*
 			 * Do not send segment if the calculation of MD5
 			 * digest has failed.
 			 */
 			goto out;
 		}
 	}
 #endif
 	if (optlen) {
 		bcopy(opt, th + 1, optlen);
 		th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 	}
 	/*
 	 * Put TCP length in extended header, and then checksum extended
 	 * header and data.
 	 */
 	m->m_pkthdr.len = hdrlen + len;	/* in6_cksum() need this */
 #ifdef INET6
 	if (isipv6) {
 		/*
 		 * ip6_plen is not need to be filled now, and will be filled
 		 * in ip6_output.
 		 */
 		if (tp->t_port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
 			th->th_sum = htons(0);
 			UDPSTAT_INC(udps_opackets);
 		} else {
 			m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			th->th_sum = in6_cksum_pseudo(ip6,
 						      sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
 						      0);
 		}
 	}
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		if (tp->t_port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
 						ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
 			th->th_sum = htons(0);
 			UDPSTAT_INC(udps_opackets);
 		} else {
 			m->m_pkthdr.csum_flags = CSUM_TCP;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			th->th_sum = in_pseudo(ip->ip_src.s_addr,
 					       ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
 									IPPROTO_TCP + len + optlen));
 		}
 		/* IP version must be set here for ipv4/ipv6 checking later */
 		KASSERT(ip->ip_v == IPVERSION,
 			("%s: IP version incorrect: %d", __func__, ip->ip_v));
 	}
 #endif
 	/*
 	 * Enable TSO and specify the size of the segments. The TCP pseudo
 	 * header checksum is always provided. XXX: Fixme: This is currently
 	 * not the case for IPv6.
 	 */
 	if (tso) {
 		KASSERT(len > tp->t_maxseg - optlen,
 			("%s: len <= tso_segsz", __func__));
 		m->m_pkthdr.csum_flags |= CSUM_TSO;
 		m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
 	}
 	KASSERT(len + hdrlen == m_length(m, NULL),
 		("%s: mbuf chain different than expected: %d + %u != %u",
 		 __func__, len, hdrlen, m_length(m, NULL)));
 
 #ifdef TCP_HHOOK
 	/* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
 	hhook_run_tcp_est_out(tp, th, &to, len, tso);
 #endif
 	/* We're getting ready to send; log now. */
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 
 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 		log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
 		if (rack->rack_no_prr)
 			log.u_bbr.flex1 = 0;
 		else
 			log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
 		log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
 		log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
 		log.u_bbr.flex4 = orig_len;
 		/* Save off the early/late values */
 		log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
 		log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
 		log.u_bbr.bw_inuse = rack_get_bw(rack);
 		log.u_bbr.flex8 = 0;
 		if (rsm) {
 			if (rsm->r_flags & RACK_RWND_COLLAPSED) {
 				rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm);
 				counter_u64_add(rack_collapsed_win_rxt, 1);
 				counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start));
 			}
 			if (doing_tlp)
 				log.u_bbr.flex8 = 2;
 			else
 				log.u_bbr.flex8 = 1;
 		} else {
 			if (doing_tlp)
 				log.u_bbr.flex8 = 3;
 			else
 				log.u_bbr.flex8 = 0;
 		}
 		log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
 		log.u_bbr.flex7 = mark;
 		log.u_bbr.flex7 <<= 8;
 		log.u_bbr.flex7 |= pass;
 		log.u_bbr.pkts_out = tp->t_maxseg;
 		log.u_bbr.timeStamp = cts;
 		log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
 		log.u_bbr.lt_epoch = cwnd_to_use;
 		log.u_bbr.delivered = sendalot;
 		lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
 				     len, &log, false, NULL, NULL, 0, &tv);
 	} else
 		lgb = NULL;
 
 	/*
 	 * Fill in IP length and desired time to live and send to IP level.
 	 * There should be a better way to handle ttl and tos; we could keep
 	 * them in the template, but need a way to checksum without them.
 	 */
 	/*
 	 * m->m_pkthdr.len should have been set before cksum calcuration,
 	 * because in6_cksum() need it.
 	 */
 #ifdef INET6
 	if (isipv6) {
 		/*
 		 * we separately set hoplimit for every segment, since the
 		 * user might want to change the value via setsockopt. Also,
 		 * desired default hop limit might be changed via Neighbor
 		 * Discovery.
 		 */
 		rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL);
 
 		/*
 		 * Set the packet size here for the benefit of DTrace
 		 * probes. ip6_output() will set it properly; it's supposed
 		 * to include the option header lengths as well.
 		 */
 		ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
 
 		if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 		else
 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 
 		if (tp->t_state == TCPS_SYN_SENT)
 			TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
 
 		TCP_PROBE5(send, NULL, tp, ip6, tp, th);
 		/* TODO: IPv6 IP6TOS_ECT bit on */
 		error = ip6_output(m,
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 				   inp->in6p_outputopts,
 #else
 				   NULL,
 #endif
 				   &inp->inp_route6,
 				   ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0),
 				   NULL, NULL, inp);
 
 		if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL)
 			mtu = inp->inp_route6.ro_nh->nh_mtu;
 	}
 #endif				/* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		ip->ip_len = htons(m->m_pkthdr.len);
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6PROTO)
 			ip->ip_ttl = in6_selecthlim(inp, NULL);
 #endif				/* INET6 */
 		rack->r_ctl.fsb.hoplimit = ip->ip_ttl;
 		/*
 		 * If we do path MTU discovery, then we set DF on every
 		 * packet. This might not be the best thing to do according
 		 * to RFC3390 Section 2. However the tcp hostcache migitates
 		 * the problem so it affects only the first tcp connection
 		 * with a host.
 		 *
 		 * NB: Don't set DF on small MTU/MSS to have a safe
 		 * fallback.
 		 */
 		if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 			if (tp->t_port == 0 || len < V_tcp_minmss) {
 				ip->ip_off |= htons(IP_DF);
 			}
 		} else {
 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 		}
 
 		if (tp->t_state == TCPS_SYN_SENT)
 			TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
 
 		TCP_PROBE5(send, NULL, tp, ip, tp, th);
 
 		error = ip_output(m,
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 				  inp->inp_options,
 #else
 				  NULL,
 #endif
 				  &inp->inp_route,
 				  ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0,
 				  inp);
 		if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL)
 			mtu = inp->inp_route.ro_nh->nh_mtu;
 	}
 #endif				/* INET */
 
 out:
 	if (lgb) {
 		lgb->tlb_errno = error;
 		lgb = NULL;
 	}
 	/*
 	 * In transmit state, time the transmission and arrange for the
 	 * retransmit.  In persist state, just set snd_max.
 	 */
 	if (error == 0) {
 		tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls);
 		if (rsm && doing_tlp) {
 			rack->rc_last_sent_tlp_past_cumack = 0;
 			rack->rc_last_sent_tlp_seq_valid = 1;
 			rack->r_ctl.last_sent_tlp_seq = rsm->r_start;
 			rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start;
 		}
 		rack->forced_ack = 0;	/* If we send something zap the FA flag */
 		if (rsm && (doing_tlp == 0)) {
 			/* Set we retransmitted */
 			rack->rc_gp_saw_rec = 1;
 		} else {
 			if (cwnd_to_use > tp->snd_ssthresh) {
 				/* Set we sent in CA */
 				rack->rc_gp_saw_ca = 1;
 			} else {
 				/* Set we sent in SS */
 				rack->rc_gp_saw_ss = 1;
 			}
 		}
 		if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 		    (tp->t_flags & TF_SACK_PERMIT) &&
 		    tp->rcv_numsacks > 0)
 			tcp_clean_dsack_blocks(tp);
 		tot_len_this_send += len;
 		if (len == 0)
 			counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
 		else if (len == 1) {
 			counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
 		} else if (len > 1) {
 			int idx;
 
 			idx = (len / segsiz) + 3;
 			if (idx >= TCP_MSS_ACCT_ATIMER)
 				counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
 			else
 				counter_u64_add(rack_out_size[idx], 1);
 		}
 	}
 	if ((rack->rack_no_prr == 0) &&
 	    sub_from_prr &&
 	    (error == 0)) {
 		if (rack->r_ctl.rc_prr_sndcnt >= len)
 			rack->r_ctl.rc_prr_sndcnt -= len;
 		else
 			rack->r_ctl.rc_prr_sndcnt = 0;
 	}
 	sub_from_prr = 0;
 	if (doing_tlp) {
 		/* Make sure the TLP is added */
 		add_flag |= RACK_TLP;
 	} else if (rsm) {
 		/* If its a resend without TLP then it must not have the flag */
 		rsm->r_flags &= ~RACK_TLP;
 	}
 	rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error,
 			rack_to_usec_ts(&tv),
 			rsm, add_flag, s_mb, s_moff, hw_tls);
 
 
 	if ((error == 0) &&
 	    (len > 0) &&
 	    (tp->snd_una == tp->snd_max))
 		rack->r_ctl.rc_tlp_rxt_last_time = cts;
 	{
 		tcp_seq startseq = tp->snd_nxt;
 
 		/* Track our lost count */
 		if (rsm && (doing_tlp == 0))
 			rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start;
 		/*
 		 * Advance snd_nxt over sequence space of this segment.
 		 */
 		if (error)
 			/* We don't log or do anything with errors */
 			goto nomore;
 		if (doing_tlp == 0) {
 			if (rsm == NULL) {
 				/*
 				 * Not a retransmission of some
 				 * sort, new data is going out so
 				 * clear our TLP count and flag.
 				 */
 				rack->rc_tlp_in_progress = 0;
 				rack->r_ctl.rc_tlp_cnt_out = 0;
 			}
 		} else {
 			/*
 			 * We have just sent a TLP, mark that it is true
 			 * and make sure our in progress is set so we
 			 * continue to check the count.
 			 */
 			rack->rc_tlp_in_progress = 1;
 			rack->r_ctl.rc_tlp_cnt_out++;
 		}
 		if (flags & (TH_SYN | TH_FIN)) {
 			if (flags & TH_SYN)
 				tp->snd_nxt++;
 			if (flags & TH_FIN) {
 				tp->snd_nxt++;
 				tp->t_flags |= TF_SENTFIN;
 			}
 		}
 		/* In the ENOBUFS case we do *not* update snd_max */
 		if (sack_rxmit)
 			goto nomore;
 
 		tp->snd_nxt += len;
 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
 			if (tp->snd_una == tp->snd_max) {
 				/*
 				 * Update the time we just added data since
 				 * none was outstanding.
 				 */
 				rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
 				tp->t_acktime = ticks;
 			}
 			tp->snd_max = tp->snd_nxt;
 			/*
 			 * Time this transmission if not a retransmission and
 			 * not currently timing anything.
 			 * This is only relevant in case of switching back to
 			 * the base stack.
 			 */
 			if (tp->t_rtttime == 0) {
 				tp->t_rtttime = ticks;
 				tp->t_rtseq = startseq;
 				KMOD_TCPSTAT_INC(tcps_segstimed);
 			}
 			if (len &&
 			    ((tp->t_flags & TF_GPUTINPROG) == 0))
 				rack_start_gp_measurement(tp, rack, startseq, sb_offset);
 		}
 		/*
 		 * If we are doing FO we need to update the mbuf position and subtract
 		 * this happens when the peer sends us duplicate information and
 		 * we thus want to send a DSACK.
 		 *
 		 * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO
 		 * turned off? If not then we are going to echo multiple DSACK blocks
 		 * out (with the TSO), which we should not be doing.
 		 */
 		if (rack->r_fast_output && len) {
 			if (rack->r_ctl.fsb.left_to_send > len)
 				rack->r_ctl.fsb.left_to_send -= len;
 			else
 				rack->r_ctl.fsb.left_to_send = 0;
 			if (rack->r_ctl.fsb.left_to_send < segsiz)
 				rack->r_fast_output = 0;
 			if (rack->r_fast_output) {
 				rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
 				rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
 			}
 		}
 	}
 nomore:
 	if (error) {
 		rack->r_ctl.rc_agg_delayed = 0;
 		rack->r_early = 0;
 		rack->r_late = 0;
 		rack->r_ctl.rc_agg_early = 0;
 		SOCKBUF_UNLOCK_ASSERT(sb);	/* Check gotos. */
 		/*
 		 * Failures do not advance the seq counter above. For the
 		 * case of ENOBUFS we will fall out and retry in 1ms with
 		 * the hpts. Everything else will just have to retransmit
 		 * with the timer.
 		 *
 		 * In any case, we do not want to loop around for another
 		 * send without a good reason.
 		 */
 		sendalot = 0;
 		switch (error) {
 		case EPERM:
 			tp->t_softerror = error;
 #ifdef TCP_ACCOUNTING
 			crtsc = get_cyclecount();
 			if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 				tp->tcp_cnt_counters[SND_OUT_FAIL]++;
 			}
 			counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
 			if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 				tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
 			}
 			counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
 			sched_unpin();
 #endif
 			return (error);
 		case ENOBUFS:
 			/*
 			 * Pace us right away to retry in a some
 			 * time
 			 */
 			if (rack->r_ctl.crte != NULL) {
 				rack_trace_point(rack, RACK_TP_HWENOBUF);
 			} else
 				rack_trace_point(rack, RACK_TP_ENOBUF);
 			slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
 			if (rack->rc_enobuf < 0x7f)
 				rack->rc_enobuf++;
 			if (slot < (10 * HPTS_USEC_IN_MSEC))
 				slot = 10 * HPTS_USEC_IN_MSEC;
 			if (rack->r_ctl.crte != NULL) {
 				counter_u64_add(rack_saw_enobuf_hw, 1);
 				tcp_rl_log_enobuf(rack->r_ctl.crte);
 			}
 			counter_u64_add(rack_saw_enobuf, 1);
 			goto enobufs;
 		case EMSGSIZE:
 			/*
 			 * For some reason the interface we used initially
 			 * to send segments changed to another or lowered
 			 * its MTU. If TSO was active we either got an
 			 * interface without TSO capabilits or TSO was
 			 * turned off. If we obtained mtu from ip_output()
 			 * then update it and try again.
 			 */
 			if (tso)
 				tp->t_flags &= ~TF_TSO;
 			if (mtu != 0) {
 				tcp_mss_update(tp, -1, mtu, NULL, NULL);
 				goto again;
 			}
 			slot = 10 * HPTS_USEC_IN_MSEC;
 			rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
 #ifdef TCP_ACCOUNTING
 			crtsc = get_cyclecount();
 			if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 				tp->tcp_cnt_counters[SND_OUT_FAIL]++;
 			}
 			counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
 			if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 				tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
 			}
 			counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
 			sched_unpin();
 #endif
 			return (error);
 		case ENETUNREACH:
 			counter_u64_add(rack_saw_enetunreach, 1);
 		case EHOSTDOWN:
 		case EHOSTUNREACH:
 		case ENETDOWN:
 			if (TCPS_HAVERCVDSYN(tp->t_state)) {
 				tp->t_softerror = error;
 			}
 			/* FALLTHROUGH */
 		default:
 			slot = 10 * HPTS_USEC_IN_MSEC;
 			rack_start_hpts_timer(rack, tp, cts, slot, 0, 0);
 #ifdef TCP_ACCOUNTING
 			crtsc = get_cyclecount();
 			if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 				tp->tcp_cnt_counters[SND_OUT_FAIL]++;
 			}
 			counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1);
 			if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 				tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
 			}
 			counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val));
 			sched_unpin();
 #endif
 			return (error);
 		}
 	} else {
 		rack->rc_enobuf = 0;
 		if (IN_FASTRECOVERY(tp->t_flags) && rsm)
 			rack->r_ctl.retran_during_recovery += len;
 	}
 	KMOD_TCPSTAT_INC(tcps_sndtotal);
 
 	/*
 	 * Data sent (as far as we can tell). If this advertises a larger
 	 * window than any other segment, then remember the size of the
 	 * advertised window. Any pending ACK has now been sent.
 	 */
 	if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
 		tp->rcv_adv = tp->rcv_nxt + recwin;
 
 	tp->last_ack_sent = tp->rcv_nxt;
 	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 enobufs:
 	if (sendalot) {
 		/* Do we need to turn off sendalot? */
 		if (rack->r_ctl.rc_pace_max_segs &&
 		    (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) {
 			/* We hit our max. */
 			sendalot = 0;
 		} else if ((rack->rc_user_set_max_segs) &&
 			   (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) {
 			/* We hit the user defined max */
 			sendalot = 0;
 		}
 	}
 	if ((error == 0) && (flags & TH_FIN))
 		tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN);
 	if (flags & TH_RST) {
 		/*
 		 * We don't send again after sending a RST.
 		 */
 		slot = 0;
 		sendalot = 0;
 		if (error == 0)
 			tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
 	} else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) {
 		/*
 		 * Get our pacing rate, if an error
 		 * occurred in sending (ENOBUF) we would
 		 * hit the else if with slot preset. Other
 		 * errors return.
 		 */
 		slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz);
 	}
 	if (rsm &&
 	    (rsm->r_flags & RACK_HAS_SYN) == 0 &&
 	    rack->use_rack_rr) {
 		/* Its a retransmit and we use the rack cheat? */
 		if ((slot == 0) ||
 		    (rack->rc_always_pace == 0) ||
 		    (rack->r_rr_config == 1)) {
 			/*
 			 * We have no pacing set or we
 			 * are using old-style rack or
 			 * we are overridden to use the old 1ms pacing.
 			 */
 			slot = rack->r_ctl.rc_min_to;
 		}
 	}
 	/* We have sent clear the flag */
 	rack->r_ent_rec_ns = 0;
 	if (rack->r_must_retran) {
 		if (rsm) {
 			rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start);
 			if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
 				/*
 				 * We have retransmitted all.
 				 */
 				rack->r_must_retran = 0;
 				rack->r_ctl.rc_out_at_rto = 0;
 			}
 		} else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) {
 			/*
 			 * Sending new data will also kill
 			 * the loop.
 			 */
 			rack->r_must_retran = 0;
 			rack->r_ctl.rc_out_at_rto = 0;
 		}
 	}
 	rack->r_ctl.fsb.recwin = recwin;
 	if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) &&
 	    SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) {
 		/*
 		 * We hit an RTO and now have past snd_max at the RTO
 		 * clear all the WAS flags.
 		 */
 		tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY);
 	}
 	if (slot) {
 		/* set the rack tcb into the slot N */
 		if ((error == 0) &&
 		    rack_use_rfo &&
 		    ((flags & (TH_SYN|TH_FIN)) == 0) &&
 		    (rsm == NULL) &&
 		    (tp->snd_nxt == tp->snd_max) &&
 		    (ipoptlen == 0) &&
 		    (tp->rcv_numsacks == 0) &&
 		    rack->r_fsb_inited &&
 		    TCPS_HAVEESTABLISHED(tp->t_state) &&
 		    (rack->r_must_retran == 0) &&
 		    ((tp->t_flags & TF_NEEDFIN) == 0) &&
 		    (len > 0) && (orig_len > 0) &&
 		    (orig_len > len) &&
 		    ((orig_len - len) >= segsiz) &&
 		    ((optlen == 0) ||
 		     ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
 			/* We can send at least one more MSS using our fsb */
 
 			rack->r_fast_output = 1;
 			rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
 			rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
 			rack->r_ctl.fsb.tcp_flags = flags;
 			rack->r_ctl.fsb.left_to_send = orig_len - len;
 			if (hw_tls)
 				rack->r_ctl.fsb.hw_tls = 1;
 			else
 				rack->r_ctl.fsb.hw_tls = 0;
 			KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
 				("rack:%p left_to_send:%u sbavail:%u out:%u",
 				 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
 				 (tp->snd_max - tp->snd_una)));
 			if (rack->r_ctl.fsb.left_to_send < segsiz)
 				rack->r_fast_output = 0;
 			else {
 				if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
 					rack->r_ctl.fsb.rfo_apply_push = 1;
 				else
 					rack->r_ctl.fsb.rfo_apply_push = 0;
 			}
 		} else
 			rack->r_fast_output = 0;
 		rack_log_fsb(rack, tp, so, flags,
 			     ipoptlen, orig_len, len, error,
 			     (rsm == NULL), optlen, __LINE__, 2);
 	} else if (sendalot) {
 		int ret;
 
 		sack_rxmit = 0;
 		if ((error == 0) &&
 		    rack_use_rfo &&
 		    ((flags & (TH_SYN|TH_FIN)) == 0) &&
 		    (rsm == NULL) &&
 		    (ipoptlen == 0) &&
 		    (tp->rcv_numsacks == 0) &&
 		    (tp->snd_nxt == tp->snd_max) &&
 		    (rack->r_must_retran == 0) &&
 		    rack->r_fsb_inited &&
 		    TCPS_HAVEESTABLISHED(tp->t_state) &&
 		    ((tp->t_flags & TF_NEEDFIN) == 0) &&
 		    (len > 0) && (orig_len > 0) &&
 		    (orig_len > len) &&
 		    ((orig_len - len) >= segsiz) &&
 		    ((optlen == 0) ||
 		     ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
 			/* we can use fast_output for more */
 
 			rack->r_fast_output = 1;
 			rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
 			rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
 			rack->r_ctl.fsb.tcp_flags = flags;
 			rack->r_ctl.fsb.left_to_send = orig_len - len;
 			if (hw_tls)
 				rack->r_ctl.fsb.hw_tls = 1;
 			else
 				rack->r_ctl.fsb.hw_tls = 0;
 			KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
 				("rack:%p left_to_send:%u sbavail:%u out:%u",
 				 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
 				 (tp->snd_max - tp->snd_una)));
 			if (rack->r_ctl.fsb.left_to_send < segsiz) {
 				rack->r_fast_output = 0;
 			}
 			if (rack->r_fast_output) {
 				if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
 					rack->r_ctl.fsb.rfo_apply_push = 1;
 				else
 					rack->r_ctl.fsb.rfo_apply_push = 0;
 				rack_log_fsb(rack, tp, so, flags,
 					     ipoptlen, orig_len, len, error,
 					     (rsm == NULL), optlen, __LINE__, 3);
 				error = 0;
 				ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error);
 				if (ret >= 0)
 					return (ret);
 			        else if (error)
 					goto nomore;
 
 			}
 		}
 		goto again;
 	}
 	/* Assure when we leave that snd_nxt will point to top */
 	if (SEQ_GT(tp->snd_max, tp->snd_nxt))
 		tp->snd_nxt = tp->snd_max;
 	rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0);
 #ifdef TCP_ACCOUNTING
 	crtsc = get_cyclecount() - ts_val;
 	if (tot_len_this_send) {
 		if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 			tp->tcp_cnt_counters[SND_OUT_DATA]++;
 		}
 		counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1);
 		if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 			tp->tcp_proc_time[SND_OUT_DATA] += crtsc;
 		}
 		counter_u64_add(tcp_proc_time[SND_OUT_DATA], crtsc);
 		if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 			tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz);
 		}
 		counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) /segsiz));
 	} else {
 		if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 			tp->tcp_cnt_counters[SND_OUT_ACK]++;
 		}
 		counter_u64_add(tcp_cnt_counters[SND_OUT_ACK], 1);
 		if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
 			tp->tcp_proc_time[SND_OUT_ACK] += crtsc;
 		}
 		counter_u64_add(tcp_proc_time[SND_OUT_ACK], crtsc);
 	}
 	sched_unpin();
 #endif
 	if (error == ENOBUFS)
 		error = 0;
 	return (error);
 }
 
 static void
 rack_update_seg(struct tcp_rack *rack)
 {
 	uint32_t orig_val;
 
 	orig_val = rack->r_ctl.rc_pace_max_segs;
 	rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
 	if (orig_val != rack->r_ctl.rc_pace_max_segs)
 		rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL, 0);
 }
 
 static void
 rack_mtu_change(struct tcpcb *tp)
 {
 	/*
 	 * The MSS may have changed
 	 */
 	struct tcp_rack *rack;
 	struct rack_sendmap *rsm;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) {
 		/*
 		 * The MTU has changed we need to resend everything
 		 * since all we have sent is lost. We first fix
 		 * up the mtu though.
 		 */
 		rack_set_pace_segments(tp, rack, __LINE__, NULL);
 		/* We treat this like a full retransmit timeout without the cwnd adjustment */
 		rack_remxt_tmr(tp);
 		rack->r_fast_output = 0;
 		rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp,
 						rack->r_ctl.rc_sacked);
 		rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
 		rack->r_must_retran = 1;
 		/* Mark all inflight to needing to be rxt'd */
 		TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
 			rsm->r_flags |= RACK_MUST_RXT;
 		}
 	}
 	sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
 	/* We don't use snd_nxt to retransmit */
 	tp->snd_nxt = tp->snd_max;
 }
 
 static int
 rack_set_profile(struct tcp_rack *rack, int prof)
 {
 	int err = EINVAL;
 	if (prof == 1) {
 		/* pace_always=1 */
 		if (rack->rc_always_pace == 0) {
 			if (tcp_can_enable_pacing() == 0)
 				return (EBUSY);
 		}
 		rack->rc_always_pace = 1;
 		if (rack->use_fixed_rate || rack->gp_ready)
 			rack_set_cc_pacing(rack);
 		rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
 		rack->rack_attempt_hdwr_pace = 0;
 		/* cmpack=1 */
 		if (rack_use_cmp_acks)
 			rack->r_use_cmp_ack = 1;
 		if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) &&
 		    rack->r_use_cmp_ack)
 			rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
 		/* scwnd=1 */
 		rack->rack_enable_scwnd = 1;
 		/* dynamic=100 */
 		rack->rc_gp_dyn_mul = 1;
 		/* gp_inc_ca */
 		rack->r_ctl.rack_per_of_gp_ca = 100;
 		/* rrr_conf=3 */
 		rack->r_rr_config = 3;
 		/* npush=2 */
 		rack->r_ctl.rc_no_push_at_mrtt = 2;
 		/* fillcw=1 */
 		rack->rc_pace_to_cwnd = 1;
 		rack->rc_pace_fill_if_rttin_range = 0;
 		rack->rtt_limit_mul = 0;
 		/* noprr=1 */
 		rack->rack_no_prr = 1;
 		/* lscwnd=1 */
 		rack->r_limit_scw = 1;
 		/* gp_inc_rec */
 		rack->r_ctl.rack_per_of_gp_rec = 90;
 		err = 0;
 
 	} else if (prof == 3) {
 		/* Same as profile one execept fill_cw becomes 2 (less aggressive set) */
 		/* pace_always=1 */
 		if (rack->rc_always_pace == 0) {
 			if (tcp_can_enable_pacing() == 0)
 				return (EBUSY);
 		}
 		rack->rc_always_pace = 1;
 		if (rack->use_fixed_rate || rack->gp_ready)
 			rack_set_cc_pacing(rack);
 		rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
 		rack->rack_attempt_hdwr_pace = 0;
 		/* cmpack=1 */
 		if (rack_use_cmp_acks)
 			rack->r_use_cmp_ack = 1;
 		if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) &&
 		    rack->r_use_cmp_ack)
 			rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
 		/* scwnd=1 */
 		rack->rack_enable_scwnd = 1;
 		/* dynamic=100 */
 		rack->rc_gp_dyn_mul = 1;
 		/* gp_inc_ca */
 		rack->r_ctl.rack_per_of_gp_ca = 100;
 		/* rrr_conf=3 */
 		rack->r_rr_config = 3;
 		/* npush=2 */
 		rack->r_ctl.rc_no_push_at_mrtt = 2;
 		/* fillcw=2 */
 		rack->rc_pace_to_cwnd = 1;
 		rack->r_fill_less_agg = 1;
 		rack->rc_pace_fill_if_rttin_range = 0;
 		rack->rtt_limit_mul = 0;
 		/* noprr=1 */
 		rack->rack_no_prr = 1;
 		/* lscwnd=1 */
 		rack->r_limit_scw = 1;
 		/* gp_inc_rec */
 		rack->r_ctl.rack_per_of_gp_rec = 90;
 		err = 0;
 
 
 	} else if (prof == 2) {
 		/* cmpack=1 */
 		if (rack->rc_always_pace == 0) {
 			if (tcp_can_enable_pacing() == 0)
 				return (EBUSY);
 		}
 		rack->rc_always_pace = 1;
 		if (rack->use_fixed_rate || rack->gp_ready)
 			rack_set_cc_pacing(rack);
 		rack->r_use_cmp_ack = 1;
 		if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state))
 			rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
 		/* pace_always=1 */
 		rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
 		/* scwnd=1 */
 		rack->rack_enable_scwnd = 1;
 		/* dynamic=100 */
 		rack->rc_gp_dyn_mul = 1;
 		rack->r_ctl.rack_per_of_gp_ca = 100;
 		/* rrr_conf=3 */
 		rack->r_rr_config = 3;
 		/* npush=2 */
 		rack->r_ctl.rc_no_push_at_mrtt = 2;
 		/* fillcw=1 */
 		rack->rc_pace_to_cwnd = 1;
 		rack->rc_pace_fill_if_rttin_range = 0;
 		rack->rtt_limit_mul = 0;
 		/* noprr=1 */
 		rack->rack_no_prr = 1;
 		/* lscwnd=0 */
 		rack->r_limit_scw = 0;
 		err = 0;
 	} else if (prof == 0) {
 		/* This changes things back to the default settings */
 		err = 0;
 		if (rack->rc_always_pace) {
 			tcp_decrement_paced_conn();
 			rack_undo_cc_pacing(rack);
 			rack->rc_always_pace = 0;
 		}
 		if (rack_pace_every_seg && tcp_can_enable_pacing()) {
 			rack->rc_always_pace = 1;
 			if (rack->use_fixed_rate || rack->gp_ready)
 				rack_set_cc_pacing(rack);
 		} else
 			rack->rc_always_pace = 0;
 		if (rack_dsack_std_based & 0x1) {
 			/* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
 			rack->rc_rack_tmr_std_based = 1;
 		}
 		if (rack_dsack_std_based & 0x2) {
 			/* Basically this means  rack timers are extended based on dsack by up to (2 * srtt) */
 			rack->rc_rack_use_dsack = 1;
 		}
 		if (rack_use_cmp_acks)
 			rack->r_use_cmp_ack = 1;
 		else
 			rack->r_use_cmp_ack = 0;
 		if (rack_disable_prr)
 			rack->rack_no_prr = 1;
 		else
 			rack->rack_no_prr = 0;
 		if (rack_gp_no_rec_chg)
 			rack->rc_gp_no_rec_chg = 1;
 		else
 			rack->rc_gp_no_rec_chg = 0;
 		if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) {
 			rack->r_mbuf_queue = 1;
 			if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state))
 				rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
 			rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
 		} else {
 			rack->r_mbuf_queue = 0;
 			rack->rc_inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
 		}
 		if (rack_enable_shared_cwnd)
 			rack->rack_enable_scwnd = 1;
 		else
 			rack->rack_enable_scwnd = 0;
 		if (rack_do_dyn_mul) {
 			/* When dynamic adjustment is on CA needs to start at 100% */
 			rack->rc_gp_dyn_mul = 1;
 			if (rack_do_dyn_mul >= 100)
 				rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul;
 		} else {
 			rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
 			rack->rc_gp_dyn_mul = 0;
 		}
 		rack->r_rr_config = 0;
 		rack->r_ctl.rc_no_push_at_mrtt = 0;
 		rack->rc_pace_to_cwnd = 0;
 		rack->rc_pace_fill_if_rttin_range = 0;
 		rack->rtt_limit_mul = 0;
 
 		if (rack_enable_hw_pacing)
 			rack->rack_hdw_pace_ena = 1;
 		else
 			rack->rack_hdw_pace_ena = 0;
 		if (rack_disable_prr)
 			rack->rack_no_prr = 1;
 		else
 			rack->rack_no_prr = 0;
 		if (rack_limits_scwnd)
 			rack->r_limit_scw  = 1;
 		else
 			rack->r_limit_scw  = 0;
 		err = 0;
 	}
 	return (err);
 }
 
 static int
 rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval)
 {
 	struct deferred_opt_list *dol;
 
 	dol = malloc(sizeof(struct deferred_opt_list),
 		     M_TCPFSB, M_NOWAIT|M_ZERO);
 	if (dol == NULL) {
 		/*
 		 * No space yikes -- fail out..
 		 */
 		return (0);
 	}
 	dol->optname = sopt_name;
 	dol->optval = loptval;
 	TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next);
 	return (1);
 }
 
 static int
 rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
 		    uint32_t optval, uint64_t loptval)
 {
 	struct epoch_tracker et;
 	struct sockopt sopt;
 	struct cc_newreno_opts opt;
 	uint64_t val;
 	int error = 0;
 	uint16_t ca, ss;
 
 	switch (sopt_name) {
 
 	case TCP_RACK_DSACK_OPT:
 		RACK_OPTS_INC(tcp_rack_dsack_opt);
 		if (optval & 0x1) {
 			rack->rc_rack_tmr_std_based = 1;
 		} else {
 			rack->rc_rack_tmr_std_based = 0;
 		}
 		if (optval & 0x2) {
 			rack->rc_rack_use_dsack = 1;
 		} else {
 			rack->rc_rack_use_dsack = 0;
 		}
 		rack_log_dsack_event(rack, 5, __LINE__, 0, 0);
 		break;
 	case TCP_RACK_PACING_BETA:
 		RACK_OPTS_INC(tcp_rack_beta);
 		if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
 			/* This only works for newreno. */
 			error = EINVAL;
 			break;
 		}
 		if (rack->rc_pacing_cc_set) {
 			/*
 			 * Set them into the real CC module
 			 * whats in the rack pcb is the old values
 			 * to be used on restoral/
 			 */
 			sopt.sopt_dir = SOPT_SET;
 			opt.name = CC_NEWRENO_BETA;
 			opt.val = optval;
 			if (CC_ALGO(tp)->ctl_output != NULL)
 				error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
 			else {
 				error = ENOENT;
 				break;
 			}
 		} else {
 			/*
 			 * Not pacing yet so set it into our local
 			 * rack pcb storage.
 			 */
 			rack->r_ctl.rc_saved_beta.beta = optval;
 		}
 		break;
 	case TCP_RACK_TIMER_SLOP:
 		RACK_OPTS_INC(tcp_rack_timer_slop);
 		rack->r_ctl.timer_slop = optval;
 		if (rack->rc_tp->t_srtt) {
 			/*
 			 * If we have an SRTT lets update t_rxtcur
 			 * to have the new slop.
 			 */
 			RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
 					   rack_rto_min, rack_rto_max,
 					   rack->r_ctl.timer_slop);
 		}
 		break;
 	case TCP_RACK_PACING_BETA_ECN:
 		RACK_OPTS_INC(tcp_rack_beta_ecn);
 		if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
 			/* This only works for newreno. */
 			error = EINVAL;
 			break;
 		}
 		if (rack->rc_pacing_cc_set) {
 			/*
 			 * Set them into the real CC module
 			 * whats in the rack pcb is the old values
 			 * to be used on restoral/
 			 */
 			sopt.sopt_dir = SOPT_SET;
 			opt.name = CC_NEWRENO_BETA_ECN;
 			opt.val = optval;
 			if (CC_ALGO(tp)->ctl_output != NULL)
 				error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt);
 			else
 				error = ENOENT;
 		} else {
 			/*
 			 * Not pacing yet so set it into our local
 			 * rack pcb storage.
 			 */
 			rack->r_ctl.rc_saved_beta.beta_ecn = optval;
 			rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN_ENABLED;
 		}
 		break;
 	case TCP_DEFER_OPTIONS:
 		RACK_OPTS_INC(tcp_defer_opt);
 		if (optval) {
 			if (rack->gp_ready) {
 				/* Too late */
 				error = EINVAL;
 				break;
 			}
 			rack->defer_options = 1;
 		} else
 			rack->defer_options = 0;
 		break;
 	case TCP_RACK_MEASURE_CNT:
 		RACK_OPTS_INC(tcp_rack_measure_cnt);
 		if (optval && (optval <= 0xff)) {
 			rack->r_ctl.req_measurements = optval;
 		} else
 			error = EINVAL;
 		break;
 	case TCP_REC_ABC_VAL:
 		RACK_OPTS_INC(tcp_rec_abc_val);
 		if (optval > 0)
 			rack->r_use_labc_for_rec = 1;
 		else
 			rack->r_use_labc_for_rec = 0;
 		break;
 	case TCP_RACK_ABC_VAL:
 		RACK_OPTS_INC(tcp_rack_abc_val);
 		if ((optval > 0) && (optval < 255))
 			rack->rc_labc = optval;
 		else
 			error = EINVAL;
 		break;
 	case TCP_HDWR_UP_ONLY:
 		RACK_OPTS_INC(tcp_pacing_up_only);
 		if (optval)
 			rack->r_up_only = 1;
 		else
 			rack->r_up_only = 0;
 		break;
 	case TCP_PACING_RATE_CAP:
 		RACK_OPTS_INC(tcp_pacing_rate_cap);
 		rack->r_ctl.bw_rate_cap = loptval;
 		break;
 	case TCP_RACK_PROFILE:
 		RACK_OPTS_INC(tcp_profile);
 		error = rack_set_profile(rack, optval);
 		break;
 	case TCP_USE_CMP_ACKS:
 		RACK_OPTS_INC(tcp_use_cmp_acks);
 		if ((optval == 0) && (rack->rc_inp->inp_flags2 & INP_MBUF_ACKCMP)) {
 			/* You can't turn it off once its on! */
 			error = EINVAL;
 		} else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) {
 			rack->r_use_cmp_ack = 1;
 			rack->r_mbuf_queue = 1;
 			tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
 		}
 		if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
 			rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP;
 		break;
 	case TCP_SHARED_CWND_TIME_LIMIT:
 		RACK_OPTS_INC(tcp_lscwnd);
 		if (optval)
 			rack->r_limit_scw = 1;
 		else
 			rack->r_limit_scw = 0;
 		break;
  	case TCP_RACK_PACE_TO_FILL:
 		RACK_OPTS_INC(tcp_fillcw);
 		if (optval == 0)
 			rack->rc_pace_to_cwnd = 0;
 		else {
 			rack->rc_pace_to_cwnd = 1;
 			if (optval > 1)
 				rack->r_fill_less_agg = 1;
 		}
 		if ((optval >= rack_gp_rtt_maxmul) &&
 		    rack_gp_rtt_maxmul &&
 		    (optval < 0xf)) {
 			rack->rc_pace_fill_if_rttin_range = 1;
 			rack->rtt_limit_mul = optval;
 		} else {
 			rack->rc_pace_fill_if_rttin_range = 0;
 			rack->rtt_limit_mul = 0;
 		}
 		break;
 	case TCP_RACK_NO_PUSH_AT_MAX:
 		RACK_OPTS_INC(tcp_npush);
 		if (optval == 0)
 			rack->r_ctl.rc_no_push_at_mrtt = 0;
 		else if (optval < 0xff)
 			rack->r_ctl.rc_no_push_at_mrtt = optval;
 		else
 			error = EINVAL;
 		break;
 	case TCP_SHARED_CWND_ENABLE:
 		RACK_OPTS_INC(tcp_rack_scwnd);
 		if (optval == 0)
 			rack->rack_enable_scwnd = 0;
 		else
 			rack->rack_enable_scwnd = 1;
 		break;
 	case TCP_RACK_MBUF_QUEUE:
 		/* Now do we use the LRO mbuf-queue feature */
 		RACK_OPTS_INC(tcp_rack_mbufq);
 		if (optval || rack->r_use_cmp_ack)
 			rack->r_mbuf_queue = 1;
 		else
 			rack->r_mbuf_queue = 0;
 		if  (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
 			tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
 		else
 			tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
 		break;
 	case TCP_RACK_NONRXT_CFG_RATE:
 		RACK_OPTS_INC(tcp_rack_cfg_rate);
 		if (optval == 0)
 			rack->rack_rec_nonrxt_use_cr = 0;
 		else
 			rack->rack_rec_nonrxt_use_cr = 1;
 		break;
 	case TCP_NO_PRR:
 		RACK_OPTS_INC(tcp_rack_noprr);
 		if (optval == 0)
 			rack->rack_no_prr = 0;
 		else if (optval == 1)
 			rack->rack_no_prr = 1;
 		else if (optval == 2)
 			rack->no_prr_addback = 1;
 		else
 			error = EINVAL;
 		break;
 	case TCP_TIMELY_DYN_ADJ:
 		RACK_OPTS_INC(tcp_timely_dyn);
 		if (optval == 0)
 			rack->rc_gp_dyn_mul = 0;
 		else {
 			rack->rc_gp_dyn_mul = 1;
 			if (optval >= 100) {
 				/*
 				 * If the user sets something 100 or more
 				 * its the gp_ca value.
 				 */
 				rack->r_ctl.rack_per_of_gp_ca  = optval;
 			}
 		}
 		break;
 	case TCP_RACK_DO_DETECTION:
 		RACK_OPTS_INC(tcp_rack_do_detection);
 		if (optval == 0)
 			rack->do_detection = 0;
 		else
 			rack->do_detection = 1;
 		break;
 	case TCP_RACK_TLP_USE:
 		if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
 			error = EINVAL;
 			break;
 		}
 		RACK_OPTS_INC(tcp_tlp_use);
 		rack->rack_tlp_threshold_use = optval;
 		break;
 	case TCP_RACK_TLP_REDUCE:
 		/* RACK TLP cwnd reduction (bool) */
 		RACK_OPTS_INC(tcp_rack_tlp_reduce);
 		rack->r_ctl.rc_tlp_cwnd_reduce = optval;
 		break;
 	/*  Pacing related ones */
 	case TCP_RACK_PACE_ALWAYS:
 		/*
 		 * zero is old rack method, 1 is new
 		 * method using a pacing rate.
 		 */
 		RACK_OPTS_INC(tcp_rack_pace_always);
 		if (optval > 0) {
 			if (rack->rc_always_pace) {
 				error = EALREADY;
 				break;
 			} else if (tcp_can_enable_pacing()) {
 				rack->rc_always_pace = 1;
 				if (rack->use_fixed_rate || rack->gp_ready)
 					rack_set_cc_pacing(rack);
 			}
 			else {
 				error = ENOSPC;
 				break;
 			}
 		} else {
 			if (rack->rc_always_pace) {
 				tcp_decrement_paced_conn();
 				rack->rc_always_pace = 0;
 				rack_undo_cc_pacing(rack);
 			}
 		}
 		if  (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
 			tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ;
 		else
 			tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
 		/* A rate may be set irate or other, if so set seg size */
 		rack_update_seg(rack);
 		break;
 	case TCP_BBR_RACK_INIT_RATE:
 		RACK_OPTS_INC(tcp_initial_rate);
 		val = optval;
 		/* Change from kbits per second to bytes per second */
 		val *= 1000;
 		val /= 8;
 		rack->r_ctl.init_rate = val;
 		if (rack->rc_init_win != rack_default_init_window) {
 			uint32_t win, snt;
 
 			/*
 			 * Options don't always get applied
 			 * in the order you think. So in order
 			 * to assure we update a cwnd we need
 			 * to check and see if we are still
 			 * where we should raise the cwnd.
 			 */
 			win = rc_init_window(rack);
 			if (SEQ_GT(tp->snd_max, tp->iss))
 				snt = tp->snd_max - tp->iss;
 			else
 				snt = 0;
 			if ((snt < win) &&
 			    (tp->snd_cwnd < win))
 				tp->snd_cwnd = win;
 		}
 		if (rack->rc_always_pace)
 			rack_update_seg(rack);
 		break;
 	case TCP_BBR_IWINTSO:
 		RACK_OPTS_INC(tcp_initial_win);
 		if (optval && (optval <= 0xff)) {
 			uint32_t win, snt;
 
 			rack->rc_init_win = optval;
 			win = rc_init_window(rack);
 			if (SEQ_GT(tp->snd_max, tp->iss))
 				snt = tp->snd_max - tp->iss;
 			else
 				snt = 0;
 			if ((snt < win) &&
 			    (tp->t_srtt |
 #ifdef NETFLIX_PEAKRATE
 			     tp->t_maxpeakrate |
 #endif
 			     rack->r_ctl.init_rate)) {
 				/*
 				 * We are not past the initial window
 				 * and we have some bases for pacing,
 				 * so we need to possibly adjust up
 				 * the cwnd. Note even if we don't set
 				 * the cwnd, its still ok to raise the rc_init_win
 				 * which can be used coming out of idle when we
 				 * would have a rate.
 				 */
 				if (tp->snd_cwnd < win)
 					tp->snd_cwnd = win;
 			}
 			if (rack->rc_always_pace)
 				rack_update_seg(rack);
 		} else
 			error = EINVAL;
 		break;
 	case TCP_RACK_FORCE_MSEG:
 		RACK_OPTS_INC(tcp_rack_force_max_seg);
 		if (optval)
 			rack->rc_force_max_seg = 1;
 		else
 			rack->rc_force_max_seg = 0;
 		break;
 	case TCP_RACK_PACE_MAX_SEG:
 		/* Max segments size in a pace in bytes */
 		RACK_OPTS_INC(tcp_rack_max_seg);
 		rack->rc_user_set_max_segs = optval;
 		rack_set_pace_segments(tp, rack, __LINE__, NULL);
 		break;
 	case TCP_RACK_PACE_RATE_REC:
 		/* Set the fixed pacing rate in Bytes per second ca */
 		RACK_OPTS_INC(tcp_rack_pace_rate_rec);
 		rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
 		if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
 			rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
 		if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
 			rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
 		rack->use_fixed_rate = 1;
 		if (rack->rc_always_pace)
 			rack_set_cc_pacing(rack);
 		rack_log_pacing_delay_calc(rack,
 					   rack->r_ctl.rc_fixed_pacing_rate_ss,
 					   rack->r_ctl.rc_fixed_pacing_rate_ca,
 					   rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
 					   __LINE__, NULL,0);
 		break;
 
 	case TCP_RACK_PACE_RATE_SS:
 		/* Set the fixed pacing rate in Bytes per second ca */
 		RACK_OPTS_INC(tcp_rack_pace_rate_ss);
 		rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
 		if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
 			rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
 		if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
 			rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
 		rack->use_fixed_rate = 1;
 		if (rack->rc_always_pace)
 			rack_set_cc_pacing(rack);
 		rack_log_pacing_delay_calc(rack,
 					   rack->r_ctl.rc_fixed_pacing_rate_ss,
 					   rack->r_ctl.rc_fixed_pacing_rate_ca,
 					   rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
 					   __LINE__, NULL, 0);
 		break;
 
 	case TCP_RACK_PACE_RATE_CA:
 		/* Set the fixed pacing rate in Bytes per second ca */
 		RACK_OPTS_INC(tcp_rack_pace_rate_ca);
 		rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
 		if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
 			rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
 		if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
 			rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
 		rack->use_fixed_rate = 1;
 		if (rack->rc_always_pace)
 			rack_set_cc_pacing(rack);
 		rack_log_pacing_delay_calc(rack,
 					   rack->r_ctl.rc_fixed_pacing_rate_ss,
 					   rack->r_ctl.rc_fixed_pacing_rate_ca,
 					   rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
 					   __LINE__, NULL, 0);
 		break;
 	case TCP_RACK_GP_INCREASE_REC:
 		RACK_OPTS_INC(tcp_gp_inc_rec);
 		rack->r_ctl.rack_per_of_gp_rec = optval;
 		rack_log_pacing_delay_calc(rack,
 					   rack->r_ctl.rack_per_of_gp_ss,
 					   rack->r_ctl.rack_per_of_gp_ca,
 					   rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
 					   __LINE__, NULL, 0);
 		break;
 	case TCP_RACK_GP_INCREASE_CA:
 		RACK_OPTS_INC(tcp_gp_inc_ca);
 		ca = optval;
 		if (ca < 100) {
 			/*
 			 * We don't allow any reduction
 			 * over the GP b/w.
 			 */
 			error = EINVAL;
 			break;
 		}
 		rack->r_ctl.rack_per_of_gp_ca = ca;
 		rack_log_pacing_delay_calc(rack,
 					   rack->r_ctl.rack_per_of_gp_ss,
 					   rack->r_ctl.rack_per_of_gp_ca,
 					   rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
 					   __LINE__, NULL, 0);
 		break;
 	case TCP_RACK_GP_INCREASE_SS:
 		RACK_OPTS_INC(tcp_gp_inc_ss);
 		ss = optval;
 		if (ss < 100) {
 			/*
 			 * We don't allow any reduction
 			 * over the GP b/w.
 			 */
 			error = EINVAL;
 			break;
 		}
 		rack->r_ctl.rack_per_of_gp_ss = ss;
 		rack_log_pacing_delay_calc(rack,
 					   rack->r_ctl.rack_per_of_gp_ss,
 					   rack->r_ctl.rack_per_of_gp_ca,
 					   rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
 					   __LINE__, NULL, 0);
 		break;
 	case TCP_RACK_RR_CONF:
 		RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate);
 		if (optval && optval <= 3)
 			rack->r_rr_config = optval;
 		else
 			rack->r_rr_config = 0;
 		break;
 	case TCP_HDWR_RATE_CAP:
 		RACK_OPTS_INC(tcp_hdwr_rate_cap);
 		if (optval) {
 			if (rack->r_rack_hw_rate_caps == 0)
 				rack->r_rack_hw_rate_caps = 1;
 			else
 				error = EALREADY;
 		} else {
 			rack->r_rack_hw_rate_caps = 0;
 		}
 		break;
 	case TCP_BBR_HDWR_PACE:
 		RACK_OPTS_INC(tcp_hdwr_pacing);
 		if (optval){
 			if (rack->rack_hdrw_pacing == 0) {
 				rack->rack_hdw_pace_ena = 1;
 				rack->rack_attempt_hdwr_pace = 0;
 			} else
 				error = EALREADY;
 		} else {
 			rack->rack_hdw_pace_ena = 0;
 #ifdef RATELIMIT
 			if (rack->r_ctl.crte != NULL) {
 				rack->rack_hdrw_pacing = 0;
 				rack->rack_attempt_hdwr_pace = 0;
 				tcp_rel_pacing_rate(rack->r_ctl.crte, tp);
 				rack->r_ctl.crte = NULL;
 			}
 #endif
 		}
 		break;
 	/*  End Pacing related ones */
 	case TCP_RACK_PRR_SENDALOT:
 		/* Allow PRR to send more than one seg */
 		RACK_OPTS_INC(tcp_rack_prr_sendalot);
 		rack->r_ctl.rc_prr_sendalot = optval;
 		break;
 	case TCP_RACK_MIN_TO:
 		/* Minimum time between rack t-o's in ms */
 		RACK_OPTS_INC(tcp_rack_min_to);
 		rack->r_ctl.rc_min_to = optval;
 		break;
 	case TCP_RACK_EARLY_SEG:
 		/* If early recovery max segments */
 		RACK_OPTS_INC(tcp_rack_early_seg);
 		rack->r_ctl.rc_early_recovery_segs = optval;
 		break;
 	case TCP_RACK_ENABLE_HYSTART:
 	{
 		if (optval) {
 			tp->ccv->flags |= CCF_HYSTART_ALLOWED;
 			if (rack_do_hystart > RACK_HYSTART_ON)
 				tp->ccv->flags |= CCF_HYSTART_CAN_SH_CWND;
 			if (rack_do_hystart > RACK_HYSTART_ON_W_SC)
 				tp->ccv->flags |= CCF_HYSTART_CONS_SSTH;
 		} else {
 			tp->ccv->flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH);
 		}
 	}
 	break;
 	case TCP_RACK_REORD_THRESH:
 		/* RACK reorder threshold (shift amount) */
 		RACK_OPTS_INC(tcp_rack_reord_thresh);
 		if ((optval > 0) && (optval < 31))
 			rack->r_ctl.rc_reorder_shift = optval;
 		else
 			error = EINVAL;
 		break;
 	case TCP_RACK_REORD_FADE:
 		/* Does reordering fade after ms time */
 		RACK_OPTS_INC(tcp_rack_reord_fade);
 		rack->r_ctl.rc_reorder_fade = optval;
 		break;
 	case TCP_RACK_TLP_THRESH:
 		/* RACK TLP theshold i.e. srtt+(srtt/N) */
 		RACK_OPTS_INC(tcp_rack_tlp_thresh);
 		if (optval)
 			rack->r_ctl.rc_tlp_threshold = optval;
 		else
 			error = EINVAL;
 		break;
 	case TCP_BBR_USE_RACK_RR:
 		RACK_OPTS_INC(tcp_rack_rr);
 		if (optval)
 			rack->use_rack_rr = 1;
 		else
 			rack->use_rack_rr = 0;
 		break;
 	case TCP_FAST_RSM_HACK:
 		RACK_OPTS_INC(tcp_rack_fastrsm_hack);
 		if (optval)
 			rack->fast_rsm_hack = 1;
 		else
 			rack->fast_rsm_hack = 0;
 		break;
 	case TCP_RACK_PKT_DELAY:
 		/* RACK added ms i.e. rack-rtt + reord + N */
 		RACK_OPTS_INC(tcp_rack_pkt_delay);
 		rack->r_ctl.rc_pkt_delay = optval;
 		break;
 	case TCP_DELACK:
 		RACK_OPTS_INC(tcp_rack_delayed_ack);
 		if (optval == 0)
 			tp->t_delayed_ack = 0;
 		else
 			tp->t_delayed_ack = 1;
 		if (tp->t_flags & TF_DELACK) {
 			tp->t_flags &= ~TF_DELACK;
 			tp->t_flags |= TF_ACKNOW;
 			NET_EPOCH_ENTER(et);
 			rack_output(tp);
 			NET_EPOCH_EXIT(et);
 		}
 		break;
 
 	case TCP_BBR_RACK_RTT_USE:
 		RACK_OPTS_INC(tcp_rack_rtt_use);
 		if ((optval != USE_RTT_HIGH) &&
 		    (optval != USE_RTT_LOW) &&
 		    (optval != USE_RTT_AVG))
 			error = EINVAL;
 		else
 			rack->r_ctl.rc_rate_sample_method = optval;
 		break;
 	case TCP_DATA_AFTER_CLOSE:
 		RACK_OPTS_INC(tcp_data_after_close);
 		if (optval)
 			rack->rc_allow_data_af_clo = 1;
 		else
 			rack->rc_allow_data_af_clo = 0;
 		break;
 	default:
 		break;
 	}
 #ifdef NETFLIX_STATS
 	tcp_log_socket_option(tp, sopt_name, optval, error);
 #endif
 	return (error);
 }
 
 
 static void
 rack_apply_deferred_options(struct tcp_rack *rack)
 {
 	struct deferred_opt_list *dol, *sdol;
 	uint32_t s_optval;
 
 	TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) {
 		TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next);
 		/* Disadvantage of deferal is you loose the error return */
 		s_optval = (uint32_t)dol->optval;
 		(void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval);
 		free(dol, M_TCPDO);
 	}
 }
 
 static void
 rack_hw_tls_change(struct tcpcb *tp, int chg)
 {
 	/*
 	 * HW tls state has changed.. fix all
 	 * rsm's in flight.
 	 */
 	struct tcp_rack *rack;
 	struct rack_sendmap *rsm;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
 		if (chg)
 			rsm->r_hw_tls = 1;
 		else
 			rsm->r_hw_tls = 0;
 	}
 	if (chg)
 		rack->r_ctl.fsb.hw_tls = 1;
 	else
 		rack->r_ctl.fsb.hw_tls = 0;
 }
 
 static int
 rack_pru_options(struct tcpcb *tp, int flags)
 {
 	if (flags & PRUS_OOB)
 		return (EOPNOTSUPP);
 	return (0);
 }
 
 static struct tcp_function_block __tcp_rack = {
 	.tfb_tcp_block_name = __XSTRING(STACKNAME),
 	.tfb_tcp_output = rack_output,
 	.tfb_do_queued_segments = ctf_do_queued_segments,
 	.tfb_do_segment_nounlock = rack_do_segment_nounlock,
 	.tfb_tcp_do_segment = rack_do_segment,
 	.tfb_tcp_ctloutput = rack_ctloutput,
 	.tfb_tcp_fb_init = rack_init,
 	.tfb_tcp_fb_fini = rack_fini,
 	.tfb_tcp_timer_stop_all = rack_stopall,
 	.tfb_tcp_timer_activate = rack_timer_activate,
 	.tfb_tcp_timer_active = rack_timer_active,
 	.tfb_tcp_timer_stop = rack_timer_stop,
 	.tfb_tcp_rexmit_tmr = rack_remxt_tmr,
 	.tfb_tcp_handoff_ok = rack_handoff_ok,
 	.tfb_tcp_mtu_chg = rack_mtu_change,
 	.tfb_pru_options = rack_pru_options,
 	.tfb_hwtls_change = rack_hw_tls_change,
 	.tfb_compute_pipe = rack_compute_pipe,
 	.tfb_flags = TCP_FUNC_OUTPUT_CANDROP,
 };
 
 /*
  * rack_ctloutput() must drop the inpcb lock before performing copyin on
  * socket option arguments.  When it re-acquires the lock after the copy, it
  * has to revalidate that the connection is still valid for the socket
  * option.
  */
 static int
 rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt)
 {
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 #ifdef INET
 	struct ip *ip;
 #endif
 	struct tcpcb *tp;
 	struct tcp_rack *rack;
 	uint64_t loptval;
 	int32_t error = 0, optval;
 
 	tp = intotcpcb(inp);
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (rack == NULL) {
 		INP_WUNLOCK(inp);
 		return (EINVAL);
 	}
 #ifdef INET6
 	ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
 #endif
 #ifdef INET
 	ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
 #endif
 
 	switch (sopt->sopt_level) {
 #ifdef INET6
 	case IPPROTO_IPV6:
 		MPASS(inp->inp_vflag & INP_IPV6PROTO);
 		switch (sopt->sopt_name) {
 		case IPV6_USE_MIN_MTU:
 			tcp6_use_min_mtu(tp);
 			break;
 		case IPV6_TCLASS:
 			/*
 			 * The DSCP codepoint has changed, update the fsb.
 			 */
 			ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 			    (rack->rc_inp->inp_flow & IPV6_FLOWINFO_MASK);
 			break;
 		}
 		INP_WUNLOCK(inp);
 		return (0);
 #endif
 #ifdef INET
 	case IPPROTO_IP:
 		switch (sopt->sopt_name) {
 		case IP_TOS:
 			/*
 			 * The DSCP codepoint has changed, update the fsb.
 			 */
 			ip->ip_tos = rack->rc_inp->inp_ip_tos;
 			break;
 		case IP_TTL:
 			/*
 			 * The TTL has changed, update the fsb.
 			 */
 			ip->ip_ttl = rack->rc_inp->inp_ip_ttl;
 			break;
 		}
 		INP_WUNLOCK(inp);
 		return (0);
 #endif
 	}
 
 	switch (sopt->sopt_name) {
 	case TCP_RACK_TLP_REDUCE:		/*  URL:tlp_reduce */
 	/*  Pacing related ones */
 	case TCP_RACK_PACE_ALWAYS:		/*  URL:pace_always */
 	case TCP_BBR_RACK_INIT_RATE:		/*  URL:irate */
 	case TCP_BBR_IWINTSO:			/*  URL:tso_iwin */
 	case TCP_RACK_PACE_MAX_SEG:		/*  URL:pace_max_seg */
 	case TCP_RACK_FORCE_MSEG:		/*  URL:force_max_seg */
 	case TCP_RACK_PACE_RATE_CA:		/*  URL:pr_ca */
 	case TCP_RACK_PACE_RATE_SS:		/*  URL:pr_ss*/
 	case TCP_RACK_PACE_RATE_REC:		/*  URL:pr_rec */
 	case TCP_RACK_GP_INCREASE_CA:		/*  URL:gp_inc_ca */
 	case TCP_RACK_GP_INCREASE_SS:		/*  URL:gp_inc_ss */
 	case TCP_RACK_GP_INCREASE_REC:		/*  URL:gp_inc_rec */
 	case TCP_RACK_RR_CONF:			/*  URL:rrr_conf */
 	case TCP_BBR_HDWR_PACE:			/*  URL:hdwrpace */
 	case TCP_HDWR_RATE_CAP:			/*  URL:hdwrcap boolean */
 	case TCP_PACING_RATE_CAP:		/*  URL:cap  -- used by side-channel */
 	case TCP_HDWR_UP_ONLY:			/*  URL:uponly -- hardware pacing  boolean */
        /* End pacing related */
 	case TCP_FAST_RSM_HACK:			/*  URL:frsm_hack */
 	case TCP_DELACK:			/*  URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */
 	case TCP_RACK_PRR_SENDALOT:		/*  URL:prr_sendalot */
 	case TCP_RACK_MIN_TO:			/*  URL:min_to */
 	case TCP_RACK_EARLY_SEG:		/*  URL:early_seg */
 	case TCP_RACK_REORD_THRESH:		/*  URL:reord_thresh */
 	case TCP_RACK_REORD_FADE:		/*  URL:reord_fade */
 	case TCP_RACK_TLP_THRESH:		/*  URL:tlp_thresh */
 	case TCP_RACK_PKT_DELAY:		/*  URL:pkt_delay */
 	case TCP_RACK_TLP_USE:			/*  URL:tlp_use */
 	case TCP_BBR_RACK_RTT_USE:		/*  URL:rttuse */
 	case TCP_BBR_USE_RACK_RR:		/*  URL:rackrr */
 	case TCP_RACK_DO_DETECTION:		/*  URL:detect */
 	case TCP_NO_PRR:			/*  URL:noprr */
 	case TCP_TIMELY_DYN_ADJ:      		/*  URL:dynamic */
 	case TCP_DATA_AFTER_CLOSE:		/*  no URL */
 	case TCP_RACK_NONRXT_CFG_RATE:		/*  URL:nonrxtcr */
 	case TCP_SHARED_CWND_ENABLE:		/*  URL:scwnd */
 	case TCP_RACK_MBUF_QUEUE:		/*  URL:mqueue */
 	case TCP_RACK_NO_PUSH_AT_MAX:		/*  URL:npush */
 	case TCP_RACK_PACE_TO_FILL:		/*  URL:fillcw */
 	case TCP_SHARED_CWND_TIME_LIMIT:	/*  URL:lscwnd */
 	case TCP_RACK_PROFILE:			/*  URL:profile */
 	case TCP_USE_CMP_ACKS:			/*  URL:cmpack */
 	case TCP_RACK_ABC_VAL:			/*  URL:labc */
 	case TCP_REC_ABC_VAL:			/*  URL:reclabc */
 	case TCP_RACK_MEASURE_CNT:		/*  URL:measurecnt */
 	case TCP_DEFER_OPTIONS:			/*  URL:defer */
 	case TCP_RACK_DSACK_OPT:		/*  URL:dsack */
 	case TCP_RACK_PACING_BETA:		/*  URL:pacing_beta */
 	case TCP_RACK_PACING_BETA_ECN:		/*  URL:pacing_beta_ecn */
 	case TCP_RACK_TIMER_SLOP:		/*  URL:timer_slop */
 	case TCP_RACK_ENABLE_HYSTART:		/*  URL:hystart */
 		break;
 	default:
 		/* Filter off all unknown options to the base stack */
 		return (tcp_default_ctloutput(inp, sopt));
 		break;
 	}
 	INP_WUNLOCK(inp);
 	if (sopt->sopt_name == TCP_PACING_RATE_CAP) {
 		error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval));
 		/*
 		 * We truncate it down to 32 bits for the socket-option trace this
 		 * means rates > 34Gbps won't show right, but thats probably ok.
 		 */
 		optval = (uint32_t)loptval;
 	} else {
 		error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
 		/* Save it in 64 bit form too */
 		loptval = optval;
 	}
 	if (error)
 		return (error);
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (tp->t_fb != &__tcp_rack) {
 		INP_WUNLOCK(inp);
 		return (ENOPROTOOPT);
 	}
 	if (rack->defer_options && (rack->gp_ready == 0) &&
 	    (sopt->sopt_name != TCP_DEFER_OPTIONS) &&
 	    (sopt->sopt_name != TCP_RACK_PACING_BETA) &&
 	    (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) &&
 	    (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) {
 		/* Options are beind deferred */
 		if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) {
 			INP_WUNLOCK(inp);
 			return (0);
 		} else {
 			/* No memory to defer, fail */
 			INP_WUNLOCK(inp);
 			return (ENOMEM);
 		}
 	}
 	error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static void
 rack_fill_info(struct tcpcb *tp, struct tcp_info *ti)
 {
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	bzero(ti, sizeof(*ti));
 
 	ti->tcpi_state = tp->t_state;
 	if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
 		ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		ti->tcpi_options |= TCPI_OPT_SACK;
 	if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
 		ti->tcpi_options |= TCPI_OPT_WSCALE;
 		ti->tcpi_snd_wscale = tp->snd_scale;
 		ti->tcpi_rcv_wscale = tp->rcv_scale;
 	}
 	if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))
 		ti->tcpi_options |= TCPI_OPT_ECN;
 	if (tp->t_flags & TF_FASTOPEN)
 		ti->tcpi_options |= TCPI_OPT_TFO;
 	/* still kept in ticks is t_rcvtime */
 	ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick;
 	/* Since we hold everything in precise useconds this is easy */
 	ti->tcpi_rtt = tp->t_srtt;
 	ti->tcpi_rttvar = tp->t_rttvar;
 	ti->tcpi_rto = tp->t_rxtcur;
 	ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
 	ti->tcpi_snd_cwnd = tp->snd_cwnd;
 	/*
 	 * FreeBSD-specific extension fields for tcp_info.
 	 */
 	ti->tcpi_rcv_space = tp->rcv_wnd;
 	ti->tcpi_rcv_nxt = tp->rcv_nxt;
 	ti->tcpi_snd_wnd = tp->snd_wnd;
 	ti->tcpi_snd_bwnd = 0;		/* Unused, kept for compat. */
 	ti->tcpi_snd_nxt = tp->snd_nxt;
 	ti->tcpi_snd_mss = tp->t_maxseg;
 	ti->tcpi_rcv_mss = tp->t_maxseg;
 	ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
 	ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
 	ti->tcpi_snd_zerowin = tp->t_sndzerowin;
 #ifdef NETFLIX_STATS
 	ti->tcpi_total_tlp = tp->t_sndtlppack;
 	ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte;
 	memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo));
 #endif
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE) {
 		ti->tcpi_options |= TCPI_OPT_TOE;
 		tcp_offload_tcp_info(tp, ti);
 	}
 #endif
 }
 
 static int
 rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct tcpcb *tp;
 	struct tcp_rack *rack;
 	int32_t error, optval;
 	uint64_t val, loptval;
 	struct	tcp_info ti;
 	/*
 	 * Because all our options are either boolean or an int, we can just
 	 * pull everything into optval and then unlock and copy. If we ever
 	 * add a option that is not a int, then this will have quite an
 	 * impact to this routine.
 	 */
 	error = 0;
 	tp = intotcpcb(inp);
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (rack == NULL) {
 		INP_WUNLOCK(inp);
 		return (EINVAL);
 	}
 	switch (sopt->sopt_name) {
 	case TCP_INFO:
 		/* First get the info filled */
 		rack_fill_info(tp, &ti);
 		/* Fix up the rtt related fields if needed */
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &ti, sizeof ti);
 		return (error);
 	/*
 	 * Beta is the congestion control value for NewReno that influences how
 	 * much of a backoff happens when loss is detected. It is normally set
 	 * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value
 	 * when you exit recovery.
 	 */
 	case TCP_RACK_PACING_BETA:
 		if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0)
 			error = EINVAL;
 		else if (rack->rc_pacing_cc_set == 0)
 			optval = rack->r_ctl.rc_saved_beta.beta;
 		else {
 			/*
 			 * Reach out into the CC data and report back what
 			 * I have previously set. Yeah it looks hackish but
 			 * we don't want to report the saved values.
 			 */
 			if (tp->ccv->cc_data)
 				optval = ((struct newreno *)tp->ccv->cc_data)->beta;
 			else
 				error = EINVAL;
 		}
 		break;
 		/*
 		 * Beta_ecn is the congestion control value for NewReno that influences how
 		 * much of a backoff happens when a ECN mark is detected. It is normally set
 		 * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when
 		 * you exit recovery. Note that classic ECN has a beta of 50, it is only
 		 * ABE Ecn that uses this "less" value, but we do too with pacing :)
 		 */
 
 	case TCP_RACK_PACING_BETA_ECN:
 		if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0)
 			error = EINVAL;
 		else if (rack->rc_pacing_cc_set == 0)
 			optval = rack->r_ctl.rc_saved_beta.beta_ecn;
 		else {
 			/*
 			 * Reach out into the CC data and report back what
 			 * I have previously set. Yeah it looks hackish but
 			 * we don't want to report the saved values.
 			 */
 			if (tp->ccv->cc_data)
 				optval = ((struct newreno *)tp->ccv->cc_data)->beta_ecn;
 			else
 				error = EINVAL;
 		}
 		break;
 	case TCP_RACK_DSACK_OPT:
 		optval = 0;
 		if (rack->rc_rack_tmr_std_based) {
 			optval |= 1;
 		}
 		if (rack->rc_rack_use_dsack) {
 			optval |= 2;
 		}
 		break;
  	case TCP_RACK_ENABLE_HYSTART:
 	{
 		if (tp->ccv->flags & CCF_HYSTART_ALLOWED) {
 			optval = RACK_HYSTART_ON;
 			if (tp->ccv->flags & CCF_HYSTART_CAN_SH_CWND)
 				optval = RACK_HYSTART_ON_W_SC;
 			if (tp->ccv->flags & CCF_HYSTART_CONS_SSTH)
 				optval = RACK_HYSTART_ON_W_SC_C;
 		} else {
 			optval = RACK_HYSTART_OFF;
 		}
 	}
 	break;
 	case TCP_FAST_RSM_HACK:
 		optval = rack->fast_rsm_hack;
 		break;
 	case TCP_DEFER_OPTIONS:
 		optval = rack->defer_options;
 		break;
 	case TCP_RACK_MEASURE_CNT:
 		optval = rack->r_ctl.req_measurements;
 		break;
 	case TCP_REC_ABC_VAL:
 		optval = rack->r_use_labc_for_rec;
 		break;
 	case TCP_RACK_ABC_VAL:
 		optval = rack->rc_labc;
 		break;
 	case TCP_HDWR_UP_ONLY:
 		optval= rack->r_up_only;
 		break;
 	case TCP_PACING_RATE_CAP:
 		loptval = rack->r_ctl.bw_rate_cap;
 		break;
 	case TCP_RACK_PROFILE:
 		/* You cannot retrieve a profile, its write only */
 		error = EINVAL;
 		break;
 	case TCP_USE_CMP_ACKS:
 		optval = rack->r_use_cmp_ack;
 		break;
 	case TCP_RACK_PACE_TO_FILL:
 		optval = rack->rc_pace_to_cwnd;
 		if (optval && rack->r_fill_less_agg)
 			optval++;
 		break;
 	case TCP_RACK_NO_PUSH_AT_MAX:
 		optval = rack->r_ctl.rc_no_push_at_mrtt;
 		break;
 	case TCP_SHARED_CWND_ENABLE:
 		optval = rack->rack_enable_scwnd;
 		break;
 	case TCP_RACK_NONRXT_CFG_RATE:
 		optval = rack->rack_rec_nonrxt_use_cr;
 		break;
 	case TCP_NO_PRR:
 		if (rack->rack_no_prr  == 1)
 			optval = 1;
 		else if (rack->no_prr_addback == 1)
 			optval = 2;
 		else
 			optval = 0;
 		break;
 	case TCP_RACK_DO_DETECTION:
 		optval = rack->do_detection;
 		break;
 	case TCP_RACK_MBUF_QUEUE:
 		/* Now do we use the LRO mbuf-queue feature */
 		optval = rack->r_mbuf_queue;
 		break;
 	case TCP_TIMELY_DYN_ADJ:
 		optval = rack->rc_gp_dyn_mul;
 		break;
 	case TCP_BBR_IWINTSO:
 		optval = rack->rc_init_win;
 		break;
 	case TCP_RACK_TLP_REDUCE:
 		/* RACK TLP cwnd reduction (bool) */
 		optval = rack->r_ctl.rc_tlp_cwnd_reduce;
 		break;
 	case TCP_BBR_RACK_INIT_RATE:
 		val = rack->r_ctl.init_rate;
 		/* convert to kbits per sec */
 		val *= 8;
 		val /= 1000;
 		optval = (uint32_t)val;
 		break;
 	case TCP_RACK_FORCE_MSEG:
 		optval = rack->rc_force_max_seg;
 		break;
 	case TCP_RACK_PACE_MAX_SEG:
 		/* Max segments in a pace */
 		optval = rack->rc_user_set_max_segs;
 		break;
 	case TCP_RACK_PACE_ALWAYS:
 		/* Use the always pace method */
 		optval = rack->rc_always_pace;
 		break;
 	case TCP_RACK_PRR_SENDALOT:
 		/* Allow PRR to send more than one seg */
 		optval = rack->r_ctl.rc_prr_sendalot;
 		break;
 	case TCP_RACK_MIN_TO:
 		/* Minimum time between rack t-o's in ms */
 		optval = rack->r_ctl.rc_min_to;
 		break;
 	case TCP_RACK_EARLY_SEG:
 		/* If early recovery max segments */
 		optval = rack->r_ctl.rc_early_recovery_segs;
 		break;
 	case TCP_RACK_REORD_THRESH:
 		/* RACK reorder threshold (shift amount) */
 		optval = rack->r_ctl.rc_reorder_shift;
 		break;
 	case TCP_RACK_REORD_FADE:
 		/* Does reordering fade after ms time */
 		optval = rack->r_ctl.rc_reorder_fade;
 		break;
 	case TCP_BBR_USE_RACK_RR:
 		/* Do we use the rack cheat for rxt */
 		optval = rack->use_rack_rr;
 		break;
 	case TCP_RACK_RR_CONF:
 		optval = rack->r_rr_config;
 		break;
 	case TCP_HDWR_RATE_CAP:
 		optval = rack->r_rack_hw_rate_caps;
 		break;
 	case TCP_BBR_HDWR_PACE:
 		optval = rack->rack_hdw_pace_ena;
 		break;
 	case TCP_RACK_TLP_THRESH:
 		/* RACK TLP theshold i.e. srtt+(srtt/N) */
 		optval = rack->r_ctl.rc_tlp_threshold;
 		break;
 	case TCP_RACK_PKT_DELAY:
 		/* RACK added ms i.e. rack-rtt + reord + N */
 		optval = rack->r_ctl.rc_pkt_delay;
 		break;
 	case TCP_RACK_TLP_USE:
 		optval = rack->rack_tlp_threshold_use;
 		break;
 	case TCP_RACK_PACE_RATE_CA:
 		optval = rack->r_ctl.rc_fixed_pacing_rate_ca;
 		break;
 	case TCP_RACK_PACE_RATE_SS:
 		optval = rack->r_ctl.rc_fixed_pacing_rate_ss;
 		break;
 	case TCP_RACK_PACE_RATE_REC:
 		optval = rack->r_ctl.rc_fixed_pacing_rate_rec;
 		break;
 	case TCP_RACK_GP_INCREASE_SS:
 		optval = rack->r_ctl.rack_per_of_gp_ca;
 		break;
 	case TCP_RACK_GP_INCREASE_CA:
 		optval = rack->r_ctl.rack_per_of_gp_ss;
 		break;
 	case TCP_BBR_RACK_RTT_USE:
 		optval = rack->r_ctl.rc_rate_sample_method;
 		break;
 	case TCP_DELACK:
 		optval = tp->t_delayed_ack;
 		break;
 	case TCP_DATA_AFTER_CLOSE:
 		optval = rack->rc_allow_data_af_clo;
 		break;
 	case TCP_SHARED_CWND_TIME_LIMIT:
 		optval = rack->r_limit_scw;
 		break;
 	case TCP_RACK_TIMER_SLOP:
 		optval = rack->r_ctl.timer_slop;
 		break;
 	default:
 		return (tcp_default_ctloutput(inp, sopt));
 		break;
 	}
 	INP_WUNLOCK(inp);
 	if (error == 0) {
 		if (TCP_PACING_RATE_CAP)
 			error = sooptcopyout(sopt, &loptval, sizeof loptval);
 		else
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 	}
 	return (error);
 }
 
 static int
 rack_ctloutput(struct inpcb *inp, struct sockopt *sopt)
 {
 	if (sopt->sopt_dir == SOPT_SET) {
 		return (rack_set_sockopt(inp, sopt));
 	} else if (sopt->sopt_dir == SOPT_GET) {
 		return (rack_get_sockopt(inp, sopt));
 	} else {
 		panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir);
 	}
 }
 
 static const char *rack_stack_names[] = {
 	__XSTRING(STACKNAME),
 #ifdef STACKALIAS
 	__XSTRING(STACKALIAS),
 #endif
 };
 
 static int
 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
 {
 	memset(mem, 0, size);
 	return (0);
 }
 
 static void
 rack_dtor(void *mem, int32_t size, void *arg)
 {
 
 }
 
 static bool rack_mod_inited = false;
 
 static int
 tcp_addrack(module_t mod, int32_t type, void *data)
 {
 	int32_t err = 0;
 	int num_stacks;
 
 	switch (type) {
 	case MOD_LOAD:
 		rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
 		    sizeof(struct rack_sendmap),
 		    rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
 
 		rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
 		    sizeof(struct tcp_rack),
 		    rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
 
 		sysctl_ctx_init(&rack_sysctl_ctx);
 		rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
 		    OID_AUTO,
 #ifdef STACKALIAS
 		    __XSTRING(STACKALIAS),
 #else
 		    __XSTRING(STACKNAME),
 #endif
 		    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 		    "");
 		if (rack_sysctl_root == NULL) {
 			printf("Failed to add sysctl node\n");
 			err = EFAULT;
 			goto free_uma;
 		}
 		rack_init_sysctls();
 		num_stacks = nitems(rack_stack_names);
 		err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
 		    rack_stack_names, &num_stacks);
 		if (err) {
 			printf("Failed to register %s stack name for "
 			    "%s module\n", rack_stack_names[num_stacks],
 			    __XSTRING(MODNAME));
 			sysctl_ctx_free(&rack_sysctl_ctx);
 free_uma:
 			uma_zdestroy(rack_zone);
 			uma_zdestroy(rack_pcb_zone);
 			rack_counter_destroy();
 			printf("Failed to register rack module -- err:%d\n", err);
 			return (err);
 		}
 		tcp_lro_reg_mbufq();
 		rack_mod_inited = true;
 		break;
 	case MOD_QUIESCE:
 		err = deregister_tcp_functions(&__tcp_rack, true, false);
 		break;
 	case MOD_UNLOAD:
 		err = deregister_tcp_functions(&__tcp_rack, false, true);
 		if (err == EBUSY)
 			break;
 		if (rack_mod_inited) {
 			uma_zdestroy(rack_zone);
 			uma_zdestroy(rack_pcb_zone);
 			sysctl_ctx_free(&rack_sysctl_ctx);
 			rack_counter_destroy();
 			rack_mod_inited = false;
 		}
 		tcp_lro_dereg_mbufq();
 		err = 0;
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (err);
 }
 
 static moduledata_t tcp_rack = {
 	.name = __XSTRING(MODNAME),
 	.evhand = tcp_addrack,
 	.priv = 0
 };
 
 MODULE_VERSION(MODNAME, 1);
 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index c50f416351c3..8fdaab35fb19 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -1,4111 +1,4102 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_kern_tls.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/arb.h>
 #include <sys/callout.h>
 #include <sys/eventhandler.h>
 #ifdef TCP_HHOOK
 #include <sys/hhook.h>
 #endif
 #include <sys/kernel.h>
 #ifdef TCP_HHOOK
 #include <sys/khelp.h>
 #endif
 #ifdef KERN_TLS
 #include <sys/ktls.h>
 #endif
 #include <sys/qmath.h>
 #include <sys/stats.h>
 #include <sys/sysctl.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/refcount.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/icmp6.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 
 #include <netinet/tcp.h>
 #ifdef INVARIANTS
 #define TCPSTATES
 #endif
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/tcp_hpts.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_fastopen.h>
 #ifdef TCPPCAP
 #include <netinet/tcp_pcap.h>
 #endif
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 
 #include <netipsec/ipsec_support.h>
 
 #include <machine/in_cksum.h>
 #include <crypto/siphash/siphash.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef INET6
 static ip6proto_ctlinput_t tcp6_ctlinput;
 static udp_tun_icmp_t tcp6_ctlinput_viaudp;
 #endif
 
 VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS;
 #ifdef INET6
 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS;
 #endif
 
 #ifdef NETFLIX_EXP_DETECTION
 /*  Sack attack detection thresholds and such */
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack_attack,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Sack Attack detection thresholds");
 int32_t tcp_force_detection = 0;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, force_detection,
     CTLFLAG_RW,
     &tcp_force_detection, 0,
     "Do we force detection even if the INP has it off?");
 int32_t tcp_sack_to_ack_thresh = 700;	/* 70 % */
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sack_to_ack_thresh,
     CTLFLAG_RW,
     &tcp_sack_to_ack_thresh, 700,
     "Percentage of sacks to acks we must see above (10.1 percent is 101)?");
 int32_t tcp_sack_to_move_thresh = 600;	/* 60 % */
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, move_thresh,
     CTLFLAG_RW,
     &tcp_sack_to_move_thresh, 600,
     "Percentage of sack moves we must see above (10.1 percent is 101)");
 int32_t tcp_restoral_thresh = 650;	/* 65 % (sack:2:ack -5%) */
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, restore_thresh,
     CTLFLAG_RW,
     &tcp_restoral_thresh, 550,
     "Percentage of sack to ack percentage we must see below to restore(10.1 percent is 101)");
 int32_t tcp_sad_decay_val = 800;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, decay_per,
     CTLFLAG_RW,
     &tcp_sad_decay_val, 800,
     "The decay percentage (10.1 percent equals 101 )");
 int32_t tcp_map_minimum = 500;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, nummaps,
     CTLFLAG_RW,
     &tcp_map_minimum, 500,
     "Number of Map enteries before we start detection");
 int32_t tcp_attack_on_turns_on_logging = 0;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, attacks_logged,
     CTLFLAG_RW,
     &tcp_attack_on_turns_on_logging, 0,
    "When we have a positive hit on attack, do we turn on logging?");
 int32_t tcp_sad_pacing_interval = 2000;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_pacing_int,
     CTLFLAG_RW,
     &tcp_sad_pacing_interval, 2000,
     "What is the minimum pacing interval for a classified attacker?");
 
 int32_t tcp_sad_low_pps = 100;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_low_pps,
     CTLFLAG_RW,
     &tcp_sad_low_pps, 100,
     "What is the input pps that below which we do not decay?");
 #endif
 uint32_t tcp_ack_war_time_window = 1000;
 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_timewindow,
     CTLFLAG_RW,
     &tcp_ack_war_time_window, 1000,
    "If the tcp_stack does ack-war prevention how many milliseconds are in its time window?");
 uint32_t tcp_ack_war_cnt = 5;
 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt,
     CTLFLAG_RW,
     &tcp_ack_war_cnt, 5,
    "If the tcp_stack does ack-war prevention how many acks can be sent in its time window?");
 
 struct rwlock tcp_function_lock;
 
 static int
 sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	new = V_tcp_mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_mssdflt = new;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I",
     "Default TCP Maximum Segment Size");
 
 #ifdef INET6
 static int
 sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	new = V_tcp_v6mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_v6mssdflt = new;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I",
    "Default TCP Maximum Segment Size for IPv6");
 #endif /* INET6 */
 
 /*
  * Minimum MSS we accept and use. This prevents DoS attacks where
  * we are forced to a ridiculous low MSS like 20 and send hundreds
  * of packets instead of one. The effect scales with the available
  * bandwidth and quickly saturates the CPU and network interface
  * with packet generation and sending. Set to zero to disable MINMSS
  * checking. This setting prevents us from sending too small packets.
  */
 VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW,
      &VNET_NAME(tcp_minmss), 0,
     "Minimum TCP Maximum Segment Size");
 
 VNET_DEFINE(int, tcp_do_rfc1323) = 1;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc1323), 0,
     "Enable rfc1323 (high performance TCP) extensions");
 
 /*
  * As of June 2021, several TCP stacks violate RFC 7323 from September 2014.
  * Some stacks negotiate TS, but never send them after connection setup. Some
  * stacks negotiate TS, but don't send them when sending keep-alive segments.
  * These include modern widely deployed TCP stacks.
  * Therefore tolerating violations for now...
  */
 VNET_DEFINE(int, tcp_tolerate_missing_ts) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tolerate_missing_ts, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_tolerate_missing_ts), 0,
     "Tolerate missing TCP timestamps");
 
 VNET_DEFINE(int, tcp_ts_offset_per_conn) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ts_offset_per_conn, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_ts_offset_per_conn), 0,
     "Initialize TCP timestamps per connection instead of per host pair");
 
 /* How many connections are pacing */
 static volatile uint32_t number_of_tcp_connections_pacing = 0;
 static uint32_t shadow_num_connections = 0;
 
 static int tcp_pacing_limit = 10000;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW,
     &tcp_pacing_limit, 1000,
     "If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)");
 
 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD,
     &shadow_num_connections, 0, "Number of TCP connections being paced");
 
 static int	tcp_log_debug = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
     &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
 
 static int	tcp_tcbhashsize;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
 
 static int	do_tcpdrain = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
     "Enable tcp_drain routine for extra help when low on mbufs");
 
 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD,
     &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs");
 
 VNET_DEFINE_STATIC(int, icmp_may_rst) = 1;
 #define	V_icmp_may_rst			VNET(icmp_may_rst)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(icmp_may_rst), 0,
     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
 
 VNET_DEFINE_STATIC(int, tcp_isn_reseed_interval) = 0;
 #define	V_tcp_isn_reseed_interval	VNET(tcp_isn_reseed_interval)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_isn_reseed_interval), 0,
     "Seconds between reseeding of ISN secret");
 
 static int	tcp_soreceive_stream;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN,
     &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets");
 
 VNET_DEFINE(uma_zone_t, sack_hole_zone);
 #define	V_sack_hole_zone		VNET(sack_hole_zone)
 VNET_DEFINE(uint32_t, tcp_map_entries_limit) = 0;	/* unlimited */
 static int
 sysctl_net_inet_tcp_map_limit_check(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_tcp_map_entries_limit;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		/* only allow "0" and value > minimum */
 		if (new > 0 && new < TCP_MIN_MAP_ENTRIES_LIMIT)
 			error = EINVAL;
 		else
 			V_tcp_map_entries_limit = new;
 	}
 	return (error);
 }
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, map_limit,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(tcp_map_entries_limit), 0,
     &sysctl_net_inet_tcp_map_limit_check, "IU",
     "Total sendmap entries limit");
 
 VNET_DEFINE(uint32_t, tcp_map_split_limit) = 0;	/* unlimited */
 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, split_limit, CTLFLAG_VNET | CTLFLAG_RW,
      &VNET_NAME(tcp_map_split_limit), 0,
     "Total sendmap split entries limit");
 
 #ifdef TCP_HHOOK
 VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]);
 #endif
 
 #define TS_OFFSET_SECRET_LENGTH SIPHASH_KEY_LENGTH
 VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]);
 #define	V_ts_offset_secret	VNET(ts_offset_secret)
 
 static int	tcp_default_fb_init(struct tcpcb *tp);
 static void	tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged);
 static int	tcp_default_handoff_ok(struct tcpcb *tp);
 static struct inpcb *tcp_notify(struct inpcb *, int);
 static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
 static struct inpcb *tcp_mtudisc(struct inpcb *, int);
 static struct inpcb *tcp_drop_syn_sent(struct inpcb *, int);
 static char *	tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
 		    const void *ip4hdr, const void *ip6hdr);
 static ipproto_ctlinput_t	tcp_ctlinput;
 static udp_tun_icmp_t		tcp_ctlinput_viaudp;
 
 static struct tcp_function_block tcp_def_funcblk = {
 	.tfb_tcp_block_name = "freebsd",
 	.tfb_tcp_output = tcp_default_output,
 	.tfb_tcp_do_segment = tcp_do_segment,
 	.tfb_tcp_ctloutput = tcp_default_ctloutput,
 	.tfb_tcp_handoff_ok = tcp_default_handoff_ok,
 	.tfb_tcp_fb_init = tcp_default_fb_init,
 	.tfb_tcp_fb_fini = tcp_default_fb_fini,
 };
 
 static int tcp_fb_cnt = 0;
 struct tcp_funchead t_functions;
 static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk;
 
 void
 tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp)
 {
 	TCPSTAT_INC(tcps_dsack_count);
 	tp->t_dsack_pack++;
 	if (tlp == 0) {
 		if (SEQ_GT(end, start)) {
 			tp->t_dsack_bytes += (end - start);
 			TCPSTAT_ADD(tcps_dsack_bytes, (end - start));
 		} else {
 			tp->t_dsack_tlp_bytes += (start - end);
 			TCPSTAT_ADD(tcps_dsack_bytes, (start - end));
 		}
 	} else {
 		if (SEQ_GT(end, start)) {
 			tp->t_dsack_bytes += (end - start);
 			TCPSTAT_ADD(tcps_dsack_tlp_bytes, (end - start));
 		} else {
 			tp->t_dsack_tlp_bytes += (start - end);
 			TCPSTAT_ADD(tcps_dsack_tlp_bytes, (start - end));
 		}
 	}
 }
 
 static struct tcp_function_block *
 find_tcp_functions_locked(struct tcp_function_set *fs)
 {
 	struct tcp_function *f;
 	struct tcp_function_block *blk=NULL;
 
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		if (strcmp(f->tf_name, fs->function_set_name) == 0) {
 			blk = f->tf_fb;
 			break;
 		}
 	}
 	return(blk);
 }
 
 static struct tcp_function_block *
 find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s)
 {
 	struct tcp_function_block *rblk=NULL;
 	struct tcp_function *f;
 
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		if (f->tf_fb == blk) {
 			rblk = blk;
 			if (s) {
 				*s = f;
 			}
 			break;
 		}
 	}
 	return (rblk);
 }
 
 struct tcp_function_block *
 find_and_ref_tcp_functions(struct tcp_function_set *fs)
 {
 	struct tcp_function_block *blk;
 
 	rw_rlock(&tcp_function_lock);
 	blk = find_tcp_functions_locked(fs);
 	if (blk)
 		refcount_acquire(&blk->tfb_refcnt);
 	rw_runlock(&tcp_function_lock);
 	return(blk);
 }
 
 struct tcp_function_block *
 find_and_ref_tcp_fb(struct tcp_function_block *blk)
 {
 	struct tcp_function_block *rblk;
 
 	rw_rlock(&tcp_function_lock);
 	rblk = find_tcp_fb_locked(blk, NULL);
 	if (rblk)
 		refcount_acquire(&rblk->tfb_refcnt);
 	rw_runlock(&tcp_function_lock);
 	return(rblk);
 }
 
 /* Find a matching alias for the given tcp_function_block. */
 int
 find_tcp_function_alias(struct tcp_function_block *blk,
     struct tcp_function_set *fs)
 {
 	struct tcp_function *f;
 	int found;
 
 	found = 0;
 	rw_rlock(&tcp_function_lock);
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		if ((f->tf_fb == blk) &&
 		    (strncmp(f->tf_name, blk->tfb_tcp_block_name,
 		        TCP_FUNCTION_NAME_LEN_MAX) != 0)) {
 			/* Matching function block with different name. */
 			strncpy(fs->function_set_name, f->tf_name,
 			    TCP_FUNCTION_NAME_LEN_MAX);
 			found = 1;
 			break;
 		}
 	}
 	/* Null terminate the string appropriately. */
 	if (found) {
 		fs->function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0';
 	} else {
 		fs->function_set_name[0] = '\0';
 	}
 	rw_runlock(&tcp_function_lock);
 	return (found);
 }
 
 static struct tcp_function_block *
 find_and_ref_tcp_default_fb(void)
 {
 	struct tcp_function_block *rblk;
 
 	rw_rlock(&tcp_function_lock);
 	rblk = tcp_func_set_ptr;
 	refcount_acquire(&rblk->tfb_refcnt);
 	rw_runlock(&tcp_function_lock);
 	return (rblk);
 }
 
 void
 tcp_switch_back_to_default(struct tcpcb *tp)
 {
 	struct tcp_function_block *tfb;
 
 	KASSERT(tp->t_fb != &tcp_def_funcblk,
 	    ("%s: called by the built-in default stack", __func__));
 
 	/*
 	 * Release the old stack. This function will either find a new one
 	 * or panic.
 	 */
 	if (tp->t_fb->tfb_tcp_fb_fini != NULL)
 		(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
 	refcount_release(&tp->t_fb->tfb_refcnt);
 
 	/*
 	 * Now, we'll find a new function block to use.
 	 * Start by trying the current user-selected
 	 * default, unless this stack is the user-selected
 	 * default.
 	 */
 	tfb = find_and_ref_tcp_default_fb();
 	if (tfb == tp->t_fb) {
 		refcount_release(&tfb->tfb_refcnt);
 		tfb = NULL;
 	}
 	/* Does the stack accept this connection? */
 	if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL &&
 	    (*tfb->tfb_tcp_handoff_ok)(tp)) {
 		refcount_release(&tfb->tfb_refcnt);
 		tfb = NULL;
 	}
 	/* Try to use that stack. */
 	if (tfb != NULL) {
 		/* Initialize the new stack. If it succeeds, we are done. */
 		tp->t_fb = tfb;
 		if (tp->t_fb->tfb_tcp_fb_init == NULL ||
 		    (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0)
 			return;
 
 		/*
 		 * Initialization failed. Release the reference count on
 		 * the stack.
 		 */
 		refcount_release(&tfb->tfb_refcnt);
 	}
 
 	/*
 	 * If that wasn't feasible, use the built-in default
 	 * stack which is not allowed to reject anyone.
 	 */
 	tfb = find_and_ref_tcp_fb(&tcp_def_funcblk);
 	if (tfb == NULL) {
 		/* there always should be a default */
 		panic("Can't refer to tcp_def_funcblk");
 	}
 	if (tfb->tfb_tcp_handoff_ok != NULL) {
 		if ((*tfb->tfb_tcp_handoff_ok) (tp)) {
 			/* The default stack cannot say no */
 			panic("Default stack rejects a new session?");
 		}
 	}
 	tp->t_fb = tfb;
 	if (tp->t_fb->tfb_tcp_fb_init != NULL &&
 	    (*tp->t_fb->tfb_tcp_fb_init)(tp)) {
 		/* The default stack cannot fail */
 		panic("Default stack initialization failed");
 	}
 }
 
 static bool
 tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp,
     const struct sockaddr *sa, void *ctx)
 {
 	struct ip *iph;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 	struct udphdr *uh;
 	struct tcphdr *th;
 	int thlen;
 	uint16_t port;
 
 	TCPSTAT_INC(tcps_tunneled_pkts);
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		/* Can't handle one that is not a pkt hdr */
 		TCPSTAT_INC(tcps_tunneled_errs);
 		goto out;
 	}
 	thlen = sizeof(struct tcphdr);
 	if (m->m_len < off + sizeof(struct udphdr) + thlen &&
 	    (m =  m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) {
 		TCPSTAT_INC(tcps_tunneled_errs);
 		goto out;
 	}
 	iph = mtod(m, struct ip *);
 	uh = (struct udphdr *)((caddr_t)iph + off);
 	th = (struct tcphdr *)(uh + 1);
 	thlen = th->th_off << 2;
 	if (m->m_len < off + sizeof(struct udphdr) + thlen) {
 		m =  m_pullup(m, off + sizeof(struct udphdr) + thlen);
 		if (m == NULL) {
 			TCPSTAT_INC(tcps_tunneled_errs);
 			goto out;
 		} else {
 			iph = mtod(m, struct ip *);
 			uh = (struct udphdr *)((caddr_t)iph + off);
 			th = (struct tcphdr *)(uh + 1);
 		}
 	}
 	m->m_pkthdr.tcp_tun_port = port = uh->uh_sport;
 	bcopy(th, uh, m->m_len - off);
 	m->m_len -= sizeof(struct udphdr);
 	m->m_pkthdr.len -= sizeof(struct udphdr);
 	/*
 	 * We use the same algorithm for
 	 * both UDP and TCP for c-sum. So
 	 * the code in tcp_input will skip
 	 * the checksum. So we do nothing
 	 * with the flag (m->m_pkthdr.csum_flags).
 	 */
 	switch (iph->ip_v) {
 #ifdef INET
 	case IPVERSION:
 		iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr));
 		tcp_input_with_port(&m, &off, IPPROTO_TCP, port);
 		break;
 #endif
 #ifdef INET6
 	case IPV6_VERSION >> 4:
 		ip6 = mtod(m, struct ip6_hdr *);
 		ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr));
 		tcp6_input_with_port(&m, &off, IPPROTO_TCP, port);
 		break;
 #endif
 	default:
 		goto out;
 		break;
 	}
 	return (true);
 out:
 	m_freem(m);
 
 	return (true);
 }
 
 static int
 sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
 {
 	int error=ENOENT;
 	struct tcp_function_set fs;
 	struct tcp_function_block *blk;
 
 	memset(&fs, 0, sizeof(fs));
 	rw_rlock(&tcp_function_lock);
 	blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL);
 	if (blk) {
 		/* Found him */
 		strcpy(fs.function_set_name, blk->tfb_tcp_block_name);
 		fs.pcbcnt = blk->tfb_refcnt;
 	}
 	rw_runlock(&tcp_function_lock);
 	error = sysctl_handle_string(oidp, fs.function_set_name,
 				     sizeof(fs.function_set_name), req);
 
 	/* Check for error or no change */
 	if (error != 0 || req->newptr == NULL)
 		return(error);
 
 	rw_wlock(&tcp_function_lock);
 	blk = find_tcp_functions_locked(&fs);
 	if ((blk == NULL) ||
 	    (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) {
 		error = ENOENT;
 		goto done;
 	}
 	tcp_func_set_ptr = blk;
 done:
 	rw_wunlock(&tcp_function_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     NULL, 0, sysctl_net_inet_default_tcp_functions, "A",
     "Set/get the default TCP functions");
 
 static int
 sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS)
 {
 	int error, cnt, linesz;
 	struct tcp_function *f;
 	char *buffer, *cp;
 	size_t bufsz, outsz;
 	bool alias;
 
 	cnt = 0;
 	rw_rlock(&tcp_function_lock);
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		cnt++;
 	}
 	rw_runlock(&tcp_function_lock);
 
 	bufsz = (cnt+2) * ((TCP_FUNCTION_NAME_LEN_MAX * 2) + 13) + 1;
 	buffer = malloc(bufsz, M_TEMP, M_WAITOK);
 
 	error = 0;
 	cp = buffer;
 
 	linesz = snprintf(cp, bufsz, "\n%-32s%c %-32s %s\n", "Stack", 'D',
 	    "Alias", "PCB count");
 	cp += linesz;
 	bufsz -= linesz;
 	outsz = linesz;
 
 	rw_rlock(&tcp_function_lock);
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		alias = (f->tf_name != f->tf_fb->tfb_tcp_block_name);
 		linesz = snprintf(cp, bufsz, "%-32s%c %-32s %u\n",
 		    f->tf_fb->tfb_tcp_block_name,
 		    (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ',
 		    alias ? f->tf_name : "-",
 		    f->tf_fb->tfb_refcnt);
 		if (linesz >= bufsz) {
 			error = EOVERFLOW;
 			break;
 		}
 		cp += linesz;
 		bufsz -= linesz;
 		outsz += linesz;
 	}
 	rw_runlock(&tcp_function_lock);
 	if (error == 0)
 		error = sysctl_handle_string(oidp, buffer, outsz + 1, req);
 	free(buffer, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
     NULL, 0, sysctl_net_inet_list_available, "A",
     "list available TCP Function sets");
 
 VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT;
 
 #ifdef INET
 VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL;
 #define	V_udp4_tun_socket	VNET(udp4_tun_socket)
 #endif
 #ifdef INET6
 VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL;
 #define	V_udp6_tun_socket	VNET(udp6_tun_socket)
 #endif
 
 static void
 tcp_over_udp_stop(void)
 {
 	/*
 	 * This function assumes sysctl caller holds inp_rinfo_lock()
 	 * for writing!
 	 */
 #ifdef INET
 	if (V_udp4_tun_socket != NULL) {
 		soclose(V_udp4_tun_socket);
 		V_udp4_tun_socket = NULL;
 	}
 #endif
 #ifdef INET6
 	if (V_udp6_tun_socket != NULL) {
 		soclose(V_udp6_tun_socket);
 		V_udp6_tun_socket = NULL;
 	}
 #endif
 }
 
 static int
 tcp_over_udp_start(void)
 {
 	uint16_t port;
 	int ret;
 #ifdef INET
 	struct sockaddr_in sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 sin6;
 #endif
 	/*
 	 * This function assumes sysctl caller holds inp_info_rlock()
 	 * for writing!
 	 */
 	port = V_tcp_udp_tunneling_port;
 	if (ntohs(port) == 0) {
 		/* Must have a port set */
 		return (EINVAL);
 	}
 #ifdef INET
 	if (V_udp4_tun_socket != NULL) {
 		/* Already running -- must stop first */
 		return (EALREADY);
 	}
 #endif
 #ifdef INET6
 	if (V_udp6_tun_socket != NULL) {
 		/* Already running -- must stop first */
 		return (EALREADY);
 	}
 #endif
 #ifdef INET
 	if ((ret = socreate(PF_INET, &V_udp4_tun_socket,
 	    SOCK_DGRAM, IPPROTO_UDP,
 	    curthread->td_ucred, curthread))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 	/* Call the special UDP hook. */
 	if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket,
 	    tcp_recv_udp_tunneled_packet,
 	    tcp_ctlinput_viaudp,
 	    NULL))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 	/* Ok, we have a socket, bind it to the port. */
 	memset(&sin, 0, sizeof(struct sockaddr_in));
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_port = htons(port);
 	if ((ret = sobind(V_udp4_tun_socket,
 	    (struct sockaddr *)&sin, curthread))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 #endif
 #ifdef INET6
 	if ((ret = socreate(PF_INET6, &V_udp6_tun_socket,
 	    SOCK_DGRAM, IPPROTO_UDP,
 	    curthread->td_ucred, curthread))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 	/* Call the special UDP hook. */
 	if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket,
 	    tcp_recv_udp_tunneled_packet,
 	    tcp6_ctlinput_viaudp,
 	    NULL))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 	/* Ok, we have a socket, bind it to the port. */
 	memset(&sin6, 0, sizeof(struct sockaddr_in6));
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_port = htons(port);
 	if ((ret = sobind(V_udp6_tun_socket,
 	    (struct sockaddr *)&sin6, curthread))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 #endif
 	return (0);
 }
 
 static int
 sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t old, new;
 
 	old = V_tcp_udp_tunneling_port;
 	new = old;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if ((error == 0) &&
 	    (req->newptr != NULL)) {
 		if ((new < TCP_TUNNELING_PORT_MIN) ||
 		    (new > TCP_TUNNELING_PORT_MAX)) {
 			error = EINVAL;
 		} else {
 			V_tcp_udp_tunneling_port = new;
 			if (old != 0) {
 				tcp_over_udp_stop();
 			}
 			if (new != 0) {
 				error = tcp_over_udp_start();
 			}
 		}
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(tcp_udp_tunneling_port),
     0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU",
     "Tunneling port for tcp over udp");
 
 VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT;
 
 static int
 sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	new = V_tcp_udp_tunneling_overhead;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if ((new < TCP_TUNNELING_OVERHEAD_MIN) ||
 		    (new > TCP_TUNNELING_OVERHEAD_MAX))
 			error = EINVAL;
 		else
 			V_tcp_udp_tunneling_overhead = new;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(tcp_udp_tunneling_overhead),
     0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU",
     "MSS reduction when using tcp over udp");
 
 /*
  * Exports one (struct tcp_function_info) for each alias/name.
  */
 static int
 sysctl_net_inet_list_func_info(SYSCTL_HANDLER_ARGS)
 {
 	int cnt, error;
 	struct tcp_function *f;
 	struct tcp_function_info tfi;
 
 	/*
 	 * We don't allow writes.
 	 */
 	if (req->newptr != NULL)
 		return (EINVAL);
 
 	/*
 	 * Wire the old buffer so we can directly copy the functions to
 	 * user space without dropping the lock.
 	 */
 	if (req->oldptr != NULL) {
 		error = sysctl_wire_old_buffer(req, 0);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Walk the list and copy out matching entries. If INVARIANTS
 	 * is compiled in, also walk the list to verify the length of
 	 * the list matches what we have recorded.
 	 */
 	rw_rlock(&tcp_function_lock);
 
 	cnt = 0;
 #ifndef INVARIANTS
 	if (req->oldptr == NULL) {
 		cnt = tcp_fb_cnt;
 		goto skip_loop;
 	}
 #endif
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 #ifdef INVARIANTS
 		cnt++;
 #endif
 		if (req->oldptr != NULL) {
 			bzero(&tfi, sizeof(tfi));
 			tfi.tfi_refcnt = f->tf_fb->tfb_refcnt;
 			tfi.tfi_id = f->tf_fb->tfb_id;
 			(void)strlcpy(tfi.tfi_alias, f->tf_name,
 			    sizeof(tfi.tfi_alias));
 			(void)strlcpy(tfi.tfi_name,
 			    f->tf_fb->tfb_tcp_block_name, sizeof(tfi.tfi_name));
 			error = SYSCTL_OUT(req, &tfi, sizeof(tfi));
 			/*
 			 * Don't stop on error, as that is the
 			 * mechanism we use to accumulate length
 			 * information if the buffer was too short.
 			 */
 		}
 	}
 	KASSERT(cnt == tcp_fb_cnt,
 	    ("%s: cnt (%d) != tcp_fb_cnt (%d)", __func__, cnt, tcp_fb_cnt));
 #ifndef INVARIANTS
 skip_loop:
 #endif
 	rw_runlock(&tcp_function_lock);
 	if (req->oldptr == NULL)
 		error = SYSCTL_OUT(req, NULL,
 		    (cnt + 1) * sizeof(struct tcp_function_info));
 
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info,
 	    CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	    NULL, 0, sysctl_net_inet_list_func_info, "S,tcp_function_info",
 	    "List TCP function block name-to-ID mappings");
 
 /*
  * tfb_tcp_handoff_ok() function for the default stack.
  * Note that we'll basically try to take all comers.
  */
 static int
 tcp_default_handoff_ok(struct tcpcb *tp)
 {
 
 	return (0);
 }
 
 /*
  * tfb_tcp_fb_init() function for the default stack.
  *
  * This handles making sure we have appropriate timers set if you are
  * transitioning a socket that has some amount of setup done.
  *
  * The init() fuction from the default can *never* return non-zero i.e.
  * it is required to always succeed since it is the stack of last resort!
  */
 static int
 tcp_default_fb_init(struct tcpcb *tp)
 {
 
 	struct socket *so;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT,
 	    ("%s: connection %p in unexpected state %d", __func__, tp,
 	    tp->t_state));
 
 	/*
 	 * Nothing to do for ESTABLISHED or LISTEN states. And, we don't
 	 * know what to do for unexpected states (which includes TIME_WAIT).
 	 */
 	if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT)
 		return (0);
 
 	/*
 	 * Make sure some kind of transmission timer is set if there is
 	 * outstanding data.
 	 */
 	so = tp->t_inpcb->inp_socket;
 	if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) ||
 	    tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) ||
 	    tcp_timer_active(tp, TT_PERSIST))) {
 		/*
 		 * If the session has established and it looks like it should
 		 * be in the persist state, set the persist timer. Otherwise,
 		 * set the retransmit timer.
 		 */
 		if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 &&
 		    (int32_t)(tp->snd_nxt - tp->snd_una) <
 		    (int32_t)sbavail(&so->so_snd))
 			tcp_setpersist(tp);
 		else
 			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 	}
 
 	/* All non-embryonic sessions get a keepalive timer. */
 	if (!tcp_timer_active(tp, TT_KEEP))
 		tcp_timer_activate(tp, TT_KEEP,
 		    TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) :
 		    TP_KEEPINIT(tp));
 
 	/*
 	 * Make sure critical variables are initialized
 	 * if transitioning while in Recovery.
 	 */
 	if IN_FASTRECOVERY(tp->t_flags) {
 		if (tp->sackhint.recover_fs == 0)
 			tp->sackhint.recover_fs = max(1,
 			    tp->snd_nxt - tp->snd_una);
 	}
 
 	return (0);
 }
 
 /*
  * tfb_tcp_fb_fini() function for the default stack.
  *
  * This changes state as necessary (or prudent) to prepare for another stack
  * to assume responsibility for the connection.
  */
 static void
 tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged)
 {
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	return;
 }
 
 /*
  * Target size of TCP PCB hash tables. Must be a power of two.
  *
  * Note that this can be overridden by the kernel environment
  * variable net.inet.tcp.tcbhashsize
  */
 #ifndef TCBHASHSIZE
 #define TCBHASHSIZE	0
 #endif
 
 /*
  * XXX
  * Callouts should be moved into struct tcp directly.  They are currently
  * separate because the tcpcb structure is exported to userland for sysctl
  * parsing purposes, which do not know about callouts.
  */
 struct tcpcb_mem {
 	struct	tcpcb		tcb;
 	struct	tcp_timer	tt;
 	struct	cc_var		ccv;
 #ifdef TCP_HHOOK
 	struct	osd		osd;
 #endif
 };
 
 VNET_DEFINE_STATIC(uma_zone_t, tcpcb_zone);
 #define	V_tcpcb_zone			VNET(tcpcb_zone)
 
 MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
 MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory");
 
 static struct mtx isn_mtx;
 
 #define	ISN_LOCK_INIT()	mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
 #define	ISN_LOCK()	mtx_lock(&isn_mtx)
 #define	ISN_UNLOCK()	mtx_unlock(&isn_mtx)
 
 INPCBSTORAGE_DEFINE(tcpcbstor, "tcpinp", "tcp_inpcb", "tcp", "tcphash");
 
 /*
  * Take a value and get the next power of 2 that doesn't overflow.
  * Used to size the tcp_inpcb hash buckets.
  */
 static int
 maketcp_hashsize(int size)
 {
 	int hashsize;
 
 	/*
 	 * auto tune.
 	 * get the next power of 2 higher than maxsockets.
 	 */
 	hashsize = 1 << fls(size);
 	/* catch overflow, and just go one power of 2 smaller */
 	if (hashsize < size) {
 		hashsize = 1 << (fls(size) - 1);
 	}
 	return (hashsize);
 }
 
 static volatile int next_tcp_stack_id = 1;
 
 /*
  * Register a TCP function block with the name provided in the names
  * array.  (Note that this function does NOT automatically register
  * blk->tfb_tcp_block_name as a stack name.  Therefore, you should
  * explicitly include blk->tfb_tcp_block_name in the list of names if
  * you wish to register the stack with that name.)
  *
  * Either all name registrations will succeed or all will fail.  If
  * a name registration fails, the function will update the num_names
  * argument to point to the array index of the name that encountered
  * the failure.
  *
  * Returns 0 on success, or an error code on failure.
  */
 int
 register_tcp_functions_as_names(struct tcp_function_block *blk, int wait,
     const char *names[], int *num_names)
 {
 	struct tcp_function *n;
 	struct tcp_function_set fs;
 	int error, i;
 
 	KASSERT(names != NULL && *num_names > 0,
 	    ("%s: Called with 0-length name list", __func__));
 	KASSERT(names != NULL, ("%s: Called with NULL name list", __func__));
 	KASSERT(rw_initialized(&tcp_function_lock),
 	    ("%s: called too early", __func__));
 
 	if ((blk->tfb_tcp_output == NULL) ||
 	    (blk->tfb_tcp_do_segment == NULL) ||
 	    (blk->tfb_tcp_ctloutput == NULL) ||
 	    (strlen(blk->tfb_tcp_block_name) == 0)) {
 		/*
 		 * These functions are required and you
 		 * need a name.
 		 */
 		*num_names = 0;
 		return (EINVAL);
 	}
 	if (blk->tfb_tcp_timer_stop_all ||
 	    blk->tfb_tcp_timer_activate ||
 	    blk->tfb_tcp_timer_active ||
 	    blk->tfb_tcp_timer_stop) {
 		/*
 		 * If you define one timer function you
 		 * must have them all.
 		 */
 		if ((blk->tfb_tcp_timer_stop_all == NULL) ||
 		    (blk->tfb_tcp_timer_activate == NULL) ||
 		    (blk->tfb_tcp_timer_active == NULL) ||
 		    (blk->tfb_tcp_timer_stop == NULL)) {
 			*num_names = 0;
 			return (EINVAL);
 		}
 	}
 
 	if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
 		*num_names = 0;
 		return (EINVAL);
 	}
 
 	refcount_init(&blk->tfb_refcnt, 0);
 	blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1);
 	for (i = 0; i < *num_names; i++) {
 		n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait);
 		if (n == NULL) {
 			error = ENOMEM;
 			goto cleanup;
 		}
 		n->tf_fb = blk;
 
 		(void)strlcpy(fs.function_set_name, names[i],
 		    sizeof(fs.function_set_name));
 		rw_wlock(&tcp_function_lock);
 		if (find_tcp_functions_locked(&fs) != NULL) {
 			/* Duplicate name space not allowed */
 			rw_wunlock(&tcp_function_lock);
 			free(n, M_TCPFUNCTIONS);
 			error = EALREADY;
 			goto cleanup;
 		}
 		(void)strlcpy(n->tf_name, names[i], sizeof(n->tf_name));
 		TAILQ_INSERT_TAIL(&t_functions, n, tf_next);
 		tcp_fb_cnt++;
 		rw_wunlock(&tcp_function_lock);
 	}
 	return(0);
 
 cleanup:
 	/*
 	 * Deregister the names we just added. Because registration failed
 	 * for names[i], we don't need to deregister that name.
 	 */
 	*num_names = i;
 	rw_wlock(&tcp_function_lock);
 	while (--i >= 0) {
 		TAILQ_FOREACH(n, &t_functions, tf_next) {
 			if (!strncmp(n->tf_name, names[i],
 			    TCP_FUNCTION_NAME_LEN_MAX)) {
 				TAILQ_REMOVE(&t_functions, n, tf_next);
 				tcp_fb_cnt--;
 				n->tf_fb = NULL;
 				free(n, M_TCPFUNCTIONS);
 				break;
 			}
 		}
 	}
 	rw_wunlock(&tcp_function_lock);
 	return (error);
 }
 
 /*
  * Register a TCP function block using the name provided in the name
  * argument.
  *
  * Returns 0 on success, or an error code on failure.
  */
 int
 register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name,
     int wait)
 {
 	const char *name_list[1];
 	int num_names, rv;
 
 	num_names = 1;
 	if (name != NULL)
 		name_list[0] = name;
 	else
 		name_list[0] = blk->tfb_tcp_block_name;
 	rv = register_tcp_functions_as_names(blk, wait, name_list, &num_names);
 	return (rv);
 }
 
 /*
  * Register a TCP function block using the name defined in
  * blk->tfb_tcp_block_name.
  *
  * Returns 0 on success, or an error code on failure.
  */
 int
 register_tcp_functions(struct tcp_function_block *blk, int wait)
 {
 
 	return (register_tcp_functions_as_name(blk, NULL, wait));
 }
 
 /*
  * Deregister all names associated with a function block. This
  * functionally removes the function block from use within the system.
  *
  * When called with a true quiesce argument, mark the function block
  * as being removed so no more stacks will use it and determine
  * whether the removal would succeed.
  *
  * When called with a false quiesce argument, actually attempt the
  * removal.
  *
  * When called with a force argument, attempt to switch all TCBs to
  * use the default stack instead of returning EBUSY.
  *
  * Returns 0 on success (or if the removal would succeed, or an error
  * code on failure.
  */
 int
 deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
     bool force)
 {
 	struct tcp_function *f;
 
 	if (blk == &tcp_def_funcblk) {
 		/* You can't un-register the default */
 		return (EPERM);
 	}
 	rw_wlock(&tcp_function_lock);
 	if (blk == tcp_func_set_ptr) {
 		/* You can't free the current default */
 		rw_wunlock(&tcp_function_lock);
 		return (EBUSY);
 	}
 	/* Mark the block so no more stacks can use it. */
 	blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
 	/*
 	 * If TCBs are still attached to the stack, attempt to switch them
 	 * to the default stack.
 	 */
 	if (force && blk->tfb_refcnt) {
 		struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
 		    INPLOOKUP_WLOCKPCB);
 		struct inpcb *inp;
 		struct tcpcb *tp;
 		VNET_ITERATOR_DECL(vnet_iter);
 
 		rw_wunlock(&tcp_function_lock);
 
 		VNET_LIST_RLOCK();
 		VNET_FOREACH(vnet_iter) {
 			CURVNET_SET(vnet_iter);
 			while ((inp = inp_next(&inpi)) != NULL) {
-				if (inp->inp_flags & INP_TIMEWAIT)
-					continue;
 				tp = intotcpcb(inp);
 				if (tp == NULL || tp->t_fb != blk)
 					continue;
 				tcp_switch_back_to_default(tp);
 			}
 			CURVNET_RESTORE();
 		}
 		VNET_LIST_RUNLOCK();
 
 		rw_wlock(&tcp_function_lock);
 	}
 	if (blk->tfb_refcnt) {
 		/* TCBs still attached. */
 		rw_wunlock(&tcp_function_lock);
 		return (EBUSY);
 	}
 	if (quiesce) {
 		/* Skip removal. */
 		rw_wunlock(&tcp_function_lock);
 		return (0);
 	}
 	/* Remove any function names that map to this function block. */
 	while (find_tcp_fb_locked(blk, &f) != NULL) {
 		TAILQ_REMOVE(&t_functions, f, tf_next);
 		tcp_fb_cnt--;
 		f->tf_fb = NULL;
 		free(f, M_TCPFUNCTIONS);
 	}
 	rw_wunlock(&tcp_function_lock);
 	return (0);
 }
 
 static void
 tcp_drain(void)
 {
 	struct epoch_tracker et;
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	if (!do_tcpdrain)
 		return;
 
 	NET_EPOCH_ENTER(et);
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
 		    INPLOOKUP_WLOCKPCB);
 		struct inpcb *inpb;
 		struct tcpcb *tcpb;
 
 	/*
 	 * Walk the tcpbs, if existing, and flush the reassembly queue,
 	 * if there is one...
 	 * XXX: The "Net/3" implementation doesn't imply that the TCP
 	 *      reassembly queue should be flushed, but in a situation
 	 *	where we're really low on mbufs, this is potentially
 	 *	useful.
 	 */
 		while ((inpb = inp_next(&inpi)) != NULL) {
-			if (inpb->inp_flags & INP_TIMEWAIT)
-				continue;
 			if ((tcpb = intotcpcb(inpb)) != NULL) {
 				tcp_reass_flush(tcpb);
 				tcp_clean_sackreport(tcpb);
 #ifdef TCP_BLACKBOX
 				tcp_log_drain(tcpb);
 #endif
 #ifdef TCPPCAP
 				if (tcp_pcap_aggressive_free) {
 					/* Free the TCP PCAP queues. */
 					tcp_pcap_drain(&(tcpb->t_inpkts));
 					tcp_pcap_drain(&(tcpb->t_outpkts));
 				}
 #endif
 			}
 		}
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 tcp_vnet_init(void *arg __unused)
 {
 
 #ifdef TCP_HHOOK
 	if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN,
 	    &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register helper hook\n", __func__);
 	if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT,
 	    &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register helper hook\n", __func__);
 #endif
 #ifdef STATS
 	if (tcp_stats_init())
 		printf("%s: WARNING: unable to initialise TCP stats\n",
 		    __func__);
 #endif
 	in_pcbinfo_init(&V_tcbinfo, &tcpcbstor, tcp_tcbhashsize,
 	    tcp_tcbhashsize);
 
 	/*
 	 * These have to be type stable for the benefit of the timers.
 	 */
 	V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_zone_set_max(V_tcpcb_zone, maxsockets);
 	uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached");
 
 	syncache_init();
 	tcp_hc_init();
 
 	TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack);
 	V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 
 	tcp_fastopen_init();
 
 	COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK);
 	VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK);
 
 	V_tcp_msl = TCPTV_MSL;
 }
 VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
     tcp_vnet_init, NULL);
 
 static void
 tcp_init(void *arg __unused)
 {
 	const char *tcbhash_tuneable;
 	int hashsize;
 
 	tcp_reass_global_init();
 
 	/* XXX virtualize those below? */
 	tcp_delacktime = TCPTV_DELACK;
 	tcp_keepinit = TCPTV_KEEP_INIT;
 	tcp_keepidle = TCPTV_KEEP_IDLE;
 	tcp_keepintvl = TCPTV_KEEPINTVL;
 	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
 	tcp_rexmit_initial = TCPTV_RTOBASE;
 	if (tcp_rexmit_initial < 1)
 		tcp_rexmit_initial = 1;
 	tcp_rexmit_min = TCPTV_MIN;
 	if (tcp_rexmit_min < 1)
 		tcp_rexmit_min = 1;
 	tcp_persmin = TCPTV_PERSMIN;
 	tcp_persmax = TCPTV_PERSMAX;
 	tcp_rexmit_slop = TCPTV_CPU_VAR;
 	tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
 
 	/* Setup the tcp function block list */
 	TAILQ_INIT(&t_functions);
 	rw_init(&tcp_function_lock, "tcp_func_lock");
 	register_tcp_functions(&tcp_def_funcblk, M_WAITOK);
 #ifdef TCP_BLACKBOX
 	/* Initialize the TCP logging data. */
 	tcp_log_init();
 #endif
 	arc4rand(&V_ts_offset_secret, sizeof(V_ts_offset_secret), 0);
 
 	if (tcp_soreceive_stream) {
 #ifdef INET
 		tcp_protosw.pr_soreceive = soreceive_stream;
 #endif
 #ifdef INET6
 		tcp6_protosw.pr_soreceive = soreceive_stream;
 #endif /* INET6 */
 	}
 
 #ifdef INET6
 	max_protohdr_grow(sizeof(struct ip6_hdr) + sizeof(struct tcphdr));
 #else /* INET6 */
 	max_protohdr_grow(sizeof(struct tcpiphdr));
 #endif /* INET6 */
 
 	ISN_LOCK_INIT();
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
 	EVENTHANDLER_REGISTER(vm_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT);
 	EVENTHANDLER_REGISTER(mbuf_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT);
 
 	tcp_inp_lro_direct_queue = counter_u64_alloc(M_WAITOK);
 	tcp_inp_lro_wokeup_queue = counter_u64_alloc(M_WAITOK);
 	tcp_inp_lro_compressed = counter_u64_alloc(M_WAITOK);
 	tcp_inp_lro_locks_taken = counter_u64_alloc(M_WAITOK);
 	tcp_extra_mbuf = counter_u64_alloc(M_WAITOK);
 	tcp_would_have_but = counter_u64_alloc(M_WAITOK);
 	tcp_comp_total = counter_u64_alloc(M_WAITOK);
 	tcp_uncomp_total = counter_u64_alloc(M_WAITOK);
 	tcp_bad_csums = counter_u64_alloc(M_WAITOK);
 #ifdef TCPPCAP
 	tcp_pcap_init();
 #endif
 
 	hashsize = TCBHASHSIZE;
 	tcbhash_tuneable = "net.inet.tcp.tcbhashsize";
 	TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize);
 	if (hashsize == 0) {
 		/*
 		 * Auto tune the hash size based on maxsockets.
 		 * A perfect hash would have a 1:1 mapping
 		 * (hashsize = maxsockets) however it's been
 		 * suggested that O(2) average is better.
 		 */
 		hashsize = maketcp_hashsize(maxsockets / 4);
 		/*
 		 * Our historical default is 512,
 		 * do not autotune lower than this.
 		 */
 		if (hashsize < 512)
 			hashsize = 512;
 		if (bootverbose)
 			printf("%s: %s auto tuned to %d\n", __func__,
 			    tcbhash_tuneable, hashsize);
 	}
 	/*
 	 * We require a hashsize to be a power of two.
 	 * Previously if it was not a power of two we would just reset it
 	 * back to 512, which could be a nasty surprise if you did not notice
 	 * the error message.
 	 * Instead what we do is clip it to the closest power of two lower
 	 * than the specified hash value.
 	 */
 	if (!powerof2(hashsize)) {
 		int oldhashsize = hashsize;
 
 		hashsize = maketcp_hashsize(hashsize);
 		/* prevent absurdly low value */
 		if (hashsize < 16)
 			hashsize = 16;
 		printf("%s: WARNING: TCB hash size not a power of 2, "
 		    "clipped from %d to %d.\n", __func__, oldhashsize,
 		    hashsize);
 	}
 	tcp_tcbhashsize = hashsize;
 
 #ifdef INET
 	IPPROTO_REGISTER(IPPROTO_TCP, tcp_input, tcp_ctlinput);
 #endif
 #ifdef INET6
 	IP6PROTO_REGISTER(IPPROTO_TCP, tcp6_input, tcp6_ctlinput);
 #endif
 }
 SYSINIT(tcp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, tcp_init, NULL);
 
 #ifdef VIMAGE
 static void
 tcp_destroy(void *unused __unused)
 {
 	int n;
 #ifdef TCP_HHOOK
 	int error;
 #endif
 
 	/*
 	 * All our processes are gone, all our sockets should be cleaned
 	 * up, which means, we should be past the tcp_discardcb() calls.
 	 * Sleep to let all tcpcb timers really disappear and cleanup.
 	 */
 	for (;;) {
 		INP_INFO_WLOCK(&V_tcbinfo);
 		n = V_tcbinfo.ipi_count;
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		if (n == 0)
 			break;
 		pause("tcpdes", hz / 10);
 	}
 	tcp_hc_destroy();
 	syncache_destroy();
 	in_pcbinfo_destroy(&V_tcbinfo);
 	/* tcp_discardcb() clears the sack_holes up. */
 	uma_zdestroy(V_sack_hole_zone);
 	uma_zdestroy(V_tcpcb_zone);
 
 	/*
 	 * Cannot free the zone until all tcpcbs are released as we attach
 	 * the allocations to them.
 	 */
 	tcp_fastopen_destroy();
 
 	COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES);
 	VNET_PCPUSTAT_FREE(tcpstat);
 
 #ifdef TCP_HHOOK
 	error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister helper hook "
 		    "type=%d, id=%d: error %d returned\n", __func__,
 		    HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error);
 	}
 	error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister helper hook "
 		    "type=%d, id=%d: error %d returned\n", __func__,
 		    HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error);
 	}
 #endif
 }
 VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL);
 #endif
 
 void
 tcp_fini(void *xtp)
 {
 
 }
 
 /*
  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
  * tcp_template used to store this data in mbufs, but we now recopy it out
  * of the tcpcb each time to conserve mbufs.
  */
 void
 tcpip_fillheaders(struct inpcb *inp, uint16_t port, void *ip_ptr, void *tcp_ptr)
 {
 	struct tcphdr *th = (struct tcphdr *)tcp_ptr;
 
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		struct ip6_hdr *ip6;
 
 		ip6 = (struct ip6_hdr *)ip_ptr;
 		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 			(inp->inp_flow & IPV6_FLOWINFO_MASK);
 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 			(IPV6_VERSION & IPV6_VERSION_MASK);
 		if (port == 0)
 			ip6->ip6_nxt = IPPROTO_TCP;
 		else
 			ip6->ip6_nxt = IPPROTO_UDP;
 		ip6->ip6_plen = htons(sizeof(struct tcphdr));
 		ip6->ip6_src = inp->in6p_laddr;
 		ip6->ip6_dst = inp->in6p_faddr;
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		struct ip *ip;
 
 		ip = (struct ip *)ip_ptr;
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = 5;
 		ip->ip_tos = inp->inp_ip_tos;
 		ip->ip_len = 0;
 		ip->ip_id = 0;
 		ip->ip_off = 0;
 		ip->ip_ttl = inp->inp_ip_ttl;
 		ip->ip_sum = 0;
 		if (port == 0)
 			ip->ip_p = IPPROTO_TCP;
 		else
 			ip->ip_p = IPPROTO_UDP;
 		ip->ip_src = inp->inp_laddr;
 		ip->ip_dst = inp->inp_faddr;
 	}
 #endif /* INET */
 	th->th_sport = inp->inp_lport;
 	th->th_dport = inp->inp_fport;
 	th->th_seq = 0;
 	th->th_ack = 0;
 	th->th_off = 5;
 	tcp_set_flags(th, 0);
 	th->th_win = 0;
 	th->th_urp = 0;
 	th->th_sum = 0;		/* in_pseudo() is called later for ipv4 */
 }
 
 /*
  * Create template to be used to send tcp packets on a connection.
  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
  * use for this function is in keepalives, which use tcp_respond.
  */
 struct tcptemp *
 tcpip_maketemplate(struct inpcb *inp)
 {
 	struct tcptemp *t;
 
 	t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
 	if (t == NULL)
 		return (NULL);
 	tcpip_fillheaders(inp, 0, (void *)&t->tt_ipgen, (void *)&t->tt_t);
 	return (t);
 }
 
 /*
  * Send a single message to the TCP at address specified by
  * the given TCP/IP header.  If m == NULL, then we make a copy
  * of the tcpiphdr at th and send directly to the addressed host.
  * This is used to force keep alive messages out using the TCP
  * template for a connection.  If flags are given then we send
  * a message back to the TCP which originated the segment th,
  * and discard the mbuf containing it and any other attached mbufs.
  *
  * In any case the ack and sequence number of the transmitted
  * segment are as specified by the parameters.
  *
  * NOTE: If m != NULL, then th must point to *inside* the mbuf.
  */
 void
 tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
     tcp_seq ack, tcp_seq seq, int flags)
 {
 	struct tcpopt to;
 	struct inpcb *inp;
 	struct ip *ip;
 	struct mbuf *optm;
 	struct udphdr *uh = NULL;
 	struct tcphdr *nth;
 	struct tcp_log_buffer *lgb;
 	u_char *optp;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	int isipv6;
 #endif /* INET6 */
 	int optlen, tlen, win, ulen;
 	bool incl_opts;
 	uint16_t port;
 	int output_ret;
 #ifdef INVARIANTS
 	int thflags = tcp_get_flags(th);
 #endif
 
 	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
 	NET_EPOCH_ASSERT();
 
 #ifdef INET6
 	isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4);
 	ip6 = ipgen;
 #endif /* INET6 */
 	ip = ipgen;
 
 	if (tp != NULL) {
 		inp = tp->t_inpcb;
 		KASSERT(inp != NULL, ("tcp control block w/o inpcb"));
 		INP_LOCK_ASSERT(inp);
 	} else
 		inp = NULL;
 
 	if (m != NULL) {
 #ifdef INET6
 		if (isipv6 && ip6 && (ip6->ip6_nxt == IPPROTO_UDP))
 			port = m->m_pkthdr.tcp_tun_port;
 		else
 #endif
 		if (ip && (ip->ip_p == IPPROTO_UDP))
 			port = m->m_pkthdr.tcp_tun_port;
 		else
 			port = 0;
 	} else
 		port = tp->t_port;
 
 	incl_opts = false;
 	win = 0;
 	if (tp != NULL) {
 		if (!(flags & TH_RST)) {
 			win = sbspace(&inp->inp_socket->so_rcv);
 			if (win > TCP_MAXWIN << tp->rcv_scale)
 				win = TCP_MAXWIN << tp->rcv_scale;
 		}
 		if ((tp->t_flags & TF_NOOPT) == 0)
 			incl_opts = true;
 	}
 	if (m == NULL) {
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return;
 		m->m_data += max_linkhdr;
 #ifdef INET6
 		if (isipv6) {
 			bcopy((caddr_t)ip6, mtod(m, caddr_t),
 			      sizeof(struct ip6_hdr));
 			ip6 = mtod(m, struct ip6_hdr *);
 			nth = (struct tcphdr *)(ip6 + 1);
 			if (port) {
 				/* Insert a UDP header */
 				uh = (struct udphdr *)nth;
 				uh->uh_sport = htons(V_tcp_udp_tunneling_port);
 				uh->uh_dport = port;
 				nth = (struct tcphdr *)(uh + 1);
 			}
 		} else
 #endif /* INET6 */
 		{
 			bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
 			ip = mtod(m, struct ip *);
 			nth = (struct tcphdr *)(ip + 1);
 			if (port) {
 				/* Insert a UDP header */
 				uh = (struct udphdr *)nth;
 				uh->uh_sport = htons(V_tcp_udp_tunneling_port);
 				uh->uh_dport = port;
 				nth = (struct tcphdr *)(uh + 1);
 			}
 		}
 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
 		flags = TH_ACK;
 	} else if ((!M_WRITABLE(m)) || (port != 0)) {
 		struct mbuf *n;
 
 		/* Can't reuse 'm', allocate a new mbuf. */
 		n = m_gethdr(M_NOWAIT, MT_DATA);
 		if (n == NULL) {
 			m_freem(m);
 			return;
 		}
 
 		if (!m_dup_pkthdr(n, m, M_NOWAIT)) {
 			m_freem(m);
 			m_freem(n);
 			return;
 		}
 
 		n->m_data += max_linkhdr;
 		/* m_len is set later */
 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
 #ifdef INET6
 		if (isipv6) {
 			bcopy((caddr_t)ip6, mtod(n, caddr_t),
 			      sizeof(struct ip6_hdr));
 			ip6 = mtod(n, struct ip6_hdr *);
 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
 			nth = (struct tcphdr *)(ip6 + 1);
 			if (port) {
 				/* Insert a UDP header */
 				uh = (struct udphdr *)nth;
 				uh->uh_sport = htons(V_tcp_udp_tunneling_port);
 				uh->uh_dport = port;
 				nth = (struct tcphdr *)(uh + 1);
 			}
 		} else
 #endif /* INET6 */
 		{
 			bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip));
 			ip = mtod(n, struct ip *);
 			xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t);
 			nth = (struct tcphdr *)(ip + 1);
 			if (port) {
 				/* Insert a UDP header */
 				uh = (struct udphdr *)nth;
 				uh->uh_sport = htons(V_tcp_udp_tunneling_port);
 				uh->uh_dport = port;
 				nth = (struct tcphdr *)(uh + 1);
 			}
 		}
 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
 		xchg(nth->th_dport, nth->th_sport, uint16_t);
 		th = nth;
 		m_freem(m);
 		m = n;
 	} else {
 		/*
 		 *  reuse the mbuf.
 		 * XXX MRT We inherit the FIB, which is lucky.
 		 */
 		m_freem(m->m_next);
 		m->m_next = NULL;
 		m->m_data = (caddr_t)ipgen;
 		/* m_len is set later */
 #ifdef INET6
 		if (isipv6) {
 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 		{
 			xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t);
 			nth = (struct tcphdr *)(ip + 1);
 		}
 		if (th != nth) {
 			/*
 			 * this is usually a case when an extension header
 			 * exists between the IPv6 header and the
 			 * TCP header.
 			 */
 			nth->th_sport = th->th_sport;
 			nth->th_dport = th->th_dport;
 		}
 		xchg(nth->th_dport, nth->th_sport, uint16_t);
 #undef xchg
 	}
 	tlen = 0;
 #ifdef INET6
 	if (isipv6)
 		tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 		tlen = sizeof (struct tcpiphdr);
 #endif
 	if (port)
 		tlen += sizeof (struct udphdr);
 #ifdef INVARIANTS
 	m->m_len = 0;
 	KASSERT(M_TRAILINGSPACE(m) >= tlen,
 	    ("Not enough trailing space for message (m=%p, need=%d, have=%ld)",
 	    m, tlen, (long)M_TRAILINGSPACE(m)));
 #endif
 	m->m_len = tlen;
 	to.to_flags = 0;
 	if (incl_opts) {
 		/* Make sure we have room. */
 		if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) {
 			m->m_next = m_get(M_NOWAIT, MT_DATA);
 			if (m->m_next) {
 				optp = mtod(m->m_next, u_char *);
 				optm = m->m_next;
 			} else
 				incl_opts = false;
 		} else {
 			optp = (u_char *) (nth + 1);
 			optm = m;
 		}
 	}
 	if (incl_opts) {
 		/* Timestamps. */
 		if (tp->t_flags & TF_RCVD_TSTMP) {
 			to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
 			to.to_tsecr = tp->ts_recent;
 			to.to_flags |= TOF_TS;
 		}
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		/* TCP-MD5 (RFC2385). */
 		if (tp->t_flags & TF_SIGNATURE)
 			to.to_flags |= TOF_SIGNATURE;
 #endif
 		/* Add the options. */
 		tlen += optlen = tcp_addoptions(&to, optp);
 
 		/* Update m_len in the correct mbuf. */
 		optm->m_len += optlen;
 	} else
 		optlen = 0;
 #ifdef INET6
 	if (isipv6) {
 		if (uh) {
 			ulen = tlen - sizeof(struct ip6_hdr);
 			uh->uh_ulen = htons(ulen);
 		}
 		ip6->ip6_flow = 0;
 		ip6->ip6_vfc = IPV6_VERSION;
 		if (port)
 			ip6->ip6_nxt = IPPROTO_UDP;
 		else
 			ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = htons(tlen - sizeof(*ip6));
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		if (uh) {
 			ulen = tlen - sizeof(struct ip);
 			uh->uh_ulen = htons(ulen);
 		}
 		ip->ip_len = htons(tlen);
 		ip->ip_ttl = V_ip_defttl;
 		if (port) {
 			ip->ip_p = IPPROTO_UDP;
 		} else {
 			ip->ip_p = IPPROTO_TCP;
 		}
 		if (V_path_mtu_discovery)
 			ip->ip_off |= htons(IP_DF);
 	}
 #endif
 	m->m_pkthdr.len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 #ifdef MAC
 	if (inp != NULL) {
 		/*
 		 * Packet is associated with a socket, so allow the
 		 * label of the response to reflect the socket label.
 		 */
 		INP_LOCK_ASSERT(inp);
 		mac_inpcb_create_mbuf(inp, m);
 	} else {
 		/*
 		 * Packet is not associated with a socket, so possibly
 		 * update the label in place.
 		 */
 		mac_netinet_tcp_reply(m);
 	}
 #endif
 	nth->th_seq = htonl(seq);
 	nth->th_ack = htonl(ack);
 	nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
 	tcp_set_flags(nth, flags);
 	if (tp != NULL)
 		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
 	else
 		nth->th_win = htons((u_short)win);
 	nth->th_urp = 0;
 
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 	if (to.to_flags & TOF_SIGNATURE) {
 		if (!TCPMD5_ENABLED() ||
 		    TCPMD5_OUTPUT(m, nth, to.to_signature) != 0) {
 			m_freem(m);
 			return;
 		}
 	}
 #endif
 
 #ifdef INET6
 	if (isipv6) {
 		if (port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			uh->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
 			nth->th_sum = 0;
 		} else {
 			m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			nth->th_sum = in6_cksum_pseudo(ip6,
 			    tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0);
 		}
 		ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb :
 		    NULL, NULL);
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		if (port) {
 			uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 			    htons(ulen + IPPROTO_UDP));
 			m->m_pkthdr.csum_flags = CSUM_UDP;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			nth->th_sum = 0;
 		} else {
 			m->m_pkthdr.csum_flags = CSUM_TCP;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 			    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
 		}
 	}
 #endif /* INET */
 #ifdef TCPDEBUG
 	if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
 #endif
 	TCP_PROBE3(debug__output, tp, th, m);
 	if (flags & TH_RST)
 		TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth);
 	lgb = NULL;
 	if ((tp != NULL) && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		if (INP_WLOCKED(inp)) {
 			union tcp_log_stackspecific log;
 			struct timeval tv;
 
 			memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 			log.u_bbr.inhpts = tp->t_inpcb->inp_in_hpts;
 			log.u_bbr.flex8 = 4;
 			log.u_bbr.pkts_out = tp->t_maxseg;
 			log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 			log.u_bbr.delivered = 0;
 			lgb = tcp_log_event_(tp, nth, NULL, NULL, TCP_LOG_OUT,
 			    ERRNO_UNK, 0, &log, false, NULL, NULL, 0, &tv);
 		} else {
 			/*
 			 * We can not log the packet, since we only own the
 			 * read lock, but a write lock is needed. The read lock
 			 * is not upgraded to a write lock, since only getting
 			 * the read lock was done intentionally to improve the
 			 * handling of SYN flooding attacks.
 			 * This happens only for pure SYN segments received in
 			 * the initial CLOSED state, or received in a more
 			 * advanced state than listen and the UDP encapsulation
 			 * port is unexpected.
 			 * The incoming SYN segments do not really belong to
 			 * the TCP connection and the handling does not change
 			 * the state of the TCP connection. Therefore, the
 			 * sending of the RST segments is not logged. Please
 			 * note that also the incoming SYN segments are not
 			 * logged.
 			 *
 			 * The following code ensures that the above description
 			 * is and stays correct.
 			 */
 			KASSERT((thflags & (TH_ACK|TH_SYN)) == TH_SYN &&
 			    (tp->t_state == TCPS_CLOSED ||
 			    (tp->t_state > TCPS_LISTEN && tp->t_port != port)),
 			    ("%s: Logging of TCP segment with flags 0x%b and "
 			    "UDP encapsulation port %u skipped in state %s",
 			    __func__, thflags, PRINT_TH_FLAGS,
 			    ntohs(port), tcpstates[tp->t_state]));
 		}
 	}
 
 	if (flags & TH_ACK)
 		TCPSTAT_INC(tcps_sndacks);
 	else if (flags & (TH_SYN|TH_FIN|TH_RST))
 		TCPSTAT_INC(tcps_sndctrl);
 	TCPSTAT_INC(tcps_sndtotal);
 
 #ifdef INET6
 	if (isipv6) {
 		TCP_PROBE5(send, NULL, tp, ip6, tp, nth);
 		output_ret = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
 	}
 #endif /* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		TCP_PROBE5(send, NULL, tp, ip, tp, nth);
 		output_ret = ip_output(m, NULL, NULL, 0, NULL, inp);
 	}
 #endif
 	if (lgb != NULL)
 		lgb->tlb_errno = output_ret;
 }
 
 /*
  * Create a new TCP control block, making an
  * empty reassembly queue and hooking it to the argument
  * protocol control block.  The `inp' parameter must have
  * come from the zone allocator set up in tcp_init().
  */
 struct tcpcb *
 tcp_newtcpcb(struct inpcb *inp)
 {
 	struct tcpcb_mem *tm;
 	struct tcpcb *tp;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO);
 	if (tm == NULL)
 		return (NULL);
 	tp = &tm->tcb;
 
 	/* Initialise cc_var struct for this tcpcb. */
 	tp->ccv = &tm->ccv;
 	tp->ccv->type = IPPROTO_TCP;
 	tp->ccv->ccvc.tcp = tp;
 	rw_rlock(&tcp_function_lock);
 	tp->t_fb = tcp_func_set_ptr;
 	refcount_acquire(&tp->t_fb->tfb_refcnt);
 	rw_runlock(&tcp_function_lock);
 	/*
 	 * Use the current system default CC algorithm.
 	 */
 	cc_attach(tp, CC_DEFAULT_ALGO());
 
 	/*
 	 * The tcpcb will hold a reference on its inpcb until tcp_discardcb()
 	 * is called.
 	 */
 	in_pcbref(inp);	/* Reference for tcpcb */
 	tp->t_inpcb = inp;
 
 	if (CC_ALGO(tp)->cb_init != NULL)
 		if (CC_ALGO(tp)->cb_init(tp->ccv, NULL) > 0) {
 			cc_detach(tp);
 			if (tp->t_fb->tfb_tcp_fb_fini)
 				(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
 			in_pcbrele_wlocked(inp);
 			refcount_release(&tp->t_fb->tfb_refcnt);
 			uma_zfree(V_tcpcb_zone, tm);
 			return (NULL);
 		}
 
 #ifdef TCP_HHOOK
 	tp->osd = &tm->osd;
 	if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) {
 		if (tp->t_fb->tfb_tcp_fb_fini)
 			(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
 		in_pcbrele_wlocked(inp);
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		uma_zfree(V_tcpcb_zone, tm);
 		return (NULL);
 	}
 #endif
 
 #ifdef VIMAGE
 	tp->t_vnet = inp->inp_vnet;
 #endif
 	tp->t_timers = &tm->tt;
 	TAILQ_INIT(&tp->t_segq);
 	tp->t_maxseg =
 #ifdef INET6
 		isipv6 ? V_tcp_v6mssdflt :
 #endif /* INET6 */
 		V_tcp_mssdflt;
 
 	/* Set up our timeouts. */
 	callout_init(&tp->t_timers->tt_rexmt, 1);
 	callout_init(&tp->t_timers->tt_persist, 1);
 	callout_init(&tp->t_timers->tt_keep, 1);
 	callout_init(&tp->t_timers->tt_2msl, 1);
 	callout_init(&tp->t_timers->tt_delack, 1);
 
 	switch (V_tcp_do_rfc1323) {
 		case 0:
 			break;
 		default:
 		case 1:
 			tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
 			break;
 		case 2:
 			tp->t_flags = TF_REQ_SCALE;
 			break;
 		case 3:
 			tp->t_flags = TF_REQ_TSTMP;
 			break;
 	}
 	if (V_tcp_do_sack)
 		tp->t_flags |= TF_SACK_PERMIT;
 	TAILQ_INIT(&tp->snd_holes);
 
 	/*
 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
 	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
 	 * reasonable initial retransmit time.
 	 */
 	tp->t_srtt = TCPTV_SRTTBASE;
 	tp->t_rttvar = ((tcp_rexmit_initial - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
 	tp->t_rttmin = tcp_rexmit_min;
 	tp->t_rxtcur = tcp_rexmit_initial;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
 	/*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
 	 * which may match an IPv4-mapped IPv6 address.
 	 */
 	inp->inp_ip_ttl = V_ip_defttl;
 	inp->inp_ppcb = tp;
 #ifdef TCPPCAP
 	/*
 	 * Init the TCP PCAP queues.
 	 */
 	tcp_pcap_tcpcb_init(tp);
 #endif
 #ifdef TCP_BLACKBOX
 	/* Initialize the per-TCPCB log data. */
 	tcp_log_tcpcbinit(tp);
 #endif
 	tp->t_pacing_rate = -1;
 	if (tp->t_fb->tfb_tcp_fb_init) {
 		if ((*tp->t_fb->tfb_tcp_fb_init)(tp)) {
 			refcount_release(&tp->t_fb->tfb_refcnt);
 			in_pcbrele_wlocked(inp);
 			uma_zfree(V_tcpcb_zone, tm);
 			return (NULL);
 		}
 	}
 #ifdef STATS
 	if (V_tcp_perconn_stats_enable == 1)
 		tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0);
 #endif
 	if (V_tcp_do_lrd)
 		tp->t_flags |= TF_LRD;
 	return (tp);		/* XXX */
 }
 
 /*
  * Drop a TCP connection, reporting
  * the specified error.  If connection is synchronized,
  * then send a RST to peer.
  */
 struct tcpcb *
 tcp_drop(struct tcpcb *tp, int errno)
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
 		tcp_state_change(tp, TCPS_CLOSED);
 		/* Don't use tcp_output() here due to possible recursion. */
 		(void)tcp_output_nodrop(tp);
 		TCPSTAT_INC(tcps_drops);
 	} else
 		TCPSTAT_INC(tcps_conndrops);
 	if (errno == ETIMEDOUT && tp->t_softerror)
 		errno = tp->t_softerror;
 	so->so_error = errno;
 	return (tcp_close(tp));
 }
 
 void
 tcp_discardcb(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Make sure that all of our timers are stopped before we delete the
 	 * PCB.
 	 *
 	 * If stopping a timer fails, we schedule a discard function in same
 	 * callout, and the last discard function called will take care of
 	 * deleting the tcpcb.
 	 */
 	tp->t_timers->tt_draincnt = 0;
 	tcp_timer_stop(tp, TT_REXMT);
 	tcp_timer_stop(tp, TT_PERSIST);
 	tcp_timer_stop(tp, TT_KEEP);
 	tcp_timer_stop(tp, TT_2MSL);
 	tcp_timer_stop(tp, TT_DELACK);
 	if (tp->t_fb->tfb_tcp_timer_stop_all) {
 		/*
 		 * Call the stop-all function of the methods,
 		 * this function should call the tcp_timer_stop()
 		 * method with each of the function specific timeouts.
 		 * That stop will be called via the tfb_tcp_timer_stop()
 		 * which should use the async drain function of the
 		 * callout system (see tcp_var.h).
 		 */
 		tp->t_fb->tfb_tcp_timer_stop_all(tp);
 	}
 
 	/* free the reassembly queue, if any */
 	tcp_reass_flush(tp);
 
 #ifdef TCP_OFFLOAD
 	/* Disconnect offload device, if any. */
 	if (tp->t_flags & TF_TOE)
 		tcp_offload_detach(tp);
 #endif
 
 	tcp_free_sackholes(tp);
 
 #ifdef TCPPCAP
 	/* Free the TCP PCAP queues. */
 	tcp_pcap_drain(&(tp->t_inpkts));
 	tcp_pcap_drain(&(tp->t_outpkts));
 #endif
 
 	/* Allow the CC algorithm to clean up after itself. */
 	if (CC_ALGO(tp)->cb_destroy != NULL)
 		CC_ALGO(tp)->cb_destroy(tp->ccv);
 	CC_DATA(tp) = NULL;
 	/* Detach from the CC algorithm */
 	cc_detach(tp);
 
 #ifdef TCP_HHOOK
 	khelp_destroy_osd(tp->osd);
 #endif
 #ifdef STATS
 	stats_blob_destroy(tp->t_stats);
 #endif
 
 	CC_ALGO(tp) = NULL;
 	inp->inp_ppcb = NULL;
 	if (tp->t_timers->tt_draincnt == 0) {
 		bool released __diagused;
 
 		released = tcp_freecb(tp);
 		KASSERT(!released, ("%s: inp %p should not have been released "
 		    "here", __func__, inp));
 	}
 }
 
 bool
 tcp_freecb(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 #ifdef INET6
 	bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif
 
 	INP_WLOCK_ASSERT(inp);
 	MPASS(tp->t_timers->tt_draincnt == 0);
 
 	/* We own the last reference on tcpcb, let's free it. */
 #ifdef TCP_BLACKBOX
 	tcp_log_tcpcbfini(tp);
 #endif
 	TCPSTATES_DEC(tp->t_state);
 	if (tp->t_fb->tfb_tcp_fb_fini)
 		(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
 
 	/*
 	 * If we got enough samples through the srtt filter,
 	 * save the rtt and rttvar in the routing entry.
 	 * 'Enough' is arbitrarily defined as 4 rtt samples.
 	 * 4 samples is enough for the srtt filter to converge
 	 * to within enough % of the correct value; fewer samples
 	 * and we could save a bogus rtt. The danger is not high
 	 * as tcp quickly recovers from everything.
 	 * XXX: Works very well but needs some more statistics!
 	 *
 	 * XXXRRS: Updating must be after the stack fini() since
 	 * that may be converting some internal representation of
 	 * say srtt etc into the general one used by other stacks.
 	 * Lets also at least protect against the so being NULL
 	 * as RW stated below.
 	 */
 	if ((tp->t_rttupdated >= 4) && (so != NULL)) {
 		struct hc_metrics_lite metrics;
 		uint32_t ssthresh;
 
 		bzero(&metrics, sizeof(metrics));
 		/*
 		 * Update the ssthresh always when the conditions below
 		 * are satisfied. This gives us better new start value
 		 * for the congestion avoidance for new connections.
 		 * ssthresh is only set if packet loss occurred on a session.
 		 *
 		 * XXXRW: 'so' may be NULL here, and/or socket buffer may be
 		 * being torn down.  Ideally this code would not use 'so'.
 		 */
 		ssthresh = tp->snd_ssthresh;
 		if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
 			/*
 			 * convert the limit from user data bytes to
 			 * packets then to packet data bytes.
 			 */
 			ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
 			if (ssthresh < 2)
 				ssthresh = 2;
 			ssthresh *= (tp->t_maxseg +
 #ifdef INET6
 			    (isipv6 ? sizeof (struct ip6_hdr) +
 			    sizeof (struct tcphdr) :
 #endif
 			    sizeof (struct tcpiphdr)
 #ifdef INET6
 			    )
 #endif
 			    );
 		} else
 			ssthresh = 0;
 		metrics.rmx_ssthresh = ssthresh;
 
 		metrics.rmx_rtt = tp->t_srtt;
 		metrics.rmx_rttvar = tp->t_rttvar;
 		metrics.rmx_cwnd = tp->snd_cwnd;
 		metrics.rmx_sendpipe = 0;
 		metrics.rmx_recvpipe = 0;
 
 		tcp_hc_update(&inp->inp_inc, &metrics);
 	}
 
 	refcount_release(&tp->t_fb->tfb_refcnt);
 	uma_zfree(V_tcpcb_zone, tp);
 
 	return (in_pcbrele_wlocked(inp));
 }
 
 /*
  * Attempt to close a TCP control block, marking it as dropped, and freeing
  * the socket if we hold the only reference.
  */
 struct tcpcb *
 tcp_close(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so;
 
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_state == TCPS_LISTEN)
 		tcp_offload_listen_stop(tp);
 #endif
 	/*
 	 * This releases the TFO pending counter resource for TFO listen
 	 * sockets as well as passively-created TFO sockets that transition
 	 * from SYN_RECEIVED to CLOSED.
 	 */
 	if (tp->t_tfo_pending) {
 		tcp_fastopen_decrement_counter(tp->t_tfo_pending);
 		tp->t_tfo_pending = NULL;
 	}
 #ifdef TCPHPTS
 	tcp_hpts_remove(inp);
 #endif
 	in_pcbdrop(inp);
 	TCPSTAT_INC(tcps_closed);
 	if (tp->t_state != TCPS_CLOSED)
 		tcp_state_change(tp, TCPS_CLOSED);
 	KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
 	so = inp->inp_socket;
 	soisdisconnected(so);
 	if (inp->inp_flags & INP_SOCKREF) {
 		inp->inp_flags &= ~INP_SOCKREF;
 		INP_WUNLOCK(inp);
 		sorele(so);
 		return (NULL);
 	}
 	return (tp);
 }
 
 /*
  * Notify a tcp user of an asynchronous error;
  * store error as soft error, but wake up user
  * (for now, won't do anything until can select for soft error).
  *
  * Do not wake up user since there currently is no mechanism for
  * reporting soft errors (yet - a kqueue filter may be added).
  */
 static struct inpcb *
 tcp_notify(struct inpcb *inp, int error)
 {
 	struct tcpcb *tp;
 
 	INP_WLOCK_ASSERT(inp);
 
-	if ((inp->inp_flags & INP_TIMEWAIT) ||
-	    (inp->inp_flags & INP_DROPPED))
+	if (inp->inp_flags & INP_DROPPED)
 		return (inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));
 
 	/*
 	 * Ignore some errors if we are hooked up.
 	 * If connection hasn't completed, has retransmitted several times,
 	 * and receives a second error, give up now.  This is better
 	 * than waiting a long time to establish a connection that
 	 * can never complete.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
 	     error == EHOSTDOWN)) {
 		if (inp->inp_route.ro_nh) {
 			NH_FREE(inp->inp_route.ro_nh);
 			inp->inp_route.ro_nh = (struct nhop_object *)NULL;
 		}
 		return (inp);
 	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
 	    tp->t_softerror) {
 		tp = tcp_drop(tp, error);
 		if (tp != NULL)
 			return (inp);
 		else
 			return (NULL);
 	} else {
 		tp->t_softerror = error;
 		return (inp);
 	}
 #if 0
 	wakeup( &so->so_timeo);
 	sorwakeup(so);
 	sowwakeup(so);
 #endif
 }
 
 static int
 tcp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
 	    INPLOOKUP_RLOCKPCB);
 	struct xinpgen xig;
 	struct inpcb *inp;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (req->oldptr == NULL) {
 		int n;
 
 		n = V_tcbinfo.ipi_count +
 		    counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]);
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
 		return (0);
 	}
 
 	if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
 		return (error);
 
 	bzero(&xig, sizeof(xig));
 	xig.xig_len = sizeof xig;
 	xig.xig_count = V_tcbinfo.ipi_count +
 	    counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]);
 	xig.xig_gen = V_tcbinfo.ipi_gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	error = syncache_pcblist(req);
 	if (error)
 		return (error);
 
 	while ((inp = inp_next(&inpi)) != NULL) {
 		if (inp->inp_gencnt <= xig.xig_gen &&
 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
 			struct xtcpcb xt;
 
 			tcp_inptoxtp(inp, &xt);
 			error = SYSCTL_OUT(req, &xt, sizeof xt);
 			if (error) {
 				INP_RUNLOCK(inp);
 				break;
 			} else
 				continue;
 		}
 	}
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		xig.xig_gen = V_tcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_tcbinfo.ipi_count +
 		    counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
     NULL, 0, tcp_pcblist, "S,xtcpcb",
     "List of active TCP connections");
 
 #ifdef INET
 static int
 tcp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct epoch_tracker et;
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	NET_EPOCH_ENTER(et);
 	inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 	    addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL);
 	NET_EPOCH_EXIT(et);
 	if (inp != NULL) {
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT,
     0, 0, tcp_getcred, "S,xucred",
     "Get the xucred of a TCP connection");
 #endif /* INET */
 
 #ifdef INET6
 static int
 tcp6_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct epoch_tracker et;
 	struct xucred xuc;
 	struct sockaddr_in6 addrs[2];
 	struct inpcb *inp;
 	int error;
 #ifdef INET
 	int mapped = 0;
 #endif
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 ||
 	    (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
 		return (error);
 	}
 	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
 #ifdef INET
 		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
 			mapped = 1;
 		else
 #endif
 			return (EINVAL);
 	}
 
 	NET_EPOCH_ENTER(et);
 #ifdef INET
 	if (mapped == 1)
 		inp = in_pcblookup(&V_tcbinfo,
 			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
 			addrs[1].sin6_port,
 			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
 			addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL);
 	else
 #endif
 		inp = in6_pcblookup(&V_tcbinfo,
 			&addrs[1].sin6_addr, addrs[1].sin6_port,
 			&addrs[0].sin6_addr, addrs[0].sin6_port,
 			INPLOOKUP_RLOCKPCB, NULL);
 	NET_EPOCH_EXIT(et);
 	if (inp != NULL) {
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT,
     0, 0, tcp6_getcred, "S,xucred",
     "Get the xucred of a TCP6 connection");
 #endif /* INET6 */
 
 #ifdef INET
 /* Path MTU to try next when a fragmentation-needed message is received. */
 static inline int
 tcp_next_pmtu(const struct icmp *icp, const struct ip *ip)
 {
 	int mtu = ntohs(icp->icmp_nextmtu);
 
 	/* If no alternative MTU was proposed, try the next smaller one. */
 	if (!mtu)
 		mtu = ip_next_mtu(ntohs(ip->ip_len), 1);
 	if (mtu < V_tcp_minmss + sizeof(struct tcpiphdr))
 		mtu = V_tcp_minmss + sizeof(struct tcpiphdr);
 
 	return (mtu);
 }
 
 static void
 tcp_ctlinput_with_port(struct icmp *icp, uint16_t port)
 {
 	struct ip *ip;
 	struct tcphdr *th;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct inpcb *(*notify)(struct inpcb *, int);
 	struct in_conninfo inc;
 	tcp_seq icmp_tcp_seq;
 	int errno, mtu;
 
 	errno = icmp_errmap(icp);
 	switch (errno) {
 	case 0:
 		return;
 	case EMSGSIZE:
 		notify = tcp_mtudisc_notify;
 		break;
 	case ECONNREFUSED:
 		if (V_icmp_may_rst)
 			notify = tcp_drop_syn_sent;
 		else
 			notify = tcp_notify;
 		break;
 	case EHOSTUNREACH:
 		if (V_icmp_may_rst && icp->icmp_type == ICMP_TIMXCEED)
 			notify = tcp_drop_syn_sent;
 		else
 			notify = tcp_notify;
 		break;
 	default:
 		notify = tcp_notify;
 	}
 
 	ip = &icp->icmp_ip;
 	th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 	icmp_tcp_seq = th->th_seq;
 	inp = in_pcblookup(&V_tcbinfo, ip->ip_dst, th->th_dport, ip->ip_src,
 	    th->th_sport, INPLOOKUP_WLOCKPCB, NULL);
 	if (inp != NULL)  {
-		if (!(inp->inp_flags & INP_TIMEWAIT) &&
-		    !(inp->inp_flags & INP_DROPPED) &&
+		if (!(inp->inp_flags & INP_DROPPED) &&
 		    !(inp->inp_socket == NULL)) {
 			tp = intotcpcb(inp);
 #ifdef TCP_OFFLOAD
 			if (tp->t_flags & TF_TOE && errno == EMSGSIZE) {
 				/*
 				 * MTU discovery for offloaded connections.  Let
 				 * the TOE driver verify seq# and process it.
 				 */
 				mtu = tcp_next_pmtu(icp, ip);
 				tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu);
 				goto out;
 			}
 #endif
 			if (tp->t_port != port) {
 				goto out;
 			}
 			if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) &&
 			    SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) {
 				if (errno == EMSGSIZE) {
 					/*
 					 * MTU discovery: we got a needfrag and
 					 * will potentially try a lower MTU.
 					 */
 					mtu = tcp_next_pmtu(icp, ip);
 
 					/*
 					 * Only process the offered MTU if it
 					 * is smaller than the current one.
 					 */
 					if (mtu < tp->t_maxseg +
 					    sizeof(struct tcpiphdr)) {
 						bzero(&inc, sizeof(inc));
 						inc.inc_faddr = ip->ip_dst;
 						inc.inc_fibnum =
 						    inp->inp_inc.inc_fibnum;
 						tcp_hc_updatemtu(&inc, mtu);
 						inp = tcp_mtudisc(inp, mtu);
 					}
 				} else
 					inp = (*notify)(inp, errno);
 			}
 		}
 	} else {
 		bzero(&inc, sizeof(inc));
 		inc.inc_fport = th->th_dport;
 		inc.inc_lport = th->th_sport;
 		inc.inc_faddr = ip->ip_dst;
 		inc.inc_laddr = ip->ip_src;
 		syncache_unreach(&inc, icmp_tcp_seq, port);
 	}
 out:
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 }
 
 static void
 tcp_ctlinput(struct icmp *icmp)
 {
 	tcp_ctlinput_with_port(icmp, htons(0));
 }
 
 static void
 tcp_ctlinput_viaudp(udp_tun_icmp_param_t param)
 {
 	/* Its a tunneled TCP over UDP icmp */
 	struct icmp *icmp = param.icmp;
 	struct ip *outer_ip, *inner_ip;
 	struct udphdr *udp;
 	struct tcphdr *th, ttemp;
 	int i_hlen, o_len;
 	uint16_t port;
 
 	outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip));
 	inner_ip = &icmp->icmp_ip;
 	i_hlen = inner_ip->ip_hl << 2;
 	o_len = ntohs(outer_ip->ip_len);
 	if (o_len <
 	    (sizeof(struct ip) + 8 + i_hlen + sizeof(struct udphdr) + offsetof(struct tcphdr, th_ack))) {
 		/* Not enough data present */
 		return;
 	}
 	/* Ok lets strip out the inner udphdr header by copying up on top of it the tcp hdr */
 	udp = (struct udphdr *)(((caddr_t)inner_ip) + i_hlen);
 	if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) {
 		return;
 	}
 	port = udp->uh_dport;
 	th = (struct tcphdr *)(udp + 1);
 	memcpy(&ttemp, th, sizeof(struct tcphdr));
 	memcpy(udp, &ttemp, sizeof(struct tcphdr));
 	/* Now adjust down the size of the outer IP header */
 	o_len -= sizeof(struct udphdr);
 	outer_ip->ip_len = htons(o_len);
 	/* Now call in to the normal handling code */
 	tcp_ctlinput_with_port(icmp, port);
 }
 #endif /* INET */
 
 #ifdef INET6
 static inline int
 tcp6_next_pmtu(const struct icmp6_hdr *icmp6)
 {
 	int mtu = ntohl(icmp6->icmp6_mtu);
 
 	/*
 	 * If no alternative MTU was proposed, or the proposed MTU was too
 	 * small, set to the min.
 	 */
 	if (mtu < IPV6_MMTU)
 		mtu = IPV6_MMTU - 8;	/* XXXNP: what is the adjustment for? */
 	return (mtu);
 }
 
 static void
 tcp6_ctlinput_with_port(struct ip6ctlparam *ip6cp, uint16_t port)
 {
 	struct in6_addr *dst;
 	struct inpcb *(*notify)(struct inpcb *, int);
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct icmp6_hdr *icmp6;
 	struct in_conninfo inc;
 	struct tcp_ports {
 		uint16_t th_sport;
 		uint16_t th_dport;
 	} t_ports;
 	tcp_seq icmp_tcp_seq;
 	unsigned int mtu;
 	unsigned int off;
 	int errno;
 
 	icmp6 = ip6cp->ip6c_icmp6;
 	m = ip6cp->ip6c_m;
 	ip6 = ip6cp->ip6c_ip6;
 	off = ip6cp->ip6c_off;
 	dst = &ip6cp->ip6c_finaldst->sin6_addr;
 
 	errno = icmp6_errmap(icmp6);
 	switch (errno) {
 	case 0:
 		return;
 	case EMSGSIZE:
 		notify = tcp_mtudisc_notify;
 		break;
 	case ECONNREFUSED:
 		if (V_icmp_may_rst)
 			notify = tcp_drop_syn_sent;
 		else
 			notify = tcp_notify;
 		break;
 	case EHOSTUNREACH:
 		/*
 		 * There are only four ICMPs that may reset connection:
 		 * - administratively prohibited
 		 * - port unreachable
 		 * - time exceeded in transit
 		 * - unknown next header
 		 */
 		if (V_icmp_may_rst &&
 		    ((icmp6->icmp6_type == ICMP6_DST_UNREACH &&
 		     (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN ||
 		      icmp6->icmp6_code == ICMP6_DST_UNREACH_NOPORT)) ||
 		    (icmp6->icmp6_type == ICMP6_TIME_EXCEEDED &&
 		      icmp6->icmp6_code == ICMP6_TIME_EXCEED_TRANSIT) ||
 		    (icmp6->icmp6_type == ICMP6_PARAM_PROB &&
 		      icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER)))
 			notify = tcp_drop_syn_sent;
 		else
 			notify = tcp_notify;
 		break;
 	default:
 		notify = tcp_notify;
 	}
 
 	/* Check if we can safely get the ports from the tcp hdr */
 	if (m == NULL ||
 	    (m->m_pkthdr.len <
 		(int32_t) (off + sizeof(struct tcp_ports)))) {
 		return;
 	}
 	bzero(&t_ports, sizeof(struct tcp_ports));
 	m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports);
 	inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, t_ports.th_dport,
 	    &ip6->ip6_src, t_ports.th_sport, INPLOOKUP_WLOCKPCB, NULL);
 	off += sizeof(struct tcp_ports);
 	if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) {
 		goto out;
 	}
 	m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq);
 	if (inp != NULL)  {
-		if (!(inp->inp_flags & INP_TIMEWAIT) &&
-		    !(inp->inp_flags & INP_DROPPED) &&
+		if (!(inp->inp_flags & INP_DROPPED) &&
 		    !(inp->inp_socket == NULL)) {
 			tp = intotcpcb(inp);
 #ifdef TCP_OFFLOAD
 			if (tp->t_flags & TF_TOE && errno == EMSGSIZE) {
 				/* MTU discovery for offloaded connections. */
 				mtu = tcp6_next_pmtu(icmp6);
 				tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu);
 				goto out;
 			}
 #endif
 			if (tp->t_port != port) {
 				goto out;
 			}
 			if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) &&
 			    SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) {
 				if (errno == EMSGSIZE) {
 					/*
 					 * MTU discovery:
 					 * If we got a needfrag set the MTU
 					 * in the route to the suggested new
 					 * value (if given) and then notify.
 					 */
 					mtu = tcp6_next_pmtu(icmp6);
 
 					bzero(&inc, sizeof(inc));
 					inc.inc_fibnum = M_GETFIB(m);
 					inc.inc_flags |= INC_ISIPV6;
 					inc.inc6_faddr = *dst;
 					if (in6_setscope(&inc.inc6_faddr,
 						m->m_pkthdr.rcvif, NULL))
 						goto out;
 					/*
 					 * Only process the offered MTU if it
 					 * is smaller than the current one.
 					 */
 					if (mtu < tp->t_maxseg +
 					    sizeof (struct tcphdr) +
 					    sizeof (struct ip6_hdr)) {
 						tcp_hc_updatemtu(&inc, mtu);
 						tcp_mtudisc(inp, mtu);
 						ICMP6STAT_INC(icp6s_pmtuchg);
 					}
 				} else
 					inp = (*notify)(inp, errno);
 			}
 		}
 	} else {
 		bzero(&inc, sizeof(inc));
 		inc.inc_fibnum = M_GETFIB(m);
 		inc.inc_flags |= INC_ISIPV6;
 		inc.inc_fport = t_ports.th_dport;
 		inc.inc_lport = t_ports.th_sport;
 		inc.inc6_faddr = *dst;
 		inc.inc6_laddr = ip6->ip6_src;
 		syncache_unreach(&inc, icmp_tcp_seq, port);
 	}
 out:
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 }
 
 static void
 tcp6_ctlinput(struct ip6ctlparam *ctl)
 {
 	tcp6_ctlinput_with_port(ctl, htons(0));
 }
 
 static void
 tcp6_ctlinput_viaudp(udp_tun_icmp_param_t param)
 {
 	struct ip6ctlparam *ip6cp = param.ip6cp;
 	struct mbuf *m;
 	struct udphdr *udp;
 	uint16_t port;
 
 	m = m_pulldown(ip6cp->ip6c_m, ip6cp->ip6c_off, sizeof(struct udphdr), NULL);
 	if (m == NULL) {
 		return;
 	}
 	udp = mtod(m, struct udphdr *);
 	if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) {
 		return;
 	}
 	port = udp->uh_dport;
 	m_adj(m, sizeof(struct udphdr));
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		ip6cp->ip6c_m->m_pkthdr.len -= sizeof(struct udphdr);
 	}
 	/* Now call in to the normal handling code */
 	tcp6_ctlinput_with_port(ip6cp, port);
 }
 
 #endif /* INET6 */
 
 static uint32_t
 tcp_keyed_hash(struct in_conninfo *inc, u_char *key, u_int len)
 {
 	SIPHASH_CTX ctx;
 	uint32_t hash[2];
 
 	KASSERT(len >= SIPHASH_KEY_LENGTH,
 	    ("%s: keylen %u too short ", __func__, len));
 	SipHash24_Init(&ctx);
 	SipHash_SetKey(&ctx, (uint8_t *)key);
 	SipHash_Update(&ctx, &inc->inc_fport, sizeof(uint16_t));
 	SipHash_Update(&ctx, &inc->inc_lport, sizeof(uint16_t));
 	switch (inc->inc_flags & INC_ISIPV6) {
 #ifdef INET
 	case 0:
 		SipHash_Update(&ctx, &inc->inc_faddr, sizeof(struct in_addr));
 		SipHash_Update(&ctx, &inc->inc_laddr, sizeof(struct in_addr));
 		break;
 #endif
 #ifdef INET6
 	case INC_ISIPV6:
 		SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(struct in6_addr));
 		SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(struct in6_addr));
 		break;
 #endif
 	}
 	SipHash_Final((uint8_t *)hash, &ctx);
 
 	return (hash[0] ^ hash[1]);
 }
 
 uint32_t
 tcp_new_ts_offset(struct in_conninfo *inc)
 {
 	struct in_conninfo inc_store, *local_inc;
 
 	if (!V_tcp_ts_offset_per_conn) {
 		memcpy(&inc_store, inc, sizeof(struct in_conninfo));
 		inc_store.inc_lport = 0;
 		inc_store.inc_fport = 0;
 		local_inc = &inc_store;
 	} else {
 		local_inc = inc;
 	}
 	return (tcp_keyed_hash(local_inc, V_ts_offset_secret,
 	    sizeof(V_ts_offset_secret)));
 }
 
 /*
  * Following is where TCP initial sequence number generation occurs.
  *
  * There are two places where we must use initial sequence numbers:
  * 1.  In SYN-ACK packets.
  * 2.  In SYN packets.
  *
  * All ISNs for SYN-ACK packets are generated by the syncache.  See
  * tcp_syncache.c for details.
  *
  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
  * depends on this property.  In addition, these ISNs should be
  * unguessable so as to prevent connection hijacking.  To satisfy
  * the requirements of this situation, the algorithm outlined in
  * RFC 1948 is used, with only small modifications.
  *
  * Implementation details:
  *
  * Time is based off the system timer, and is corrected so that it
  * increases by one megabyte per second.  This allows for proper
  * recycling on high speed LANs while still leaving over an hour
  * before rollover.
  *
  * As reading the *exact* system time is too expensive to be done
  * whenever setting up a TCP connection, we increment the time
  * offset in two ways.  First, a small random positive increment
  * is added to isn_offset for each connection that is set up.
  * Second, the function tcp_isn_tick fires once per clock tick
  * and increments isn_offset as necessary so that sequence numbers
  * are incremented at approximately ISN_BYTES_PER_SECOND.  The
  * random positive increments serve only to ensure that the same
  * exact sequence number is never sent out twice (as could otherwise
  * happen when a port is recycled in less than the system tick
  * interval.)
  *
  * net.inet.tcp.isn_reseed_interval controls the number of seconds
  * between seeding of isn_secret.  This is normally set to zero,
  * as reseeding should not be necessary.
  *
  * Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
  * isn_offset_old, and isn_ctx is performed using the ISN lock.  In
  * general, this means holding an exclusive (write) lock.
  */
 
 #define ISN_BYTES_PER_SECOND 1048576
 #define ISN_STATIC_INCREMENT 4096
 #define ISN_RANDOM_INCREMENT (4096 - 1)
 #define ISN_SECRET_LENGTH    SIPHASH_KEY_LENGTH
 
 VNET_DEFINE_STATIC(u_char, isn_secret[ISN_SECRET_LENGTH]);
 VNET_DEFINE_STATIC(int, isn_last);
 VNET_DEFINE_STATIC(int, isn_last_reseed);
 VNET_DEFINE_STATIC(u_int32_t, isn_offset);
 VNET_DEFINE_STATIC(u_int32_t, isn_offset_old);
 
 #define	V_isn_secret			VNET(isn_secret)
 #define	V_isn_last			VNET(isn_last)
 #define	V_isn_last_reseed		VNET(isn_last_reseed)
 #define	V_isn_offset			VNET(isn_offset)
 #define	V_isn_offset_old		VNET(isn_offset_old)
 
 tcp_seq
 tcp_new_isn(struct in_conninfo *inc)
 {
 	tcp_seq new_isn;
 	u_int32_t projected_offset;
 
 	ISN_LOCK();
 	/* Seed if this is the first use, reseed if requested. */
 	if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) &&
 	     (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz)
 		< (u_int)ticks))) {
 		arc4rand(&V_isn_secret, sizeof(V_isn_secret), 0);
 		V_isn_last_reseed = ticks;
 	}
 
 	/* Compute the hash and return the ISN. */
 	new_isn = (tcp_seq)tcp_keyed_hash(inc, V_isn_secret,
 	    sizeof(V_isn_secret));
 	V_isn_offset += ISN_STATIC_INCREMENT +
 		(arc4random() & ISN_RANDOM_INCREMENT);
 	if (ticks != V_isn_last) {
 		projected_offset = V_isn_offset_old +
 		    ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last);
 		if (SEQ_GT(projected_offset, V_isn_offset))
 			V_isn_offset = projected_offset;
 		V_isn_offset_old = V_isn_offset;
 		V_isn_last = ticks;
 	}
 	new_isn += V_isn_offset;
 	ISN_UNLOCK();
 	return (new_isn);
 }
 
 /*
  * When a specific ICMP unreachable message is received and the
  * connection state is SYN-SENT, drop the connection.  This behavior
  * is controlled by the icmp_may_rst sysctl.
  */
 static struct inpcb *
 tcp_drop_syn_sent(struct inpcb *inp, int errno)
 {
 	struct tcpcb *tp;
 
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(inp);
 
-	if ((inp->inp_flags & INP_TIMEWAIT) ||
-	    (inp->inp_flags & INP_DROPPED))
+	if (inp->inp_flags & INP_DROPPED)
 		return (inp);
 
 	tp = intotcpcb(inp);
 	if (tp->t_state != TCPS_SYN_SENT)
 		return (inp);
 
 	if (IS_FASTOPEN(tp->t_flags))
 		tcp_fastopen_disable_path(tp);
 
 	tp = tcp_drop(tp, errno);
 	if (tp != NULL)
 		return (inp);
 	else
 		return (NULL);
 }
 
 /*
  * When `need fragmentation' ICMP is received, update our idea of the MSS
  * based on the new value. Also nudge TCP to send something, since we
  * know the packet we just sent was dropped.
  * This duplicates some code in the tcp_mss() function in tcp_input.c.
  */
 static struct inpcb *
 tcp_mtudisc_notify(struct inpcb *inp, int error)
 {
 
 	return (tcp_mtudisc(inp, -1));
 }
 
 static struct inpcb *
 tcp_mtudisc(struct inpcb *inp, int mtuoffer)
 {
 	struct tcpcb *tp;
 	struct socket *so;
 
 	INP_WLOCK_ASSERT(inp);
-	if ((inp->inp_flags & INP_TIMEWAIT) ||
-	    (inp->inp_flags & INP_DROPPED))
+	if (inp->inp_flags & INP_DROPPED)
 		return (inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
 
 	tcp_mss_update(tp, -1, mtuoffer, NULL, NULL);
 
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_snd);
 	/* If the mss is larger than the socket buffer, decrease the mss. */
 	if (so->so_snd.sb_hiwat < tp->t_maxseg)
 		tp->t_maxseg = so->so_snd.sb_hiwat;
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	TCPSTAT_INC(tcps_mturesent);
 	tp->t_rtttime = 0;
 	tp->snd_nxt = tp->snd_una;
 	tcp_free_sackholes(tp);
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		EXIT_FASTRECOVERY(tp->t_flags);
 	if (tp->t_fb->tfb_tcp_mtu_chg != NULL) {
 		/*
 		 * Conceptually the snd_nxt setting
 		 * and freeing sack holes should
 		 * be done by the default stacks
 		 * own tfb_tcp_mtu_chg().
 		 */
 		tp->t_fb->tfb_tcp_mtu_chg(tp);
 	}
 	if (tcp_output(tp) < 0)
 		return (NULL);
 	else
 		return (inp);
 }
 
 #ifdef INET
 /*
  * Look-up the routing entry to the peer of this inpcb.  If no route
  * is found and it cannot be allocated, then return 0.  This routine
  * is called by TCP routines that access the rmx structure and by
  * tcp_mss_update to get the peer/interface MTU.
  */
 uint32_t
 tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap)
 {
 	struct nhop_object *nh;
 	struct ifnet *ifp;
 	uint32_t maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
 
 	if (inc->inc_faddr.s_addr != INADDR_ANY) {
 		nh = fib4_lookup(inc->inc_fibnum, inc->inc_faddr, 0, NHR_NONE, 0);
 		if (nh == NULL)
 			return (0);
 
 		ifp = nh->nh_ifp;
 		maxmtu = nh->nh_mtu;
 
 		/* Report additional interface capabilities. */
 		if (cap != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO4 &&
 			    ifp->if_hwassist & CSUM_TSO) {
 				cap->ifcap |= CSUM_TSO;
 				cap->tsomax = ifp->if_hw_tsomax;
 				cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 				cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 			}
 		}
 	}
 	return (maxmtu);
 }
 #endif /* INET */
 
 #ifdef INET6
 uint32_t
 tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap)
 {
 	struct nhop_object *nh;
 	struct in6_addr dst6;
 	uint32_t scopeid;
 	struct ifnet *ifp;
 	uint32_t maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
 
 	if (inc->inc_flags & INC_IPV6MINMTU)
 		return (IPV6_MMTU);
 
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
 		in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid);
 		nh = fib6_lookup(inc->inc_fibnum, &dst6, scopeid, NHR_NONE, 0);
 		if (nh == NULL)
 			return (0);
 
 		ifp = nh->nh_ifp;
 		maxmtu = nh->nh_mtu;
 
 		/* Report additional interface capabilities. */
 		if (cap != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO6 &&
 			    ifp->if_hwassist & CSUM_TSO) {
 				cap->ifcap |= CSUM_TSO;
 				cap->tsomax = ifp->if_hw_tsomax;
 				cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 				cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 			}
 		}
 	}
 
 	return (maxmtu);
 }
 
 /*
  * Handle setsockopt(IPV6_USE_MIN_MTU) by a TCP stack.
  *
  * XXXGL: we are updating inpcb here with INC_IPV6MINMTU flag.
  * The right place to do that is ip6_setpktopt() that has just been
  * executed.  By the way it just filled ip6po_minmtu for us.
  */
 void
 tcp6_use_min_mtu(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 
 	INP_WLOCK_ASSERT(inp);
 	/*
 	 * In case of the IPV6_USE_MIN_MTU socket
 	 * option, the INC_IPV6MINMTU flag to announce
 	 * a corresponding MSS during the initial
 	 * handshake.  If the TCP connection is not in
 	 * the front states, just reduce the MSS being
 	 * used.  This avoids the sending of TCP
 	 * segments which will be fragmented at the
 	 * IPv6 layer.
 	 */
 	inp->inp_inc.inc_flags |= INC_IPV6MINMTU;
 	if ((tp->t_state >= TCPS_SYN_SENT) &&
 	    (inp->inp_inc.inc_flags & INC_ISIPV6)) {
 		struct ip6_pktopts *opt;
 
 		opt = inp->in6p_outputopts;
 		if (opt != NULL && opt->ip6po_minmtu == IP6PO_MINMTU_ALL &&
 		    tp->t_maxseg > TCP6_MSS)
 			tp->t_maxseg = TCP6_MSS;
 	}
 }
 #endif /* INET6 */
 
 /*
  * Calculate effective SMSS per RFC5681 definition for a given TCP
  * connection at its current state, taking into account SACK and etc.
  */
 u_int
 tcp_maxseg(const struct tcpcb *tp)
 {
 	u_int optlen;
 
 	if (tp->t_flags & TF_NOOPT)
 		return (tp->t_maxseg);
 
 	/*
 	 * Here we have a simplified code from tcp_addoptions(),
 	 * without a proper loop, and having most of paddings hardcoded.
 	 * We might make mistakes with padding here in some edge cases,
 	 * but this is harmless, since result of tcp_maxseg() is used
 	 * only in cwnd and ssthresh estimations.
 	 */
 	if (TCPS_HAVEESTABLISHED(tp->t_state)) {
 		if (tp->t_flags & TF_RCVD_TSTMP)
 			optlen = TCPOLEN_TSTAMP_APPA;
 		else
 			optlen = 0;
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		if (tp->t_flags & TF_SIGNATURE)
 			optlen += PADTCPOLEN(TCPOLEN_SIGNATURE);
 #endif
 		if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) {
 			optlen += TCPOLEN_SACKHDR;
 			optlen += tp->rcv_numsacks * TCPOLEN_SACK;
 			optlen = PADTCPOLEN(optlen);
 		}
 	} else {
 		if (tp->t_flags & TF_REQ_TSTMP)
 			optlen = TCPOLEN_TSTAMP_APPA;
 		else
 			optlen = PADTCPOLEN(TCPOLEN_MAXSEG);
 		if (tp->t_flags & TF_REQ_SCALE)
 			optlen += PADTCPOLEN(TCPOLEN_WINDOW);
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		if (tp->t_flags & TF_SIGNATURE)
 			optlen += PADTCPOLEN(TCPOLEN_SIGNATURE);
 #endif
 		if (tp->t_flags & TF_SACK_PERMIT)
 			optlen += PADTCPOLEN(TCPOLEN_SACK_PERMITTED);
 	}
 #undef PAD
 	optlen = min(optlen, TCP_MAXOLEN);
 	return (tp->t_maxseg - optlen);
 }
 
 
 u_int
 tcp_fixed_maxseg(const struct tcpcb *tp)
 {
 	int optlen;
 
 	if (tp->t_flags & TF_NOOPT)
 		return (tp->t_maxseg);
 
 	/*
 	 * Here we have a simplified code from tcp_addoptions(),
 	 * without a proper loop, and having most of paddings hardcoded.
 	 * We only consider fixed options that we would send every
 	 * time I.e. SACK is not considered. This is important
 	 * for cc modules to figure out what the modulo of the
 	 * cwnd should be.
 	 */
 #define	PAD(len)	((((len) / 4) + !!((len) % 4)) * 4)
 	if (TCPS_HAVEESTABLISHED(tp->t_state)) {
 		if (tp->t_flags & TF_RCVD_TSTMP)
 			optlen = TCPOLEN_TSTAMP_APPA;
 		else
 			optlen = 0;
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		if (tp->t_flags & TF_SIGNATURE)
 			optlen += PAD(TCPOLEN_SIGNATURE);
 #endif
 	} else {
 		if (tp->t_flags & TF_REQ_TSTMP)
 			optlen = TCPOLEN_TSTAMP_APPA;
 		else
 			optlen = PAD(TCPOLEN_MAXSEG);
 		if (tp->t_flags & TF_REQ_SCALE)
 			optlen += PAD(TCPOLEN_WINDOW);
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		if (tp->t_flags & TF_SIGNATURE)
 			optlen += PAD(TCPOLEN_SIGNATURE);
 #endif
 		if (tp->t_flags & TF_SACK_PERMIT)
 			optlen += PAD(TCPOLEN_SACK_PERMITTED);
 	}
 #undef PAD
 	optlen = min(optlen, TCP_MAXOLEN);
 	return (tp->t_maxseg - optlen);
 }
 
 
 
 static int
 sysctl_drop(SYSCTL_HANDLER_ARGS)
 {
 	/* addrs[0] is a foreign socket, addrs[1] is a local one. */
 	struct sockaddr_storage addrs[2];
 	struct inpcb *inp;
 	struct tcpcb *tp;
 #ifdef INET
 	struct sockaddr_in *fin = NULL, *lin = NULL;
 #endif
 	struct epoch_tracker et;
 #ifdef INET6
 	struct sockaddr_in6 *fin6, *lin6;
 #endif
 	int error;
 
 	inp = NULL;
 #ifdef INET6
 	fin6 = lin6 = NULL;
 #endif
 	error = 0;
 
 	if (req->oldptr != NULL || req->oldlen != 0)
 		return (EINVAL);
 	if (req->newptr == NULL)
 		return (EPERM);
 	if (req->newlen < sizeof(addrs))
 		return (ENOMEM);
 	error = SYSCTL_IN(req, &addrs, sizeof(addrs));
 	if (error)
 		return (error);
 
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		fin6 = (struct sockaddr_in6 *)&addrs[0];
 		lin6 = (struct sockaddr_in6 *)&addrs[1];
 		if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
 		    lin6->sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
 			if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
 				return (EINVAL);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
 #ifdef INET
 			fin = (struct sockaddr_in *)&addrs[0];
 			lin = (struct sockaddr_in *)&addrs[1];
 #endif
 			break;
 		}
 		error = sa6_embedscope(fin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		error = sa6_embedscope(lin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		fin = (struct sockaddr_in *)&addrs[0];
 		lin = (struct sockaddr_in *)&addrs[1];
 		if (fin->sin_len != sizeof(struct sockaddr_in) ||
 		    lin->sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 		break;
 #endif
 	default:
 		return (EINVAL);
 	}
 	NET_EPOCH_ENTER(et);
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr,
 		    fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port,
 		    INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port,
 		    lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 	}
 	if (inp != NULL) {
 		if ((inp->inp_flags & INP_DROPPED) == 0 &&
 		    !SOLISTENING(inp->inp_socket)) {
 			tp = intotcpcb(inp);
 			tp = tcp_drop(tp, ECONNABORTED);
 			if (tp != NULL)
 				INP_WUNLOCK(inp);
 		} else
 			INP_WUNLOCK(inp);
 	} else
 		error = ESRCH;
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
     CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP |
     CTLFLAG_NEEDGIANT, NULL, 0, sysctl_drop, "",
     "Drop TCP connection");
 
 static int
 tcp_sysctl_setsockopt(SYSCTL_HANDLER_ARGS)
 {
 	return (sysctl_setsockopt(oidp, arg1, arg2, req, &V_tcbinfo,
 	    &tcp_ctloutput_set));
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, setsockopt,
     CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP |
     CTLFLAG_MPSAFE, NULL, 0, tcp_sysctl_setsockopt, "",
     "Set socket option for TCP endpoint");
 
 #ifdef KERN_TLS
 static int
 sysctl_switch_tls(SYSCTL_HANDLER_ARGS)
 {
 	/* addrs[0] is a foreign socket, addrs[1] is a local one. */
 	struct sockaddr_storage addrs[2];
 	struct inpcb *inp;
 #ifdef INET
 	struct sockaddr_in *fin = NULL, *lin = NULL;
 #endif
 	struct epoch_tracker et;
 #ifdef INET6
 	struct sockaddr_in6 *fin6, *lin6;
 #endif
 	int error;
 
 	inp = NULL;
 #ifdef INET6
 	fin6 = lin6 = NULL;
 #endif
 	error = 0;
 
 	if (req->oldptr != NULL || req->oldlen != 0)
 		return (EINVAL);
 	if (req->newptr == NULL)
 		return (EPERM);
 	if (req->newlen < sizeof(addrs))
 		return (ENOMEM);
 	error = SYSCTL_IN(req, &addrs, sizeof(addrs));
 	if (error)
 		return (error);
 
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		fin6 = (struct sockaddr_in6 *)&addrs[0];
 		lin6 = (struct sockaddr_in6 *)&addrs[1];
 		if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
 		    lin6->sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
 			if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
 				return (EINVAL);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
 #ifdef INET
 			fin = (struct sockaddr_in *)&addrs[0];
 			lin = (struct sockaddr_in *)&addrs[1];
 #endif
 			break;
 		}
 		error = sa6_embedscope(fin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		error = sa6_embedscope(lin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		fin = (struct sockaddr_in *)&addrs[0];
 		lin = (struct sockaddr_in *)&addrs[1];
 		if (fin->sin_len != sizeof(struct sockaddr_in) ||
 		    lin->sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 		break;
 #endif
 	default:
 		return (EINVAL);
 	}
 	NET_EPOCH_ENTER(et);
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr,
 		    fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port,
 		    INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port,
 		    lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 	}
 	NET_EPOCH_EXIT(et);
 	if (inp != NULL) {
-		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0 ||
+		if ((inp->inp_flags & INP_DROPPED) != 0 ||
 		    inp->inp_socket == NULL) {
 			error = ECONNRESET;
 			INP_WUNLOCK(inp);
 		} else {
 			struct socket *so;
 
 			so = inp->inp_socket;
 			soref(so);
 			error = ktls_set_tx_mode(so,
 			    arg2 == 0 ? TCP_TLS_MODE_SW : TCP_TLS_MODE_IFNET);
 			INP_WUNLOCK(inp);
 			sorele(so);
 		}
 	} else
 		error = ESRCH;
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_sw_tls,
     CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP |
     CTLFLAG_NEEDGIANT, NULL, 0, sysctl_switch_tls, "",
     "Switch TCP connection to SW TLS");
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_ifnet_tls,
     CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP |
     CTLFLAG_NEEDGIANT, NULL, 1, sysctl_switch_tls, "",
     "Switch TCP connection to ifnet TLS");
 #endif
 
 /*
  * Generate a standardized TCP log line for use throughout the
  * tcp subsystem.  Memory allocation is done with M_NOWAIT to
  * allow use in the interrupt context.
  *
  * NB: The caller MUST free(s, M_TCPLOG) the returned string.
  * NB: The function may return NULL if memory allocation failed.
  *
  * Due to header inclusion and ordering limitations the struct ip
  * and ip6_hdr pointers have to be passed as void pointers.
  */
 char *
 tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr,
     const void *ip6hdr)
 {
 
 	/* Is logging enabled? */
 	if (V_tcp_log_in_vain == 0)
 		return (NULL);
 
 	return (tcp_log_addr(inc, th, ip4hdr, ip6hdr));
 }
 
 char *
 tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr,
     const void *ip6hdr)
 {
 
 	/* Is logging enabled? */
 	if (tcp_log_debug == 0)
 		return (NULL);
 
 	return (tcp_log_addr(inc, th, ip4hdr, ip6hdr));
 }
 
 static char *
 tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr,
     const void *ip6hdr)
 {
 	char *s, *sp;
 	size_t size;
 #ifdef INET
 	const struct ip *ip = (const struct ip *)ip4hdr;
 #endif
 #ifdef INET6
 	const struct ip6_hdr *ip6 = (const struct ip6_hdr *)ip6hdr;
 #endif /* INET6 */
 
 	/*
 	 * The log line looks like this:
 	 * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>"
 	 */
 	size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") +
 	    sizeof(PRINT_TH_FLAGS) + 1 +
 #ifdef INET6
 	    2 * INET6_ADDRSTRLEN;
 #else
 	    2 * INET_ADDRSTRLEN;
 #endif /* INET6 */
 
 	s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT);
 	if (s == NULL)
 		return (NULL);
 
 	strcat(s, "TCP: [");
 	sp = s + strlen(s);
 
 	if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) {
 		inet_ntoa_r(inc->inc_faddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		inet_ntoa_r(inc->inc_laddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 #ifdef INET6
 	} else if (inc) {
 		ip6_sprintf(sp, &inc->inc6_faddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &inc->inc6_laddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 	} else if (ip6 && th) {
 		ip6_sprintf(sp, &ip6->ip6_src);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &ip6->ip6_dst);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 #endif /* INET6 */
 #ifdef INET
 	} else if (ip && th) {
 		inet_ntoa_r(ip->ip_src, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		inet_ntoa_r(ip->ip_dst, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 #endif /* INET */
 	} else {
 		free(s, M_TCPLOG);
 		return (NULL);
 	}
 	sp = s + strlen(s);
 	if (th)
 		sprintf(sp, " tcpflags 0x%b", tcp_get_flags(th), PRINT_TH_FLAGS);
 	if (*(s + size - 1) != '\0')
 		panic("%s: string too long", __func__);
 	return (s);
 }
 
 /*
  * A subroutine which makes it easy to track TCP state changes with DTrace.
  * This function shouldn't be called for t_state initializations that don't
  * correspond to actual TCP state transitions.
  */
 void
 tcp_state_change(struct tcpcb *tp, int newstate)
 {
 #if defined(KDTRACE_HOOKS)
 	int pstate = tp->t_state;
 #endif
 
 	TCPSTATES_DEC(tp->t_state);
 	TCPSTATES_INC(newstate);
 	tp->t_state = newstate;
 	TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate);
 }
 
 /*
  * Create an external-format (``xtcpcb'') structure using the information in
  * the kernel-format tcpcb structure pointed to by tp.  This is done to
  * reduce the spew of irrelevant information over this interface, to isolate
  * user code from changes in the kernel structure, and potentially to provide
  * information-hiding if we decide that some of this information should be
  * hidden from users.
  */
 void
 tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt)
 {
 	struct tcpcb *tp = intotcpcb(inp);
 	sbintime_t now;
 
 	bzero(xt, sizeof(*xt));
 	xt->t_state = tp->t_state;
 	xt->t_logstate = tp->t_logstate;
 	xt->t_flags = tp->t_flags;
 	xt->t_sndzerowin = tp->t_sndzerowin;
 	xt->t_sndrexmitpack = tp->t_sndrexmitpack;
 	xt->t_rcvoopack = tp->t_rcvoopack;
 	xt->t_rcv_wnd = tp->rcv_wnd;
 	xt->t_snd_wnd = tp->snd_wnd;
 	xt->t_snd_cwnd = tp->snd_cwnd;
 	xt->t_snd_ssthresh = tp->snd_ssthresh;
 	xt->t_dsack_bytes = tp->t_dsack_bytes;
 	xt->t_dsack_tlp_bytes = tp->t_dsack_tlp_bytes;
 	xt->t_dsack_pack = tp->t_dsack_pack;
 	xt->t_maxseg = tp->t_maxseg;
 	xt->xt_ecn = (tp->t_flags2 & TF2_ECN_PERMIT) ? 1 : 0 +
 		     (tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0;
 
 	now = getsbinuptime();
 #define	COPYTIMER(ttt)	do {					\
 	if (callout_active(&tp->t_timers->ttt))			\
 		xt->ttt = (tp->t_timers->ttt.c_time - now) /	\
 		    SBT_1MS;					\
 	else							\
 		xt->ttt = 0;					\
 } while (0)
 	COPYTIMER(tt_delack);
 	COPYTIMER(tt_rexmt);
 	COPYTIMER(tt_persist);
 	COPYTIMER(tt_keep);
 	COPYTIMER(tt_2msl);
 #undef COPYTIMER
 	xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz;
 
 	xt->xt_encaps_port = tp->t_port;
 	bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack,
 	    TCP_FUNCTION_NAME_LEN_MAX);
 	bcopy(CC_ALGO(tp)->name, xt->xt_cc, TCP_CA_NAME_MAX);
 #ifdef TCP_BLACKBOX
 	(void)tcp_log_get_id(tp, xt->xt_logid);
 #endif
 
 	xt->xt_len = sizeof(struct xtcpcb);
 	in_pcbtoxinpcb(inp, &xt->xt_inp);
 	if (inp->inp_socket == NULL)
 		xt->xt_inp.xi_socket.xso_protocol = IPPROTO_TCP;
 }
 
 void
 tcp_log_end_status(struct tcpcb *tp, uint8_t status)
 {
 	uint32_t bit, i;
 
 	if ((tp == NULL) ||
 	    (status > TCP_EI_STATUS_MAX_VALUE) ||
 	    (status == 0)) {
 		/* Invalid */
 		return;
 	}
 	if (status > (sizeof(uint32_t) * 8)) {
 		/* Should this be a KASSERT? */
 		return;
 	}
 	bit = 1U << (status - 1);
 	if (bit & tp->t_end_info_status) {
 		/* already logged */
 		return;
 	}
 	for (i = 0; i < TCP_END_BYTE_INFO; i++) {
 		if (tp->t_end_info_bytes[i] == TCP_EI_EMPTY_SLOT) {
 			tp->t_end_info_bytes[i] = status;
 			tp->t_end_info_status |= bit;
 			break;
 		}
 	}
 }
 
 int
 tcp_can_enable_pacing(void)
 {
 
 	if ((tcp_pacing_limit == -1) ||
 	    (tcp_pacing_limit > number_of_tcp_connections_pacing)) {
 		atomic_fetchadd_int(&number_of_tcp_connections_pacing, 1);
 		shadow_num_connections = number_of_tcp_connections_pacing;
 		return (1);
 	} else {
 		return (0);
 	}
 }
 
 static uint8_t tcp_pacing_warning = 0;
 
 void
 tcp_decrement_paced_conn(void)
 {
 	uint32_t ret;
 
 	ret = atomic_fetchadd_int(&number_of_tcp_connections_pacing, -1);
 	shadow_num_connections = number_of_tcp_connections_pacing;
 	KASSERT(ret != 0, ("tcp_paced_connection_exits -1 would cause wrap?"));
 	if (ret == 0) {
 		if (tcp_pacing_limit != -1) {
 			printf("Warning all pacing is now disabled, count decrements invalidly!\n");
 			tcp_pacing_limit = 0;
 		} else if (tcp_pacing_warning == 0) {
 			printf("Warning pacing count is invalid, invalid decrement\n");
 			tcp_pacing_warning = 1;
 		}
 	}
 }
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index f4915da6e77c..391b9dfdbc05 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -1,1148 +1,1148 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_timer.c	8.2 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_tcpdebug.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 #include <net/netisr.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_rss.h>
 #include <netinet/in_systm.h>
 #ifdef INET6
 #include <netinet6/in6_pcb.h>
 #endif
 #include <netinet/ip_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/cc/cc.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #include <netinet/tcp_debug.h>
 
 int    tcp_persmin;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &tcp_persmin, 0, sysctl_msec_to_ticks, "I",
     "minimum persistence interval");
 
 int    tcp_persmax;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &tcp_persmax, 0, sysctl_msec_to_ticks, "I",
     "maximum persistence interval");
 
 int	tcp_keepinit;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I",
     "time to establish connection");
 
 int	tcp_keepidle;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I",
     "time before keepalive probes begin");
 
 int	tcp_keepintvl;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I",
     "time between keepalive probes");
 
 int	tcp_delacktime;
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
     "Time before a delayed ACK is sent");
 
 VNET_DEFINE(int, tcp_msl);
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(tcp_msl), 0, sysctl_msec_to_ticks, "I",
     "Maximum segment lifetime");
 
 int	tcp_rexmit_initial;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial,
    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I",
     "Initial Retransmission Timeout");
 
 int	tcp_rexmit_min;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
     "Minimum Retransmission Timeout");
 
 int	tcp_rexmit_slop;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
     "Retransmission Timer Slop");
 
 VNET_DEFINE(int, tcp_always_keepalive) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_VNET|CTLFLAG_RW,
     &VNET_NAME(tcp_always_keepalive) , 0,
     "Assume SO_KEEPALIVE on all TCP connections");
 
 int    tcp_fast_finwait2_recycle = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW,
     &tcp_fast_finwait2_recycle, 0,
     "Recycle closed FIN_WAIT_2 connections faster");
 
 int    tcp_finwait2_timeout;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I",
     "FIN-WAIT2 timeout");
 
 int	tcp_keepcnt = TCPTV_KEEPCNT;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
     "Number of keepalive probes to send");
 
 	/* max idle probes */
 int	tcp_maxpersistidle;
 
 int	tcp_rexmit_drop_options = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
     &tcp_rexmit_drop_options, 0,
     "Drop TCP options from 3rd and later retransmitted SYN");
 
 int	tcp_maxunacktime = TCPTV_MAXUNACKTIME;
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxunacktime,
     CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &tcp_maxunacktime, 0, sysctl_msec_to_ticks, "I",
     "Maximum time (in ms) that a session can linger without making progress");
 
 VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
     CTLFLAG_RW|CTLFLAG_VNET,
     &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
     "Path MTU Discovery Black Hole Detection Enabled");
 
 #ifdef INET
 VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
     CTLFLAG_RW|CTLFLAG_VNET,
     &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
     "Path MTU Discovery Black Hole Detection lowered MSS");
 #endif
 
 #ifdef INET6
 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
     CTLFLAG_RW|CTLFLAG_VNET,
     &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
     "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
 #endif
 
 #ifdef	RSS
 static int	per_cpu_timers = 1;
 #else
 static int	per_cpu_timers = 0;
 #endif
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
     &per_cpu_timers , 0, "run tcp timers on all cpus");
 
 /*
  * Map the given inp to a CPU id.
  *
  * This queries RSS if it's compiled in, else it defaults to the current
  * CPU ID.
  */
 inline int
 inp_to_cpuid(struct inpcb *inp)
 {
 	u_int cpuid;
 
 	if (per_cpu_timers) {
 #ifdef	RSS
 		cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 		if (cpuid == NETISR_CPUID_NONE)
 			return (curcpu);	/* XXX */
 		else
 			return (cpuid);
 #endif
 		/*
 		 * We don't have a flowid -> cpuid mapping, so cheat and
 		 * just map unknown cpuids to curcpu.  Not the best, but
 		 * apparently better than defaulting to swi 0.
 		 */
 		cpuid = inp->inp_flowid % (mp_maxid + 1);
 		if (! CPU_ABSENT(cpuid))
 			return (cpuid);
 		return (curcpu);
 	} else {
 		return (0);
 	}
 }
 
 int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
     { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
 
 int tcp_totbackoff = 2559;	/* sum of tcp_backoff[] */
 
 /*
  * TCP timer processing.
  */
 
 void
 tcp_timer_delack(void *xtp)
 {
 	struct epoch_tracker et;
 	struct tcpcb *tp = xtp;
 	struct inpcb *inp;
 	CURVNET_SET(tp->t_vnet);
 
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_delack) ||
 	    !callout_active(&tp->t_timers->tt_delack)) {
 		INP_WUNLOCK(inp);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_delack);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
 		CURVNET_RESTORE();
 		return;
 	}
 	tp->t_flags |= TF_ACKNOW;
 	TCPSTAT_INC(tcps_delack);
 	NET_EPOCH_ENTER(et);
 	(void) tcp_output_unlock(tp);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 }
 
 /*
  * Call tcp_close() from a callout context.
  */
 static void
 tcp_timer_close(struct tcpcb *tp)
 {
 	struct epoch_tracker et;
 	struct inpcb *inp = tp->t_inpcb;
 
 	INP_WLOCK_ASSERT(inp);
 
 	NET_EPOCH_ENTER(et);
 	tp = tcp_close(tp);
 	NET_EPOCH_EXIT(et);
 	if (tp != NULL)
 		INP_WUNLOCK(inp);
 }
 
 /*
  * Call tcp_drop() from a callout context.
  */
 static void
 tcp_timer_drop(struct tcpcb *tp)
 {
 	struct epoch_tracker et;
 	struct inpcb *inp = tp->t_inpcb;
 
 	INP_WLOCK_ASSERT(inp);
 
 	NET_EPOCH_ENTER(et);
 	tp = tcp_drop(tp, ETIMEDOUT);
 	NET_EPOCH_EXIT(et);
 	if (tp != NULL)
 		INP_WUNLOCK(inp);
 }
 
 void
 tcp_timer_2msl(void *xtp)
 {
 	struct tcpcb *tp = xtp;
 	struct inpcb *inp;
 	CURVNET_SET(tp->t_vnet);
 #ifdef TCPDEBUG
 	int ostate;
 
 	ostate = tp->t_state;
 #endif
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	tcp_log_end_status(tp, TCP_EI_STATUS_2MSL);
 	tcp_free_sackholes(tp);
 	if (callout_pending(&tp->t_timers->tt_2msl) ||
 	    !callout_active(&tp->t_timers->tt_2msl)) {
 		INP_WUNLOCK(tp->t_inpcb);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_2msl);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	/*
 	 * 2 MSL timeout in shutdown went off.  If we're closed but
 	 * still waiting for peer to close and connection has been idle
 	 * too long delete connection control block.  Otherwise, check
 	 * again in a bit.
 	 *
 	 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
 	 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
 	 * Ignore fact that there were recent incoming segments.
 	 */
 	if (tp->t_state == TCPS_TIME_WAIT) {
 		tcp_timer_close(tp);
 		CURVNET_RESTORE();
 		return;
 	} else if (tp->t_state == TCPS_FIN_WAIT_2 &&
 	    tcp_fast_finwait2_recycle && tp->t_inpcb->inp_socket &&
 	    (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
 		TCPSTAT_INC(tcps_finwait2_drops);
 		tcp_timer_close(tp);
 		CURVNET_RESTORE();
 		return;
 	} else {
 		if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
 			callout_reset(&tp->t_timers->tt_2msl,
 				      TP_KEEPINTVL(tp), tcp_timer_2msl, tp);
 		} else {
 			tcp_timer_close(tp);
 			CURVNET_RESTORE();
 			return;
 		}
 	}
 
 #ifdef TCPDEBUG
 	if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_keep(void *xtp)
 {
 	struct tcpcb *tp = xtp;
 	struct tcptemp *t_template;
 	struct inpcb *inp;
 	struct epoch_tracker et;
 	CURVNET_SET(tp->t_vnet);
 #ifdef TCPDEBUG
 	int ostate;
 
 	ostate = tp->t_state;
 #endif
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_keep) ||
 	    !callout_active(&tp->t_timers->tt_keep)) {
 		INP_WUNLOCK(inp);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_keep);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 
 	/*
 	 * Because we don't regularly reset the keepalive callout in
 	 * the ESTABLISHED state, it may be that we don't actually need
 	 * to send a keepalive yet. If that occurs, schedule another
 	 * call for the next time the keepalive timer might expire.
 	 */
 	if (TCPS_HAVEESTABLISHED(tp->t_state)) {
 		u_int idletime;
 
 		idletime = ticks - tp->t_rcvtime;
 		if (idletime < TP_KEEPIDLE(tp)) {
 			callout_reset(&tp->t_timers->tt_keep,
 			    TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp);
 			INP_WUNLOCK(inp);
 			CURVNET_RESTORE();
 			return;
 		}
 	}
 
 	/*
 	 * Keep-alive timer went off; send something
 	 * or drop connection if idle for too long.
 	 */
 	TCPSTAT_INC(tcps_keeptimeo);
 	if (tp->t_state < TCPS_ESTABLISHED)
 		goto dropit;
 	if ((V_tcp_always_keepalive ||
 	    inp->inp_socket->so_options & SO_KEEPALIVE) &&
 	    tp->t_state <= TCPS_CLOSING) {
 		if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
 			goto dropit;
 		/*
 		 * Send a packet designed to force a response
 		 * if the peer is up and reachable:
 		 * either an ACK if the connection is still alive,
 		 * or an RST if the peer has closed the connection
 		 * due to timeout or reboot.
 		 * Using sequence number tp->snd_una-1
 		 * causes the transmitted zero-length segment
 		 * to lie outside the receive window;
 		 * by the protocol spec, this requires the
 		 * correspondent TCP to respond.
 		 */
 		TCPSTAT_INC(tcps_keepprobe);
 		t_template = tcpip_maketemplate(inp);
 		if (t_template) {
 			NET_EPOCH_ENTER(et);
 			tcp_respond(tp, t_template->tt_ipgen,
 				    &t_template->tt_t, (struct mbuf *)NULL,
 				    tp->rcv_nxt, tp->snd_una - 1, 0);
 			NET_EPOCH_EXIT(et);
 			free(t_template, M_TEMP);
 		}
 		callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
 			      tcp_timer_keep, tp);
 	} else
 		callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
 			      tcp_timer_keep, tp);
 
 #ifdef TCPDEBUG
 	if (inp->inp_socket->so_options & SO_DEBUG)
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 	INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 	return;
 
 dropit:
 	TCPSTAT_INC(tcps_keepdrops);
 	NET_EPOCH_ENTER(et);
 	tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
 	tp = tcp_drop(tp, ETIMEDOUT);
 
 #ifdef TCPDEBUG
 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 	NET_EPOCH_EXIT(et);
 	if (tp != NULL)
 		INP_WUNLOCK(inp);
 	CURVNET_RESTORE();
 }
 
 /*
  * Has this session exceeded the maximum time without seeing a substantive
  * acknowledgement? If so, return true; otherwise false.
  */
 static bool
 tcp_maxunacktime_check(struct tcpcb *tp)
 {
 
 	/* Are we tracking this timer for this session? */
 	if (TP_MAXUNACKTIME(tp) == 0)
 		return false;
 
 	/* Do we have a current measurement. */
 	if (tp->t_acktime == 0)
 		return false;
 
 	/* Are we within the acceptable range? */
 	if (TSTMP_GT(TP_MAXUNACKTIME(tp) + tp->t_acktime, (u_int)ticks))
 		return false;
 
 	/* We exceeded the timer. */
 	TCPSTAT_INC(tcps_progdrops);
 	return true;
 }
 
 void
 tcp_timer_persist(void *xtp)
 {
 	struct tcpcb *tp = xtp;
 	struct inpcb *inp;
 	struct epoch_tracker et;
 	bool progdrop;
 	int outrv;
 	CURVNET_SET(tp->t_vnet);
 #ifdef TCPDEBUG
 	int ostate;
 
 	ostate = tp->t_state;
 #endif
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_persist) ||
 	    !callout_active(&tp->t_timers->tt_persist)) {
 		INP_WUNLOCK(inp);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_persist);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	/*
 	 * Persistence timer into zero window.
 	 * Force a byte to be output, if possible.
 	 */
 	TCPSTAT_INC(tcps_persisttimeo);
 	/*
 	 * Hack: if the peer is dead/unreachable, we do not
 	 * time out if the window is closed.  After a full
 	 * backoff, drop the connection if the idle time
 	 * (no responses to probes) reaches the maximum
 	 * backoff that we would use if retransmitting.
 	 * Also, drop the connection if we haven't been making
 	 * progress.
 	 */
 	progdrop = tcp_maxunacktime_check(tp);
 	if (progdrop || (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
 	     ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff))) {
 		if (!progdrop)
 			TCPSTAT_INC(tcps_persistdrop);
 		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
 		tcp_timer_drop(tp);
 		CURVNET_RESTORE();
 		return;
 	}
 	/*
 	 * If the user has closed the socket then drop a persisting
 	 * connection after a much reduced timeout.
 	 */
 	if (tp->t_state > TCPS_CLOSE_WAIT &&
 	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
 		TCPSTAT_INC(tcps_persistdrop);
 		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
 		tcp_timer_drop(tp);
 		CURVNET_RESTORE();
 		return;
 	}
 	tcp_setpersist(tp);
 	tp->t_flags |= TF_FORCEDATA;
 	NET_EPOCH_ENTER(et);
 	outrv = tcp_output_nodrop(tp);
 	tp->t_flags &= ~TF_FORCEDATA;
 
 #ifdef TCPDEBUG
 	if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
 		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 	(void) tcp_unlock_or_drop(tp, outrv);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_rexmt(void * xtp)
 {
 	struct tcpcb *tp = xtp;
 	CURVNET_SET(tp->t_vnet);
 	int rexmt, outrv;
 	struct inpcb *inp;
 	struct epoch_tracker et;
 	bool isipv6;
 #ifdef TCPDEBUG
 	int ostate;
 
 	ostate = tp->t_state;
 #endif
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_rexmt) ||
 	    !callout_active(&tp->t_timers->tt_rexmt)) {
 		INP_WUNLOCK(inp);
 		CURVNET_RESTORE();
 		return;
 	}
 	callout_deactivate(&tp->t_timers->tt_rexmt);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		CURVNET_RESTORE();
 		return;
 	}
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
 	tcp_free_sackholes(tp);
 	TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false);
 	if (tp->t_fb->tfb_tcp_rexmit_tmr) {
 		/* The stack has a timer action too. */
 		(*tp->t_fb->tfb_tcp_rexmit_tmr)(tp);
 	}
 	/*
 	 * Retransmission timer went off.  Message has not
 	 * been acked within retransmit interval.  Back off
 	 * to a longer retransmit interval and retransmit one segment.
 	 *
 	 * If we've either exceeded the maximum number of retransmissions,
 	 * or we've gone long enough without making progress, then drop
 	 * the session.
 	 */
 	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT || tcp_maxunacktime_check(tp)) {
 		if (tp->t_rxtshift > TCP_MAXRXTSHIFT)
 			TCPSTAT_INC(tcps_timeoutdrop);
 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
 		tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
 		tcp_timer_drop(tp);
 		CURVNET_RESTORE();
 		return;
 	}
 	if (tp->t_state == TCPS_SYN_SENT) {
 		/*
 		 * If the SYN was retransmitted, indicate CWND to be
 		 * limited to 1 segment in cc_conn_init().
 		 */
 		tp->snd_cwnd = 1;
 	} else if (tp->t_rxtshift == 1) {
 		/*
 		 * first retransmit; record ssthresh and cwnd so they can
 		 * be recovered if this turns out to be a "bad" retransmit.
 		 * A retransmit is considered "bad" if an ACK for this
 		 * segment is received within RTT/2 interval; the assumption
 		 * here is that the ACK was already in flight.  See
 		 * "On Estimating End-to-End Network Path Properties" by
 		 * Allman and Paxson for more details.
 		 */
 		tp->snd_cwnd_prev = tp->snd_cwnd;
 		tp->snd_ssthresh_prev = tp->snd_ssthresh;
 		tp->snd_recover_prev = tp->snd_recover;
 		if (IN_FASTRECOVERY(tp->t_flags))
 			tp->t_flags |= TF_WASFRECOVERY;
 		else
 			tp->t_flags &= ~TF_WASFRECOVERY;
 		if (IN_CONGRECOVERY(tp->t_flags))
 			tp->t_flags |= TF_WASCRECOVERY;
 		else
 			tp->t_flags &= ~TF_WASCRECOVERY;
 		if ((tp->t_flags & TF_RCVD_TSTMP) == 0)
 			tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
 		/* In the event that we've negotiated timestamps
 		 * badrxtwin will be set to the value that we set
 		 * the retransmitted packet's to_tsval to by tcp_output
 		 */
 		tp->t_flags |= TF_PREVVALID;
 	} else
 		tp->t_flags &= ~TF_PREVVALID;
 	TCPSTAT_INC(tcps_rexmttimeo);
 	if ((tp->t_state == TCPS_SYN_SENT) ||
 	    (tp->t_state == TCPS_SYN_RECEIVED))
 		rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift];
 	else
 		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
 	TCPT_RANGESET(tp->t_rxtcur, rexmt,
 		      tp->t_rttmin, TCPTV_REXMTMAX);
 
 	/*
 	 * We enter the path for PLMTUD if connection is established or, if
 	 * connection is FIN_WAIT_1 status, reason for the last is that if
 	 * amount of data we send is very small, we could send it in couple of
 	 * packets and process straight to FIN. In that case we won't catch
 	 * ESTABLISHED state.
 	 */
 #ifdef INET6
 	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false;
 #else
 	isipv6 = false;
 #endif
 	if (((V_tcp_pmtud_blackhole_detect == 1) ||
 	    (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) ||
 	    (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) &&
 	    ((tp->t_state == TCPS_ESTABLISHED) ||
 	    (tp->t_state == TCPS_FIN_WAIT_1))) {
 		if (tp->t_rxtshift == 1) {
 			/*
 			 * We enter blackhole detection after the first
 			 * unsuccessful timer based retransmission.
 			 * Then we reduce up to two times the MSS, each
 			 * candidate giving two tries of retransmissions.
 			 * But we give a candidate only two tries, if it
 			 * actually reduces the MSS.
 			 */
 			tp->t_blackhole_enter = 2;
 			tp->t_blackhole_exit = tp->t_blackhole_enter;
 			if (isipv6) {
 #ifdef INET6
 				if (tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss)
 					tp->t_blackhole_exit += 2;
 				if (tp->t_maxseg > V_tcp_v6mssdflt &&
 				    V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt)
 					tp->t_blackhole_exit += 2;
 #endif
 			} else {
 #ifdef INET
 				if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss)
 					tp->t_blackhole_exit += 2;
 				if (tp->t_maxseg > V_tcp_mssdflt &&
 				    V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt)
 					tp->t_blackhole_exit += 2;
 #endif
 			}
 		}
 		if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
 		    (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
 		    (tp->t_rxtshift >= tp->t_blackhole_enter &&
 		    tp->t_rxtshift < tp->t_blackhole_exit &&
 		    (tp->t_rxtshift - tp->t_blackhole_enter) % 2 == 0)) {
 			/*
 			 * Enter Path MTU Black-hole Detection mechanism:
 			 * - Disable Path MTU Discovery (IP "DF" bit).
 			 * - Reduce MTU to lower value than what we
 			 *   negotiated with peer.
 			 */
 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
 				/* Record that we may have found a black hole. */
 				tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
 				/* Keep track of previous MSS. */
 				tp->t_pmtud_saved_maxseg = tp->t_maxseg;
 			}
 
 			/*
 			 * Reduce the MSS to blackhole value or to the default
 			 * in an attempt to retransmit.
 			 */
 #ifdef INET6
 			if (isipv6 &&
 			    tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss &&
 			    V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) {
 				/* Use the sysctl tuneable blackhole MSS. */
 				tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
 				TCPSTAT_INC(tcps_pmtud_blackhole_activated);
 			} else if (isipv6) {
 				/* Use the default MSS. */
 				tp->t_maxseg = V_tcp_v6mssdflt;
 				/*
 				 * Disable Path MTU Discovery when we switch to
 				 * minmss.
 				 */
 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 				TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
 			}
 #endif
 #if defined(INET6) && defined(INET)
 			else
 #endif
 #ifdef INET
 			if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss &&
 			    V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) {
 				/* Use the sysctl tuneable blackhole MSS. */
 				tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
 				TCPSTAT_INC(tcps_pmtud_blackhole_activated);
 			} else {
 				/* Use the default MSS. */
 				tp->t_maxseg = V_tcp_mssdflt;
 				/*
 				 * Disable Path MTU Discovery when we switch to
 				 * minmss.
 				 */
 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 				TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
 			}
 #endif
 			/*
 			 * Reset the slow-start flight size
 			 * as it may depend on the new MSS.
 			 */
 			if (CC_ALGO(tp)->conn_init != NULL)
 				CC_ALGO(tp)->conn_init(tp->ccv);
 		} else {
 			/*
 			 * If further retransmissions are still unsuccessful
 			 * with a lowered MTU, maybe this isn't a blackhole and
 			 * we restore the previous MSS and blackhole detection
 			 * flags.
 			 */
 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
 			    (tp->t_rxtshift >= tp->t_blackhole_exit)) {
 				tp->t_flags2 |= TF2_PLPMTU_PMTUD;
 				tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
 				tp->t_maxseg = tp->t_pmtud_saved_maxseg;
 				TCPSTAT_INC(tcps_pmtud_blackhole_failed);
 				/*
 				 * Reset the slow-start flight size as it
 				 * may depend on the new MSS.
 				 */
 				if (CC_ALGO(tp)->conn_init != NULL)
 					CC_ALGO(tp)->conn_init(tp->ccv);
 			}
 		}
 	}
 
 	/*
 	 * Disable RFC1323 and SACK if we haven't got any response to
 	 * our third SYN to work-around some broken terminal servers
 	 * (most of which have hopefully been retired) that have bad VJ
 	 * header compression code which trashes TCP segments containing
 	 * unknown-to-them TCP options.
 	 */
 	if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
 	    (tp->t_rxtshift == 3))
 		tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
 	/*
 	 * If we backed off this far, notify the L3 protocol that we're having
 	 * connection problems.
 	 */
 	if (tp->t_rxtshift > TCP_RTT_INVALIDATE) {
 #ifdef INET6
 		if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
 			in6_losing(tp->t_inpcb);
 		else
 #endif
 			in_losing(tp->t_inpcb);
 	}
 	tp->snd_nxt = tp->snd_una;
 	tp->snd_recover = tp->snd_max;
 	/*
 	 * Force a segment to be sent.
 	 */
 	tp->t_flags |= TF_ACKNOW;
 	/*
 	 * If timing a segment in this window, stop the timer.
 	 */
 	tp->t_rtttime = 0;
 
 	cc_cong_signal(tp, NULL, CC_RTO);
 	NET_EPOCH_ENTER(et);
 	outrv = tcp_output_nodrop(tp);
 #ifdef TCPDEBUG
 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
 			  PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 	(void) tcp_unlock_or_drop(tp, outrv);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
 {
 	struct callout *t_callout;
 	callout_func_t *f_callout;
 	struct inpcb *inp = tp->t_inpcb;
 	int cpu = inp_to_cpuid(inp);
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE)
 		return;
 #endif
 
 	if (tp->t_timers->tt_flags & TT_STOPPED)
 		return;
 
 	switch (timer_type) {
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;
 			f_callout = tcp_timer_delack;
 			break;
 		case TT_REXMT:
 			t_callout = &tp->t_timers->tt_rexmt;
 			f_callout = tcp_timer_rexmt;
 			break;
 		case TT_PERSIST:
 			t_callout = &tp->t_timers->tt_persist;
 			f_callout = tcp_timer_persist;
 			break;
 		case TT_KEEP:
 			t_callout = &tp->t_timers->tt_keep;
 			f_callout = tcp_timer_keep;
 			break;
 		case TT_2MSL:
 			t_callout = &tp->t_timers->tt_2msl;
 			f_callout = tcp_timer_2msl;
 			break;
 		default:
 			if (tp->t_fb->tfb_tcp_timer_activate) {
 				tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta);
 				return;
 			}
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 	if (delta == 0) {
 		callout_stop(t_callout);
 	} else {
 		callout_reset_on(t_callout, delta, f_callout, tp, cpu);
 	}
 }
 
 int
 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
 {
 	struct callout *t_callout;
 
 	switch (timer_type) {
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;
 			break;
 		case TT_REXMT:
 			t_callout = &tp->t_timers->tt_rexmt;
 			break;
 		case TT_PERSIST:
 			t_callout = &tp->t_timers->tt_persist;
 			break;
 		case TT_KEEP:
 			t_callout = &tp->t_timers->tt_keep;
 			break;
 		case TT_2MSL:
 			t_callout = &tp->t_timers->tt_2msl;
 			break;
 		default:
 			if (tp->t_fb->tfb_tcp_timer_active) {
 				return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type));
 			}
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 	return callout_active(t_callout);
 }
 
 /*
  * Stop the timer from running, and apply a flag
  * against the timer_flags that will force the
  * timer never to run. The flag is needed to assure
  * a race does not leave it running and cause
  * the timer to possibly restart itself (keep and persist
  * especially do this).
  */
 int
 tcp_timer_suspend(struct tcpcb *tp, uint32_t timer_type)
 {
 	struct callout *t_callout;
 	uint32_t t_flags;
 
 	switch (timer_type) {
 		case TT_DELACK:
 			t_flags = TT_DELACK_SUS;
 			t_callout = &tp->t_timers->tt_delack;
 			break;
 		case TT_REXMT:
 			t_flags = TT_REXMT_SUS;
 			t_callout = &tp->t_timers->tt_rexmt;
 			break;
 		case TT_PERSIST:
 			t_flags = TT_PERSIST_SUS;
 			t_callout = &tp->t_timers->tt_persist;
 			break;
 		case TT_KEEP:
 			t_flags = TT_KEEP_SUS;
 			t_callout = &tp->t_timers->tt_keep;
 			break;
 		case TT_2MSL:
 			t_flags = TT_2MSL_SUS;
 			t_callout = &tp->t_timers->tt_2msl;
 			break;
 		default:
 			panic("tp:%p bad timer_type 0x%x", tp, timer_type);
 	}
 	tp->t_timers->tt_flags |= t_flags;
 	return (callout_stop(t_callout));
 }
 
 void
 tcp_timers_unsuspend(struct tcpcb *tp, uint32_t timer_type)
 {
 	switch (timer_type) {
 		case TT_DELACK:
 			if (tp->t_timers->tt_flags & TT_DELACK_SUS) {
 				tp->t_timers->tt_flags &= ~TT_DELACK_SUS;
 				if (tp->t_flags & TF_DELACK) {
 					/* Delayed ack timer should be up activate a timer */
 					tp->t_flags &= ~TF_DELACK;
 					tcp_timer_activate(tp, TT_DELACK,
 					    tcp_delacktime);
 				}
 			}
 			break;
 		case TT_REXMT:
 			if (tp->t_timers->tt_flags & TT_REXMT_SUS) {
 				tp->t_timers->tt_flags &= ~TT_REXMT_SUS;
 				if (SEQ_GT(tp->snd_max, tp->snd_una) &&
 				    (tcp_timer_active((tp), TT_PERSIST) == 0) &&
 				    tp->snd_wnd) {
 					/* We have outstanding data activate a timer */
 					tcp_timer_activate(tp, TT_REXMT,
                                             tp->t_rxtcur);
 				}
 			}
 			break;
 		case TT_PERSIST:
 			if (tp->t_timers->tt_flags & TT_PERSIST_SUS) {
 				tp->t_timers->tt_flags &= ~TT_PERSIST_SUS;
 				if (tp->snd_wnd == 0) {
 					/* Activate the persists timer */
 					tp->t_rxtshift = 0;
 					tcp_setpersist(tp);
 				}
 			}
 			break;
 		case TT_KEEP:
 			if (tp->t_timers->tt_flags & TT_KEEP_SUS) {
 				tp->t_timers->tt_flags &= ~TT_KEEP_SUS;
 				tcp_timer_activate(tp, TT_KEEP,
 					    TCPS_HAVEESTABLISHED(tp->t_state) ?
 					    TP_KEEPIDLE(tp) : TP_KEEPINIT(tp));
 			}
 			break;
 		case TT_2MSL:
 			if (tp->t_timers->tt_flags &= TT_2MSL_SUS) {
 				tp->t_timers->tt_flags &= ~TT_2MSL_SUS;
 				if ((tp->t_state == TCPS_FIN_WAIT_2) &&
 				    ((tp->t_inpcb->inp_socket == NULL) ||
 				     (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE))) {
 					/* Star the 2MSL timer */
 					tcp_timer_activate(tp, TT_2MSL,
 					    (tcp_fast_finwait2_recycle) ?
 					    tcp_finwait2_timeout : TP_MAXIDLE(tp));
 				}
 			}
 			break;
 		default:
 			panic("tp:%p bad timer_type 0x%x", tp, timer_type);
 	}
 }
 
 static void
 tcp_timer_discard(void *ptp)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct epoch_tracker et;
 
 	tp = (struct tcpcb *)ptp;
 	CURVNET_SET(tp->t_vnet);
 	NET_EPOCH_ENTER(et);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL",
 		__func__, tp));
 	INP_WLOCK(inp);
 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0,
 		("%s: tcpcb has to be stopped here", __func__));
 	if (--tp->t_timers->tt_draincnt > 0 ||
 	    tcp_freecb(tp) == false)
 		INP_WUNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 }
 
 void
 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
 {
 	struct callout *t_callout;
 
 	tp->t_timers->tt_flags |= TT_STOPPED;
 	switch (timer_type) {
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;
 			break;
 		case TT_REXMT:
 			t_callout = &tp->t_timers->tt_rexmt;
 			break;
 		case TT_PERSIST:
 			t_callout = &tp->t_timers->tt_persist;
 			break;
 		case TT_KEEP:
 			t_callout = &tp->t_timers->tt_keep;
 			break;
 		case TT_2MSL:
 			t_callout = &tp->t_timers->tt_2msl;
 			break;
 		default:
 			if (tp->t_fb->tfb_tcp_timer_stop) {
 				/*
 				 * XXXrrs we need to look at this with the
 				 * stop case below (flags).
 				 */
 				tp->t_fb->tfb_tcp_timer_stop(tp, timer_type);
 				return;
 			}
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 
 	if (callout_async_drain(t_callout, tcp_timer_discard) == 0) {
 		/*
 		 * Can't stop the callout, defer tcpcb actual deletion
 		 * to the last one. We do this using the async drain
 		 * function and incrementing the count in
 		 */
 		tp->t_timers->tt_draincnt++;
 	}
 }
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 43acc0ad1719..d069c804bcc4 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1,3190 +1,3178 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2006-2007 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_kern_tls.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/arb.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/refcount.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/qmath.h>
 #include <sys/sysctl.h>
 #include <sys/mbuf.h>
 #ifdef INET6
 #include <sys/domain.h>
 #endif /* INET6 */
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/proc.h>
 #include <sys/jail.h>
 #include <sys/stats.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #endif
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_log_buf.h>
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcp_fastopen.h>
 #include <netinet/tcp_hpts.h>
 #ifdef TCPPCAP
 #include <netinet/tcp_pcap.h>
 #endif
 #include <netinet/tcp_debug.h>
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 #include <netipsec/ipsec_support.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 
 /*
  * TCP protocol interface to socket abstraction.
  */
 #ifdef INET
 static int	tcp_connect(struct tcpcb *, struct sockaddr *,
 		    struct thread *td);
 #endif /* INET */
 #ifdef INET6
 static int	tcp6_connect(struct tcpcb *, struct sockaddr *,
 		    struct thread *td);
 #endif /* INET6 */
 static void	tcp_disconnect(struct tcpcb *);
 static void	tcp_usrclosed(struct tcpcb *);
 static void	tcp_fill_info(struct tcpcb *, struct tcp_info *);
 
 static int	tcp_pru_options_support(struct tcpcb *tp, int flags);
 
 #ifdef TCPDEBUG
 #define	TCPDEBUG0	int ostate = 0
 #define	TCPDEBUG1()	ostate = tp ? tp->t_state : 0
 #define	TCPDEBUG2(req)	if (tp && (so->so_options & SO_DEBUG)) \
 				tcp_trace(TA_USER, ostate, tp, 0, 0, req)
 #else
 #define	TCPDEBUG0
 #define	TCPDEBUG1()
 #define	TCPDEBUG2(req)
 #endif
 
 /*
  * tcp_require_unique port requires a globally-unique source port for each
  * outgoing connection.  The default is to require the 4-tuple to be unique.
  */
 VNET_DEFINE(int, tcp_require_unique_port) = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, require_unique_port,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_require_unique_port), 0,
     "Require globally-unique ephemeral port for outgoing connections");
 #define	V_tcp_require_unique_port	VNET(tcp_require_unique_port)
 
 /*
  * TCP attaches to socket via pru_attach(), reserving space,
  * and an internet control block.
  */
 static int
 tcp_usr_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	int error;
 	TCPDEBUG0;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL"));
 	TCPDEBUG1();
 
 	error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace);
 	if (error)
 		goto out;
 
 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
 	so->so_snd.sb_flags |= SB_AUTOSIZE;
 	error = in_pcballoc(so, &V_tcbinfo);
 	if (error)
 		goto out;
 	inp = sotoinpcb(so);
 	tp = tcp_newtcpcb(inp);
 	if (tp == NULL) {
 		error = ENOBUFS;
 		in_pcbdetach(inp);
 		in_pcbfree(inp);
 		goto out;
 	}
 	tp->t_state = TCPS_CLOSED;
 	INP_WUNLOCK(inp);
 	TCPSTATES_INC(TCPS_CLOSED);
 out:
 	TCPDEBUG2(PRU_ATTACH);
 	TCP_PROBE2(debug__user, tp, PRU_ATTACH);
 	return (error);
 }
 
 /*
  * tcp_usr_detach is called when the socket layer loses its final reference
  * to the socket, be it a file descriptor reference, a reference from TCP,
  * etc.  At this point, there is only one case in which we will keep around
  * inpcb state: time wait.
  */
 static void
 tcp_usr_detach(struct socket *so)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 	INP_WLOCK(inp);
 	KASSERT(so->so_pcb == inp && inp->inp_socket == so,
 		("%s: socket %p inp %p mismatch", __func__, so, inp));
 
 	tp = intotcpcb(inp);
 
 	KASSERT(inp->inp_flags & INP_DROPPED ||
 	    tp->t_state < TCPS_SYN_SENT,
 	    ("%s: inp %p not dropped or embryonic", __func__, inp));
 
 	tcp_discardcb(tp);
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 }
 
 #ifdef INET
 /*
  * Give the socket an address.
  */
 static int
 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error = 0;
 	struct inpcb *inp;
 #ifdef KDTRACE_HOOKS
 	struct tcpcb *tp = NULL;
 #endif
 	struct sockaddr_in *sinp;
 
 	sinp = (struct sockaddr_in *)nam;
 	if (nam->sa_family != AF_INET) {
 		/*
 		 * Preserve compatibility with old programs.
 		 */
 		if (nam->sa_family != AF_UNSPEC ||
 		    nam->sa_len < offsetof(struct sockaddr_in, sin_zero) ||
 		    sinp->sin_addr.s_addr != INADDR_ANY)
 			return (EAFNOSUPPORT);
 		nam->sa_family = AF_INET;
 	}
 	if (nam->sa_len != sizeof(*sinp))
 		return (EINVAL);
 
 	/*
 	 * Must check for multicast addresses and disallow binding
 	 * to them.
 	 */
 	if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
 		return (EAFNOSUPPORT);
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL"));
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		error = EINVAL;
 		goto out;
 	}
 #ifdef KDTRACE_HOOKS
 	tp = intotcpcb(inp);
 #endif
 	TCPDEBUG1();
 	INP_HASH_WLOCK(&V_tcbinfo);
 	error = in_pcbbind(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 out:
 	TCPDEBUG2(PRU_BIND);
 	TCP_PROBE2(debug__user, tp, PRU_BIND);
 	INP_WUNLOCK(inp);
 
 	return (error);
 }
 #endif /* INET */
 
 #ifdef INET6
 static int
 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error = 0;
 	struct inpcb *inp;
 #ifdef KDTRACE_HOOKS
 	struct tcpcb *tp = NULL;
 #endif
 	struct sockaddr_in6 *sin6;
 	u_char vflagsav;
 
 	sin6 = (struct sockaddr_in6 *)nam;
 	if (nam->sa_family != AF_INET6)
 		return (EAFNOSUPPORT);
 	if (nam->sa_len != sizeof(*sin6))
 		return (EINVAL);
 
 	/*
 	 * Must check for multicast addresses and disallow binding
 	 * to them.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 		return (EAFNOSUPPORT);
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL"));
 	INP_WLOCK(inp);
 	vflagsav = inp->inp_vflag;
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		error = EINVAL;
 		goto out;
 	}
 #ifdef KDTRACE_HOOKS
 	tp = intotcpcb(inp);
 #endif
 	TCPDEBUG1();
 	INP_HASH_WLOCK(&V_tcbinfo);
 	inp->inp_vflag &= ~INP_IPV4;
 	inp->inp_vflag |= INP_IPV6;
 #ifdef INET
 	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
 			inp->inp_vflag |= INP_IPV4;
 		else if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 			struct sockaddr_in sin;
 
 			in6_sin6_2_sin(&sin, sin6);
 			if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
 				error = EAFNOSUPPORT;
 				INP_HASH_WUNLOCK(&V_tcbinfo);
 				goto out;
 			}
 			inp->inp_vflag |= INP_IPV4;
 			inp->inp_vflag &= ~INP_IPV6;
 			error = in_pcbbind(inp, (struct sockaddr *)&sin,
 			    td->td_ucred);
 			INP_HASH_WUNLOCK(&V_tcbinfo);
 			goto out;
 		}
 	}
 #endif
 	error = in6_pcbbind(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 out:
 	if (error != 0)
 		inp->inp_vflag = vflagsav;
 	TCPDEBUG2(PRU_BIND);
 	TCP_PROBE2(debug__user, tp, PRU_BIND);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 #endif /* INET6 */
 
 #ifdef INET
 /*
  * Prepare to accept connections.
  */
 static int
 tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
 {
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL"));
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		error = EINVAL;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	SOCK_LOCK(so);
 	error = solisten_proto_check(so);
 	if (error != 0) {
 		SOCK_UNLOCK(so);
 		goto out;
 	}
 	if (inp->inp_lport == 0) {
 		INP_HASH_WLOCK(&V_tcbinfo);
 		error = in_pcbbind(inp, NULL, td->td_ucred);
 		INP_HASH_WUNLOCK(&V_tcbinfo);
 	}
 	if (error == 0) {
 		tcp_state_change(tp, TCPS_LISTEN);
 		solisten_proto(so, backlog);
 #ifdef TCP_OFFLOAD
 		if ((so->so_options & SO_NO_OFFLOAD) == 0)
 			tcp_offload_listen_start(tp);
 #endif
 	} else {
 		solisten_proto_abort(so);
 	}
 	SOCK_UNLOCK(so);
 
 	if (IS_FASTOPEN(tp->t_flags))
 		tp->t_tfo_pending = tcp_fastopen_alloc_counter();
 
 out:
 	TCPDEBUG2(PRU_LISTEN);
 	TCP_PROBE2(debug__user, tp, PRU_LISTEN);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 #endif /* INET */
 
 #ifdef INET6
 static int
 tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
 {
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	u_char vflagsav;
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL"));
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		error = EINVAL;
 		goto out;
 	}
 	vflagsav = inp->inp_vflag;
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	SOCK_LOCK(so);
 	error = solisten_proto_check(so);
 	if (error != 0) {
 		SOCK_UNLOCK(so);
 		goto out;
 	}
 	INP_HASH_WLOCK(&V_tcbinfo);
 	if (inp->inp_lport == 0) {
 		inp->inp_vflag &= ~INP_IPV4;
 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
 			inp->inp_vflag |= INP_IPV4;
 		error = in6_pcbbind(inp, NULL, td->td_ucred);
 	}
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 	if (error == 0) {
 		tcp_state_change(tp, TCPS_LISTEN);
 		solisten_proto(so, backlog);
 #ifdef TCP_OFFLOAD
 		if ((so->so_options & SO_NO_OFFLOAD) == 0)
 			tcp_offload_listen_start(tp);
 #endif
 	} else {
 		solisten_proto_abort(so);
 	}
 	SOCK_UNLOCK(so);
 
 	if (IS_FASTOPEN(tp->t_flags))
 		tp->t_tfo_pending = tcp_fastopen_alloc_counter();
 
 	if (error != 0)
 		inp->inp_vflag = vflagsav;
 
 out:
 	TCPDEBUG2(PRU_LISTEN);
 	TCP_PROBE2(debug__user, tp, PRU_LISTEN);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 #endif /* INET6 */
 
 #ifdef INET
 /*
  * Initiate connection to peer.
  * Create a template for use in transmissions on this connection.
  * Enter SYN_SENT state, and mark socket as connecting.
  * Start keep-alive timer, and seed output sequence space.
  * Send initial segment on connection.
  */
 static int
 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct epoch_tracker et;
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	struct sockaddr_in *sinp;
 
 	sinp = (struct sockaddr_in *)nam;
 	if (nam->sa_family != AF_INET)
 		return (EAFNOSUPPORT);
 	if (nam->sa_len != sizeof (*sinp))
 		return (EINVAL);
 
 	/*
 	 * Must disallow TCP ``connections'' to multicast addresses.
 	 */
 	if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
 		return (EAFNOSUPPORT);
 	if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST)
 		return (EACCES);
 	if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0)
 		return (error);
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
 	INP_WLOCK(inp);
-	if (inp->inp_flags & INP_TIMEWAIT) {
-		error = EADDRINUSE;
-		goto out;
-	}
 	if (inp->inp_flags & INP_DROPPED) {
 		error = ECONNREFUSED;
 		goto out;
 	}
 	if (SOLISTENING(so)) {
 		error = EOPNOTSUPP;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	NET_EPOCH_ENTER(et);
 	if ((error = tcp_connect(tp, nam, td)) != 0)
 		goto out_in_epoch;
 #ifdef TCP_OFFLOAD
 	if (registered_toedevs > 0 &&
 	    (so->so_options & SO_NO_OFFLOAD) == 0 &&
 	    (error = tcp_offload_connect(so, nam)) == 0)
 		goto out_in_epoch;
 #endif
 	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
 	error = tcp_output(tp);
 	KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()"
 	    ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error));
 out_in_epoch:
 	NET_EPOCH_EXIT(et);
 out:
 	TCPDEBUG2(PRU_CONNECT);
 	TCP_PROBE2(debug__user, tp, PRU_CONNECT);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 #endif /* INET */
 
 #ifdef INET6
 static int
 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct epoch_tracker et;
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	struct sockaddr_in6 *sin6;
 	u_int8_t incflagsav;
 	u_char vflagsav;
 
 	TCPDEBUG0;
 
 	sin6 = (struct sockaddr_in6 *)nam;
 	if (nam->sa_family != AF_INET6)
 		return (EAFNOSUPPORT);
 	if (nam->sa_len != sizeof (*sin6))
 		return (EINVAL);
 
 	/*
 	 * Must disallow TCP ``connections'' to multicast addresses.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 		return (EAFNOSUPPORT);
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
 	INP_WLOCK(inp);
 	vflagsav = inp->inp_vflag;
 	incflagsav = inp->inp_inc.inc_flags;
-	if (inp->inp_flags & INP_TIMEWAIT) {
-		error = EADDRINUSE;
-		goto out;
-	}
 	if (inp->inp_flags & INP_DROPPED) {
 		error = ECONNREFUSED;
 		goto out;
 	}
 	if (SOLISTENING(so)) {
 		error = EINVAL;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 #ifdef INET
 	/*
 	 * XXXRW: Some confusion: V4/V6 flags relate to binding, and
 	 * therefore probably require the hash lock, which isn't held here.
 	 * Is this a significant problem?
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 		struct sockaddr_in sin;
 
 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
 			error = EINVAL;
 			goto out;
 		}
 		if ((inp->inp_vflag & INP_IPV4) == 0) {
 			error = EAFNOSUPPORT;
 			goto out;
 		}
 
 		in6_sin6_2_sin(&sin, sin6);
 		if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
 			error = EAFNOSUPPORT;
 			goto out;
 		}
 		if (ntohl(sin.sin_addr.s_addr) == INADDR_BROADCAST) {
 			error = EACCES;
 			goto out;
 		}
 		if ((error = prison_remote_ip4(td->td_ucred,
 		    &sin.sin_addr)) != 0)
 			goto out;
 		inp->inp_vflag |= INP_IPV4;
 		inp->inp_vflag &= ~INP_IPV6;
 		NET_EPOCH_ENTER(et);
 		if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
 			goto out_in_epoch;
 #ifdef TCP_OFFLOAD
 		if (registered_toedevs > 0 &&
 		    (so->so_options & SO_NO_OFFLOAD) == 0 &&
 		    (error = tcp_offload_connect(so, nam)) == 0)
 			goto out_in_epoch;
 #endif
 		error = tcp_output(tp);
 		goto out_in_epoch;
 	} else {
 		if ((inp->inp_vflag & INP_IPV6) == 0) {
 			error = EAFNOSUPPORT;
 			goto out;
 		}
 	}
 #endif
 	if ((error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr)) != 0)
 		goto out;
 	inp->inp_vflag &= ~INP_IPV4;
 	inp->inp_vflag |= INP_IPV6;
 	inp->inp_inc.inc_flags |= INC_ISIPV6;
 	NET_EPOCH_ENTER(et);
 	if ((error = tcp6_connect(tp, nam, td)) != 0)
 		goto out_in_epoch;
 #ifdef TCP_OFFLOAD
 	if (registered_toedevs > 0 &&
 	    (so->so_options & SO_NO_OFFLOAD) == 0 &&
 	    (error = tcp_offload_connect(so, nam)) == 0)
 		goto out_in_epoch;
 #endif
 	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
 	error = tcp_output(tp);
 out_in_epoch:
 	NET_EPOCH_EXIT(et);
 out:
 	KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()"
 	    ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error));
 	/*
 	 * If the implicit bind in the connect call fails, restore
 	 * the flags we modified.
 	 */
 	if (error != 0 && inp->inp_lport == 0) {
 		inp->inp_vflag = vflagsav;
 		inp->inp_inc.inc_flags = incflagsav;
 	}
 
 	TCPDEBUG2(PRU_CONNECT);
 	TCP_PROBE2(debug__user, tp, PRU_CONNECT);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 #endif /* INET6 */
 
 /*
  * Initiate disconnect from peer.
  * If connection never passed embryonic stage, just drop;
  * else if don't need to let data drain, then can just drop anyways,
  * else have to begin TCP shutdown process: mark socket disconnecting,
  * drain unread data, state switch to reflect user close, and
  * send segment (e.g. FIN) to peer.  Socket will be really disconnected
  * when peer sends FIN and acks ours.
  *
  * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
  */
 static int
 tcp_usr_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	struct epoch_tracker et;
 	int error = 0;
 
 	TCPDEBUG0;
 	NET_EPOCH_ENTER(et);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
 	INP_WLOCK(inp);
-	if (inp->inp_flags & INP_TIMEWAIT)
-		goto out;
 	if (inp->inp_flags & INP_DROPPED) {
 		error = ECONNRESET;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	tcp_disconnect(tp);
 out:
 	TCPDEBUG2(PRU_DISCONNECT);
 	TCP_PROBE2(debug__user, tp, PRU_DISCONNECT);
 	INP_WUNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 #ifdef INET
 /*
  * Accept a connection.  Essentially all the work is done at higher levels;
  * just return the address of the peer, storing through addr.
  */
 static int
 tcp_usr_accept(struct socket *so, struct sockaddr **nam)
 {
 	int error = 0;
 	struct inpcb *inp = NULL;
 #ifdef KDTRACE_HOOKS
 	struct tcpcb *tp = NULL;
 #endif
 	struct in_addr addr;
 	in_port_t port = 0;
 	TCPDEBUG0;
 
 	if (so->so_state & SS_ISDISCONNECTED)
 		return (ECONNABORTED);
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		error = ECONNABORTED;
 		goto out;
 	}
 #ifdef KDTRACE_HOOKS
 	tp = intotcpcb(inp);
 #endif
 	TCPDEBUG1();
 
 	/*
 	 * We inline in_getpeeraddr and COMMON_END here, so that we can
 	 * copy the data of interest and defer the malloc until after we
 	 * release the lock.
 	 */
 	port = inp->inp_fport;
 	addr = inp->inp_faddr;
 
 out:
 	TCPDEBUG2(PRU_ACCEPT);
 	TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
 	INP_WUNLOCK(inp);
 	if (error == 0)
 		*nam = in_sockaddr(port, &addr);
 	return error;
 }
 #endif /* INET */
 
 #ifdef INET6
 static int
 tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp = NULL;
 	int error = 0;
 #ifdef KDTRACE_HOOKS
 	struct tcpcb *tp = NULL;
 #endif
 	struct in_addr addr;
 	struct in6_addr addr6;
 	struct epoch_tracker et;
 	in_port_t port = 0;
 	int v4 = 0;
 	TCPDEBUG0;
 
 	if (so->so_state & SS_ISDISCONNECTED)
 		return (ECONNABORTED);
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL"));
 	NET_EPOCH_ENTER(et);
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		error = ECONNABORTED;
 		goto out;
 	}
 #ifdef KDTRACE_HOOKS
 	tp = intotcpcb(inp);
 #endif
 	TCPDEBUG1();
 
 	/*
 	 * We inline in6_mapped_peeraddr and COMMON_END here, so that we can
 	 * copy the data of interest and defer the malloc until after we
 	 * release the lock.
 	 */
 	if (inp->inp_vflag & INP_IPV4) {
 		v4 = 1;
 		port = inp->inp_fport;
 		addr = inp->inp_faddr;
 	} else {
 		port = inp->inp_fport;
 		addr6 = inp->in6p_faddr;
 	}
 
 out:
 	TCPDEBUG2(PRU_ACCEPT);
 	TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
 	INP_WUNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 	if (error == 0) {
 		if (v4)
 			*nam = in6_v4mapsin6_sockaddr(port, &addr);
 		else
 			*nam = in6_sockaddr(port, &addr6);
 	}
 	return error;
 }
 #endif /* INET6 */
 
 /*
  * Mark the connection as being incapable of further output.
  */
 static int
 tcp_usr_shutdown(struct socket *so)
 {
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	struct epoch_tracker et;
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("inp == NULL"));
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	NET_EPOCH_ENTER(et);
 	TCPDEBUG1();
 	socantsendmore(so);
 	tcp_usrclosed(tp);
 	if (!(inp->inp_flags & INP_DROPPED))
 		error = tcp_output_nodrop(tp);
 	TCPDEBUG2(PRU_SHUTDOWN);
 	TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN);
 	error = tcp_unlock_or_drop(tp, error);
 	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
 
 /*
  * After a receive, possibly send window update to peer.
  */
 static int
 tcp_usr_rcvd(struct socket *so, int flags)
 {
 	struct epoch_tracker et;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	int outrv = 0, error = 0;
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	NET_EPOCH_ENTER(et);
 	TCPDEBUG1();
 	/*
 	 * For passively-created TFO connections, don't attempt a window
 	 * update while still in SYN_RECEIVED as this may trigger an early
 	 * SYN|ACK.  It is preferable to have the SYN|ACK be sent along with
 	 * application response data, or failing that, when the DELACK timer
 	 * expires.
 	 */
 	if (IS_FASTOPEN(tp->t_flags) &&
 	    (tp->t_state == TCPS_SYN_RECEIVED))
 		goto out;
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE)
 		tcp_offload_rcvd(tp);
 	else
 #endif
 		outrv = tcp_output_nodrop(tp);
 out:
 	TCPDEBUG2(PRU_RCVD);
 	TCP_PROBE2(debug__user, tp, PRU_RCVD);
 	(void) tcp_unlock_or_drop(tp, outrv);
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 /*
  * Do a send by putting data in output queue and updating urgent
  * marker if URG set.  Possibly send more data.  Unlike the other
  * pru_*() routines, the mbuf chains are our responsibility.  We
  * must either enqueue them or free them.  The other pru_* routines
  * generally are caller-frees.
  */
 static int
 tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
     struct sockaddr *nam, struct mbuf *control, struct thread *td)
 {
 	struct epoch_tracker et;
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 #ifdef INET
 #ifdef INET6
 	struct sockaddr_in sin;
 #endif
 	struct sockaddr_in *sinp;
 #endif
 #ifdef INET6
 	int isipv6;
 #endif
 	u_int8_t incflagsav;
 	u_char vflagsav;
 	bool restoreflags;
 	TCPDEBUG0;
 
 	if (control != NULL) {
 		/* TCP doesn't do control messages (rights, creds, etc) */
 		if (control->m_len) {
 			m_freem(control);
 			return (EINVAL);
 		}
 		m_freem(control);	/* empty control, just free it */
 	}
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		if (m != NULL && (flags & PRUS_NOTREADY) == 0)
 			m_freem(m);
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 
 	vflagsav = inp->inp_vflag;
 	incflagsav = inp->inp_inc.inc_flags;
 	restoreflags = false;
 	tp = intotcpcb(inp);
 
 	NET_EPOCH_ENTER(et);
 	if ((flags & PRUS_OOB) != 0 &&
 	    (error = tcp_pru_options_support(tp, PRUS_OOB)) != 0)
 		goto out;
 
 	TCPDEBUG1();
 	if (nam != NULL && tp->t_state < TCPS_SYN_SENT) {
 		if (tp->t_state == TCPS_LISTEN) {
 			error = EINVAL;
 			goto out;
 		}
 		switch (nam->sa_family) {
 #ifdef INET
 		case AF_INET:
 			sinp = (struct sockaddr_in *)nam;
 			if (sinp->sin_len != sizeof(struct sockaddr_in)) {
 				error = EINVAL;
 				goto out;
 			}
 			if ((inp->inp_vflag & INP_IPV6) != 0) {
 				error = EAFNOSUPPORT;
 				goto out;
 			}
 			if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
 				error = EAFNOSUPPORT;
 				goto out;
 			}
 			if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST) {
 				error = EACCES;
 				goto out;
 			}
 			if ((error = prison_remote_ip4(td->td_ucred,
 			    &sinp->sin_addr)))
 				goto out;
 #ifdef INET6
 			isipv6 = 0;
 #endif
 			break;
 #endif /* INET */
 #ifdef INET6
 		case AF_INET6:
 		{
 			struct sockaddr_in6 *sin6;
 
 			sin6 = (struct sockaddr_in6 *)nam;
 			if (sin6->sin6_len != sizeof(*sin6)) {
 				error = EINVAL;
 				goto out;
 			}
 			if ((inp->inp_vflag & INP_IPV6PROTO) == 0) {
 				error = EAFNOSUPPORT;
 				goto out;
 			}
 			if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
 				error = EAFNOSUPPORT;
 				goto out;
 			}
 			if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 #ifdef INET
 				if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
 					error = EINVAL;
 					goto out;
 				}
 				if ((inp->inp_vflag & INP_IPV4) == 0) {
 					error = EAFNOSUPPORT;
 					goto out;
 				}
 				restoreflags = true;
 				inp->inp_vflag &= ~INP_IPV6;
 				sinp = &sin;
 				in6_sin6_2_sin(sinp, sin6);
 				if (IN_MULTICAST(
 				    ntohl(sinp->sin_addr.s_addr))) {
 					error = EAFNOSUPPORT;
 					goto out;
 				}
 				if ((error = prison_remote_ip4(td->td_ucred,
 				    &sinp->sin_addr)))
 					goto out;
 				isipv6 = 0;
 #else /* !INET */
 				error = EAFNOSUPPORT;
 				goto out;
 #endif /* INET */
 			} else {
 				if ((inp->inp_vflag & INP_IPV6) == 0) {
 					error = EAFNOSUPPORT;
 					goto out;
 				}
 				restoreflags = true;
 				inp->inp_vflag &= ~INP_IPV4;
 				inp->inp_inc.inc_flags |= INC_ISIPV6;
 				if ((error = prison_remote_ip6(td->td_ucred,
 				    &sin6->sin6_addr)))
 					goto out;
 				isipv6 = 1;
 			}
 			break;
 		}
 #endif /* INET6 */
 		default:
 			error = EAFNOSUPPORT;
 			goto out;
 		}
 	}
 	if (!(flags & PRUS_OOB)) {
 		if (tp->t_acktime == 0)
 			tp->t_acktime = ticks;
 		sbappendstream(&so->so_snd, m, flags);
 		m = NULL;
 		if (nam && tp->t_state < TCPS_SYN_SENT) {
 			KASSERT(tp->t_state == TCPS_CLOSED,
 			    ("%s: tp %p is listening", __func__, tp));
 
 			/*
 			 * Do implied connect if not yet connected,
 			 * initialize window to default value, and
 			 * initialize maxseg using peer's cached MSS.
 			 */
 #ifdef INET6
 			if (isipv6)
 				error = tcp6_connect(tp, nam, td);
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 			else
 #endif
 #ifdef INET
 				error = tcp_connect(tp,
 				    (struct sockaddr *)sinp, td);
 #endif
 			/*
 			 * The bind operation in tcp_connect succeeded. We
 			 * no longer want to restore the flags if later
 			 * operations fail.
 			 */
 			if (error == 0 || inp->inp_lport != 0)
 				restoreflags = false;
 
 			if (error) {
 				/* m is freed if PRUS_NOTREADY is unset. */
 				sbflush(&so->so_snd);
 				goto out;
 			}
 			if (IS_FASTOPEN(tp->t_flags))
 				tcp_fastopen_connect(tp);
 			else {
 				tp->snd_wnd = TTCP_CLIENT_SND_WND;
 				tcp_mss(tp, -1);
 			}
 		}
 		if (flags & PRUS_EOF) {
 			/*
 			 * Close the send side of the connection after
 			 * the data is sent.
 			 */
 			socantsendmore(so);
 			tcp_usrclosed(tp);
 		}
 		if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 		    ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
 		    (tp->t_fbyte_out == 0) &&
 		    (so->so_snd.sb_ccc > 0)) {
 			tp->t_fbyte_out = ticks;
 			if (tp->t_fbyte_out == 0)
 				tp->t_fbyte_out = 1;
 			if (tp->t_fbyte_out && tp->t_fbyte_in)
 				tp->t_flags2 |= TF2_FBYTES_COMPLETE;
 		}
 		if (!(inp->inp_flags & INP_DROPPED) &&
 		    !(flags & PRUS_NOTREADY)) {
 			if (flags & PRUS_MORETOCOME)
 				tp->t_flags |= TF_MORETOCOME;
 			error = tcp_output_nodrop(tp);
 			if (flags & PRUS_MORETOCOME)
 				tp->t_flags &= ~TF_MORETOCOME;
 		}
 	} else {
 		/*
 		 * XXXRW: PRUS_EOF not implemented with PRUS_OOB?
 		 */
 		SOCKBUF_LOCK(&so->so_snd);
 		if (sbspace(&so->so_snd) < -512) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = ENOBUFS;
 			goto out;
 		}
 		/*
 		 * According to RFC961 (Assigned Protocols),
 		 * the urgent pointer points to the last octet
 		 * of urgent data.  We continue, however,
 		 * to consider it to indicate the first octet
 		 * of data past the urgent section.
 		 * Otherwise, snd_up should be one lower.
 		 */
 		if (tp->t_acktime == 0)
 			tp->t_acktime = ticks;
 		sbappendstream_locked(&so->so_snd, m, flags);
 		SOCKBUF_UNLOCK(&so->so_snd);
 		m = NULL;
 		if (nam && tp->t_state < TCPS_SYN_SENT) {
 			/*
 			 * Do implied connect if not yet connected,
 			 * initialize window to default value, and
 			 * initialize maxseg using peer's cached MSS.
 			 */
 
 			/*
 			 * Not going to contemplate SYN|URG
 			 */
 			if (IS_FASTOPEN(tp->t_flags))
 				tp->t_flags &= ~TF_FASTOPEN;
 #ifdef INET6
 			if (isipv6)
 				error = tcp6_connect(tp, nam, td);
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 			else
 #endif
 #ifdef INET
 				error = tcp_connect(tp,
 				    (struct sockaddr *)sinp, td);
 #endif
 			/*
 			 * The bind operation in tcp_connect succeeded. We
 			 * no longer want to restore the flags if later
 			 * operations fail.
 			 */
 			if (error == 0 || inp->inp_lport != 0)
 				restoreflags = false;
 
 			if (error != 0) {
 				/* m is freed if PRUS_NOTREADY is unset. */
 				sbflush(&so->so_snd);
 				goto out;
 			}
 			tp->snd_wnd = TTCP_CLIENT_SND_WND;
 			tcp_mss(tp, -1);
 		}
 		tp->snd_up = tp->snd_una + sbavail(&so->so_snd);
 		if ((flags & PRUS_NOTREADY) == 0) {
 			tp->t_flags |= TF_FORCEDATA;
 			error = tcp_output_nodrop(tp);
 			tp->t_flags &= ~TF_FORCEDATA;
 		}
 	}
 	TCP_LOG_EVENT(tp, NULL,
 	    &inp->inp_socket->so_rcv,
 	    &inp->inp_socket->so_snd,
 	    TCP_LOG_USERSEND, error,
 	    0, NULL, false);
 
 out:
 	/*
 	 * In case of PRUS_NOTREADY, the caller or tcp_usr_ready() is
 	 * responsible for freeing memory.
 	 */
 	if (m != NULL && (flags & PRUS_NOTREADY) == 0)
 		m_freem(m);
 
 	/*
 	 * If the request was unsuccessful and we changed flags,
 	 * restore the original flags.
 	 */
 	if (error != 0 && restoreflags) {
 		inp->inp_vflag = vflagsav;
 		inp->inp_inc.inc_flags = incflagsav;
 	}
 	TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
 		  ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
 	TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB :
 		   ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
 	error = tcp_unlock_or_drop(tp, error);
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 static int
 tcp_usr_ready(struct socket *so, struct mbuf *m, int count)
 {
 	struct epoch_tracker et;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	int error;
 
 	inp = sotoinpcb(so);
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		mb_free_notready(m, count);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 
 	SOCKBUF_LOCK(&so->so_snd);
 	error = sbready(&so->so_snd, m, count);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	if (error) {
 		INP_WUNLOCK(inp);
 		return (error);
 	}
 	NET_EPOCH_ENTER(et);
 	error = tcp_output_unlock(tp);
 	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
 
 /*
  * Abort the TCP.  Drop the connection abruptly.
  */
 static void
 tcp_usr_abort(struct socket *so)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	struct epoch_tracker et;
 	TCPDEBUG0;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
 
 	NET_EPOCH_ENTER(et);
 	INP_WLOCK(inp);
 	KASSERT(inp->inp_socket != NULL,
 	    ("tcp_usr_abort: inp_socket == NULL"));
 
 	/*
 	 * If we still have full TCP state, and we're not dropped, drop.
 	 */
-	if (!(inp->inp_flags & INP_TIMEWAIT) &&
-	    !(inp->inp_flags & INP_DROPPED)) {
+	if (!(inp->inp_flags & INP_DROPPED)) {
 		tp = intotcpcb(inp);
 		TCPDEBUG1();
 		tp = tcp_drop(tp, ECONNABORTED);
 		if (tp == NULL)
 			goto dropped;
 		TCPDEBUG2(PRU_ABORT);
 		TCP_PROBE2(debug__user, tp, PRU_ABORT);
 	}
 	if (!(inp->inp_flags & INP_DROPPED)) {
 		soref(so);
 		inp->inp_flags |= INP_SOCKREF;
 	}
 	INP_WUNLOCK(inp);
 dropped:
 	NET_EPOCH_EXIT(et);
 }
 
 /*
  * TCP socket is closed.  Start friendly disconnect.
  */
 static void
 tcp_usr_close(struct socket *so)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	struct epoch_tracker et;
 	TCPDEBUG0;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
 
 	NET_EPOCH_ENTER(et);
 	INP_WLOCK(inp);
 	KASSERT(inp->inp_socket != NULL,
 	    ("tcp_usr_close: inp_socket == NULL"));
 
 	/*
 	 * If we still have full TCP state, and we're not dropped, initiate
 	 * a disconnect.
 	 */
-	if (!(inp->inp_flags & INP_TIMEWAIT) &&
-	    !(inp->inp_flags & INP_DROPPED)) {
+	if (!(inp->inp_flags & INP_DROPPED)) {
 		tp = intotcpcb(inp);
 		tp->t_flags |= TF_CLOSED;
 		TCPDEBUG1();
 		tcp_disconnect(tp);
 		TCPDEBUG2(PRU_CLOSE);
 		TCP_PROBE2(debug__user, tp, PRU_CLOSE);
 	}
 	if (!(inp->inp_flags & INP_DROPPED)) {
 		soref(so);
 		inp->inp_flags |= INP_SOCKREF;
 	}
 	INP_WUNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 }
 
 static int
 tcp_pru_options_support(struct tcpcb *tp, int flags)
 {
 	/*
 	 * If the specific TCP stack has a pru_options
 	 * specified then it does not always support
 	 * all the PRU_XX options and we must ask it.
 	 * If the function is not specified then all
 	 * of the PRU_XX options are supported.
 	 */
 	int ret = 0;
 
 	if (tp->t_fb->tfb_pru_options) {
 		ret = (*tp->t_fb->tfb_pru_options)(tp, flags);
 	}
 	return (ret);
 }
 
 /*
  * Receive out-of-band data.
  */
 static int
 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
 {
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL"));
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		error = ECONNRESET;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	error = tcp_pru_options_support(tp, PRUS_OOB);
 	if (error) {
 		goto out;
 	}
 	TCPDEBUG1();
 	if ((so->so_oobmark == 0 &&
 	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
 	    so->so_options & SO_OOBINLINE ||
 	    tp->t_oobflags & TCPOOB_HADDATA) {
 		error = EINVAL;
 		goto out;
 	}
 	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
 		error = EWOULDBLOCK;
 		goto out;
 	}
 	m->m_len = 1;
 	*mtod(m, caddr_t) = tp->t_iobc;
 	if ((flags & MSG_PEEK) == 0)
 		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 
 out:
 	TCPDEBUG2(PRU_RCVOOB);
 	TCP_PROBE2(debug__user, tp, PRU_RCVOOB);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 #ifdef INET
 struct protosw tcp_protosw = {
 	.pr_type =		SOCK_STREAM,
 	.pr_protocol =		IPPROTO_TCP,
 	.pr_flags =		PR_CONNREQUIRED | PR_IMPLOPCL | PR_WANTRCVD |
 				    PR_CAPATTACH,
 	.pr_ctloutput =		tcp_ctloutput,
 	.pr_abort =		tcp_usr_abort,
 	.pr_accept =		tcp_usr_accept,
 	.pr_attach =		tcp_usr_attach,
 	.pr_bind =		tcp_usr_bind,
 	.pr_connect =		tcp_usr_connect,
 	.pr_control =		in_control,
 	.pr_detach =		tcp_usr_detach,
 	.pr_disconnect =	tcp_usr_disconnect,
 	.pr_listen =		tcp_usr_listen,
 	.pr_peeraddr =		in_getpeeraddr,
 	.pr_rcvd =		tcp_usr_rcvd,
 	.pr_rcvoob =		tcp_usr_rcvoob,
 	.pr_send =		tcp_usr_send,
 	.pr_ready =		tcp_usr_ready,
 	.pr_shutdown =		tcp_usr_shutdown,
 	.pr_sockaddr =		in_getsockaddr,
 	.pr_sosetlabel =	in_pcbsosetlabel,
 	.pr_close =		tcp_usr_close,
 };
 #endif /* INET */
 
 #ifdef INET6
 struct protosw tcp6_protosw = {
 	.pr_type =		SOCK_STREAM,
 	.pr_protocol =		IPPROTO_TCP,
 	.pr_flags =		PR_CONNREQUIRED | PR_IMPLOPCL |PR_WANTRCVD |
 				    PR_CAPATTACH,
 	.pr_ctloutput =		tcp_ctloutput,
 	.pr_abort =		tcp_usr_abort,
 	.pr_accept =		tcp6_usr_accept,
 	.pr_attach =		tcp_usr_attach,
 	.pr_bind =		tcp6_usr_bind,
 	.pr_connect =		tcp6_usr_connect,
 	.pr_control =		in6_control,
 	.pr_detach =		tcp_usr_detach,
 	.pr_disconnect =	tcp_usr_disconnect,
 	.pr_listen =		tcp6_usr_listen,
 	.pr_peeraddr =		in6_mapped_peeraddr,
 	.pr_rcvd =		tcp_usr_rcvd,
 	.pr_rcvoob =		tcp_usr_rcvoob,
 	.pr_send =		tcp_usr_send,
 	.pr_ready =		tcp_usr_ready,
 	.pr_shutdown =		tcp_usr_shutdown,
 	.pr_sockaddr =		in6_mapped_sockaddr,
 	.pr_sosetlabel =	in_pcbsosetlabel,
 	.pr_close =		tcp_usr_close,
 };
 #endif /* INET6 */
 
 #ifdef INET
 /*
  * Common subroutine to open a TCP connection to remote host specified
  * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
  * port number if needed.  Call in_pcbconnect_setup to do the routing and
  * to choose a local host address (interface).  If there is an existing
  * incarnation of the same connection in TIME-WAIT state and if the remote
  * host was sending CC options and if the connection duration was < MSL, then
  * truncate the previous TIME-WAIT state and proceed.
  * Initialize connection parameters and enter SYN-SENT state.
  */
 static int
 tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp = tp->t_inpcb, *oinp;
 	struct socket *so = inp->inp_socket;
 	struct in_addr laddr;
 	u_short lport;
 	int error;
 
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK(&V_tcbinfo);
 
 	if (V_tcp_require_unique_port && inp->inp_lport == 0) {
 		error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
 		if (error)
 			goto out;
 	}
 
 	/*
 	 * Cannot simply call in_pcbconnect, because there might be an
 	 * earlier incarnation of this same connection still in
 	 * TIME_WAIT state, creating an ADDRINUSE error.
 	 */
 	laddr = inp->inp_laddr;
 	lport = inp->inp_lport;
 	error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
 	    &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred);
 	if (error && oinp == NULL)
 		goto out;
 	if (oinp) {
 		error = EADDRINUSE;
 		goto out;
 	}
 	/* Handle initial bind if it hadn't been done in advance. */
 	if (inp->inp_lport == 0) {
 		inp->inp_lport = lport;
 		if (in_pcbinshash(inp) != 0) {
 			inp->inp_lport = 0;
 			error = EAGAIN;
 			goto out;
 		}
 	}
 	inp->inp_laddr = laddr;
 	in_pcbrehash(inp);
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 
 	/*
 	 * Compute window scaling to request:
 	 * Scale to fit into sweet spot.  See tcp_syncache.c.
 	 * XXX: This should move to tcp_output().
 	 */
 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
 	    (TCP_MAXWIN << tp->request_r_scale) < sb_max)
 		tp->request_r_scale++;
 
 	soisconnecting(so);
 	TCPSTAT_INC(tcps_connattempt);
 	tcp_state_change(tp, TCPS_SYN_SENT);
 	tp->iss = tcp_new_isn(&inp->inp_inc);
 	if (tp->t_flags & TF_REQ_TSTMP)
 		tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc);
 	tcp_sendseqinit(tp);
 
 	return 0;
 
 out:
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 	return (error);
 }
 #endif /* INET */
 
 #ifdef INET6
 static int
 tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	int error;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK(&V_tcbinfo);
 
 	if (V_tcp_require_unique_port && inp->inp_lport == 0) {
 		error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
 		if (error)
 			goto out;
 	}
 	error = in6_pcbconnect(inp, nam, td->td_ucred);
 	if (error != 0)
 		goto out;
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 
 	/* Compute window scaling to request.  */
 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
 	    (TCP_MAXWIN << tp->request_r_scale) < sb_max)
 		tp->request_r_scale++;
 
 	soisconnecting(inp->inp_socket);
 	TCPSTAT_INC(tcps_connattempt);
 	tcp_state_change(tp, TCPS_SYN_SENT);
 	tp->iss = tcp_new_isn(&inp->inp_inc);
 	if (tp->t_flags & TF_REQ_TSTMP)
 		tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc);
 	tcp_sendseqinit(tp);
 
 	return 0;
 
 out:
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 	return error;
 }
 #endif /* INET6 */
 
 /*
  * Export TCP internal state information via a struct tcp_info, based on the
  * Linux 2.6 API.  Not ABI compatible as our constants are mapped differently
  * (TCP state machine, etc).  We export all information using FreeBSD-native
  * constants -- for example, the numeric values for tcpi_state will differ
  * from Linux.
  */
 static void
 tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
 {
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	bzero(ti, sizeof(*ti));
 
 	ti->tcpi_state = tp->t_state;
 	if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
 		ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		ti->tcpi_options |= TCPI_OPT_SACK;
 	if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
 		ti->tcpi_options |= TCPI_OPT_WSCALE;
 		ti->tcpi_snd_wscale = tp->snd_scale;
 		ti->tcpi_rcv_wscale = tp->rcv_scale;
 	}
 	if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))
 		ti->tcpi_options |= TCPI_OPT_ECN;
 
 	ti->tcpi_rto = tp->t_rxtcur * tick;
 	ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick;
 	ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
 	ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT;
 
 	ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
 	ti->tcpi_snd_cwnd = tp->snd_cwnd;
 
 	/*
 	 * FreeBSD-specific extension fields for tcp_info.
 	 */
 	ti->tcpi_rcv_space = tp->rcv_wnd;
 	ti->tcpi_rcv_nxt = tp->rcv_nxt;
 	ti->tcpi_snd_wnd = tp->snd_wnd;
 	ti->tcpi_snd_bwnd = 0;		/* Unused, kept for compat. */
 	ti->tcpi_snd_nxt = tp->snd_nxt;
 	ti->tcpi_snd_mss = tp->t_maxseg;
 	ti->tcpi_rcv_mss = tp->t_maxseg;
 	ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
 	ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
 	ti->tcpi_snd_zerowin = tp->t_sndzerowin;
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE) {
 		ti->tcpi_options |= TCPI_OPT_TOE;
 		tcp_offload_tcp_info(tp, ti);
 	}
 #endif
 }
 
 /*
  * tcp_ctloutput() must drop the inpcb lock before performing copyin on
  * socket option arguments.  When it re-acquires the lock after the copy, it
  * has to revalidate that the connection is still valid for the socket
  * option.
  */
 #define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do {			\
 	INP_WLOCK(inp);							\
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {		\
+	if (inp->inp_flags & INP_DROPPED) {				\
 		INP_WUNLOCK(inp);					\
 		cleanup;						\
 		return (ECONNRESET);					\
 	}								\
 	tp = intotcpcb(inp);						\
 } while(0)
 #define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */)
 
 int
 tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct socket *so = inp->inp_socket;
 	struct tcpcb *tp = intotcpcb(inp);
 	int error = 0;
 
 	MPASS(sopt->sopt_dir == SOPT_SET);
 	INP_WLOCK_ASSERT(inp);
-	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
+	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("inp_flags == %x", inp->inp_flags));
 	KASSERT(so != NULL, ("inp_socket == NULL"));
 
 	if (sopt->sopt_level != IPPROTO_TCP) {
 		INP_WUNLOCK(inp);
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6PROTO)
 			error = ip6_ctloutput(so, sopt);
 #endif
 #if defined(INET6) && defined(INET)
 		else
 #endif
 #ifdef INET
 			error = ip_ctloutput(so, sopt);
 #endif
 		/*
 		 * When an IP-level socket option affects TCP, pass control
 		 * down to stack tfb_tcp_ctloutput, otherwise return what
 		 * IP level returned.
 		 */
 		switch (sopt->sopt_level) {
 #ifdef INET6
 		case IPPROTO_IPV6:
 			if ((inp->inp_vflag & INP_IPV6PROTO) == 0)
 				return (error);
 			switch (sopt->sopt_name) {
 			case IPV6_TCLASS:
 				/* Notify tcp stacks that care (e.g. RACK). */
 				break;
 			case IPV6_USE_MIN_MTU:
 				/* Update t_maxseg accordingly. */
 				break;
 			default:
 				return (error);
 			}
 			break;
 #endif
 #ifdef INET
 		case IPPROTO_IP:
 			switch (sopt->sopt_name) {
 			case IP_TOS:
 				inp->inp_ip_tos &= ~IPTOS_ECN_MASK;
 				break;
 			case IP_TTL:
 				/* Notify tcp stacks that care (e.g. RACK). */
 				break;
 			default:
 				return (error);
 			}
 			break;
 #endif
 		default:
 			return (error);
 		}
 		INP_WLOCK(inp);
-		if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+		if (inp->inp_flags & INP_DROPPED) {
 			INP_WUNLOCK(inp);
 			return (ECONNRESET);
 		}
 	} else if (sopt->sopt_name == TCP_FUNCTION_BLK) {
 		/*
 		 * Protect the TCP option TCP_FUNCTION_BLK so
 		 * that a sub-function can *never* overwrite this.
 		 */
 		struct tcp_function_set fsn;
 		struct tcp_function_block *blk;
 
 		INP_WUNLOCK(inp);
 		error = sooptcopyin(sopt, &fsn, sizeof fsn, sizeof fsn);
 		if (error)
 			return (error);
 
 		INP_WLOCK(inp);
-		if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+		if (inp->inp_flags & INP_DROPPED) {
 			INP_WUNLOCK(inp);
 			return (ECONNRESET);
 		}
 		tp = intotcpcb(inp);
 
 		blk = find_and_ref_tcp_functions(&fsn);
 		if (blk == NULL) {
 			INP_WUNLOCK(inp);
 			return (ENOENT);
 		}
 		if (tp->t_fb == blk) {
 			/* You already have this */
 			refcount_release(&blk->tfb_refcnt);
 			INP_WUNLOCK(inp);
 			return (0);
 		}
 		if (tp->t_state != TCPS_CLOSED) {
 			/*
 			 * The user has advanced the state
 			 * past the initial point, we may not
 			 * be able to switch.
 			 */
 			if (blk->tfb_tcp_handoff_ok != NULL) {
 				/*
 				 * Does the stack provide a
 				 * query mechanism, if so it may
 				 * still be possible?
 				 */
 				error = (*blk->tfb_tcp_handoff_ok)(tp);
 			} else
 				error = EINVAL;
 			if (error) {
 				refcount_release(&blk->tfb_refcnt);
 				INP_WUNLOCK(inp);
 				return(error);
 			}
 		}
 		if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
 			refcount_release(&blk->tfb_refcnt);
 			INP_WUNLOCK(inp);
 			return (ENOENT);
 		}
 		/*
 		 * Release the old refcnt, the
 		 * lookup acquired a ref on the
 		 * new one already.
 		 */
 		if (tp->t_fb->tfb_tcp_fb_fini) {
 			struct epoch_tracker et;
 			/*
 			 * Tell the stack to cleanup with 0 i.e.
 			 * the tcb is not going away.
 			 */
 			NET_EPOCH_ENTER(et);
 			(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
 			NET_EPOCH_EXIT(et);
 		}
 #ifdef TCPHPTS
 		/* Assure that we are not on any hpts */
 		tcp_hpts_remove(tp->t_inpcb);
 #endif
 		if (blk->tfb_tcp_fb_init) {
 			error = (*blk->tfb_tcp_fb_init)(tp);
 			if (error) {
 				refcount_release(&blk->tfb_refcnt);
 				if (tp->t_fb->tfb_tcp_fb_init) {
 					if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0)  {
 						/* Fall back failed, drop the connection */
 						INP_WUNLOCK(inp);
 						soabort(so);
 						return (error);
 					}
 				}
 				goto err_out;
 			}
 		}
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		tp->t_fb = blk;
 #ifdef TCP_OFFLOAD
 		if (tp->t_flags & TF_TOE) {
 			tcp_offload_ctloutput(tp, sopt->sopt_dir,
 			     sopt->sopt_name);
 		}
 #endif
 err_out:
 		INP_WUNLOCK(inp);
 		return (error);
 	}
 
 	/* Pass in the INP locked, callee must unlock it. */
 	return (tp->t_fb->tfb_tcp_ctloutput(inp, sopt));
 }
 
 static int
 tcp_ctloutput_get(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct socket *so = inp->inp_socket;
 	struct tcpcb *tp = intotcpcb(inp);
 	int error = 0;
 
 	MPASS(sopt->sopt_dir == SOPT_GET);
 	INP_WLOCK_ASSERT(inp);
-	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
+	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("inp_flags == %x", inp->inp_flags));
 	KASSERT(so != NULL, ("inp_socket == NULL"));
 
 	if (sopt->sopt_level != IPPROTO_TCP) {
 		INP_WUNLOCK(inp);
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6PROTO)
 			error = ip6_ctloutput(so, sopt);
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 		else
 #endif
 #ifdef INET
 			error = ip_ctloutput(so, sopt);
 #endif
 		return (error);
 	}
 	if (((sopt->sopt_name == TCP_FUNCTION_BLK) ||
 	     (sopt->sopt_name == TCP_FUNCTION_ALIAS))) {
 		struct tcp_function_set fsn;
 
 		if (sopt->sopt_name == TCP_FUNCTION_ALIAS) {
 			memset(&fsn, 0, sizeof(fsn));
 			find_tcp_function_alias(tp->t_fb, &fsn);
 		} else {
 			strncpy(fsn.function_set_name,
 			    tp->t_fb->tfb_tcp_block_name,
 			    TCP_FUNCTION_NAME_LEN_MAX);
 			fsn.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0';
 		}
 		fsn.pcbcnt = tp->t_fb->tfb_refcnt;
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &fsn, sizeof fsn);
 		return (error);
 	}
 
 	/* Pass in the INP locked, callee must unlock it. */
 	return (tp->t_fb->tfb_tcp_ctloutput(inp, sopt));
 }
 
 int
 tcp_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct	inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
 
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (sopt->sopt_dir == SOPT_SET)
 		return (tcp_ctloutput_set(inp, sopt));
 	else if (sopt->sopt_dir == SOPT_GET)
 		return (tcp_ctloutput_get(inp, sopt));
 	else
 		panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir);
 }
 
 /*
  * If this assert becomes untrue, we need to change the size of the buf
  * variable in tcp_default_ctloutput().
  */
 #ifdef CTASSERT
 CTASSERT(TCP_CA_NAME_MAX <= TCP_LOG_ID_LEN);
 CTASSERT(TCP_LOG_REASON_LEN <= TCP_LOG_ID_LEN);
 #endif
 
 #ifdef KERN_TLS
 static int
 copyin_tls_enable(struct sockopt *sopt, struct tls_enable *tls)
 {
 	struct tls_enable_v0 tls_v0;
 	int error;
 
 	if (sopt->sopt_valsize == sizeof(tls_v0)) {
 		error = sooptcopyin(sopt, &tls_v0, sizeof(tls_v0),
 		    sizeof(tls_v0));
 		if (error)
 			return (error);
 		memset(tls, 0, sizeof(*tls));
 		tls->cipher_key = tls_v0.cipher_key;
 		tls->iv = tls_v0.iv;
 		tls->auth_key = tls_v0.auth_key;
 		tls->cipher_algorithm = tls_v0.cipher_algorithm;
 		tls->cipher_key_len = tls_v0.cipher_key_len;
 		tls->iv_len = tls_v0.iv_len;
 		tls->auth_algorithm = tls_v0.auth_algorithm;
 		tls->auth_key_len = tls_v0.auth_key_len;
 		tls->flags = tls_v0.flags;
 		tls->tls_vmajor = tls_v0.tls_vmajor;
 		tls->tls_vminor = tls_v0.tls_vminor;
 		return (0);
 	}
 
 	return (sooptcopyin(sopt, tls, sizeof(*tls), sizeof(*tls)));
 }
 #endif
 
 extern struct cc_algo newreno_cc_algo;
 
 static int
 tcp_set_cc_mod(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct cc_algo *algo;
 	void *ptr = NULL;
 	struct tcpcb *tp;
 	struct cc_var cc_mem;
 	char	buf[TCP_CA_NAME_MAX];
 	size_t mem_sz;
 	int error;
 
 	INP_WUNLOCK(inp);
 	error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1);
 	if (error)
 		return(error);
 	buf[sopt->sopt_valsize] = '\0';
 	CC_LIST_RLOCK();
 	STAILQ_FOREACH(algo, &cc_list, entries) {
 		if (strncmp(buf, algo->name,
 			    TCP_CA_NAME_MAX) == 0) {
 			if (algo->flags & CC_MODULE_BEING_REMOVED) {
 				/* We can't "see" modules being unloaded */
 				continue;
 			}
 			break;
 		}
 	}
 	if (algo == NULL) {
 		CC_LIST_RUNLOCK();
 		return(ESRCH);
 	}
 	/* 
 	 * With a reference the algorithm cannot be removed
 	 * so we hold a reference through the change process.
 	 */
 	cc_refer(algo);
 	CC_LIST_RUNLOCK();
 	if (algo->cb_init != NULL) {
 		/* We can now pre-get the memory for the CC */
 		mem_sz = (*algo->cc_data_sz)();
 		if (mem_sz == 0) {
 			goto no_mem_needed;
 		}
 		ptr = malloc(mem_sz, M_CC_MEM, M_WAITOK);
 	} else {
 no_mem_needed:
 		mem_sz = 0;
 		ptr = NULL;
 	}
 	/*
 	 * Make sure its all clean and zero and also get
 	 * back the inplock.
 	 */
 	memset(&cc_mem, 0, sizeof(cc_mem));
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		if (ptr)
 			free(ptr, M_CC_MEM);
 		/* Release our temp reference */
 		CC_LIST_RLOCK();
 		cc_release(algo);
 		CC_LIST_RUNLOCK();
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	if (ptr != NULL)
 		memset(ptr, 0, mem_sz);
 	cc_mem.ccvc.tcp = tp;
 	/*
 	 * We once again hold a write lock over the tcb so it's
 	 * safe to do these things without ordering concerns.
 	 * Note here we init into stack memory.
 	 */
 	if (algo->cb_init != NULL)
 		error = algo->cb_init(&cc_mem, ptr);
 	else
 		error = 0;
 	/*
 	 * The CC algorithms, when given their memory
 	 * should not fail we could in theory have a
 	 * KASSERT here.
 	 */
 	if (error == 0) {
 		/*
 		 * Touchdown, lets go ahead and move the
 		 * connection to the new CC module by
 		 * copying in the cc_mem after we call
 		 * the old ones cleanup (if any).
 		 */
 		if (CC_ALGO(tp)->cb_destroy != NULL)
 			CC_ALGO(tp)->cb_destroy(tp->ccv);
 		/* Detach the old CC from the tcpcb  */
 		cc_detach(tp);
 		/* Copy in our temp memory that was inited */
 		memcpy(tp->ccv, &cc_mem, sizeof(struct cc_var));
 		/* Now attach the new, which takes a reference */
 		cc_attach(tp, algo);
 		/* Ok now are we where we have gotten past any conn_init? */
 		if (TCPS_HAVEESTABLISHED(tp->t_state) && (CC_ALGO(tp)->conn_init != NULL)) {
 			/* Yep run the connection init for the new CC */
 			CC_ALGO(tp)->conn_init(tp->ccv);
 		}
 	} else if (ptr)
 		free(ptr, M_CC_MEM);
 	INP_WUNLOCK(inp);
 	/* Now lets release our temp reference */
 	CC_LIST_RLOCK();
 	cc_release(algo);
 	CC_LIST_RUNLOCK();
 	return (error);
 }
 
 int
 tcp_default_ctloutput(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct tcpcb *tp = intotcpcb(inp);
 	int	error, opt, optval;
 	u_int	ui;
 	struct	tcp_info ti;
 #ifdef KERN_TLS
 	struct tls_enable tls;
 	struct socket *so = inp->inp_socket;
 #endif
 	char	*pbuf, buf[TCP_LOG_ID_LEN];
 #ifdef STATS
 	struct statsblob *sbp;
 #endif
 	size_t	len;
 
 	INP_WLOCK_ASSERT(inp);
-	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
+	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 	    ("inp_flags == %x", inp->inp_flags));
 	KASSERT(inp->inp_socket != NULL, ("inp_socket == NULL"));
 
 	switch (sopt->sopt_level) {
 #ifdef INET6
 	case IPPROTO_IPV6:
 		MPASS(inp->inp_vflag & INP_IPV6PROTO);
 		switch (sopt->sopt_name) {
 		case IPV6_USE_MIN_MTU:
 			tcp6_use_min_mtu(tp);
 			/* FALLTHROUGH */
 		}
 		INP_WUNLOCK(inp);
 		return (0);
 #endif
 #ifdef INET
 	case IPPROTO_IP:
 		INP_WUNLOCK(inp);
 		return (0);
 #endif
 	}
 
 	/*
 	 * For TCP_CCALGOOPT forward the control to CC module, for both
 	 * SOPT_SET and SOPT_GET.
 	 */
 	switch (sopt->sopt_name) {
 	case TCP_CCALGOOPT:
 		INP_WUNLOCK(inp);
 		if (sopt->sopt_valsize > CC_ALGOOPT_LIMIT)
 			return (EINVAL);
 		pbuf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO);
 		error = sooptcopyin(sopt, pbuf, sopt->sopt_valsize,
 		    sopt->sopt_valsize);
 		if (error) {
 			free(pbuf, M_TEMP);
 			return (error);
 		}
 		INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP));
 		if (CC_ALGO(tp)->ctl_output != NULL)
 			error = CC_ALGO(tp)->ctl_output(tp->ccv, sopt, pbuf);
 		else
 			error = ENOENT;
 		INP_WUNLOCK(inp);
 		if (error == 0 && sopt->sopt_dir == SOPT_GET)
 			error = sooptcopyout(sopt, pbuf, sopt->sopt_valsize);
 		free(pbuf, M_TEMP);
 		return (error);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		case TCP_MD5SIG:
 			INP_WUNLOCK(inp);
 			if (!TCPMD5_ENABLED())
 				return (ENOPROTOOPT);
 			error = TCPMD5_PCBCTL(inp, sopt);
 			if (error)
 				return (error);
 			INP_WLOCK_RECHECK(inp);
 			goto unlock_and_done;
 #endif /* IPSEC */
 
 		case TCP_NODELAY:
 		case TCP_NOOPT:
 		case TCP_LRD:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			switch (sopt->sopt_name) {
 			case TCP_NODELAY:
 				opt = TF_NODELAY;
 				break;
 			case TCP_NOOPT:
 				opt = TF_NOOPT;
 				break;
 			case TCP_LRD:
 				opt = TF_LRD;
 				break;
 			default:
 				opt = 0; /* dead code to fool gcc */
 				break;
 			}
 
 			if (optval)
 				tp->t_flags |= opt;
 			else
 				tp->t_flags &= ~opt;
 unlock_and_done:
 #ifdef TCP_OFFLOAD
 			if (tp->t_flags & TF_TOE) {
 				tcp_offload_ctloutput(tp, sopt->sopt_dir,
 				    sopt->sopt_name);
 			}
 #endif
 			INP_WUNLOCK(inp);
 			break;
 
 		case TCP_NOPUSH:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			if (optval)
 				tp->t_flags |= TF_NOPUSH;
 			else if (tp->t_flags & TF_NOPUSH) {
 				tp->t_flags &= ~TF_NOPUSH;
 				if (TCPS_HAVEESTABLISHED(tp->t_state)) {
 					struct epoch_tracker et;
 
 					NET_EPOCH_ENTER(et);
 					error = tcp_output_nodrop(tp);
 					NET_EPOCH_EXIT(et);
 				}
 			}
 			goto unlock_and_done;
 
 		case TCP_REMOTE_UDP_ENCAPS_PORT:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 			if ((optval < TCP_TUNNELING_PORT_MIN) ||
 			    (optval > TCP_TUNNELING_PORT_MAX)) {
 				/* Its got to be in range */
 				return (EINVAL);
 			}
 			if ((V_tcp_udp_tunneling_port == 0) && (optval != 0)) {
 				/* You have to have enabled a UDP tunneling port first */
 				return (EINVAL);
 			}
 			INP_WLOCK_RECHECK(inp);
 			if (tp->t_state != TCPS_CLOSED) {
 				/* You can't change after you are connected */
 				error = EINVAL;
 			} else {
 				/* Ok we are all good set the port */
 				tp->t_port = htons(optval);
 			}
 			goto unlock_and_done;
 
 		case TCP_MAXSEG:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			if (optval > 0 && optval <= tp->t_maxseg &&
 			    optval + 40 >= V_tcp_minmss)
 				tp->t_maxseg = optval;
 			else
 				error = EINVAL;
 			goto unlock_and_done;
 
 		case TCP_INFO:
 			INP_WUNLOCK(inp);
 			error = EINVAL;
 			break;
 
 		case TCP_STATS:
 			INP_WUNLOCK(inp);
 #ifdef STATS
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			if (optval > 0)
 				sbp = stats_blob_alloc(
 				    V_tcp_perconn_stats_dflt_tpl, 0);
 			else
 				sbp = NULL;
 
 			INP_WLOCK_RECHECK(inp);
 			if ((tp->t_stats != NULL && sbp == NULL) ||
 			    (tp->t_stats == NULL && sbp != NULL)) {
 				struct statsblob *t = tp->t_stats;
 				tp->t_stats = sbp;
 				sbp = t;
 			}
 			INP_WUNLOCK(inp);
 
 			stats_blob_destroy(sbp);
 #else
 			return (EOPNOTSUPP);
 #endif /* !STATS */
 			break;
 
 		case TCP_CONGESTION:
 			error = tcp_set_cc_mod(inp, sopt);
 			break;
 
 		case TCP_REUSPORT_LB_NUMA:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof(optval),
 			    sizeof(optval));
 			INP_WLOCK_RECHECK(inp);
 			if (!error)
 				error = in_pcblbgroup_numa(inp, optval);
 			INP_WUNLOCK(inp);
 			break;
 
 #ifdef KERN_TLS
 		case TCP_TXTLS_ENABLE:
 			INP_WUNLOCK(inp);
 			error = copyin_tls_enable(sopt, &tls);
 			if (error)
 				break;
 			error = ktls_enable_tx(so, &tls);
 			break;
 		case TCP_TXTLS_MODE:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			error = ktls_set_tx_mode(so, ui);
 			INP_WUNLOCK(inp);
 			break;
 		case TCP_RXTLS_ENABLE:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &tls, sizeof(tls),
 			    sizeof(tls));
 			if (error)
 				break;
 			error = ktls_enable_rx(so, &tls);
 			break;
 #endif
 		case TCP_MAXUNACKTIME:
 		case TCP_KEEPIDLE:
 		case TCP_KEEPINTVL:
 		case TCP_KEEPINIT:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
 			if (error)
 				return (error);
 
 			if (ui > (UINT_MAX / hz)) {
 				error = EINVAL;
 				break;
 			}
 			ui *= hz;
 
 			INP_WLOCK_RECHECK(inp);
 			switch (sopt->sopt_name) {
 			case TCP_MAXUNACKTIME:
 				tp->t_maxunacktime = ui;
 				break;
 
 			case TCP_KEEPIDLE:
 				tp->t_keepidle = ui;
 				/*
 				 * XXX: better check current remaining
 				 * timeout and "merge" it with new value.
 				 */
 				if ((tp->t_state > TCPS_LISTEN) &&
 				    (tp->t_state <= TCPS_CLOSING))
 					tcp_timer_activate(tp, TT_KEEP,
 					    TP_KEEPIDLE(tp));
 				break;
 			case TCP_KEEPINTVL:
 				tp->t_keepintvl = ui;
 				if ((tp->t_state == TCPS_FIN_WAIT_2) &&
 				    (TP_MAXIDLE(tp) > 0))
 					tcp_timer_activate(tp, TT_2MSL,
 					    TP_MAXIDLE(tp));
 				break;
 			case TCP_KEEPINIT:
 				tp->t_keepinit = ui;
 				if (tp->t_state == TCPS_SYN_RECEIVED ||
 				    tp->t_state == TCPS_SYN_SENT)
 					tcp_timer_activate(tp, TT_KEEP,
 					    TP_KEEPINIT(tp));
 				break;
 			}
 			goto unlock_and_done;
 
 		case TCP_KEEPCNT:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			tp->t_keepcnt = ui;
 			if ((tp->t_state == TCPS_FIN_WAIT_2) &&
 			    (TP_MAXIDLE(tp) > 0))
 				tcp_timer_activate(tp, TT_2MSL,
 				    TP_MAXIDLE(tp));
 			goto unlock_and_done;
 
 #ifdef TCPPCAP
 		case TCP_PCAP_OUT:
 		case TCP_PCAP_IN:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			if (optval >= 0)
 				tcp_pcap_set_sock_max(TCP_PCAP_OUT ?
 					&(tp->t_outpkts) : &(tp->t_inpkts),
 					optval);
 			else
 				error = EINVAL;
 			goto unlock_and_done;
 #endif
 
 		case TCP_FASTOPEN: {
 			struct tcp_fastopen tfo_optval;
 
 			INP_WUNLOCK(inp);
 			if (!V_tcp_fastopen_client_enable &&
 			    !V_tcp_fastopen_server_enable)
 				return (EPERM);
 
 			error = sooptcopyin(sopt, &tfo_optval,
 				    sizeof(tfo_optval), sizeof(int));
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			if ((tp->t_state != TCPS_CLOSED) &&
 			    (tp->t_state != TCPS_LISTEN)) {
 				error = EINVAL;
 				goto unlock_and_done;
 			}
 			if (tfo_optval.enable) {
 				if (tp->t_state == TCPS_LISTEN) {
 					if (!V_tcp_fastopen_server_enable) {
 						error = EPERM;
 						goto unlock_and_done;
 					}
 
 					if (tp->t_tfo_pending == NULL)
 						tp->t_tfo_pending =
 						    tcp_fastopen_alloc_counter();
 				} else {
 					/*
 					 * If a pre-shared key was provided,
 					 * stash it in the client cookie
 					 * field of the tcpcb for use during
 					 * connect.
 					 */
 					if (sopt->sopt_valsize ==
 					    sizeof(tfo_optval)) {
 						memcpy(tp->t_tfo_cookie.client,
 						       tfo_optval.psk,
 						       TCP_FASTOPEN_PSK_LEN);
 						tp->t_tfo_client_cookie_len =
 						    TCP_FASTOPEN_PSK_LEN;
 					}
 				}
 				tp->t_flags |= TF_FASTOPEN;
 			} else
 				tp->t_flags &= ~TF_FASTOPEN;
 			goto unlock_and_done;
 		}
 
 #ifdef TCP_BLACKBOX
 		case TCP_LOG:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			error = tcp_log_state_change(tp, optval);
 			goto unlock_and_done;
 
 		case TCP_LOGBUF:
 			INP_WUNLOCK(inp);
 			error = EINVAL;
 			break;
 
 		case TCP_LOGID:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, buf, TCP_LOG_ID_LEN - 1, 0);
 			if (error)
 				break;
 			buf[sopt->sopt_valsize] = '\0';
 			INP_WLOCK_RECHECK(inp);
 			error = tcp_log_set_id(tp, buf);
 			/* tcp_log_set_id() unlocks the INP. */
 			break;
 
 		case TCP_LOGDUMP:
 		case TCP_LOGDUMPID:
 			INP_WUNLOCK(inp);
 			error =
 			    sooptcopyin(sopt, buf, TCP_LOG_REASON_LEN - 1, 0);
 			if (error)
 				break;
 			buf[sopt->sopt_valsize] = '\0';
 			INP_WLOCK_RECHECK(inp);
 			if (sopt->sopt_name == TCP_LOGDUMP) {
 				error = tcp_log_dump_tp_logbuf(tp, buf,
 				    M_WAITOK, true);
 				INP_WUNLOCK(inp);
 			} else {
 				tcp_log_dump_tp_bucket_logbufs(tp, buf);
 				/*
 				 * tcp_log_dump_tp_bucket_logbufs() drops the
 				 * INP lock.
 				 */
 			}
 			break;
 #endif
 
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case SOPT_GET:
 		tp = intotcpcb(inp);
 		switch (sopt->sopt_name) {
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		case TCP_MD5SIG:
 			INP_WUNLOCK(inp);
 			if (!TCPMD5_ENABLED())
 				return (ENOPROTOOPT);
 			error = TCPMD5_PCBCTL(inp, sopt);
 			break;
 #endif
 
 		case TCP_NODELAY:
 			optval = tp->t_flags & TF_NODELAY;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		case TCP_MAXSEG:
 			optval = tp->t_maxseg;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		case TCP_REMOTE_UDP_ENCAPS_PORT:
 			optval = ntohs(tp->t_port);
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		case TCP_NOOPT:
 			optval = tp->t_flags & TF_NOOPT;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		case TCP_NOPUSH:
 			optval = tp->t_flags & TF_NOPUSH;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		case TCP_INFO:
 			tcp_fill_info(tp, &ti);
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &ti, sizeof ti);
 			break;
 		case TCP_STATS:
 			{
 #ifdef STATS
 			int nheld;
 			TYPEOF_MEMBER(struct statsblob, flags) sbflags = 0;
 
 			error = 0;
 			socklen_t outsbsz = sopt->sopt_valsize;
 			if (tp->t_stats == NULL)
 				error = ENOENT;
 			else if (outsbsz >= tp->t_stats->cursz)
 				outsbsz = tp->t_stats->cursz;
 			else if (outsbsz >= sizeof(struct statsblob))
 				outsbsz = sizeof(struct statsblob);
 			else
 				error = EINVAL;
 			INP_WUNLOCK(inp);
 			if (error)
 				break;
 
 			sbp = sopt->sopt_val;
 			nheld = atop(round_page(((vm_offset_t)sbp) +
 			    (vm_size_t)outsbsz) - trunc_page((vm_offset_t)sbp));
 			vm_page_t ma[nheld];
 			if (vm_fault_quick_hold_pages(
 			    &curproc->p_vmspace->vm_map, (vm_offset_t)sbp,
 			    outsbsz, VM_PROT_READ | VM_PROT_WRITE, ma,
 			    nheld) < 0) {
 				error = EFAULT;
 				break;
 			}
 
 			if ((error = copyin_nofault(&(sbp->flags), &sbflags,
 			    SIZEOF_MEMBER(struct statsblob, flags))))
 				goto unhold;
 
 			INP_WLOCK_RECHECK(inp);
 			error = stats_blob_snapshot(&sbp, outsbsz, tp->t_stats,
 			    sbflags | SB_CLONE_USRDSTNOFAULT);
 			INP_WUNLOCK(inp);
 			sopt->sopt_valsize = outsbsz;
 unhold:
 			vm_page_unhold_pages(ma, nheld);
 #else
 			INP_WUNLOCK(inp);
 			error = EOPNOTSUPP;
 #endif /* !STATS */
 			break;
 			}
 		case TCP_CONGESTION:
 			len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, buf, len + 1);
 			break;
 		case TCP_MAXUNACKTIME:
 		case TCP_KEEPIDLE:
 		case TCP_KEEPINTVL:
 		case TCP_KEEPINIT:
 		case TCP_KEEPCNT:
 			switch (sopt->sopt_name) {
 			case TCP_MAXUNACKTIME:
 				ui = TP_MAXUNACKTIME(tp) / hz;
 				break;
 			case TCP_KEEPIDLE:
 				ui = TP_KEEPIDLE(tp) / hz;
 				break;
 			case TCP_KEEPINTVL:
 				ui = TP_KEEPINTVL(tp) / hz;
 				break;
 			case TCP_KEEPINIT:
 				ui = TP_KEEPINIT(tp) / hz;
 				break;
 			case TCP_KEEPCNT:
 				ui = TP_KEEPCNT(tp);
 				break;
 			}
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &ui, sizeof(ui));
 			break;
 #ifdef TCPPCAP
 		case TCP_PCAP_OUT:
 		case TCP_PCAP_IN:
 			optval = tcp_pcap_get_sock_max(TCP_PCAP_OUT ?
 					&(tp->t_outpkts) : &(tp->t_inpkts));
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 #endif
 		case TCP_FASTOPEN:
 			optval = tp->t_flags & TF_FASTOPEN;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 #ifdef TCP_BLACKBOX
 		case TCP_LOG:
 			optval = tp->t_logstate;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 		case TCP_LOGBUF:
 			/* tcp_log_getlogbuf() does INP_WUNLOCK(inp) */
 			error = tcp_log_getlogbuf(sopt, tp);
 			break;
 		case TCP_LOGID:
 			len = tcp_log_get_id(tp, buf);
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, buf, len + 1);
 			break;
 		case TCP_LOGDUMP:
 		case TCP_LOGDUMPID:
 			INP_WUNLOCK(inp);
 			error = EINVAL;
 			break;
 #endif
 #ifdef KERN_TLS
 		case TCP_TXTLS_MODE:
 			error = ktls_get_tx_mode(so, &optval);
 			INP_WUNLOCK(inp);
 			if (error == 0)
 				error = sooptcopyout(sopt, &optval,
 				    sizeof(optval));
 			break;
 		case TCP_RXTLS_MODE:
 			error = ktls_get_rx_mode(so, &optval);
 			INP_WUNLOCK(inp);
 			if (error == 0)
 				error = sooptcopyout(sopt, &optval,
 				    sizeof(optval));
 			break;
 #endif
 		case TCP_LRD:
 			optval = tp->t_flags & TF_LRD;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 #undef INP_WLOCK_RECHECK
 #undef INP_WLOCK_RECHECK_CLEANUP
 
 /*
  * Initiate (or continue) disconnect.
  * If embryonic state, just send reset (once).
  * If in ``let data drain'' option and linger null, just drop.
  * Otherwise (hard), mark socket disconnecting and drop
  * current input data; switch states based on user close, and
  * send segment to peer (with FIN).
  */
 static void
 tcp_disconnect(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Neither tcp_close() nor tcp_drop() should return NULL, as the
 	 * socket is still open.
 	 */
 	if (tp->t_state < TCPS_ESTABLISHED &&
 	    !(tp->t_state > TCPS_LISTEN && IS_FASTOPEN(tp->t_flags))) {
 		tp = tcp_close(tp);
 		KASSERT(tp != NULL,
 		    ("tcp_disconnect: tcp_close() returned NULL"));
 	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
 		tp = tcp_drop(tp, 0);
 		KASSERT(tp != NULL,
 		    ("tcp_disconnect: tcp_drop() returned NULL"));
 	} else {
 		soisdisconnecting(so);
 		sbflush(&so->so_rcv);
 		tcp_usrclosed(tp);
 		if (!(inp->inp_flags & INP_DROPPED))
 			/* Ignore stack's drop request, we already at it. */
 			(void)tcp_output_nodrop(tp);
 	}
 }
 
 /*
  * User issued close, and wish to trail through shutdown states:
  * if never received SYN, just forget it.  If got a SYN from peer,
  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
  * If already got a FIN from peer, then almost done; go to LAST_ACK
  * state.  In all other cases, have already sent FIN to peer (e.g.
  * after PRU_SHUTDOWN), and just have to play tedious game waiting
  * for peer to send FIN or not respond to keep-alives, etc.
  * We can let the user exit from the close as soon as the FIN is acked.
  */
 static void
 tcp_usrclosed(struct tcpcb *tp)
 {
 
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	switch (tp->t_state) {
 	case TCPS_LISTEN:
 #ifdef TCP_OFFLOAD
 		tcp_offload_listen_stop(tp);
 #endif
 		tcp_state_change(tp, TCPS_CLOSED);
 		/* FALLTHROUGH */
 	case TCPS_CLOSED:
 		tp = tcp_close(tp);
 		/*
 		 * tcp_close() should never return NULL here as the socket is
 		 * still open.
 		 */
 		KASSERT(tp != NULL,
 		    ("tcp_usrclosed: tcp_close() returned NULL"));
 		break;
 
 	case TCPS_SYN_SENT:
 	case TCPS_SYN_RECEIVED:
 		tp->t_flags |= TF_NEEDFIN;
 		break;
 
 	case TCPS_ESTABLISHED:
 		tcp_state_change(tp, TCPS_FIN_WAIT_1);
 		break;
 
 	case TCPS_CLOSE_WAIT:
 		tcp_state_change(tp, TCPS_LAST_ACK);
 		break;
 	}
 	if (tp->t_acktime == 0)
 		tp->t_acktime = ticks;
 	if (tp->t_state >= TCPS_FIN_WAIT_2) {
 		soisdisconnected(tp->t_inpcb->inp_socket);
 		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
 		if (tp->t_state == TCPS_FIN_WAIT_2) {
 			int timeout;
 
 			timeout = (tcp_fast_finwait2_recycle) ?
 			    tcp_finwait2_timeout : TP_MAXIDLE(tp);
 			tcp_timer_activate(tp, TT_2MSL, timeout);
 		}
 	}
 }
 
 #ifdef DDB
 static void
 db_print_indent(int indent)
 {
 	int i;
 
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 }
 
 static void
 db_print_tstate(int t_state)
 {
 
 	switch (t_state) {
 	case TCPS_CLOSED:
 		db_printf("TCPS_CLOSED");
 		return;
 
 	case TCPS_LISTEN:
 		db_printf("TCPS_LISTEN");
 		return;
 
 	case TCPS_SYN_SENT:
 		db_printf("TCPS_SYN_SENT");
 		return;
 
 	case TCPS_SYN_RECEIVED:
 		db_printf("TCPS_SYN_RECEIVED");
 		return;
 
 	case TCPS_ESTABLISHED:
 		db_printf("TCPS_ESTABLISHED");
 		return;
 
 	case TCPS_CLOSE_WAIT:
 		db_printf("TCPS_CLOSE_WAIT");
 		return;
 
 	case TCPS_FIN_WAIT_1:
 		db_printf("TCPS_FIN_WAIT_1");
 		return;
 
 	case TCPS_CLOSING:
 		db_printf("TCPS_CLOSING");
 		return;
 
 	case TCPS_LAST_ACK:
 		db_printf("TCPS_LAST_ACK");
 		return;
 
 	case TCPS_FIN_WAIT_2:
 		db_printf("TCPS_FIN_WAIT_2");
 		return;
 
 	case TCPS_TIME_WAIT:
 		db_printf("TCPS_TIME_WAIT");
 		return;
 
 	default:
 		db_printf("unknown");
 		return;
 	}
 }
 
 static void
 db_print_tflags(u_int t_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (t_flags & TF_ACKNOW) {
 		db_printf("%sTF_ACKNOW", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_DELACK) {
 		db_printf("%sTF_DELACK", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_NODELAY) {
 		db_printf("%sTF_NODELAY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_NOOPT) {
 		db_printf("%sTF_NOOPT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_SENTFIN) {
 		db_printf("%sTF_SENTFIN", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_REQ_SCALE) {
 		db_printf("%sTF_REQ_SCALE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_RCVD_SCALE) {
 		db_printf("%sTF_RECVD_SCALE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_REQ_TSTMP) {
 		db_printf("%sTF_REQ_TSTMP", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_RCVD_TSTMP) {
 		db_printf("%sTF_RCVD_TSTMP", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_SACK_PERMIT) {
 		db_printf("%sTF_SACK_PERMIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_NEEDSYN) {
 		db_printf("%sTF_NEEDSYN", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_NEEDFIN) {
 		db_printf("%sTF_NEEDFIN", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_NOPUSH) {
 		db_printf("%sTF_NOPUSH", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_PREVVALID) {
 		db_printf("%sTF_PREVVALID", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_MORETOCOME) {
 		db_printf("%sTF_MORETOCOME", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_SONOTCONN) {
 		db_printf("%sTF_SONOTCONN", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_LASTIDLE) {
 		db_printf("%sTF_LASTIDLE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_RXWIN0SENT) {
 		db_printf("%sTF_RXWIN0SENT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_FASTRECOVERY) {
 		db_printf("%sTF_FASTRECOVERY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_CONGRECOVERY) {
 		db_printf("%sTF_CONGRECOVERY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_WASFRECOVERY) {
 		db_printf("%sTF_WASFRECOVERY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_WASCRECOVERY) {
 		db_printf("%sTF_WASCRECOVERY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_SIGNATURE) {
 		db_printf("%sTF_SIGNATURE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_FORCEDATA) {
 		db_printf("%sTF_FORCEDATA", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_TSO) {
 		db_printf("%sTF_TSO", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_FASTOPEN) {
 		db_printf("%sTF_FASTOPEN", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_tflags2(u_int t_flags2)
 {
 	int comma;
 
 	comma = 0;
 	if (t_flags2 & TF2_PLPMTU_BLACKHOLE) {
 		db_printf("%sTF2_PLPMTU_BLACKHOLE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_PLPMTU_PMTUD) {
 		db_printf("%sTF2_PLPMTU_PMTUD", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_PLPMTU_MAXSEGSNT) {
 		db_printf("%sTF2_PLPMTU_MAXSEGSNT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_LOG_AUTO) {
 		db_printf("%sTF2_LOG_AUTO", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_DROP_AF_DATA) {
 		db_printf("%sTF2_DROP_AF_DATA", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_ECN_PERMIT) {
 		db_printf("%sTF2_ECN_PERMIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_ECN_SND_CWR) {
 		db_printf("%sTF2_ECN_SND_CWR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_ECN_SND_ECE) {
 		db_printf("%sTF2_ECN_SND_ECE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_ACE_PERMIT) {
 		db_printf("%sTF2_ACE_PERMIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_FBYTES_COMPLETE) {
 		db_printf("%sTF2_FBYTES_COMPLETE", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_toobflags(char t_oobflags)
 {
 	int comma;
 
 	comma = 0;
 	if (t_oobflags & TCPOOB_HAVEDATA) {
 		db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_oobflags & TCPOOB_HADDATA) {
 		db_printf("%sTCPOOB_HADDATA", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_tcpcb(struct tcpcb *tp, const char *name, int indent)
 {
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, tp);
 
 	indent += 2;
 
 	db_print_indent(indent);
 	db_printf("t_segq first: %p   t_segqlen: %d   t_dupacks: %d\n",
 	   TAILQ_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
 
 	db_print_indent(indent);
 	db_printf("tt_rexmt: %p   tt_persist: %p   tt_keep: %p\n",
 	    &tp->t_timers->tt_rexmt, &tp->t_timers->tt_persist, &tp->t_timers->tt_keep);
 
 	db_print_indent(indent);
 	db_printf("tt_2msl: %p   tt_delack: %p   t_inpcb: %p\n", &tp->t_timers->tt_2msl,
 	    &tp->t_timers->tt_delack, tp->t_inpcb);
 
 	db_print_indent(indent);
 	db_printf("t_state: %d (", tp->t_state);
 	db_print_tstate(tp->t_state);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("t_flags: 0x%x (", tp->t_flags);
 	db_print_tflags(tp->t_flags);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("t_flags2: 0x%x (", tp->t_flags2);
 	db_print_tflags2(tp->t_flags2);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("snd_una: 0x%08x   snd_max: 0x%08x   snd_nxt: x0%08x\n",
 	    tp->snd_una, tp->snd_max, tp->snd_nxt);
 
 	db_print_indent(indent);
 	db_printf("snd_up: 0x%08x   snd_wl1: 0x%08x   snd_wl2: 0x%08x\n",
 	   tp->snd_up, tp->snd_wl1, tp->snd_wl2);
 
 	db_print_indent(indent);
 	db_printf("iss: 0x%08x   irs: 0x%08x   rcv_nxt: 0x%08x\n",
 	    tp->iss, tp->irs, tp->rcv_nxt);
 
 	db_print_indent(indent);
 	db_printf("rcv_adv: 0x%08x   rcv_wnd: %u   rcv_up: 0x%08x\n",
 	    tp->rcv_adv, tp->rcv_wnd, tp->rcv_up);
 
 	db_print_indent(indent);
 	db_printf("snd_wnd: %u   snd_cwnd: %u\n",
 	   tp->snd_wnd, tp->snd_cwnd);
 
 	db_print_indent(indent);
 	db_printf("snd_ssthresh: %u   snd_recover: "
 	    "0x%08x\n", tp->snd_ssthresh, tp->snd_recover);
 
 	db_print_indent(indent);
 	db_printf("t_rcvtime: %u   t_startime: %u\n",
 	    tp->t_rcvtime, tp->t_starttime);
 
 	db_print_indent(indent);
 	db_printf("t_rttime: %u   t_rtsq: 0x%08x\n",
 	    tp->t_rtttime, tp->t_rtseq);
 
 	db_print_indent(indent);
 	db_printf("t_rxtcur: %d   t_maxseg: %u   t_srtt: %d\n",
 	    tp->t_rxtcur, tp->t_maxseg, tp->t_srtt);
 
 	db_print_indent(indent);
 	db_printf("t_rttvar: %d   t_rxtshift: %d   t_rttmin: %u   "
 	    "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin,
 	    tp->t_rttbest);
 
 	db_print_indent(indent);
 	db_printf("t_rttupdated: %lu   max_sndwnd: %u   t_softerror: %d\n",
 	    tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror);
 
 	db_print_indent(indent);
 	db_printf("t_oobflags: 0x%x (", tp->t_oobflags);
 	db_print_toobflags(tp->t_oobflags);
 	db_printf(")   t_iobc: 0x%02x\n", tp->t_iobc);
 
 	db_print_indent(indent);
 	db_printf("snd_scale: %u   rcv_scale: %u   request_r_scale: %u\n",
 	    tp->snd_scale, tp->rcv_scale, tp->request_r_scale);
 
 	db_print_indent(indent);
 	db_printf("ts_recent: %u   ts_recent_age: %u\n",
 	    tp->ts_recent, tp->ts_recent_age);
 
 	db_print_indent(indent);
 	db_printf("ts_offset: %u   last_ack_sent: 0x%08x   snd_cwnd_prev: "
 	    "%u\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev);
 
 	db_print_indent(indent);
 	db_printf("snd_ssthresh_prev: %u   snd_recover_prev: 0x%08x   "
 	    "t_badrxtwin: %u\n", tp->snd_ssthresh_prev,
 	    tp->snd_recover_prev, tp->t_badrxtwin);
 
 	db_print_indent(indent);
 	db_printf("snd_numholes: %d  snd_holes first: %p\n",
 	    tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes));
 
 	db_print_indent(indent);
 	db_printf("snd_fack: 0x%08x   rcv_numsacks: %d\n",
 	    tp->snd_fack, tp->rcv_numsacks);
 
 	/* Skip sackblks, sackhint. */
 
 	db_print_indent(indent);
 	db_printf("t_rttlow: %d   rfbuf_ts: %u   rfbuf_cnt: %d\n",
 	    tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt);
 }
 
 DB_SHOW_COMMAND(tcpcb, db_show_tcpcb)
 {
 	struct tcpcb *tp;
 
 	if (!have_addr) {
 		db_printf("usage: show tcpcb <addr>\n");
 		return;
 	}
 	tp = (struct tcpcb *)addr;
 
 	db_print_tcpcb(tp, "tcpcb", 0);
 }
 #endif
diff --git a/sys/netinet/toecore.c b/sys/netinet/toecore.c
index a185103df3a7..a078d9cea51f 100644
--- a/sys/netinet/toecore.c
+++ b/sys/netinet/toecore.c
@@ -1,605 +1,605 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/types.h>
 #include <sys/sockopt.h>
 #include <sys/sysctl.h>
 #include <sys/socket.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_vlan_var.h>
 #include <net/if_llatbl.h>
 #include <net/route.h>
 
 #include <netinet/if_ether.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/nd6.h>
 #define TCPSTATES
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/tcp_offload.h>
 #include <netinet/toecore.h>
 
 static struct mtx toedev_lock;
 static TAILQ_HEAD(, toedev) toedev_list;
 static eventhandler_tag listen_start_eh;
 static eventhandler_tag listen_stop_eh;
 static eventhandler_tag lle_event_eh;
 
 static int
 toedev_connect(struct toedev *tod __unused, struct socket *so __unused,
     struct nhop_object *nh __unused, struct sockaddr *nam __unused)
 {
 
 	return (ENOTSUP);
 }
 
 static int
 toedev_listen_start(struct toedev *tod __unused, struct tcpcb *tp __unused)
 {
 
 	return (ENOTSUP);
 }
 
 static int
 toedev_listen_stop(struct toedev *tod __unused, struct tcpcb *tp __unused)
 {
 
 	return (ENOTSUP);
 }
 
 static void
 toedev_input(struct toedev *tod __unused, struct tcpcb *tp __unused,
     struct mbuf *m)
 {
 
 	m_freem(m);
 	return;
 }
 
 static void
 toedev_rcvd(struct toedev *tod __unused, struct tcpcb *tp __unused)
 {
 
 	return;
 }
 
 static int
 toedev_output(struct toedev *tod __unused, struct tcpcb *tp __unused)
 {
 
 	return (ENOTSUP);
 }
 
 static void
 toedev_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp __unused)
 {
 
 	return;
 }
 
 static void
 toedev_l2_update(struct toedev *tod __unused, struct ifnet *ifp __unused,
     struct sockaddr *sa __unused, uint8_t *lladdr __unused,
     uint16_t vtag __unused)
 {
 
 	return;
 }
 
 static void
 toedev_route_redirect(struct toedev *tod __unused, struct ifnet *ifp __unused,
     struct nhop_object *nh0 __unused, struct nhop_object *nh1 __unused)
 {
 
 	return;
 }
 
 static void
 toedev_syncache_added(struct toedev *tod __unused, void *ctx __unused)
 {
 
 	return;
 }
 
 static void
 toedev_syncache_removed(struct toedev *tod __unused, void *ctx __unused)
 {
 
 	return;
 }
 
 static int
 toedev_syncache_respond(struct toedev *tod __unused, void *ctx __unused,
     struct mbuf *m)
 {
 
 	m_freem(m);
 	return (0);
 }
 
 static void
 toedev_offload_socket(struct toedev *tod __unused, void *ctx __unused,
     struct socket *so __unused)
 {
 
 	return;
 }
 
 static void
 toedev_ctloutput(struct toedev *tod __unused, struct tcpcb *tp __unused,
     int sopt_dir __unused, int sopt_name __unused)
 {
 
 	return;
 }
 
 static void
 toedev_tcp_info(struct toedev *tod __unused, struct tcpcb *tp __unused,
     struct tcp_info *ti __unused)
 {
 
 	return;
 }
 
 static int
 toedev_alloc_tls_session(struct toedev *tod __unused, struct tcpcb *tp __unused,
     struct ktls_session *tls __unused, int direction __unused)
 {
 
 	return (EINVAL);
 }
 
 static void
 toedev_pmtu_update(struct toedev *tod __unused, struct tcpcb *tp __unused,
     tcp_seq seq __unused, int mtu __unused)
 {
 
 	return;
 }
 
 /*
  * Inform one or more TOE devices about a listening socket.
  */
 static void
 toe_listen_start(struct inpcb *inp, void *arg)
 {
 	struct toedev *t, *tod;
 	struct tcpcb *tp;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(inp->inp_pcbinfo == &V_tcbinfo,
 	    ("%s: inp is not a TCP inp", __func__));
 
-	if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))
+	if (inp->inp_flags & INP_DROPPED)
 		return;
 
 	tp = intotcpcb(inp);
 	if (tp->t_state != TCPS_LISTEN)
 		return;
 
 	t = arg;
 	mtx_lock(&toedev_lock);
 	TAILQ_FOREACH(tod, &toedev_list, link) {
 		if (t == NULL || t == tod)
 			tod->tod_listen_start(tod, tp);
 	}
 	mtx_unlock(&toedev_lock);
 }
 
 static void
 toe_listen_start_event(void *arg __unused, struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(tp->t_state == TCPS_LISTEN,
 	    ("%s: t_state %s", __func__, tcpstates[tp->t_state]));
 
 	toe_listen_start(inp, NULL);
 }
 
 static void
 toe_listen_stop_event(void *arg __unused, struct tcpcb *tp)
 {
 	struct toedev *tod;
 #ifdef INVARIANTS
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(tp->t_state == TCPS_LISTEN,
 	    ("%s: t_state %s", __func__, tcpstates[tp->t_state]));
 
 	mtx_lock(&toedev_lock);
 	TAILQ_FOREACH(tod, &toedev_list, link)
 	    tod->tod_listen_stop(tod, tp);
 	mtx_unlock(&toedev_lock);
 }
 
 /*
  * Fill up a freshly allocated toedev struct with reasonable defaults.
  */
 void
 init_toedev(struct toedev *tod)
 {
 
 	tod->tod_softc = NULL;
 
 	/*
 	 * Provide no-op defaults so that the kernel can call any toedev
 	 * function without having to check whether the TOE driver supplied one
 	 * or not.
 	 */
 	tod->tod_connect = toedev_connect;
 	tod->tod_listen_start = toedev_listen_start;
 	tod->tod_listen_stop = toedev_listen_stop;
 	tod->tod_input = toedev_input;
 	tod->tod_rcvd = toedev_rcvd;
 	tod->tod_output = toedev_output;
 	tod->tod_send_rst = toedev_output;
 	tod->tod_send_fin = toedev_output;
 	tod->tod_pcb_detach = toedev_pcb_detach;
 	tod->tod_l2_update = toedev_l2_update;
 	tod->tod_route_redirect = toedev_route_redirect;
 	tod->tod_syncache_added = toedev_syncache_added;
 	tod->tod_syncache_removed = toedev_syncache_removed;
 	tod->tod_syncache_respond = toedev_syncache_respond;
 	tod->tod_offload_socket = toedev_offload_socket;
 	tod->tod_ctloutput = toedev_ctloutput;
 	tod->tod_tcp_info = toedev_tcp_info;
 	tod->tod_alloc_tls_session = toedev_alloc_tls_session;
 	tod->tod_pmtu_update = toedev_pmtu_update;
 }
 
 /*
  * Register an active TOE device with the system.  This allows it to receive
  * notifications from the kernel.
  */
 int
 register_toedev(struct toedev *tod)
 {
 	struct toedev *t;
 
 	mtx_lock(&toedev_lock);
 	TAILQ_FOREACH(t, &toedev_list, link) {
 		if (t == tod) {
 			mtx_unlock(&toedev_lock);
 			return (EEXIST);
 		}
 	}
 
 	TAILQ_INSERT_TAIL(&toedev_list, tod, link);
 	registered_toedevs++;
 	mtx_unlock(&toedev_lock);
 
 	inp_apply_all(toe_listen_start, tod);
 
 	return (0);
 }
 
 /*
  * Remove the TOE device from the global list of active TOE devices.  It is the
  * caller's responsibility to ensure that the TOE device is quiesced prior to
  * this call.
  */
 int
 unregister_toedev(struct toedev *tod)
 {
 	struct toedev *t, *t2;
 	int rc = ENODEV;
 
 	mtx_lock(&toedev_lock);
 	TAILQ_FOREACH_SAFE(t, &toedev_list, link, t2) {
 		if (t == tod) {
 			TAILQ_REMOVE(&toedev_list, tod, link);
 			registered_toedevs--;
 			rc = 0;
 			break;
 		}
 	}
 	KASSERT(registered_toedevs >= 0,
 	    ("%s: registered_toedevs (%d) < 0", __func__, registered_toedevs));
 	mtx_unlock(&toedev_lock);
 	return (rc);
 }
 
 void
 toe_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
     struct inpcb *inp, void *tod, void *todctx, uint8_t iptos)
 {
 
 	INP_RLOCK_ASSERT(inp);
 
 	(void )syncache_add(inc, to, th, inp, inp->inp_socket, NULL, tod,
 	    todctx, iptos, htons(0));
 }
 
 int
 toe_syncache_expand(struct in_conninfo *inc, struct tcpopt *to,
     struct tcphdr *th, struct socket **lsop)
 {
 
 	NET_EPOCH_ASSERT();
 
 	return (syncache_expand(inc, to, th, lsop, NULL, htons(0)));
 }
 
 /*
  * General purpose check to see if a 4-tuple is in use by the kernel.  If a TCP
  * header (presumably for an incoming SYN) is also provided, an existing 4-tuple
  * in TIME_WAIT may be assassinated freeing it up for re-use.
  *
  * Note that the TCP header must have been run through tcp_fields_to_host() or
  * equivalent.
  */
 int
 toe_4tuple_check(struct in_conninfo *inc, struct tcphdr *th, struct ifnet *ifp)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp;
 
 	if (inc->inc_flags & INC_ISIPV6) {
 		inp = in6_pcblookup(&V_tcbinfo, &inc->inc6_faddr,
 		    inc->inc_fport, &inc->inc6_laddr, inc->inc_lport,
 		    INPLOOKUP_RLOCKPCB, ifp);
 	} else {
 		inp = in_pcblookup(&V_tcbinfo, inc->inc_faddr, inc->inc_fport,
 		    inc->inc_laddr, inc->inc_lport, INPLOOKUP_RLOCKPCB, ifp);
 	}
 	if (inp != NULL) {
 		INP_RLOCK_ASSERT(inp);
 
 		tp = intotcpcb(inp);
 		if (tp->t_state == TCPS_TIME_WAIT && th != NULL) {
 			if (!tcp_twcheck(inp, NULL, th, NULL, 0))
 				return (EADDRINUSE);
 		} else {
 			INP_RUNLOCK(inp);
 			return (EADDRINUSE);
 		}
 	}
 
 	return (0);
 }
 
 static void
 toe_lle_event(void *arg __unused, struct llentry *lle, int evt)
 {
 	struct toedev *tod;
 	struct ifnet *ifp;
 	struct sockaddr *sa;
 	uint8_t *lladdr;
 	uint16_t vid, pcp;
 	int family;
 	struct sockaddr_in6 sin6;
 
 	LLE_WLOCK_ASSERT(lle);
 
 	ifp = lltable_get_ifp(lle->lle_tbl);
 	family = lltable_get_af(lle->lle_tbl);
 
 	if (family != AF_INET && family != AF_INET6)
 		return;
 	/*
 	 * Not interested if the interface's TOE capability is not enabled.
 	 */
 	if ((family == AF_INET && !(ifp->if_capenable & IFCAP_TOE4)) ||
 	    (family == AF_INET6 && !(ifp->if_capenable & IFCAP_TOE6)))
 		return;
 
 	tod = TOEDEV(ifp);
 	if (tod == NULL)
 		return;
 
 	sa = (struct sockaddr *)&sin6;
 	lltable_fill_sa_entry(lle, sa);
 
 	vid = 0xfff;
 	pcp = 0;
 	if (evt != LLENTRY_RESOLVED) {
 		/*
 		 * LLENTRY_TIMEDOUT, LLENTRY_DELETED, LLENTRY_EXPIRED all mean
 		 * this entry is going to be deleted.
 		 */
 
 		lladdr = NULL;
 	} else {
 		KASSERT(lle->la_flags & LLE_VALID,
 		    ("%s: %p resolved but not valid?", __func__, lle));
 
 		lladdr = (uint8_t *)lle->ll_addr;
 		VLAN_TAG(ifp, &vid);
 		VLAN_PCP(ifp, &pcp);
 	}
 
 	tod->tod_l2_update(tod, ifp, sa, lladdr, EVL_MAKETAG(vid, pcp, 0));
 }
 
 /*
  * Returns 0 or EWOULDBLOCK on success (any other value is an error).  0 means
  * lladdr and vtag are valid on return, EWOULDBLOCK means the TOE driver's
  * tod_l2_update will be called later, when the entry is resolved or times out.
  */
 int
 toe_l2_resolve(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa,
     uint8_t *lladdr, uint16_t *vtag)
 {
 	int rc;
 	uint16_t vid, pcp;
 
 	switch (sa->sa_family) {
 #ifdef INET
 	case AF_INET:
 		rc = arpresolve(ifp, 0, NULL, sa, lladdr, NULL, NULL);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		rc = nd6_resolve(ifp, LLE_SF(AF_INET6, 0), NULL, sa, lladdr,
 		    NULL, NULL);
 		break;
 #endif
 	default:
 		return (EPROTONOSUPPORT);
 	}
 
 	if (rc == 0) {
 		vid = 0xfff;
 		pcp = 0;
 		if (ifp->if_type == IFT_L2VLAN) {
 			VLAN_TAG(ifp, &vid);
 			VLAN_PCP(ifp, &pcp);
 		} else if (ifp->if_pcp != IFNET_PCP_NONE) {
 			vid = 0;
 			pcp = ifp->if_pcp;
 		}
 		*vtag = EVL_MAKETAG(vid, pcp, 0);
 	}
 
 	return (rc);
 }
 
 void
 toe_connect_failed(struct toedev *tod, struct inpcb *inp, int err)
 {
 
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(inp);
 
 	if (!(inp->inp_flags & INP_DROPPED)) {
 		struct tcpcb *tp = intotcpcb(inp);
 
 		KASSERT(tp->t_flags & TF_TOE,
 		    ("%s: tp %p not offloaded.", __func__, tp));
 
 		if (err == EAGAIN) {
 			/*
 			 * Temporary failure during offload, take this PCB back.
 			 * Detach from the TOE driver and do the rest of what
 			 * TCP's pru_connect would have done if the connection
 			 * wasn't offloaded.
 			 */
 
 			tod->tod_pcb_detach(tod, tp);
 			KASSERT(!(tp->t_flags & TF_TOE),
 			    ("%s: tp %p still offloaded.", __func__, tp));
 			tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
 			if (tcp_output(tp) < 0)
 				INP_WLOCK(inp);	/* re-acquire */
 		} else {
 			tp = tcp_drop(tp, err);
 			if (tp == NULL)
 				INP_WLOCK(inp);	/* re-acquire */
 		}
 	}
 	INP_WLOCK_ASSERT(inp);
 }
 
 static int
 toecore_load(void)
 {
 
 	mtx_init(&toedev_lock, "toedev lock", NULL, MTX_DEF);
 	TAILQ_INIT(&toedev_list);
 
 	listen_start_eh = EVENTHANDLER_REGISTER(tcp_offload_listen_start,
 	    toe_listen_start_event, NULL, EVENTHANDLER_PRI_ANY);
 	listen_stop_eh = EVENTHANDLER_REGISTER(tcp_offload_listen_stop,
 	    toe_listen_stop_event, NULL, EVENTHANDLER_PRI_ANY);
 	lle_event_eh = EVENTHANDLER_REGISTER(lle_event, toe_lle_event, NULL,
 	    EVENTHANDLER_PRI_ANY);
 
 	return (0);
 }
 
 static int
 toecore_unload(void)
 {
 
 	mtx_lock(&toedev_lock);
 	if (!TAILQ_EMPTY(&toedev_list)) {
 		mtx_unlock(&toedev_lock);
 		return (EBUSY);
 	}
 
 	EVENTHANDLER_DEREGISTER(tcp_offload_listen_start, listen_start_eh);
 	EVENTHANDLER_DEREGISTER(tcp_offload_listen_stop, listen_stop_eh);
 	EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh);
 
 	mtx_unlock(&toedev_lock);
 	mtx_destroy(&toedev_lock);
 
 	return (0);
 }
 
 static int
 toecore_mod_handler(module_t mod, int cmd, void *arg)
 {
 
 	if (cmd == MOD_LOAD)
 		return (toecore_load());
 
 	if (cmd == MOD_UNLOAD)
 		return (toecore_unload());
 
 	return (EOPNOTSUPP);
 }
 
 static moduledata_t mod_data= {
 	"toecore",
 	toecore_mod_handler,
 	0
 };
 
 MODULE_VERSION(toecore, 1);
 DECLARE_MODULE(toecore, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c
index 516d83fa846c..176a5227c9a1 100644
--- a/sys/netinet6/in6_pcb.c
+++ b/sys/netinet6/in6_pcb.c
@@ -1,1142 +1,1139 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_pcb.c,v 1.31 2001/05/21 05:45:10 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/hash.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/jail.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/tcp_var.h>
 #include <netinet/ip6.h>
 #include <netinet/ip_var.h>
 
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_pcb_var.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/scope6_var.h>
 
 int
 in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct ucred *cred)
 {
 	struct socket *so = inp->inp_socket;
 	u_int16_t lport = 0;
 	int error, lookupflags = 0;
 #ifdef INVARIANTS
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 #endif
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	error = prison_local_ip6(cred, laddr,
 	    ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0));
 	if (error)
 		return(error);
 
 	/* XXX: this is redundant when called from in6_pcbbind */
 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
 		lookupflags = INPLOOKUP_WILDCARD;
 
 	inp->inp_flags |= INP_ANONPORT;
 
 	error = in_pcb_lport(inp, NULL, &lport, cred, lookupflags);
 	if (error != 0)
 		return (error);
 
 	inp->inp_lport = lport;
 	if (in_pcbinshash(inp) != 0) {
 		inp->in6p_laddr = in6addr_any;
 		inp->inp_lport = 0;
 		return (EAGAIN);
 	}
 
 	return (0);
 }
 
 int
 in6_pcbbind(struct inpcb *inp, struct sockaddr *nam,
     struct ucred *cred)
 {
 	struct socket *so = inp->inp_socket;
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)NULL;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	u_short	lport = 0;
 	int error, lookupflags = 0;
 	int reuseport = (so->so_options & SO_REUSEPORT);
 
 	/*
 	 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
 	 * so that we don't have to add to the (already messy) code below.
 	 */
 	int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
 		return (EINVAL);
 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
 		lookupflags = INPLOOKUP_WILDCARD;
 	if (nam == NULL) {
 		if ((error = prison_local_ip6(cred, &inp->in6p_laddr,
 		    ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0)
 			return (error);
 	} else {
 		sin6 = (struct sockaddr_in6 *)nam;
 		KASSERT(sin6->sin6_family == AF_INET6,
 		    ("%s: invalid address family for %p", __func__, sin6));
 		KASSERT(sin6->sin6_len == sizeof(*sin6),
 		    ("%s: invalid address length for %p", __func__, sin6));
 
 		if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0)
 			return(error);
 
 		if ((error = prison_local_ip6(cred, &sin6->sin6_addr,
 		    ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0)
 			return (error);
 
 		lport = sin6->sin6_port;
 		if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
 			/*
 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
 			 * allow compepte duplication of binding if
 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
 			 * and a multicast address is bound on both
 			 * new and duplicated sockets.
 			 */
 			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
 			/*
 			 * XXX: How to deal with SO_REUSEPORT_LB here?
 			 * Treat same as SO_REUSEPORT for now.
 			 */
 			if ((so->so_options &
 			    (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
 				reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			struct epoch_tracker et;
 			struct ifaddr *ifa;
 
 			sin6->sin6_port = 0;		/* yech... */
 			NET_EPOCH_ENTER(et);
 			if ((ifa = ifa_ifwithaddr((struct sockaddr *)sin6)) ==
 			    NULL &&
 			    (inp->inp_flags & INP_BINDANY) == 0) {
 				NET_EPOCH_EXIT(et);
 				return (EADDRNOTAVAIL);
 			}
 
 			/*
 			 * XXX: bind to an anycast address might accidentally
 			 * cause sending a packet with anycast source address.
 			 * We should allow to bind to a deprecated address, since
 			 * the application dares to use it.
 			 */
 			if (ifa != NULL &&
 			    ((struct in6_ifaddr *)ifa)->ia6_flags &
 			    (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_DETACHED)) {
 				NET_EPOCH_EXIT(et);
 				return (EADDRNOTAVAIL);
 			}
 			NET_EPOCH_EXIT(et);
 		}
 		if (lport) {
 			struct inpcb *t;
 
 			/* GROSS */
 			if (ntohs(lport) <= V_ipport_reservedhigh &&
 			    ntohs(lport) >= V_ipport_reservedlow &&
 			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
 				return (EACCES);
 			if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) &&
 			    priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
 				t = in6_pcblookup_local(pcbinfo,
 				    &sin6->sin6_addr, lport,
 				    INPLOOKUP_WILDCARD, cred);
 				if (t &&
 				    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
-				    ((t->inp_flags & INP_TIMEWAIT) == 0) &&
 				    (so->so_type != SOCK_STREAM ||
 				     IN6_IS_ADDR_UNSPECIFIED(&t->in6p_faddr)) &&
 				    (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
 				     !IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) ||
 				     (t->inp_flags2 & INP_REUSEPORT) ||
 				     (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
 				    (inp->inp_cred->cr_uid !=
 				     t->inp_cred->cr_uid))
 					return (EADDRINUSE);
 
 				/*
 				 * If the socket is a BINDMULTI socket, then
 				 * the credentials need to match and the
 				 * original socket also has to have been bound
 				 * with BINDMULTI.
 				 */
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 
 #ifdef INET
 				if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 &&
 				    IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 					struct sockaddr_in sin;
 
 					in6_sin6_2_sin(&sin, sin6);
 					t = in_pcblookup_local(pcbinfo,
 					    sin.sin_addr, lport,
 					    INPLOOKUP_WILDCARD, cred);
 					if (t &&
 					    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
-					    ((t->inp_flags &
-					      INP_TIMEWAIT) == 0) &&
 					    (so->so_type != SOCK_STREAM ||
 					     ntohl(t->inp_faddr.s_addr) ==
 					      INADDR_ANY) &&
 					    (inp->inp_cred->cr_uid !=
 					     t->inp_cred->cr_uid))
 						return (EADDRINUSE);
 
 					if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 						return (EADDRINUSE);
 				}
 #endif
 			}
 			t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr,
 			    lport, lookupflags, cred);
 			if (t && (reuseport & inp_so_options(t)) == 0 &&
 			    (reuseport_lb & inp_so_options(t)) == 0) {
 				return (EADDRINUSE);
 			}
 #ifdef INET
 			if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 &&
 			    IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 				struct sockaddr_in sin;
 
 				in6_sin6_2_sin(&sin, sin6);
 				t = in_pcblookup_local(pcbinfo, sin.sin_addr,
 				   lport, lookupflags, cred);
 				if (t &&
 				    (reuseport & inp_so_options(t)) == 0 &&
 				    (reuseport_lb & inp_so_options(t)) == 0 &&
 				    (ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
 				        (t->inp_vflag & INP_IPV6PROTO) != 0)) {
 					return (EADDRINUSE);
 				}
 			}
 #endif
 		}
 		inp->in6p_laddr = sin6->sin6_addr;
 	}
 	if (lport == 0) {
 		if ((error = in6_pcbsetport(&inp->in6p_laddr, inp, cred)) != 0) {
 			/* Undo an address bind that may have occurred. */
 			inp->in6p_laddr = in6addr_any;
 			return (error);
 		}
 	} else {
 		inp->inp_lport = lport;
 		if (in_pcbinshash(inp) != 0) {
 			inp->in6p_laddr = in6addr_any;
 			inp->inp_lport = 0;
 			return (EAGAIN);
 		}
 	}
 	return (0);
 }
 
 /*
  *   Transform old in6_pcbconnect() into an inner subroutine for new
  *   in6_pcbconnect(): Do some validity-checking on the remote
  *   address (in mbuf 'nam') and then determine local host address
  *   (i.e., which interface) to use to access that remote host.
  *
  *   This preserves definition of in6_pcbconnect(), while supporting a
  *   slightly different version for T/TCP.  (This is more than
  *   a bit of a kludge, but cleaning up the internal interfaces would
  *   have forced minor changes in every protocol).
  */
 static int
 in6_pcbladdr(struct inpcb *inp, struct sockaddr_in6 *sin6,
     struct in6_addr *plocal_addr6)
 {
 	int error = 0;
 	int scope_ambiguous = 0;
 	struct in6_addr in6a;
 	struct epoch_tracker et;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);	/* XXXRW: why? */
 
 	if (sin6->sin6_port == 0)
 		return (EADDRNOTAVAIL);
 
 	if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone)
 		scope_ambiguous = 1;
 	if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0)
 		return(error);
 
 	if (!CK_STAILQ_EMPTY(&V_in6_ifaddrhead)) {
 		/*
 		 * If the destination address is UNSPECIFIED addr,
 		 * use the loopback addr, e.g ::1.
 		 */
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
 			sin6->sin6_addr = in6addr_loopback;
 	}
 	if ((error = prison_remote_ip6(inp->inp_cred, &sin6->sin6_addr)) != 0)
 		return (error);
 
 	NET_EPOCH_ENTER(et);
 	error = in6_selectsrc_socket(sin6, inp->in6p_outputopts,
 	    inp, inp->inp_cred, scope_ambiguous, &in6a, NULL);
 	NET_EPOCH_EXIT(et);
 	if (error)
 		return (error);
 
 	/*
 	 * Do not update this earlier, in case we return with an error.
 	 *
 	 * XXX: this in6_selectsrc_socket result might replace the bound local
 	 * address with the address specified by setsockopt(IPV6_PKTINFO).
 	 * Is it the intended behavior?
 	 */
 	*plocal_addr6 = in6a;
 
 	/*
 	 * Don't do pcblookup call here; return interface in
 	 * plocal_addr6
 	 * and exit to caller, that will do the lookup.
 	 */
 
 	return (0);
 }
 
 /*
  * Outer subroutine:
  * Connect from a socket to a specified address.
  * Both address and port must be specified in argument sin.
  * If don't have a local address for this socket yet,
  * then pick one.
  */
 int
 in6_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
     struct ucred *cred, struct mbuf *m, bool rehash)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
 	struct sockaddr_in6 laddr6;
 	int error;
 
 	KASSERT(sin6->sin6_family == AF_INET6,
 	    ("%s: invalid address family for %p", __func__, sin6));
 	KASSERT(sin6->sin6_len == sizeof(*sin6),
 	    ("%s: invalid address length for %p", __func__, sin6));
 
 	bzero(&laddr6, sizeof(laddr6));
 	laddr6.sin6_family = AF_INET6;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 #ifdef ROUTE_MPATH
 	if (CALC_FLOWID_OUTBOUND) {
 		uint32_t hash_type, hash_val;
 
 		hash_val = fib6_calc_software_hash(&inp->in6p_laddr,
 		    &sin6->sin6_addr, 0, sin6->sin6_port,
 		    inp->inp_socket->so_proto->pr_protocol, &hash_type);
 		inp->inp_flowid = hash_val;
 		inp->inp_flowtype = hash_type;
 	}
 #endif
 	/*
 	 * Call inner routine, to assign local interface address.
 	 * in6_pcbladdr() may automatically fill in sin6_scope_id.
 	 */
 	if ((error = in6_pcbladdr(inp, sin6, &laddr6.sin6_addr)) != 0)
 		return (error);
 
 	if (in6_pcblookup_hash_locked(pcbinfo, &sin6->sin6_addr,
 			       sin6->sin6_port,
 			      IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
 			      ? &laddr6.sin6_addr : &inp->in6p_laddr,
 			      inp->inp_lport, 0, NULL, M_NODOM) != NULL) {
 		return (EADDRINUSE);
 	}
 	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 		if (inp->inp_lport == 0) {
 			/*
 			 * rehash was required to be true in the past for
 			 * this case; retain that convention.  However,
 			 * we now call in_pcb_lport_dest rather than
 			 * in6_pcbbind; the former does not insert into
 			 * the hash table, the latter does.  Change rehash
 			 * to false to do the in_pcbinshash below.
 			 */
 			KASSERT(rehash == true,
 			    ("Rehashing required for unbound inps"));
 			rehash = false;
 			error = in_pcb_lport_dest(inp,
 			    (struct sockaddr *) &laddr6, &inp->inp_lport,
 			    (struct sockaddr *) sin6, sin6->sin6_port, cred,
 			    INPLOOKUP_WILDCARD);
 			if (error)
 				return (error);
 		}
 		inp->in6p_laddr = laddr6.sin6_addr;
 	}
 	inp->in6p_faddr = sin6->sin6_addr;
 	inp->inp_fport = sin6->sin6_port;
 	/* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
 	inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
 	if (inp->inp_flags & IN6P_AUTOFLOWLABEL)
 		inp->inp_flow |=
 		    (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
 
 	if (rehash) {
 		in_pcbrehash(inp);
 	} else {
 		in_pcbinshash(inp);
 	}
 
 	return (0);
 }
 
 int
 in6_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 
 	return (in6_pcbconnect_mbuf(inp, nam, cred, NULL, true));
 }
 
 void
 in6_pcbdisconnect(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	bzero((caddr_t)&inp->in6p_faddr, sizeof(inp->in6p_faddr));
 	inp->inp_fport = 0;
 	/* clear flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
 	inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
 	in_pcbrehash(inp);
 }
 
 struct sockaddr *
 in6_sockaddr(in_port_t port, struct in6_addr *addr_p)
 {
 	struct sockaddr_in6 *sin6;
 
 	sin6 = malloc(sizeof *sin6, M_SONAME, M_WAITOK);
 	bzero(sin6, sizeof *sin6);
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(*sin6);
 	sin6->sin6_port = port;
 	sin6->sin6_addr = *addr_p;
 	(void)sa6_recoverscope(sin6); /* XXX: should catch errors */
 
 	return (struct sockaddr *)sin6;
 }
 
 struct sockaddr *
 in6_v4mapsin6_sockaddr(in_port_t port, struct in_addr *addr_p)
 {
 	struct sockaddr_in sin;
 	struct sockaddr_in6 *sin6_p;
 
 	bzero(&sin, sizeof sin);
 	sin.sin_family = AF_INET;
 	sin.sin_len = sizeof(sin);
 	sin.sin_port = port;
 	sin.sin_addr = *addr_p;
 
 	sin6_p = malloc(sizeof *sin6_p, M_SONAME,
 		M_WAITOK);
 	in6_sin_2_v4mapsin6(&sin, sin6_p);
 
 	return (struct sockaddr *)sin6_p;
 }
 
 int
 in6_getsockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in6_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in6_getsockaddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_lport;
 	addr = inp->in6p_laddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in6_sockaddr(port, &addr);
 	return 0;
 }
 
 int
 in6_getpeeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in6_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in6_getpeeraddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_fport;
 	addr = inp->in6p_faddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in6_sockaddr(port, &addr);
 	return 0;
 }
 
 int
 in6_mapped_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct	inpcb *inp;
 	int	error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in6_mapped_sockaddr: inp == NULL"));
 
 #ifdef INET
 	if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) {
 		error = in_getsockaddr(so, nam);
 		if (error == 0)
 			in6_sin_2_v4mapsin6_in_sock(nam);
 	} else
 #endif
 	{
 		/* scope issues will be handled in in6_getsockaddr(). */
 		error = in6_getsockaddr(so, nam);
 	}
 
 	return error;
 }
 
 int
 in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct	inpcb *inp;
 	int	error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in6_mapped_peeraddr: inp == NULL"));
 
 #ifdef INET
 	if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) {
 		error = in_getpeeraddr(so, nam);
 		if (error == 0)
 			in6_sin_2_v4mapsin6_in_sock(nam);
 	} else
 #endif
 	/* scope issues will be handled in in6_getpeeraddr(). */
 	error = in6_getpeeraddr(so, nam);
 
 	return error;
 }
 
 /*
  * Pass some notification to all connections of a protocol
  * associated with address dst.  The local address and/or port numbers
  * may be specified to limit the search.  The "usual action" will be
  * taken, depending on the ctlinput cmd.  The caller must filter any
  * cmds that are uninteresting (e.g., no error in the map).
  * Call the protocol specific routine (if any) to report
  * any errors for each matching socket.
  */
 static bool
 inp_match6(const struct inpcb *inp, void *v __unused)
 {
 
 	return ((inp->inp_vflag & INP_IPV6) != 0);
 }
 
 void
 in6_pcbnotify(struct inpcbinfo *pcbinfo, struct sockaddr_in6 *sa6_dst,
     u_int fport_arg, const struct sockaddr_in6 *src, u_int lport_arg,
     int errno, void *cmdarg,
     struct inpcb *(*notify)(struct inpcb *, int))
 {
 	struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
 	    inp_match6, NULL);
 	struct inpcb *inp;
 	struct sockaddr_in6 sa6_src;
 	u_short	fport = fport_arg, lport = lport_arg;
 	u_int32_t flowinfo;
 
 	if (IN6_IS_ADDR_UNSPECIFIED(&sa6_dst->sin6_addr))
 		return;
 
 	/*
 	 * note that src can be NULL when we get notify by local fragmentation.
 	 */
 	sa6_src = (src == NULL) ? sa6_any : *src;
 	flowinfo = sa6_src.sin6_flowinfo;
 
 	while ((inp = inp_next(&inpi)) != NULL) {
 		INP_WLOCK_ASSERT(inp);
 		/*
 		 * If the error designates a new path MTU for a destination
 		 * and the application (associated with this socket) wanted to
 		 * know the value, notify.
 		 * XXX: should we avoid to notify the value to TCP sockets?
 		 */
 		if (errno == EMSGSIZE && cmdarg != NULL)
 			ip6_notify_pmtu(inp, sa6_dst, *(uint32_t *)cmdarg);
 
 		/*
 		 * Detect if we should notify the error. If no source and
 		 * destination ports are specified, but non-zero flowinfo and
 		 * local address match, notify the error. This is the case
 		 * when the error is delivered with an encrypted buffer
 		 * by ESP. Otherwise, just compare addresses and ports
 		 * as usual.
 		 */
 		if (lport == 0 && fport == 0 && flowinfo &&
 		    inp->inp_socket != NULL &&
 		    flowinfo == (inp->inp_flow & IPV6_FLOWLABEL_MASK) &&
 		    IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr))
 			goto do_notify;
 		else if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr,
 					     &sa6_dst->sin6_addr) ||
 			 inp->inp_socket == 0 ||
 			 (lport && inp->inp_lport != lport) ||
 			 (!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) &&
 			  !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr,
 					      &sa6_src.sin6_addr)) ||
 			 (fport && inp->inp_fport != fport)) {
 			continue;
 		}
 
 	  do_notify:
 		if (notify)
 			(*notify)(inp, errno);
 	}
 }
 
 /*
  * Lookup a PCB based on the local address and port.  Caller must hold the
  * hash lock.  No inpcb locks or references are acquired.
  */
 struct inpcb *
 in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr,
     u_short lport, int lookupflags, struct ucred *cred)
 {
 	struct inpcb *inp;
 	int matchwild = 3, wildcard;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
 		struct inpcbhead *head;
 		/*
 		 * Look for an unconnected (wildcard foreign addr) PCB that
 		 * matches the local address and port we're looking for.
 		 */
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
 		    pcbinfo->ipi_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_hash) {
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV6) == 0)
 				continue;
 			if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
 			    IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) &&
 			    inp->inp_lport == lport) {
 				/* Found. */
 				if (cred == NULL ||
 				    prison_equal_ip6(cred->cr_prison,
 					inp->inp_cred->cr_prison))
 					return (inp);
 			}
 		}
 		/*
 		 * Not found.
 		 */
 		return (NULL);
 	} else {
 		struct inpcbporthead *porthash;
 		struct inpcbport *phd;
 		struct inpcb *match = NULL;
 		/*
 		 * Best fit PCB lookup.
 		 *
 		 * First see if this local port is in use by looking on the
 		 * port hash list.
 		 */
 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
 		    pcbinfo->ipi_porthashmask)];
 		CK_LIST_FOREACH(phd, porthash, phd_hash) {
 			if (phd->phd_port == lport)
 				break;
 		}
 		if (phd != NULL) {
 			/*
 			 * Port is in use by one or more PCBs. Look for best
 			 * fit.
 			 */
 			CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
 				wildcard = 0;
 				if (cred != NULL &&
 				    !prison_equal_ip6(cred->cr_prison,
 					inp->inp_cred->cr_prison))
 					continue;
 				/* XXX inp locking */
 				if ((inp->inp_vflag & INP_IPV6) == 0)
 					continue;
 				if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))
 					wildcard++;
 				if (!IN6_IS_ADDR_UNSPECIFIED(
 					&inp->in6p_laddr)) {
 					if (IN6_IS_ADDR_UNSPECIFIED(laddr))
 						wildcard++;
 					else if (!IN6_ARE_ADDR_EQUAL(
 					    &inp->in6p_laddr, laddr))
 						continue;
 				} else {
 					if (!IN6_IS_ADDR_UNSPECIFIED(laddr))
 						wildcard++;
 				}
 				if (wildcard < matchwild) {
 					match = inp;
 					matchwild = wildcard;
 					if (matchwild == 0)
 						break;
 				}
 			}
 		}
 		return (match);
 	}
 }
 
 static bool
 in6_multi_match(const struct inpcb *inp, void *v __unused)
 {
 
 	if ((inp->inp_vflag & INP_IPV6) && inp->in6p_moptions != NULL)
 		return (true);
 	else
 		return (false);
 }
 
 void
 in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
 {
 	struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_RLOCKPCB,
 	    in6_multi_match, NULL);
 	struct inpcb *inp;
 	struct in6_multi *inm;
 	struct in6_mfilter *imf;
 	struct ip6_moptions *im6o;
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	while ((inp = inp_next(&inpi)) != NULL) {
 		INP_RLOCK_ASSERT(inp);
 
 		im6o = inp->in6p_moptions;
 		/*
 		 * Unselect the outgoing ifp for multicast if it
 		 * is being detached.
 		 */
 		if (im6o->im6o_multicast_ifp == ifp)
 			im6o->im6o_multicast_ifp = NULL;
 		/*
 		 * Drop multicast group membership if we joined
 		 * through the interface being detached.
 		 */
 restart:
 		IP6_MFILTER_FOREACH(imf, &im6o->im6o_head) {
 			if ((inm = imf->im6f_in6m) == NULL)
 				continue;
 			if (inm->in6m_ifp != ifp)
 				continue;
 			ip6_mfilter_remove(&im6o->im6o_head, imf);
 			in6_leavegroup_locked(inm, NULL);
 			ip6_mfilter_free(imf);
 			goto restart;
 		}
 	}
 }
 
 /*
  * Check for alternatives when higher level complains
  * about service problems.  For now, invalidate cached
  * routing information.  If the route was created dynamically
  * (by a redirect), time to try a default gateway again.
  */
 void
 in6_losing(struct inpcb *inp)
 {
 
 	RO_INVALIDATE_CACHE(&inp->inp_route6);
 }
 
 /*
  * After a routing change, flush old routing
  * and allocate a (hopefully) better one.
  */
 struct inpcb *
 in6_rtchange(struct inpcb *inp, int errno __unused)
 {
 
 	RO_INVALIDATE_CACHE(&inp->inp_route6);
 	return inp;
 }
 
 static struct inpcb *
 in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
     const struct in6_addr *laddr, uint16_t lport, const struct in6_addr *faddr,
     uint16_t fport, int lookupflags, uint8_t numa_domain)
 {
 	struct inpcb *local_wild, *numa_wild;
 	const struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	uint32_t idx;
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	hdr = &pcbinfo->ipi_lbgrouphashbase[
 	    INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
 
 	/*
 	 * Order of socket selection:
 	 * 1. non-wild.
 	 * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
 	 *
 	 * NOTE:
 	 * - Load balanced group does not contain jailed sockets.
 	 * - Load balanced does not contain IPv4 mapped INET6 wild sockets.
 	 */
 	local_wild = NULL;
 	numa_wild = NULL;
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 #ifdef INET
 		if (!(grp->il_vflag & INP_IPV6))
 			continue;
 #endif
 		if (grp->il_lport != lport)
 			continue;
 
 		idx = INP6_PCBLBGROUP_PKTHASH(faddr, lport, fport) %
 		    grp->il_inpcnt;
 		if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr)) {
 			if (numa_domain == M_NODOM ||
 			    grp->il_numa_domain == numa_domain) {
 				return (grp->il_inp[idx]);
 			}
 			else
 				numa_wild = grp->il_inp[idx];
 		}
 		if (IN6_IS_ADDR_UNSPECIFIED(&grp->il6_laddr) &&
 		    (lookupflags & INPLOOKUP_WILDCARD) != 0 &&
 		    (local_wild == NULL || numa_domain == M_NODOM ||
 			grp->il_numa_domain == numa_domain)) {
 			local_wild = grp->il_inp[idx];
 		}
 	}
 	if (numa_wild != NULL)
 		return (numa_wild);
 	return (local_wild);
 }
 
 /*
  * Lookup PCB in hash list.  Used in in_pcb.c as well as here.
  */
 struct inpcb *
 in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
     u_int fport_arg, struct in6_addr *laddr, u_int lport_arg,
     int lookupflags, struct ifnet *ifp, uint8_t numa_domain)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(faddr, lport, fport,
 	    pcbinfo->ipi_hashmask)];
 	CK_LIST_FOREACH(inp, head, inp_hash) {
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV6) == 0)
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) &&
 		    IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP6))
 				return (inp);
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL)
 		return (tmpinp);
 
 	/*
 	 * Then look in lb group (for wildcard match).
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		inp = in6_pcblookup_lbgroup(pcbinfo, laddr, lport, faddr,
 		    fport, lookupflags, numa_domain);
 		if (inp != NULL)
 			return (inp);
 	}
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 		struct inpcb *jail_wild = NULL;
 		int injail;
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
 		    pcbinfo->ipi_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_hash) {
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV6) == 0)
 				continue;
 
 			if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) ||
 			    inp->inp_lport != lport) {
 				continue;
 			}
 
 			injail = prison_flag(inp->inp_cred, PR_IP6);
 			if (injail) {
 				if (prison_check_ip6_locked(
 				    inp->inp_cred->cr_prison, laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) {
 				if (injail)
 					return (inp);
 				else
 					local_exact = inp;
 			} else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 				if (injail)
 					jail_wild = inp;
 				else
 					local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 
 		if (jail_wild != NULL)
 			return (jail_wild);
 		if (local_exact != NULL)
 			return (local_exact);
 		if (local_wild != NULL)
 			return (local_wild);
 	} /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
 
 	/*
 	 * Not found.
 	 */
 	return (NULL);
 }
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
  * hash list lock, and will return the inpcb locked (i.e., requires
  * INPLOOKUP_LOCKPCB).
  */
 static struct inpcb *
 in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
     u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags,
     struct ifnet *ifp, uint8_t numa_domain)
 {
 	struct inpcb *inp;
 
 	smr_enter(pcbinfo->ipi_smr);
 	inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain);
 	if (inp != NULL) {
 		if (__predict_false(inp_smr_lock(inp,
 		    (lookupflags & INPLOOKUP_LOCKMASK)) == false))
 			inp = NULL;
 	} else
 		smr_exit(pcbinfo->ipi_smr);
 
 	return (inp);
 }
 
 /*
  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
  * from which a pre-calculated hash value may be extracted.
  */
 struct inpcb *
 in6_pcblookup(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport,
     struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp)
 {
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 	return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp, M_NODOM));
 }
 
 struct inpcb *
 in6_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
     u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags,
     struct ifnet *ifp, struct mbuf *m)
 {
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 	return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp, m->m_pkthdr.numa_domain));
 }
 
 void
 init_sin6(struct sockaddr_in6 *sin6, struct mbuf *m, int srcordst)
 {
 	struct ip6_hdr *ip;
 
 	ip = mtod(m, struct ip6_hdr *);
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_len = sizeof(*sin6);
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_addr = srcordst ? ip->ip6_dst : ip->ip6_src;
 
 	(void)sa6_recoverscope(sin6); /* XXX: should catch errors... */
 
 	return;
 }
diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c
index b976b0583515..f00f718445f5 100644
--- a/sys/netinet6/ip6_output.c
+++ b/sys/netinet6/ip6_output.c
@@ -1,3387 +1,3387 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 
 #include <machine/in_cksum.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_vlan_var.h>
 #include <net/if_llatbl.h>
 #include <net/ethernet.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/pfil.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/in6_rss.h>
 
 #include <netipsec/ipsec_support.h>
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 #include <netinet/sctp.h>
 #include <netinet/sctp_crc32.h>
 #endif
 
 #include <netinet6/scope6_var.h>
 
 extern int in6_mcast_loop;
 
 struct ip6_exthdrs {
 	struct mbuf *ip6e_ip6;
 	struct mbuf *ip6e_hbh;
 	struct mbuf *ip6e_dest1;
 	struct mbuf *ip6e_rthdr;
 	struct mbuf *ip6e_dest2;
 };
 
 static MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options");
 
 static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **,
 			   struct ucred *, int);
 static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *,
 	struct socket *, struct sockopt *);
 static int ip6_getpcbopt(struct inpcb *, int, struct sockopt *);
 static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *,
 	struct ucred *, int, int, int);
 
 static int ip6_copyexthdr(struct mbuf **, caddr_t, int);
 static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int,
 	struct ip6_frag **);
 static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
 static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
 static int ip6_getpmtu(struct route_in6 *, int,
 	struct ifnet *, const struct in6_addr *, u_long *, int *, u_int,
 	u_int);
 static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long,
 	u_long *, int *, u_int);
 static int ip6_getpmtu_ctl(u_int, const struct in6_addr *, u_long *);
 static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
 
 /*
  * Make an extension header from option data.  hp is the source,
  * mp is the destination, and _ol is the optlen.
  */
 #define	MAKE_EXTHDR(hp, mp, _ol)					\
     do {								\
 	if (hp) {							\
 		struct ip6_ext *eh = (struct ip6_ext *)(hp);		\
 		error = ip6_copyexthdr((mp), (caddr_t)(hp),		\
 		    ((eh)->ip6e_len + 1) << 3);				\
 		if (error)						\
 			goto freehdrs;					\
 		(_ol) += (*(mp))->m_len;				\
 	}								\
     } while (/*CONSTCOND*/ 0)
 
 /*
  * Form a chain of extension headers.
  * m is the extension header mbuf
  * mp is the previous mbuf in the chain
  * p is the next header
  * i is the type of option.
  */
 #define MAKE_CHAIN(m, mp, p, i)\
     do {\
 	if (m) {\
 		if (!hdrsplit) \
 			panic("%s:%d: assumption failed: "\
 			    "hdr not split: hdrsplit %d exthdrs %p",\
 			    __func__, __LINE__, hdrsplit, &exthdrs);\
 		*mtod((m), u_char *) = *(p);\
 		*(p) = (i);\
 		p = mtod((m), u_char *);\
 		(m)->m_next = (mp)->m_next;\
 		(mp)->m_next = (m);\
 		(mp) = (m);\
 	}\
     } while (/*CONSTCOND*/ 0)
 
 void
 in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset)
 {
 	u_short csum;
 
 	csum = in_cksum_skip(m, offset + plen, offset);
 	if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0)
 		csum = 0xffff;
 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
 
 	if (offset + sizeof(csum) > m->m_len)
 		m_copyback(m, offset, sizeof(csum), (caddr_t)&csum);
 	else
 		*(u_short *)mtodo(m, offset) = csum;
 }
 
 static void
 ip6_output_delayed_csum(struct mbuf *m, struct ifnet *ifp, int csum_flags,
     int plen, int optlen)
 {
 
 	KASSERT((plen >= optlen), ("%s:%d: plen %d < optlen %d, m %p, ifp %p "
 	    "csum_flags %#x",
 	    __func__, __LINE__, plen, optlen, m, ifp, csum_flags));
 
 	if (csum_flags & CSUM_DELAY_DATA_IPV6) {
 		in6_delayed_cksum(m, plen - optlen,
 		    sizeof(struct ip6_hdr) + optlen);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
 	}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 	if (csum_flags & CSUM_SCTP_IPV6) {
 		sctp_delayed_cksum(m, sizeof(struct ip6_hdr) + optlen);
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
 	}
 #endif
 }
 
 int
 ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto,
     int fraglen , uint32_t id)
 {
 	struct mbuf *m, **mnext, *m_frgpart;
 	struct ip6_hdr *ip6, *mhip6;
 	struct ip6_frag *ip6f;
 	int off;
 	int error;
 	int tlen = m0->m_pkthdr.len;
 
 	KASSERT((fraglen % 8 == 0), ("Fragment length must be a multiple of 8"));
 
 	m = m0;
 	ip6 = mtod(m, struct ip6_hdr *);
 	mnext = &m->m_nextpkt;
 
 	for (off = hlen; off < tlen; off += fraglen) {
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (!m) {
 			IP6STAT_INC(ip6s_odropped);
 			return (ENOBUFS);
 		}
 
 		/*
 		 * Make sure the complete packet header gets copied
 		 * from the originating mbuf to the newly created
 		 * mbuf. This also ensures that existing firewall
 		 * classification(s), VLAN tags and so on get copied
 		 * to the resulting fragmented packet(s):
 		 */
 		if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
 			m_free(m);
 			IP6STAT_INC(ip6s_odropped);
 			return (ENOBUFS);
 		}
 
 		*mnext = m;
 		mnext = &m->m_nextpkt;
 		m->m_data += max_linkhdr;
 		mhip6 = mtod(m, struct ip6_hdr *);
 		*mhip6 = *ip6;
 		m->m_len = sizeof(*mhip6);
 		error = ip6_insertfraghdr(m0, m, hlen, &ip6f);
 		if (error) {
 			IP6STAT_INC(ip6s_odropped);
 			return (error);
 		}
 		ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7));
 		if (off + fraglen >= tlen)
 			fraglen = tlen - off;
 		else
 			ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
 		mhip6->ip6_plen = htons((u_short)(fraglen + hlen +
 		    sizeof(*ip6f) - sizeof(struct ip6_hdr)));
 		if ((m_frgpart = m_copym(m0, off, fraglen, M_NOWAIT)) == NULL) {
 			IP6STAT_INC(ip6s_odropped);
 			return (ENOBUFS);
 		}
 		m_cat(m, m_frgpart);
 		m->m_pkthdr.len = fraglen + hlen + sizeof(*ip6f);
 		ip6f->ip6f_reserved = 0;
 		ip6f->ip6f_ident = id;
 		ip6f->ip6f_nxt = nextproto;
 		IP6STAT_INC(ip6s_ofragments);
 		in6_ifstat_inc(ifp, ifs6_out_fragcreat);
 	}
 
 	return (0);
 }
 
 static int
 ip6_output_send(struct inpcb *inp, struct ifnet *ifp, struct ifnet *origifp,
     struct mbuf *m, struct sockaddr_in6 *dst, struct route_in6 *ro,
     bool stamp_tag)
 {
 #ifdef KERN_TLS
 	struct ktls_session *tls = NULL;
 #endif
 	struct m_snd_tag *mst;
 	int error;
 
 	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	mst = NULL;
 
 #ifdef KERN_TLS
 	/*
 	 * If this is an unencrypted TLS record, save a reference to
 	 * the record.  This local reference is used to call
 	 * ktls_output_eagain after the mbuf has been freed (thus
 	 * dropping the mbuf's reference) in if_output.
 	 */
 	if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) {
 		tls = ktls_hold(m->m_next->m_epg_tls);
 		mst = tls->snd_tag;
 
 		/*
 		 * If a TLS session doesn't have a valid tag, it must
 		 * have had an earlier ifp mismatch, so drop this
 		 * packet.
 		 */
 		if (mst == NULL) {
 			m_freem(m);
 			error = EAGAIN;
 			goto done;
 		}
 		/*
 		 * Always stamp tags that include NIC ktls.
 		 */
 		stamp_tag = true;
 	}
 #endif
 #ifdef RATELIMIT
 	if (inp != NULL && mst == NULL) {
 		if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 ||
 		    (inp->inp_snd_tag != NULL &&
 		    inp->inp_snd_tag->ifp != ifp))
 			in_pcboutput_txrtlmt(inp, ifp, m);
 
 		if (inp->inp_snd_tag != NULL)
 			mst = inp->inp_snd_tag;
 	}
 #endif
 	if (stamp_tag && mst != NULL) {
 		KASSERT(m->m_pkthdr.rcvif == NULL,
 		    ("trying to add a send tag to a forwarded packet"));
 		if (mst->ifp != ifp) {
 			m_freem(m);
 			error = EAGAIN;
 			goto done;
 		}
 
 		/* stamp send tag on mbuf */
 		m->m_pkthdr.snd_tag = m_snd_tag_ref(mst);
 		m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 	}
 
 	error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro);
 
 done:
 	/* Check for route change invalidating send tags. */
 #ifdef KERN_TLS
 	if (tls != NULL) {
 		if (error == EAGAIN)
 			error = ktls_output_eagain(inp, tls);
 		ktls_free(tls);
 	}
 #endif
 #ifdef RATELIMIT
 	if (error == EAGAIN)
 		in_pcboutput_eagain(inp);
 #endif
 	return (error);
 }
 
 /*
  * IP6 output.
  * The packet in mbuf chain m contains a skeletal IP6 header (with pri, len,
  * nxt, hlim, src, dst).
  * This function may modify ver and hlim only.
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  * If route_in6 ro is present and has ro_nh initialized, route lookup would be
  * skipped and ro->ro_nh would be used. If ro is present but ro->ro_nh is NULL,
  * then result of route lookup is stored in ro->ro_nh.
  *
  * Type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and nd_ifinfo.linkmtu
  * is uint32_t.  So we use u_long to hold largest one, which is rt_mtu.
  *
  * ifpp - XXX: just for statistics
  */
 int
 ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
     struct route_in6 *ro, int flags, struct ip6_moptions *im6o,
     struct ifnet **ifpp, struct inpcb *inp)
 {
 	struct ip6_hdr *ip6;
 	struct ifnet *ifp, *origifp;
 	struct mbuf *m = m0;
 	struct mbuf *mprev;
 	struct route_in6 *ro_pmtu;
 	struct nhop_object *nh;
 	struct sockaddr_in6 *dst, sin6, src_sa, dst_sa;
 	struct in6_addr odst;
 	u_char *nexthdrp;
 	int tlen, len;
 	int error = 0;
 	int vlan_pcp = -1;
 	struct in6_ifaddr *ia = NULL;
 	u_long mtu;
 	int alwaysfrag, dontfrag;
 	u_int32_t optlen, plen = 0, unfragpartlen;
 	struct ip6_exthdrs exthdrs;
 	struct in6_addr src0, dst0;
 	u_int32_t zone;
 	bool hdrsplit;
 	int sw_csum, tso;
 	int needfiblookup;
 	uint32_t fibnum;
 	struct m_tag *fwd_tag = NULL;
 	uint32_t id;
 
 	NET_EPOCH_ASSERT();
 
 	if (inp != NULL) {
 		INP_LOCK_ASSERT(inp);
 		M_SETFIB(m, inp->inp_inc.inc_fibnum);
 		if ((flags & IP_NODEFAULTFLOWID) == 0) {
 			/* Unconditionally set flowid. */
 			m->m_pkthdr.flowid = inp->inp_flowid;
 			M_HASHTYPE_SET(m, inp->inp_flowtype);
 		}
 		if ((inp->inp_flags2 & INP_2PCP_SET) != 0)
 			vlan_pcp = (inp->inp_flags2 & INP_2PCP_MASK) >>
 			    INP_2PCP_SHIFT;
 #ifdef NUMA
 		m->m_pkthdr.numa_domain = inp->inp_numa_domain;
 #endif
 	}
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	/*
 	 * IPSec checking which handles several cases.
 	 * FAST IPSEC: We re-injected the packet.
 	 * XXX: need scope argument.
 	 */
 	if (IPSEC_ENABLED(ipv6)) {
 		if ((error = IPSEC_OUTPUT(ipv6, m, inp)) != 0) {
 			if (error == EINPROGRESS)
 				error = 0;
 			goto done;
 		}
 	}
 #endif /* IPSEC */
 
 	/* Source address validation. */
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) &&
 	    (flags & IPV6_UNSPECSRC) == 0) {
 		error = EOPNOTSUPP;
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
 		error = EOPNOTSUPP;
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 
 	/*
 	 * If we are given packet options to add extension headers prepare them.
 	 * Calculate the total length of the extension header chain.
 	 * Keep the length of the unfragmentable part for fragmentation.
 	 */
 	bzero(&exthdrs, sizeof(exthdrs));
 	optlen = 0;
 	unfragpartlen = sizeof(struct ip6_hdr);
 	if (opt) {
 		/* Hop-by-Hop options header. */
 		MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh, optlen);
 
 		/* Destination options header (1st part). */
 		if (opt->ip6po_rthdr) {
 #ifndef RTHDR_SUPPORT_IMPLEMENTED
 			/*
 			 * If there is a routing header, discard the packet
 			 * right away here. RH0/1 are obsolete and we do not
 			 * currently support RH2/3/4.
 			 * People trying to use RH253/254 may want to disable
 			 * this check.
 			 * The moment we do support any routing header (again)
 			 * this block should check the routing type more
 			 * selectively.
 			 */
 			error = EINVAL;
 			goto bad;
 #endif
 
 			/*
 			 * Destination options header (1st part).
 			 * This only makes sense with a routing header.
 			 * See Section 9.2 of RFC 3542.
 			 * Disabling this part just for MIP6 convenience is
 			 * a bad idea.  We need to think carefully about a
 			 * way to make the advanced API coexist with MIP6
 			 * options, which might automatically be inserted in
 			 * the kernel.
 			 */
 			MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1,
 			    optlen);
 		}
 		/* Routing header. */
 		MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr, optlen);
 
 		unfragpartlen += optlen;
 
 		/*
 		 * NOTE: we don't add AH/ESP length here (done in
 		 * ip6_ipsec_output()).
 		 */
 
 		/* Destination options header (2nd part). */
 		MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2, optlen);
 	}
 
 	/*
 	 * If there is at least one extension header,
 	 * separate IP6 header from the payload.
 	 */
 	hdrsplit = false;
 	if (optlen) {
 		if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
 			m = NULL;
 			goto freehdrs;
 		}
 		m = exthdrs.ip6e_ip6;
 		ip6 = mtod(m, struct ip6_hdr *);
 		hdrsplit = true;
 	}
 
 	/* Adjust mbuf packet header length. */
 	m->m_pkthdr.len += optlen;
 	plen = m->m_pkthdr.len - sizeof(*ip6);
 
 	/* If this is a jumbo payload, insert a jumbo payload option. */
 	if (plen > IPV6_MAXPACKET) {
 		if (!hdrsplit) {
 			if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
 				m = NULL;
 				goto freehdrs;
 			}
 			m = exthdrs.ip6e_ip6;
 			ip6 = mtod(m, struct ip6_hdr *);
 			hdrsplit = true;
 		}
 		if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0)
 			goto freehdrs;
 		ip6->ip6_plen = 0;
 	} else
 		ip6->ip6_plen = htons(plen);
 	nexthdrp = &ip6->ip6_nxt;
 
 	if (optlen) {
 		/*
 		 * Concatenate headers and fill in next header fields.
 		 * Here we have, on "m"
 		 *	IPv6 payload
 		 * and we insert headers accordingly.
 		 * Finally, we should be getting:
 		 *	IPv6 hbh dest1 rthdr ah* [esp* dest2 payload].
 		 *
 		 * During the header composing process "m" points to IPv6
 		 * header.  "mprev" points to an extension header prior to esp.
 		 */
 		mprev = m;
 
 		/*
 		 * We treat dest2 specially.  This makes IPsec processing
 		 * much easier.  The goal here is to make mprev point the
 		 * mbuf prior to dest2.
 		 *
 		 * Result: IPv6 dest2 payload.
 		 * m and mprev will point to IPv6 header.
 		 */
 		if (exthdrs.ip6e_dest2) {
 			if (!hdrsplit)
 				panic("%s:%d: assumption failed: "
 				    "hdr not split: hdrsplit %d exthdrs %p",
 				    __func__, __LINE__, hdrsplit, &exthdrs);
 			exthdrs.ip6e_dest2->m_next = m->m_next;
 			m->m_next = exthdrs.ip6e_dest2;
 			*mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt;
 			ip6->ip6_nxt = IPPROTO_DSTOPTS;
 		}
 
 		/*
 		 * Result: IPv6 hbh dest1 rthdr dest2 payload.
 		 * m will point to IPv6 header.  mprev will point to the
 		 * extension header prior to dest2 (rthdr in the above case).
 		 */
 		MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS);
 		MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp,
 			   IPPROTO_DSTOPTS);
 		MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp,
 			   IPPROTO_ROUTING);
 	}
 
 	IP6STAT_INC(ip6s_localout);
 
 	/* Route packet. */
 	ro_pmtu = ro;
 	if (opt && opt->ip6po_rthdr)
 		ro = &opt->ip6po_route;
 	if (ro != NULL)
 		dst = (struct sockaddr_in6 *)&ro->ro_dst;
 	else
 		dst = &sin6;
 	fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
 
 again:
 	/*
 	 * If specified, try to fill in the traffic class field.
 	 * Do not override if a non-zero value is already set.
 	 * We check the diffserv field and the ECN field separately.
 	 */
 	if (opt && opt->ip6po_tclass >= 0) {
 		int mask = 0;
 
 		if (IPV6_DSCP(ip6) == 0)
 			mask |= 0xfc;
 		if (IPV6_ECN(ip6) == 0)
 			mask |= 0x03;
 		if (mask != 0)
 			ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20);
 	}
 
 	/* Fill in or override the hop limit field, if necessary. */
 	if (opt && opt->ip6po_hlim != -1)
 		ip6->ip6_hlim = opt->ip6po_hlim & 0xff;
 	else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (im6o != NULL)
 			ip6->ip6_hlim = im6o->im6o_multicast_hlim;
 		else
 			ip6->ip6_hlim = V_ip6_defmcasthlim;
 	}
 
 	if (ro == NULL || ro->ro_nh == NULL) {
 		bzero(dst, sizeof(*dst));
 		dst->sin6_family = AF_INET6;
 		dst->sin6_len = sizeof(*dst);
 		dst->sin6_addr = ip6->ip6_dst;
 	} 
 	/*
 	 * Validate route against routing table changes.
 	 * Make sure that the address family is set in route.
 	 */
 	nh = NULL;
 	ifp = NULL;
 	mtu = 0;
 	if (ro != NULL) {
 		if (ro->ro_nh != NULL && inp != NULL) {
 			ro->ro_dst.sin6_family = AF_INET6; /* XXX KASSERT? */
 			NH_VALIDATE((struct route *)ro, &inp->inp_rt_cookie,
 			    fibnum);
 		}
 		if (ro->ro_nh != NULL && fwd_tag == NULL &&
 		    (!NH_IS_VALID(ro->ro_nh) ||
 		    ro->ro_dst.sin6_family != AF_INET6 ||
 		    !IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)))
 			RO_INVALIDATE_CACHE(ro);
 
 		if (ro->ro_nh != NULL && fwd_tag == NULL &&
 		    ro->ro_dst.sin6_family == AF_INET6 &&
 		    IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)) {
 			/* Nexthop is valid and contains valid ifp */
 			nh = ro->ro_nh;
 		} else {
 			if (ro->ro_lle)
 				LLE_FREE(ro->ro_lle);	/* zeros ro_lle */
 			ro->ro_lle = NULL;
 			if (fwd_tag == NULL) {
 				bzero(&dst_sa, sizeof(dst_sa));
 				dst_sa.sin6_family = AF_INET6;
 				dst_sa.sin6_len = sizeof(dst_sa);
 				dst_sa.sin6_addr = ip6->ip6_dst;
 			}
 			error = in6_selectroute(&dst_sa, opt, im6o, ro, &ifp,
 			    &nh, fibnum, m->m_pkthdr.flowid);
 			if (error != 0) {
 				IP6STAT_INC(ip6s_noroute);
 				if (ifp != NULL)
 					in6_ifstat_inc(ifp, ifs6_out_discard);
 				goto bad;
 			}
 			/*
 			 * At this point at least @ifp is not NULL
 			 * Can be the case when dst is multicast, link-local or
 			 * interface is explicitly specificed by the caller.
 			 */
 		}
 		if (nh == NULL) {
 			/*
 			 * If in6_selectroute() does not return a nexthop
 			 * dst may not have been updated.
 			 */
 			*dst = dst_sa;	/* XXX */
 			origifp = ifp;
 			mtu = ifp->if_mtu;
 		} else {
 			ifp = nh->nh_ifp;
 			origifp = nh->nh_aifp;
 			ia = (struct in6_ifaddr *)(nh->nh_ifa);
 			counter_u64_add(nh->nh_pksent, 1);
 		}
 	} else {
 		struct nhop_object *nh;
 		struct in6_addr kdst;
 		uint32_t scopeid;
 
 		if (fwd_tag == NULL) {
 			bzero(&dst_sa, sizeof(dst_sa));
 			dst_sa.sin6_family = AF_INET6;
 			dst_sa.sin6_len = sizeof(dst_sa);
 			dst_sa.sin6_addr = ip6->ip6_dst;
 		}
 
 		if (IN6_IS_ADDR_MULTICAST(&dst_sa.sin6_addr) &&
 		    im6o != NULL &&
 		    (ifp = im6o->im6o_multicast_ifp) != NULL) {
 			/* We do not need a route lookup. */
 			*dst = dst_sa;	/* XXX */
 			origifp = ifp;
 			goto nonh6lookup;
 		}
 
 		in6_splitscope(&dst_sa.sin6_addr, &kdst, &scopeid);
 
 		if (IN6_IS_ADDR_MC_LINKLOCAL(&dst_sa.sin6_addr) ||
 		    IN6_IS_ADDR_MC_NODELOCAL(&dst_sa.sin6_addr)) {
 			if (scopeid > 0) {
 				ifp = in6_getlinkifnet(scopeid);
 				if (ifp == NULL) {
 					error = EHOSTUNREACH;
 					goto bad;
 				}
 				*dst = dst_sa;	/* XXX */
 				origifp = ifp;
 				goto nonh6lookup;
 			}
 		}
 
 		nh = fib6_lookup(fibnum, &kdst, scopeid, NHR_NONE,
 		    m->m_pkthdr.flowid);
 		if (nh == NULL) {
 			IP6STAT_INC(ip6s_noroute);
 			/* No ifp in6_ifstat_inc(ifp, ifs6_out_discard); */
 			error = EHOSTUNREACH;;
 			goto bad;
 		}
 
 		ifp = nh->nh_ifp;
 		origifp = nh->nh_aifp;
 		ia = ifatoia6(nh->nh_ifa);
 		if (nh->nh_flags & NHF_GATEWAY)
 			dst->sin6_addr = nh->gw6_sa.sin6_addr;
 		else if (fwd_tag != NULL)
 			dst->sin6_addr = dst_sa.sin6_addr;
 nonh6lookup:
 		;
 	}
 	/*
 	 * At this point ifp MUST be pointing to the valid transmit ifp.
 	 * origifp MUST be valid and pointing to either the same ifp or,
 	 * in case of loopback output, to the interface which ip6_src
 	 * belongs to.
 	 * Examples:
 	 *  fe80::1%em0 -> fe80::2%em0 -> ifp=em0, origifp=em0
 	 *  fe80::1%em0 -> fe80::1%em0 -> ifp=lo0, origifp=em0
 	 *  ::1 -> ::1 -> ifp=lo0, origifp=lo0
 	 *
 	 * mtu can be 0 and will be refined later.
 	 */
 	KASSERT((ifp != NULL), ("output interface must not be NULL"));
 	KASSERT((origifp != NULL), ("output address interface must not be NULL"));
 
 	if ((flags & IPV6_FORWARDING) == 0) {
 		/* XXX: the FORWARDING flag can be set for mrouting. */
 		in6_ifstat_inc(ifp, ifs6_out_request);
 	}
 
 	/* Setup data structures for scope ID checks. */
 	src0 = ip6->ip6_src;
 	bzero(&src_sa, sizeof(src_sa));
 	src_sa.sin6_family = AF_INET6;
 	src_sa.sin6_len = sizeof(src_sa);
 	src_sa.sin6_addr = ip6->ip6_src;
 
 	dst0 = ip6->ip6_dst;
 	/* Re-initialize to be sure. */
 	bzero(&dst_sa, sizeof(dst_sa));
 	dst_sa.sin6_family = AF_INET6;
 	dst_sa.sin6_len = sizeof(dst_sa);
 	dst_sa.sin6_addr = ip6->ip6_dst;
 
 	/* Check for valid scope ID. */
 	if (in6_setscope(&src0, origifp, &zone) == 0 &&
 	    sa6_recoverscope(&src_sa) == 0 && zone == src_sa.sin6_scope_id &&
 	    in6_setscope(&dst0, origifp, &zone) == 0 &&
 	    sa6_recoverscope(&dst_sa) == 0 && zone == dst_sa.sin6_scope_id) {
 		/*
 		 * The outgoing interface is in the zone of the source
 		 * and destination addresses.
 		 *
 		 */
 	} else if ((origifp->if_flags & IFF_LOOPBACK) == 0 ||
 	    sa6_recoverscope(&src_sa) != 0 ||
 	    sa6_recoverscope(&dst_sa) != 0 ||
 	    dst_sa.sin6_scope_id == 0 ||
 	    (src_sa.sin6_scope_id != 0 &&
 	    src_sa.sin6_scope_id != dst_sa.sin6_scope_id) ||
 	    ifnet_byindex(dst_sa.sin6_scope_id) == NULL) {
 		/*
 		 * If the destination network interface is not a
 		 * loopback interface, or the destination network
 		 * address has no scope ID, or the source address has
 		 * a scope ID set which is different from the
 		 * destination address one, or there is no network
 		 * interface representing this scope ID, the address
 		 * pair is considered invalid.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(origifp, ifs6_out_discard);
 		if (error == 0)
 			error = EHOSTUNREACH; /* XXX */
 		goto bad;
 	}
 	/* All scope ID checks are successful. */
 
 	if (nh && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (opt && opt->ip6po_nextroute.ro_nh) {
 			/*
 			 * The nexthop is explicitly specified by the
 			 * application.  We assume the next hop is an IPv6
 			 * address.
 			 */
 			dst = (struct sockaddr_in6 *)opt->ip6po_nexthop;
 		}
 		else if ((nh->nh_flags & NHF_GATEWAY))
 			dst = &nh->gw6_sa;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		m->m_flags &= ~(M_BCAST | M_MCAST); /* Just in case. */
 	} else {
 		m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST;
 		in6_ifstat_inc(ifp, ifs6_out_mcast);
 
 		/* Confirm that the outgoing interface supports multicast. */
 		if (!(ifp->if_flags & IFF_MULTICAST)) {
 			IP6STAT_INC(ip6s_noroute);
 			in6_ifstat_inc(ifp, ifs6_out_discard);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		if ((im6o == NULL && in6_mcast_loop) ||
 		    (im6o && im6o->im6o_multicast_loop)) {
 			/*
 			 * Loop back multicast datagram if not expressly
 			 * forbidden to do so, even if we have not joined
 			 * the address; protocols will filter it later,
 			 * thus deferring a hash lookup and lock acquisition
 			 * at the expense of an m_copym().
 			 */
 			ip6_mloopback(ifp, m);
 		} else {
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IPV6_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip6_mloopback(),
 			 * above, will be forwarded by the ip6_input() routine,
 			 * if necessary.
 			 */
 			if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) {
 				/*
 				 * XXX: ip6_mforward expects that rcvif is NULL
 				 * when it is called from the originating path.
 				 * However, it may not always be the case.
 				 */
 				m->m_pkthdr.rcvif = NULL;
 				if (ip6_mforward(ip6, ifp, m) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 		/*
 		 * Multicasts with a hoplimit of zero may be looped back,
 		 * above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip6_mloopback() will
 		 * loop back a copy if this host actually belongs to the
 		 * destination group on the loopback interface.
 		 */
 		if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) ||
 		    IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) {
 			m_freem(m);
 			goto done;
 		}
 	}
 
 	/*
 	 * Fill the outgoing inteface to tell the upper layer
 	 * to increment per-interface statistics.
 	 */
 	if (ifpp)
 		*ifpp = ifp;
 
 	/* Determine path MTU. */
 	if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst,
 		    &mtu, &alwaysfrag, fibnum, *nexthdrp)) != 0)
 		goto bad;
 	KASSERT(mtu > 0, ("%s:%d: mtu %ld, ro_pmtu %p ro %p ifp %p "
 	    "alwaysfrag %d fibnum %u\n", __func__, __LINE__, mtu, ro_pmtu, ro,
 	    ifp, alwaysfrag, fibnum));
 
 	/*
 	 * The caller of this function may specify to use the minimum MTU
 	 * in some cases.
 	 * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU
 	 * setting.  The logic is a bit complicated; by default, unicast
 	 * packets will follow path MTU while multicast packets will be sent at
 	 * the minimum MTU.  If IP6PO_MINMTU_ALL is specified, all packets
 	 * including unicast ones will be sent at the minimum MTU.  Multicast
 	 * packets will always be sent at the minimum MTU unless
 	 * IP6PO_MINMTU_DISABLE is explicitly specified.
 	 * See RFC 3542 for more details.
 	 */
 	if (mtu > IPV6_MMTU) {
 		if ((flags & IPV6_MINMTU))
 			mtu = IPV6_MMTU;
 		else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL)
 			mtu = IPV6_MMTU;
 		else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
 			 (opt == NULL ||
 			  opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) {
 			mtu = IPV6_MMTU;
 		}
 	}
 
 	/*
 	 * Clear embedded scope identifiers if necessary.
 	 * in6_clearscope() will touch the addresses only when necessary.
 	 */
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 
 	/*
 	 * If the outgoing packet contains a hop-by-hop options header,
 	 * it must be examined and processed even by the source node.
 	 * (RFC 2460, section 4.)
 	 */
 	if (exthdrs.ip6e_hbh) {
 		struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *);
 		u_int32_t dummy; /* XXX unused */
 		u_int32_t plen = 0; /* XXX: ip6_process will check the value */
 
 #ifdef DIAGNOSTIC
 		if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len)
 			panic("ip6e_hbh is not contiguous");
 #endif
 		/*
 		 *  XXX: if we have to send an ICMPv6 error to the sender,
 		 *       we need the M_LOOP flag since icmp6_error() expects
 		 *       the IPv6 and the hop-by-hop options header are
 		 *       contiguous unless the flag is set.
 		 */
 		m->m_flags |= M_LOOP;
 		m->m_pkthdr.rcvif = ifp;
 		if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1),
 		    ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh),
 		    &dummy, &plen) < 0) {
 			/* m was already freed at this point. */
 			error = EINVAL;/* better error? */
 			goto done;
 		}
 		m->m_flags &= ~M_LOOP; /* XXX */
 		m->m_pkthdr.rcvif = NULL;
 	}
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED_OUT(V_inet6_pfil_head))
 		goto passout;
 
 	odst = ip6->ip6_dst;
 	/* Run through list of hooks for output packets. */
 	switch (pfil_mbuf_out(V_inet6_pfil_head, &m, ifp, inp)) {
 	case PFIL_PASS:
 		ip6 = mtod(m, struct ip6_hdr *);
 		break;
 	case PFIL_DROPPED:
 		error = EACCES;
 		/* FALLTHROUGH */
 	case PFIL_CONSUMED:
 		goto done;
 	}
 
 	needfiblookup = 0;
 	/* See if destination IP address was changed by packet filter. */
 	if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		/* If destination is now ourself drop to ip6_input(). */
 		if (in6_localip(&ip6->ip6_dst)) {
 			m->m_flags |= M_FASTFWD_OURS;
 			if (m->m_pkthdr.rcvif == NULL)
 				m->m_pkthdr.rcvif = V_loif;
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 				m->m_pkthdr.csum_flags |=
 				    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 			if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
 				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 			error = netisr_queue(NETISR_IPV6, m);
 			goto done;
 		} else {
 			if (ro != NULL)
 				RO_INVALIDATE_CACHE(ro);
 			needfiblookup = 1; /* Redo the routing table lookup. */
 		}
 	}
 	/* See if fib was changed by packet filter. */
 	if (fibnum != M_GETFIB(m)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		fibnum = M_GETFIB(m);
 		if (ro != NULL)
 			RO_INVALIDATE_CACHE(ro);
 		needfiblookup = 1;
 	}
 	if (needfiblookup)
 		goto again;
 
 	/* See if local, if yes, send it to netisr. */
 	if (m->m_flags & M_FASTFWD_OURS) {
 		if (m->m_pkthdr.rcvif == NULL)
 			m->m_pkthdr.rcvif = V_loif;
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 			m->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
 			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 		error = netisr_queue(NETISR_IPV6, m);
 		goto done;
 	}
 	/* Or forward to some other address? */
 	if ((m->m_flags & M_IP6_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		if (ro != NULL)
 			dst = (struct sockaddr_in6 *)&ro->ro_dst;
 		else
 			dst = &sin6;
 		bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6));
 		m->m_flags |= M_SKIP_FIREWALL;
 		m->m_flags &= ~M_IP6_NEXTHOP;
 		m_tag_delete(m, fwd_tag);
 		goto again;
 	}
 
 passout:
 	if (vlan_pcp > -1)
 		EVL_APPLY_PRI(m, vlan_pcp);
 
 	/* Ensure the packet data is mapped if the interface requires it. */
 	if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) {
 		m = mb_unmapped_to_ext(m);
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_odropped);
 			return (ENOBUFS);
 		}
 	}
 
 	/*
 	 * Send the packet to the outgoing interface.
 	 * If necessary, do IPv6 fragmentation before sending.
 	 *
 	 * The logic here is rather complex:
 	 * 1: normal case (dontfrag == 0, alwaysfrag == 0)
 	 * 1-a:	send as is if tlen <= path mtu
 	 * 1-b:	fragment if tlen > path mtu
 	 *
 	 * 2: if user asks us not to fragment (dontfrag == 1)
 	 * 2-a:	send as is if tlen <= interface mtu
 	 * 2-b:	error if tlen > interface mtu
 	 *
 	 * 3: if we always need to attach fragment header (alwaysfrag == 1)
 	 *	always fragment
 	 *
 	 * 4: if dontfrag == 1 && alwaysfrag == 1
 	 *	error, as we cannot handle this conflicting request.
 	 */
 	sw_csum = m->m_pkthdr.csum_flags;
 	if (!hdrsplit) {
 		tso = ((sw_csum & ifp->if_hwassist &
 		    (CSUM_TSO | CSUM_INNER_TSO)) != 0) ? 1 : 0;
 		sw_csum &= ~ifp->if_hwassist;
 	} else
 		tso = 0;
 	/*
 	 * If we added extension headers, we will not do TSO and calculate the
 	 * checksums ourselves for now.
 	 * XXX-BZ  Need a framework to know when the NIC can handle it, even
 	 * with ext. hdrs.
 	 */
 	ip6_output_delayed_csum(m, ifp, sw_csum, plen, optlen);
 	/* XXX-BZ m->m_pkthdr.csum_flags &= ~ifp->if_hwassist; */
 	tlen = m->m_pkthdr.len;
 
 	if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso)
 		dontfrag = 1;
 	else
 		dontfrag = 0;
 	if (dontfrag && alwaysfrag) {	/* Case 4. */
 		/* Conflicting request - can't transmit. */
 		error = EMSGSIZE;
 		goto bad;
 	}
 	if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) {	/* Case 2-b. */
 		/*
 		 * Even if the DONTFRAG option is specified, we cannot send the
 		 * packet when the data length is larger than the MTU of the
 		 * outgoing interface.
 		 * Notify the error by sending IPV6_PATHMTU ancillary data if
 		 * application wanted to know the MTU value. Also return an
 		 * error code (this is not described in the API spec).
 		 */
 		if (inp != NULL)
 			ip6_notify_pmtu(inp, &dst_sa, (u_int32_t)mtu);
 		error = EMSGSIZE;
 		goto bad;
 	}
 
 	/* Transmit packet without fragmentation. */
 	if (dontfrag || (!alwaysfrag && tlen <= mtu)) {	/* Cases 1-a and 2-a. */
 		struct in6_ifaddr *ia6;
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
 		if (ia6) {
 			/* Record statistics for this interface address. */
 			counter_u64_add(ia6->ia_ifa.ifa_opackets, 1);
 			counter_u64_add(ia6->ia_ifa.ifa_obytes,
 			    m->m_pkthdr.len);
 		}
 		error = ip6_output_send(inp, ifp, origifp, m, dst, ro,
 		    (flags & IP_NO_SND_TAG_RL) ? false : true);
 		goto done;
 	}
 
 	/* Try to fragment the packet.  Cases 1-b and 3. */
 	if (mtu < IPV6_MMTU) {
 		/* Path MTU cannot be less than IPV6_MMTU. */
 		error = EMSGSIZE;
 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
 		goto bad;
 	} else if (ip6->ip6_plen == 0) {
 		/* Jumbo payload cannot be fragmented. */
 		error = EMSGSIZE;
 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
 		goto bad;
 	} else {
 		u_char nextproto;
 
 		/*
 		 * Too large for the destination or interface;
 		 * fragment if possible.
 		 * Must be able to put at least 8 bytes per fragment.
 		 */
 		if (mtu > IPV6_MAXPACKET)
 			mtu = IPV6_MAXPACKET;
 
 		len = (mtu - unfragpartlen - sizeof(struct ip6_frag)) & ~7;
 		if (len < 8) {
 			error = EMSGSIZE;
 			in6_ifstat_inc(ifp, ifs6_out_fragfail);
 			goto bad;
 		}
 
 		/*
 		 * If the interface will not calculate checksums on
 		 * fragmented packets, then do it here.
 		 * XXX-BZ handle the hw offloading case.  Need flags.
 		 */
 		ip6_output_delayed_csum(m, ifp, m->m_pkthdr.csum_flags, plen,
 		    optlen);
 
 		/*
 		 * Change the next header field of the last header in the
 		 * unfragmentable part.
 		 */
 		if (exthdrs.ip6e_rthdr) {
 			nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
 			*mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
 		} else if (exthdrs.ip6e_dest1) {
 			nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
 			*mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
 		} else if (exthdrs.ip6e_hbh) {
 			nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
 			*mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
 		} else {
 			ip6 = mtod(m, struct ip6_hdr *);
 			nextproto = ip6->ip6_nxt;
 			ip6->ip6_nxt = IPPROTO_FRAGMENT;
 		}
 
 		/*
 		 * Loop through length of segment after first fragment,
 		 * make new header and copy data of each part and link onto
 		 * chain.
 		 */
 		m0 = m;
 		id = htonl(ip6_randomid());
 		error = ip6_fragment(ifp, m, unfragpartlen, nextproto,len, id);
 		if (error != 0)
 			goto sendorfree;
 
 		in6_ifstat_inc(ifp, ifs6_out_fragok);
 	}
 
 	/* Remove leading garbage. */
 sendorfree:
 	m = m0->m_nextpkt;
 	m0->m_nextpkt = 0;
 	m_freem(m0);
 	for (; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 		if (error == 0) {
 			/* Record statistics for this interface address. */
 			if (ia) {
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_obytes,
 				    m->m_pkthdr.len);
 			}
 			if (vlan_pcp > -1)
 				EVL_APPLY_PRI(m, vlan_pcp);
 			error = ip6_output_send(inp, ifp, origifp, m, dst, ro,
 			    true);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		IP6STAT_INC(ip6s_fragmented);
 
 done:
 	return (error);
 
 freehdrs:
 	m_freem(exthdrs.ip6e_hbh);	/* m_freem() checks if mbuf is NULL. */
 	m_freem(exthdrs.ip6e_dest1);
 	m_freem(exthdrs.ip6e_rthdr);
 	m_freem(exthdrs.ip6e_dest2);
 	/* FALLTHROUGH */
 bad:
 	if (m)
 		m_freem(m);
 	goto done;
 }
 
 static int
 ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen)
 {
 	struct mbuf *m;
 
 	if (hlen > MCLBYTES)
 		return (ENOBUFS); /* XXX */
 
 	if (hlen > MLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, 0);
 	else
 		m = m_get(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOBUFS);
 	m->m_len = hlen;
 	if (hdr)
 		bcopy(hdr, mtod(m, caddr_t), hlen);
 
 	*mp = m;
 	return (0);
 }
 
 /*
  * Insert jumbo payload option.
  */
 static int
 ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen)
 {
 	struct mbuf *mopt;
 	u_char *optbuf;
 	u_int32_t v;
 
 #define JUMBOOPTLEN	8	/* length of jumbo payload option and padding */
 
 	/*
 	 * If there is no hop-by-hop options header, allocate new one.
 	 * If there is one but it doesn't have enough space to store the
 	 * jumbo payload option, allocate a cluster to store the whole options.
 	 * Otherwise, use it to store the options.
 	 */
 	if (exthdrs->ip6e_hbh == NULL) {
 		mopt = m_get(M_NOWAIT, MT_DATA);
 		if (mopt == NULL)
 			return (ENOBUFS);
 		mopt->m_len = JUMBOOPTLEN;
 		optbuf = mtod(mopt, u_char *);
 		optbuf[1] = 0;	/* = ((JUMBOOPTLEN) >> 3) - 1 */
 		exthdrs->ip6e_hbh = mopt;
 	} else {
 		struct ip6_hbh *hbh;
 
 		mopt = exthdrs->ip6e_hbh;
 		if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) {
 			/*
 			 * XXX assumption:
 			 * - exthdrs->ip6e_hbh is not referenced from places
 			 *   other than exthdrs.
 			 * - exthdrs->ip6e_hbh is not an mbuf chain.
 			 */
 			int oldoptlen = mopt->m_len;
 			struct mbuf *n;
 
 			/*
 			 * XXX: give up if the whole (new) hbh header does
 			 * not fit even in an mbuf cluster.
 			 */
 			if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
 				return (ENOBUFS);
 
 			/*
 			 * As a consequence, we must always prepare a cluster
 			 * at this point.
 			 */
 			n = m_getcl(M_NOWAIT, MT_DATA, 0);
 			if (n == NULL)
 				return (ENOBUFS);
 			n->m_len = oldoptlen + JUMBOOPTLEN;
 			bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t),
 			    oldoptlen);
 			optbuf = mtod(n, caddr_t) + oldoptlen;
 			m_freem(mopt);
 			mopt = exthdrs->ip6e_hbh = n;
 		} else {
 			optbuf = mtod(mopt, u_char *) + mopt->m_len;
 			mopt->m_len += JUMBOOPTLEN;
 		}
 		optbuf[0] = IP6OPT_PADN;
 		optbuf[1] = 1;
 
 		/*
 		 * Adjust the header length according to the pad and
 		 * the jumbo payload option.
 		 */
 		hbh = mtod(mopt, struct ip6_hbh *);
 		hbh->ip6h_len += (JUMBOOPTLEN >> 3);
 	}
 
 	/* fill in the option. */
 	optbuf[2] = IP6OPT_JUMBO;
 	optbuf[3] = 4;
 	v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
 	bcopy(&v, &optbuf[4], sizeof(u_int32_t));
 
 	/* finally, adjust the packet header length */
 	exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;
 
 	return (0);
 #undef JUMBOOPTLEN
 }
 
 /*
  * Insert fragment header and copy unfragmentable header portions.
  */
 static int
 ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen,
     struct ip6_frag **frghdrp)
 {
 	struct mbuf *n, *mlast;
 
 	if (hlen > sizeof(struct ip6_hdr)) {
 		n = m_copym(m0, sizeof(struct ip6_hdr),
 		    hlen - sizeof(struct ip6_hdr), M_NOWAIT);
 		if (n == NULL)
 			return (ENOBUFS);
 		m->m_next = n;
 	} else
 		n = m;
 
 	/* Search for the last mbuf of unfragmentable part. */
 	for (mlast = n; mlast->m_next; mlast = mlast->m_next)
 		;
 
 	if (M_WRITABLE(mlast) &&
 	    M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) {
 		/* use the trailing space of the last mbuf for the fragment hdr */
 		*frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) +
 		    mlast->m_len);
 		mlast->m_len += sizeof(struct ip6_frag);
 		m->m_pkthdr.len += sizeof(struct ip6_frag);
 	} else {
 		/* allocate a new mbuf for the fragment header */
 		struct mbuf *mfrg;
 
 		mfrg = m_get(M_NOWAIT, MT_DATA);
 		if (mfrg == NULL)
 			return (ENOBUFS);
 		mfrg->m_len = sizeof(struct ip6_frag);
 		*frghdrp = mtod(mfrg, struct ip6_frag *);
 		mlast->m_next = mfrg;
 	}
 
 	return (0);
 }
 
 /*
  * Calculates IPv6 path mtu for destination @dst.
  * Resulting MTU is stored in @mtup.
  *
  * Returns 0 on success.
  */
 static int
 ip6_getpmtu_ctl(u_int fibnum, const struct in6_addr *dst, u_long *mtup)
 {
 	struct epoch_tracker et;
 	struct nhop_object *nh;
 	struct in6_addr kdst;
 	uint32_t scopeid;
 	int error;
 
 	in6_splitscope(dst, &kdst, &scopeid);
 
 	NET_EPOCH_ENTER(et);
 	nh = fib6_lookup(fibnum, &kdst, scopeid, NHR_NONE, 0);
 	if (nh != NULL)
 		error = ip6_calcmtu(nh->nh_ifp, dst, nh->nh_mtu, mtup, NULL, 0);
 	else
 		error = EHOSTUNREACH;
 	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
 
 /*
  * Calculates IPv6 path MTU for @dst based on transmit @ifp,
  * and cached data in @ro_pmtu.
  * MTU from (successful) route lookup is saved (along with dst)
  * inside @ro_pmtu to avoid subsequent route lookups after packet
  * filter processing.
  *
  * Stores mtu and always-frag value into @mtup and @alwaysfragp.
  * Returns 0 on success.
  */
 static int
 ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup,
     struct ifnet *ifp, const struct in6_addr *dst, u_long *mtup,
     int *alwaysfragp, u_int fibnum, u_int proto)
 {
 	struct nhop_object *nh;
 	struct in6_addr kdst;
 	uint32_t scopeid;
 	struct sockaddr_in6 *sa6_dst, sin6;
 	u_long mtu;
 
 	NET_EPOCH_ASSERT();
 
 	mtu = 0;
 	if (ro_pmtu == NULL || do_lookup) {
 		/*
 		 * Here ro_pmtu has final destination address, while
 		 * ro might represent immediate destination.
 		 * Use ro_pmtu destination since mtu might differ.
 		 */
 		if (ro_pmtu != NULL) {
 			sa6_dst = (struct sockaddr_in6 *)&ro_pmtu->ro_dst;
 			if (!IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))
 				ro_pmtu->ro_mtu = 0;
 		} else
 			sa6_dst = &sin6;
 
 		if (ro_pmtu == NULL || ro_pmtu->ro_mtu == 0) {
 			bzero(sa6_dst, sizeof(*sa6_dst));
 			sa6_dst->sin6_family = AF_INET6;
 			sa6_dst->sin6_len = sizeof(struct sockaddr_in6);
 			sa6_dst->sin6_addr = *dst;
 
 			in6_splitscope(dst, &kdst, &scopeid);
 			nh = fib6_lookup(fibnum, &kdst, scopeid, NHR_NONE, 0);
 			if (nh != NULL) {
 				mtu = nh->nh_mtu;
 				if (ro_pmtu != NULL)
 					ro_pmtu->ro_mtu = mtu;
 			}
 		} else
 			mtu = ro_pmtu->ro_mtu;
 	}
 
 	if (ro_pmtu != NULL && ro_pmtu->ro_nh != NULL)
 		mtu = ro_pmtu->ro_nh->nh_mtu;
 
 	return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto));
 }
 
 /*
  * Calculate MTU based on transmit @ifp, route mtu @rt_mtu and
  * hostcache data for @dst.
  * Stores mtu and always-frag value into @mtup and @alwaysfragp.
  *
  * Returns 0 on success.
  */
 static int
 ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu,
     u_long *mtup, int *alwaysfragp, u_int proto)
 {
 	u_long mtu = 0;
 	int alwaysfrag = 0;
 	int error = 0;
 
 	if (rt_mtu > 0) {
 		u_int32_t ifmtu;
 		struct in_conninfo inc;
 
 		bzero(&inc, sizeof(inc));
 		inc.inc_flags |= INC_ISIPV6;
 		inc.inc6_faddr = *dst;
 
 		ifmtu = IN6_LINKMTU(ifp);
 
 		/* TCP is known to react to pmtu changes so skip hc */
 		if (proto != IPPROTO_TCP)
 			mtu = tcp_hc_getmtu(&inc);
 
 		if (mtu)
 			mtu = min(mtu, rt_mtu);
 		else
 			mtu = rt_mtu;
 		if (mtu == 0)
 			mtu = ifmtu;
 		else if (mtu < IPV6_MMTU) {
 			/*
 			 * RFC2460 section 5, last paragraph:
 			 * if we record ICMPv6 too big message with
 			 * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU
 			 * or smaller, with framgent header attached.
 			 * (fragment header is needed regardless from the
 			 * packet size, for translators to identify packets)
 			 */
 			alwaysfrag = 1;
 			mtu = IPV6_MMTU;
 		}
 	} else if (ifp) {
 		mtu = IN6_LINKMTU(ifp);
 	} else
 		error = EHOSTUNREACH; /* XXX */
 
 	*mtup = mtu;
 	if (alwaysfragp)
 		*alwaysfragp = alwaysfrag;
 	return (error);
 }
 
 /*
  * IP6 socket option processing.
  */
 int
 ip6_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int optdatalen, uproto;
 	void *optdata;
 	struct inpcb *inp = sotoinpcb(so);
 	int error, optval;
 	int level, op, optname;
 	int optlen;
 	struct thread *td;
 #ifdef	RSS
 	uint32_t rss_bucket;
 	int retval;
 #endif
 
 /*
  * Don't use more than a quarter of mbuf clusters.  N.B.:
  * nmbclusters is an int, but nmbclusters * MCLBYTES may overflow
  * on LP64 architectures, so cast to u_long to avoid undefined
  * behavior.  ILP32 architectures cannot have nmbclusters
  * large enough to overflow for other reasons.
  */
 #define IPV6_PKTOPTIONS_MBUF_LIMIT	((u_long)nmbclusters * MCLBYTES / 4)
 
 	level = sopt->sopt_level;
 	op = sopt->sopt_dir;
 	optname = sopt->sopt_name;
 	optlen = sopt->sopt_valsize;
 	td = sopt->sopt_td;
 	error = 0;
 	optval = 0;
 	uproto = (int)so->so_proto->pr_protocol;
 
 	if (level != IPPROTO_IPV6) {
 		error = EINVAL;
 
 		if (sopt->sopt_level == SOL_SOCKET &&
 		    sopt->sopt_dir == SOPT_SET) {
 			switch (sopt->sopt_name) {
 			case SO_REUSEADDR:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEADDR) != 0)
 					inp->inp_flags2 |= INP_REUSEADDR;
 				else
 					inp->inp_flags2 &= ~INP_REUSEADDR;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT_LB:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT_LB) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT_LB;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT_LB;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_SETFIB:
 				INP_WLOCK(inp);
 				inp->inp_inc.inc_fibnum = so->so_fibnum;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_MAX_PACING_RATE:
 #ifdef RATELIMIT
 				INP_WLOCK(inp);
 				inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 				INP_WUNLOCK(inp);
 				error = 0;
 #else
 				error = EOPNOTSUPP;
 #endif
 				break;
 			default:
 				break;
 			}
 		}
 	} else {		/* level == IPPROTO_IPV6 */
 		switch (op) {
 		case SOPT_SET:
 			switch (optname) {
 			case IPV6_2292PKTOPTIONS:
 #ifdef IPV6_PKTOPTIONS
 			case IPV6_PKTOPTIONS:
 #endif
 			{
 				struct mbuf *m;
 
 				if (optlen > IPV6_PKTOPTIONS_MBUF_LIMIT) {
 					printf("ip6_ctloutput: mbuf limit hit\n");
 					error = ENOBUFS;
 					break;
 				}
 
 				error = soopt_getm(sopt, &m); /* XXX */
 				if (error != 0)
 					break;
 				error = soopt_mcopyin(sopt, m); /* XXX */
 				if (error != 0)
 					break;
 				INP_WLOCK(inp);
 				error = ip6_pcbopts(&inp->in6p_outputopts, m,
 				    so, sopt);
 				INP_WUNLOCK(inp);
 				m_freem(m); /* XXX */
 				break;
 			}
 
 			/*
 			 * Use of some Hop-by-Hop options or some
 			 * Destination options, might require special
 			 * privilege.  That is, normal applications
 			 * (without special privilege) might be forbidden
 			 * from setting certain options in outgoing packets,
 			 * and might never see certain options in received
 			 * packets. [RFC 2292 Section 6]
 			 * KAME specific note:
 			 *  KAME prevents non-privileged users from sending or
 			 *  receiving ANY hbh/dst options in order to avoid
 			 *  overhead of parsing options in the kernel.
 			 */
 			case IPV6_RECVHOPOPTS:
 			case IPV6_RECVDSTOPTS:
 			case IPV6_RECVRTHDRDSTOPTS:
 				if (td != NULL) {
 					error = priv_check(td,
 					    PRIV_NETINET_SETHDROPTS);
 					if (error)
 						break;
 				}
 				/* FALLTHROUGH */
 			case IPV6_UNICAST_HOPS:
 			case IPV6_HOPLIMIT:
 
 			case IPV6_RECVPKTINFO:
 			case IPV6_RECVHOPLIMIT:
 			case IPV6_RECVRTHDR:
 			case IPV6_RECVPATHMTU:
 			case IPV6_RECVTCLASS:
 			case IPV6_RECVFLOWID:
 #ifdef	RSS
 			case IPV6_RECVRSSBUCKETID:
 #endif
 			case IPV6_V6ONLY:
 			case IPV6_AUTOFLOWLABEL:
 			case IPV6_ORIGDSTADDR:
 			case IPV6_BINDANY:
 			case IPV6_BINDMULTI:
 #ifdef	RSS
 			case IPV6_RSS_LISTEN_BUCKET:
 #endif
 			case IPV6_VLAN_PCP:
 				if (optname == IPV6_BINDANY && td != NULL) {
 					error = priv_check(td,
 					    PRIV_NETINET_BINDANY);
 					if (error)
 						break;
 				}
 
 				if (optlen != sizeof(int)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				switch (optname) {
 				case IPV6_UNICAST_HOPS:
 					if (optval < -1 || optval >= 256)
 						error = EINVAL;
 					else {
 						/* -1 = kernel default */
 						inp->in6p_hops = optval;
 						if ((inp->inp_vflag &
 						     INP_IPV4) != 0)
 							inp->inp_ip_ttl = optval;
 					}
 					break;
 #define OPTSET(bit) \
 do { \
 	INP_WLOCK(inp); \
 	if (optval) \
 		inp->inp_flags |= (bit); \
 	else \
 		inp->inp_flags &= ~(bit); \
 	INP_WUNLOCK(inp); \
 } while (/*CONSTCOND*/ 0)
 #define OPTSET2292(bit) \
 do { \
 	INP_WLOCK(inp); \
 	inp->inp_flags |= IN6P_RFC2292; \
 	if (optval) \
 		inp->inp_flags |= (bit); \
 	else \
 		inp->inp_flags &= ~(bit); \
 	INP_WUNLOCK(inp); \
 } while (/*CONSTCOND*/ 0)
 #define OPTBIT(bit) (inp->inp_flags & (bit) ? 1 : 0)
 
 #define OPTSET2_N(bit, val) do {					\
 	if (val)							\
 		inp->inp_flags2 |= bit;					\
 	else								\
 		inp->inp_flags2 &= ~bit;				\
 } while (0)
 #define OPTSET2(bit, val) do {						\
 	INP_WLOCK(inp);							\
 	OPTSET2_N(bit, val);						\
 	INP_WUNLOCK(inp);						\
 } while (0)
 #define OPTBIT2(bit) (inp->inp_flags2 & (bit) ? 1 : 0)
 #define OPTSET2292_EXCLUSIVE(bit)					\
 do {									\
 	INP_WLOCK(inp);							\
 	if (OPTBIT(IN6P_RFC2292)) {					\
 		error = EINVAL;						\
 	} else {							\
 		if (optval)						\
 			inp->inp_flags |= (bit);			\
 		else							\
 			inp->inp_flags &= ~(bit);			\
 	}								\
 	INP_WUNLOCK(inp);						\
 } while (/*CONSTCOND*/ 0)
 
 				case IPV6_RECVPKTINFO:
 					OPTSET2292_EXCLUSIVE(IN6P_PKTINFO);
 					break;
 
 				case IPV6_HOPLIMIT:
 				{
 					struct ip6_pktopts **optp;
 
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					INP_WLOCK(inp);
-					if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+					if (inp->inp_flags & INP_DROPPED) {
 						INP_WUNLOCK(inp);
 						return (ECONNRESET);
 					}
 					optp = &inp->in6p_outputopts;
 					error = ip6_pcbopt(IPV6_HOPLIMIT,
 					    (u_char *)&optval, sizeof(optval),
 					    optp, (td != NULL) ? td->td_ucred :
 					    NULL, uproto);
 					INP_WUNLOCK(inp);
 					break;
 				}
 
 				case IPV6_RECVHOPLIMIT:
 					OPTSET2292_EXCLUSIVE(IN6P_HOPLIMIT);
 					break;
 
 				case IPV6_RECVHOPOPTS:
 					OPTSET2292_EXCLUSIVE(IN6P_HOPOPTS);
 					break;
 
 				case IPV6_RECVDSTOPTS:
 					OPTSET2292_EXCLUSIVE(IN6P_DSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDRDSTOPTS:
 					OPTSET2292_EXCLUSIVE(IN6P_RTHDRDSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDR:
 					OPTSET2292_EXCLUSIVE(IN6P_RTHDR);
 					break;
 
 				case IPV6_RECVPATHMTU:
 					/*
 					 * We ignore this option for TCP
 					 * sockets.
 					 * (RFC3542 leaves this case
 					 * unspecified.)
 					 */
 					if (uproto != IPPROTO_TCP)
 						OPTSET(IN6P_MTU);
 					break;
 
 				case IPV6_RECVFLOWID:
 					OPTSET2(INP_RECVFLOWID, optval);
 					break;
 
 #ifdef	RSS
 				case IPV6_RECVRSSBUCKETID:
 					OPTSET2(INP_RECVRSSBUCKETID, optval);
 					break;
 #endif
 
 				case IPV6_V6ONLY:
 					INP_WLOCK(inp);
 					if (inp->inp_lport ||
 					    !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 						/*
 						 * The socket is already bound.
 						 */
 						INP_WUNLOCK(inp);
 						error = EINVAL;
 						break;
 					}
 					if (optval) {
 						inp->inp_flags |= IN6P_IPV6_V6ONLY;
 						inp->inp_vflag &= ~INP_IPV4;
 					} else {
 						inp->inp_flags &= ~IN6P_IPV6_V6ONLY;
 						inp->inp_vflag |= INP_IPV4;
 					}
 					INP_WUNLOCK(inp);
 					break;
 				case IPV6_RECVTCLASS:
 					/* cannot mix with RFC2292 XXX */
 					OPTSET2292_EXCLUSIVE(IN6P_TCLASS);
 					break;
 				case IPV6_AUTOFLOWLABEL:
 					OPTSET(IN6P_AUTOFLOWLABEL);
 					break;
 
 				case IPV6_ORIGDSTADDR:
 					OPTSET2(INP_ORIGDSTADDR, optval);
 					break;
 				case IPV6_BINDANY:
 					OPTSET(INP_BINDANY);
 					break;
 
 				case IPV6_BINDMULTI:
 					OPTSET2(INP_BINDMULTI, optval);
 					break;
 #ifdef	RSS
 				case IPV6_RSS_LISTEN_BUCKET:
 					if ((optval >= 0) &&
 					    (optval < rss_getnumbuckets())) {
 						INP_WLOCK(inp);
 						inp->inp_rss_listen_bucket = optval;
 						OPTSET2_N(INP_RSS_BUCKET_SET, 1);
 						INP_WUNLOCK(inp);
 					} else {
 						error = EINVAL;
 					}
 					break;
 #endif
 				case IPV6_VLAN_PCP:
 					if ((optval >= -1) && (optval <=
 					    (INP_2PCP_MASK >> INP_2PCP_SHIFT))) {
 						if (optval == -1) {
 							INP_WLOCK(inp);
 							inp->inp_flags2 &=
 							    ~(INP_2PCP_SET |
 							    INP_2PCP_MASK);
 							INP_WUNLOCK(inp);
 						} else {
 							INP_WLOCK(inp);
 							inp->inp_flags2 |=
 							    INP_2PCP_SET;
 							inp->inp_flags2 &=
 							    ~INP_2PCP_MASK;
 							inp->inp_flags2 |=
 							    optval <<
 							    INP_2PCP_SHIFT;
 							INP_WUNLOCK(inp);
 						}
 					} else
 						error = EINVAL;
 					break;
 				}
 				break;
 
 			case IPV6_TCLASS:
 			case IPV6_DONTFRAG:
 			case IPV6_USE_MIN_MTU:
 			case IPV6_PREFER_TEMPADDR:
 				if (optlen != sizeof(optval)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				{
 					struct ip6_pktopts **optp;
 					INP_WLOCK(inp);
-					if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+					if (inp->inp_flags & INP_DROPPED) {
 						INP_WUNLOCK(inp);
 						return (ECONNRESET);
 					}
 					optp = &inp->in6p_outputopts;
 					error = ip6_pcbopt(optname,
 					    (u_char *)&optval, sizeof(optval),
 					    optp, (td != NULL) ? td->td_ucred :
 					    NULL, uproto);
 					INP_WUNLOCK(inp);
 					break;
 				}
 
 			case IPV6_2292PKTINFO:
 			case IPV6_2292HOPLIMIT:
 			case IPV6_2292HOPOPTS:
 			case IPV6_2292DSTOPTS:
 			case IPV6_2292RTHDR:
 				/* RFC 2292 */
 				if (optlen != sizeof(int)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				switch (optname) {
 				case IPV6_2292PKTINFO:
 					OPTSET2292(IN6P_PKTINFO);
 					break;
 				case IPV6_2292HOPLIMIT:
 					OPTSET2292(IN6P_HOPLIMIT);
 					break;
 				case IPV6_2292HOPOPTS:
 					/*
 					 * Check super-user privilege.
 					 * See comments for IPV6_RECVHOPOPTS.
 					 */
 					if (td != NULL) {
 						error = priv_check(td,
 						    PRIV_NETINET_SETHDROPTS);
 						if (error)
 							return (error);
 					}
 					OPTSET2292(IN6P_HOPOPTS);
 					break;
 				case IPV6_2292DSTOPTS:
 					if (td != NULL) {
 						error = priv_check(td,
 						    PRIV_NETINET_SETHDROPTS);
 						if (error)
 							return (error);
 					}
 					OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */
 					break;
 				case IPV6_2292RTHDR:
 					OPTSET2292(IN6P_RTHDR);
 					break;
 				}
 				break;
 			case IPV6_PKTINFO:
 			case IPV6_HOPOPTS:
 			case IPV6_RTHDR:
 			case IPV6_DSTOPTS:
 			case IPV6_RTHDRDSTOPTS:
 			case IPV6_NEXTHOP:
 			{
 				/* new advanced API (RFC3542) */
 				u_char *optbuf;
 				u_char optbuf_storage[MCLBYTES];
 				int optlen;
 				struct ip6_pktopts **optp;
 
 				/* cannot mix with RFC2292 */
 				if (OPTBIT(IN6P_RFC2292)) {
 					error = EINVAL;
 					break;
 				}
 
 				/*
 				 * We only ensure valsize is not too large
 				 * here.  Further validation will be done
 				 * later.
 				 */
 				error = sooptcopyin(sopt, optbuf_storage,
 				    sizeof(optbuf_storage), 0);
 				if (error)
 					break;
 				optlen = sopt->sopt_valsize;
 				optbuf = optbuf_storage;
 				INP_WLOCK(inp);
-				if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+				if (inp->inp_flags & INP_DROPPED) {
 					INP_WUNLOCK(inp);
 					return (ECONNRESET);
 				}
 				optp = &inp->in6p_outputopts;
 				error = ip6_pcbopt(optname, optbuf, optlen,
 				    optp, (td != NULL) ? td->td_ucred : NULL,
 				    uproto);
 				INP_WUNLOCK(inp);
 				break;
 			}
 #undef OPTSET
 
 			case IPV6_MULTICAST_IF:
 			case IPV6_MULTICAST_HOPS:
 			case IPV6_MULTICAST_LOOP:
 			case IPV6_JOIN_GROUP:
 			case IPV6_LEAVE_GROUP:
 			case IPV6_MSFILTER:
 			case MCAST_BLOCK_SOURCE:
 			case MCAST_UNBLOCK_SOURCE:
 			case MCAST_JOIN_GROUP:
 			case MCAST_LEAVE_GROUP:
 			case MCAST_JOIN_SOURCE_GROUP:
 			case MCAST_LEAVE_SOURCE_GROUP:
 				error = ip6_setmoptions(inp, sopt);
 				break;
 
 			case IPV6_PORTRANGE:
 				error = sooptcopyin(sopt, &optval,
 				    sizeof optval, sizeof optval);
 				if (error)
 					break;
 
 				INP_WLOCK(inp);
 				switch (optval) {
 				case IPV6_PORTRANGE_DEFAULT:
 					inp->inp_flags &= ~(INP_LOWPORT);
 					inp->inp_flags &= ~(INP_HIGHPORT);
 					break;
 
 				case IPV6_PORTRANGE_HIGH:
 					inp->inp_flags &= ~(INP_LOWPORT);
 					inp->inp_flags |= INP_HIGHPORT;
 					break;
 
 				case IPV6_PORTRANGE_LOW:
 					inp->inp_flags &= ~(INP_HIGHPORT);
 					inp->inp_flags |= INP_LOWPORT;
 					break;
 
 				default:
 					error = EINVAL;
 					break;
 				}
 				INP_WUNLOCK(inp);
 				break;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 			case IPV6_IPSEC_POLICY:
 				if (IPSEC_ENABLED(ipv6)) {
 					error = IPSEC_PCBCTL(ipv6, inp, sopt);
 					break;
 				}
 				/* FALLTHROUGH */
 #endif /* IPSEC */
 
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			break;
 
 		case SOPT_GET:
 			switch (optname) {
 			case IPV6_2292PKTOPTIONS:
 #ifdef IPV6_PKTOPTIONS
 			case IPV6_PKTOPTIONS:
 #endif
 				/*
 				 * RFC3542 (effectively) deprecated the
 				 * semantics of the 2292-style pktoptions.
 				 * Since it was not reliable in nature (i.e.,
 				 * applications had to expect the lack of some
 				 * information after all), it would make sense
 				 * to simplify this part by always returning
 				 * empty data.
 				 */
 				sopt->sopt_valsize = 0;
 				break;
 
 			case IPV6_RECVHOPOPTS:
 			case IPV6_RECVDSTOPTS:
 			case IPV6_RECVRTHDRDSTOPTS:
 			case IPV6_UNICAST_HOPS:
 			case IPV6_RECVPKTINFO:
 			case IPV6_RECVHOPLIMIT:
 			case IPV6_RECVRTHDR:
 			case IPV6_RECVPATHMTU:
 
 			case IPV6_V6ONLY:
 			case IPV6_PORTRANGE:
 			case IPV6_RECVTCLASS:
 			case IPV6_AUTOFLOWLABEL:
 			case IPV6_BINDANY:
 			case IPV6_FLOWID:
 			case IPV6_FLOWTYPE:
 			case IPV6_RECVFLOWID:
 #ifdef	RSS
 			case IPV6_RSSBUCKETID:
 			case IPV6_RECVRSSBUCKETID:
 #endif
 			case IPV6_BINDMULTI:
 			case IPV6_VLAN_PCP:
 				switch (optname) {
 				case IPV6_RECVHOPOPTS:
 					optval = OPTBIT(IN6P_HOPOPTS);
 					break;
 
 				case IPV6_RECVDSTOPTS:
 					optval = OPTBIT(IN6P_DSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDRDSTOPTS:
 					optval = OPTBIT(IN6P_RTHDRDSTOPTS);
 					break;
 
 				case IPV6_UNICAST_HOPS:
 					optval = inp->in6p_hops;
 					break;
 
 				case IPV6_RECVPKTINFO:
 					optval = OPTBIT(IN6P_PKTINFO);
 					break;
 
 				case IPV6_RECVHOPLIMIT:
 					optval = OPTBIT(IN6P_HOPLIMIT);
 					break;
 
 				case IPV6_RECVRTHDR:
 					optval = OPTBIT(IN6P_RTHDR);
 					break;
 
 				case IPV6_RECVPATHMTU:
 					optval = OPTBIT(IN6P_MTU);
 					break;
 
 				case IPV6_V6ONLY:
 					optval = OPTBIT(IN6P_IPV6_V6ONLY);
 					break;
 
 				case IPV6_PORTRANGE:
 				    {
 					int flags;
 					flags = inp->inp_flags;
 					if (flags & INP_HIGHPORT)
 						optval = IPV6_PORTRANGE_HIGH;
 					else if (flags & INP_LOWPORT)
 						optval = IPV6_PORTRANGE_LOW;
 					else
 						optval = 0;
 					break;
 				    }
 				case IPV6_RECVTCLASS:
 					optval = OPTBIT(IN6P_TCLASS);
 					break;
 
 				case IPV6_AUTOFLOWLABEL:
 					optval = OPTBIT(IN6P_AUTOFLOWLABEL);
 					break;
 
 				case IPV6_ORIGDSTADDR:
 					optval = OPTBIT2(INP_ORIGDSTADDR);
 					break;
 
 				case IPV6_BINDANY:
 					optval = OPTBIT(INP_BINDANY);
 					break;
 
 				case IPV6_FLOWID:
 					optval = inp->inp_flowid;
 					break;
 
 				case IPV6_FLOWTYPE:
 					optval = inp->inp_flowtype;
 					break;
 
 				case IPV6_RECVFLOWID:
 					optval = OPTBIT2(INP_RECVFLOWID);
 					break;
 #ifdef	RSS
 				case IPV6_RSSBUCKETID:
 					retval =
 					    rss_hash2bucket(inp->inp_flowid,
 					    inp->inp_flowtype,
 					    &rss_bucket);
 					if (retval == 0)
 						optval = rss_bucket;
 					else
 						error = EINVAL;
 					break;
 
 				case IPV6_RECVRSSBUCKETID:
 					optval = OPTBIT2(INP_RECVRSSBUCKETID);
 					break;
 #endif
 
 				case IPV6_BINDMULTI:
 					optval = OPTBIT2(INP_BINDMULTI);
 					break;
 
 				case IPV6_VLAN_PCP:
 					if (OPTBIT2(INP_2PCP_SET)) {
 						optval = (inp->inp_flags2 &
 							    INP_2PCP_MASK) >>
 							    INP_2PCP_SHIFT;
 					} else {
 						optval = -1;
 					}
 					break;
 				}
 
 				if (error)
 					break;
 				error = sooptcopyout(sopt, &optval,
 					sizeof optval);
 				break;
 
 			case IPV6_PATHMTU:
 			{
 				u_long pmtu = 0;
 				struct ip6_mtuinfo mtuinfo;
 				struct in6_addr addr;
 
 				if (!(so->so_state & SS_ISCONNECTED))
 					return (ENOTCONN);
 				/*
 				 * XXX: we dot not consider the case of source
 				 * routing, or optional information to specify
 				 * the outgoing interface.
 				 * Copy faddr out of inp to avoid holding lock
 				 * on inp during route lookup.
 				 */
 				INP_RLOCK(inp);
 				bcopy(&inp->in6p_faddr, &addr, sizeof(addr));
 				INP_RUNLOCK(inp);
 				error = ip6_getpmtu_ctl(so->so_fibnum,
 				    &addr, &pmtu);
 				if (error)
 					break;
 				if (pmtu > IPV6_MAXPACKET)
 					pmtu = IPV6_MAXPACKET;
 
 				bzero(&mtuinfo, sizeof(mtuinfo));
 				mtuinfo.ip6m_mtu = (u_int32_t)pmtu;
 				optdata = (void *)&mtuinfo;
 				optdatalen = sizeof(mtuinfo);
 				error = sooptcopyout(sopt, optdata,
 				    optdatalen);
 				break;
 			}
 
 			case IPV6_2292PKTINFO:
 			case IPV6_2292HOPLIMIT:
 			case IPV6_2292HOPOPTS:
 			case IPV6_2292RTHDR:
 			case IPV6_2292DSTOPTS:
 				switch (optname) {
 				case IPV6_2292PKTINFO:
 					optval = OPTBIT(IN6P_PKTINFO);
 					break;
 				case IPV6_2292HOPLIMIT:
 					optval = OPTBIT(IN6P_HOPLIMIT);
 					break;
 				case IPV6_2292HOPOPTS:
 					optval = OPTBIT(IN6P_HOPOPTS);
 					break;
 				case IPV6_2292RTHDR:
 					optval = OPTBIT(IN6P_RTHDR);
 					break;
 				case IPV6_2292DSTOPTS:
 					optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS);
 					break;
 				}
 				error = sooptcopyout(sopt, &optval,
 				    sizeof optval);
 				break;
 			case IPV6_PKTINFO:
 			case IPV6_HOPOPTS:
 			case IPV6_RTHDR:
 			case IPV6_DSTOPTS:
 			case IPV6_RTHDRDSTOPTS:
 			case IPV6_NEXTHOP:
 			case IPV6_TCLASS:
 			case IPV6_DONTFRAG:
 			case IPV6_USE_MIN_MTU:
 			case IPV6_PREFER_TEMPADDR:
 				error = ip6_getpcbopt(inp, optname, sopt);
 				break;
 
 			case IPV6_MULTICAST_IF:
 			case IPV6_MULTICAST_HOPS:
 			case IPV6_MULTICAST_LOOP:
 			case IPV6_MSFILTER:
 				error = ip6_getmoptions(inp, sopt);
 				break;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 			case IPV6_IPSEC_POLICY:
 				if (IPSEC_ENABLED(ipv6)) {
 					error = IPSEC_PCBCTL(ipv6, inp, sopt);
 					break;
 				}
 				/* FALLTHROUGH */
 #endif /* IPSEC */
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			break;
 		}
 	}
 	return (error);
 }
 
 int
 ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0, optval, optlen;
 	const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum);
 	struct inpcb *inp = sotoinpcb(so);
 	int level, op, optname;
 
 	level = sopt->sopt_level;
 	op = sopt->sopt_dir;
 	optname = sopt->sopt_name;
 	optlen = sopt->sopt_valsize;
 
 	if (level != IPPROTO_IPV6) {
 		return (EINVAL);
 	}
 
 	switch (optname) {
 	case IPV6_CHECKSUM:
 		/*
 		 * For ICMPv6 sockets, no modification allowed for checksum
 		 * offset, permit "no change" values to help existing apps.
 		 *
 		 * RFC3542 says: "An attempt to set IPV6_CHECKSUM
 		 * for an ICMPv6 socket will fail."
 		 * The current behavior does not meet RFC3542.
 		 */
 		switch (op) {
 		case SOPT_SET:
 			if (optlen != sizeof(int)) {
 				error = EINVAL;
 				break;
 			}
 			error = sooptcopyin(sopt, &optval, sizeof(optval),
 					    sizeof(optval));
 			if (error)
 				break;
 			if (optval < -1 || (optval % 2) != 0) {
 				/*
 				 * The API assumes non-negative even offset
 				 * values or -1 as a special value.
 				 */
 				error = EINVAL;
 			} else if (inp->inp_ip_p == IPPROTO_ICMPV6) {
 				if (optval != icmp6off)
 					error = EINVAL;
 			} else
 				inp->in6p_cksum = optval;
 			break;
 
 		case SOPT_GET:
 			if (inp->inp_ip_p == IPPROTO_ICMPV6)
 				optval = icmp6off;
 			else
 				optval = inp->in6p_cksum;
 
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 
 	default:
 		error = ENOPROTOOPT;
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Set up IP6 options in pcb for insertion in output packets or
  * specifying behavior of outgoing packets.
  */
 static int
 ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m,
     struct socket *so, struct sockopt *sopt)
 {
 	struct ip6_pktopts *opt = *pktopt;
 	int error = 0;
 	struct thread *td = sopt->sopt_td;
 	struct epoch_tracker et;
 
 	/* turn off any old options. */
 	if (opt) {
 #ifdef DIAGNOSTIC
 		if (opt->ip6po_pktinfo || opt->ip6po_nexthop ||
 		    opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 ||
 		    opt->ip6po_rhinfo.ip6po_rhi_rthdr)
 			printf("ip6_pcbopts: all specified options are cleared.\n");
 #endif
 		ip6_clearpktopts(opt, -1);
 	} else {
 		opt = malloc(sizeof(*opt), M_IP6OPT, M_NOWAIT);
 		if (opt == NULL)
 			return (ENOMEM);
 	}
 	*pktopt = NULL;
 
 	if (!m || m->m_len == 0) {
 		/*
 		 * Only turning off any previous options, regardless of
 		 * whether the opt is just created or given.
 		 */
 		free(opt, M_IP6OPT);
 		return (0);
 	}
 
 	/*  set options specified by user. */
 	NET_EPOCH_ENTER(et);
 	if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ?
 	    td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) {
 		ip6_clearpktopts(opt, -1); /* XXX: discard all options */
 		free(opt, M_IP6OPT);
 		NET_EPOCH_EXIT(et);
 		return (error);
 	}
 	NET_EPOCH_EXIT(et);
 	*pktopt = opt;
 	return (0);
 }
 
 /*
  * initialize ip6_pktopts.  beware that there are non-zero default values in
  * the struct.
  */
 void
 ip6_initpktopts(struct ip6_pktopts *opt)
 {
 
 	bzero(opt, sizeof(*opt));
 	opt->ip6po_hlim = -1;	/* -1 means default hop limit */
 	opt->ip6po_tclass = -1;	/* -1 means default traffic class */
 	opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY;
 	opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM;
 }
 
 static int
 ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt,
     struct ucred *cred, int uproto)
 {
 	struct epoch_tracker et;
 	struct ip6_pktopts *opt;
 	int ret;
 
 	if (*pktopt == NULL) {
 		*pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT,
 		    M_NOWAIT);
 		if (*pktopt == NULL)
 			return (ENOBUFS);
 		ip6_initpktopts(*pktopt);
 	}
 	opt = *pktopt;
 
 	NET_EPOCH_ENTER(et);
 	ret = ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto);
 	NET_EPOCH_EXIT(et);
 
 	return (ret);
 }
 
 #define GET_PKTOPT_VAR(field, lenexpr) do {					\
 	if (pktopt && pktopt->field) {						\
 		INP_RUNLOCK(inp);						\
 		optdata = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK);		\
 		malloc_optdata = true;						\
 		INP_RLOCK(inp);							\
-		if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {		\
+		if (inp->inp_flags & INP_DROPPED) {				\
 			INP_RUNLOCK(inp);					\
 			free(optdata, M_TEMP);					\
 			return (ECONNRESET);					\
 		}								\
 		pktopt = inp->in6p_outputopts;					\
 		if (pktopt && pktopt->field) {					\
 			optdatalen = min(lenexpr, sopt->sopt_valsize);		\
 			bcopy(pktopt->field, optdata, optdatalen);		\
 		} else {							\
 			free(optdata, M_TEMP);					\
 			optdata = NULL;						\
 			malloc_optdata = false;					\
 		}								\
 	}									\
 } while(0)
 
 #define GET_PKTOPT_EXT_HDR(field) GET_PKTOPT_VAR(field,				\
 	(((struct ip6_ext *)pktopt->field)->ip6e_len + 1) << 3)
 
 #define GET_PKTOPT_SOCKADDR(field) GET_PKTOPT_VAR(field,			\
 	pktopt->field->sa_len)
 
 static int
 ip6_getpcbopt(struct inpcb *inp, int optname, struct sockopt *sopt)
 {
 	void *optdata = NULL;
 	bool malloc_optdata = false;
 	int optdatalen = 0;
 	int error = 0;
 	struct in6_pktinfo null_pktinfo;
 	int deftclass = 0, on;
 	int defminmtu = IP6PO_MINMTU_MCASTONLY;
 	int defpreftemp = IP6PO_TEMPADDR_SYSTEM;
 	struct ip6_pktopts *pktopt;
 
 	INP_RLOCK(inp);
 	pktopt = inp->in6p_outputopts;
 
 	switch (optname) {
 	case IPV6_PKTINFO:
 		optdata = (void *)&null_pktinfo;
 		if (pktopt && pktopt->ip6po_pktinfo) {
 			bcopy(pktopt->ip6po_pktinfo, &null_pktinfo,
 			    sizeof(null_pktinfo));
 			in6_clearscope(&null_pktinfo.ipi6_addr);
 		} else {
 			/* XXX: we don't have to do this every time... */
 			bzero(&null_pktinfo, sizeof(null_pktinfo));
 		}
 		optdatalen = sizeof(struct in6_pktinfo);
 		break;
 	case IPV6_TCLASS:
 		if (pktopt && pktopt->ip6po_tclass >= 0)
 			deftclass = pktopt->ip6po_tclass;
 		optdata = (void *)&deftclass;
 		optdatalen = sizeof(int);
 		break;
 	case IPV6_HOPOPTS:
 		GET_PKTOPT_EXT_HDR(ip6po_hbh);
 		break;
 	case IPV6_RTHDR:
 		GET_PKTOPT_EXT_HDR(ip6po_rthdr);
 		break;
 	case IPV6_RTHDRDSTOPTS:
 		GET_PKTOPT_EXT_HDR(ip6po_dest1);
 		break;
 	case IPV6_DSTOPTS:
 		GET_PKTOPT_EXT_HDR(ip6po_dest2);
 		break;
 	case IPV6_NEXTHOP:
 		GET_PKTOPT_SOCKADDR(ip6po_nexthop);
 		break;
 	case IPV6_USE_MIN_MTU:
 		if (pktopt)
 			defminmtu = pktopt->ip6po_minmtu;
 		optdata = (void *)&defminmtu;
 		optdatalen = sizeof(int);
 		break;
 	case IPV6_DONTFRAG:
 		if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG))
 			on = 1;
 		else
 			on = 0;
 		optdata = (void *)&on;
 		optdatalen = sizeof(on);
 		break;
 	case IPV6_PREFER_TEMPADDR:
 		if (pktopt)
 			defpreftemp = pktopt->ip6po_prefer_tempaddr;
 		optdata = (void *)&defpreftemp;
 		optdatalen = sizeof(int);
 		break;
 	default:		/* should not happen */
 #ifdef DIAGNOSTIC
 		panic("ip6_getpcbopt: unexpected option\n");
 #endif
 		INP_RUNLOCK(inp);
 		return (ENOPROTOOPT);
 	}
 	INP_RUNLOCK(inp);
 
 	error = sooptcopyout(sopt, optdata, optdatalen);
 	if (malloc_optdata)
 		free(optdata, M_TEMP);
 
 	return (error);
 }
 
 void
 ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
 {
 	if (pktopt == NULL)
 		return;
 
 	if (optname == -1 || optname == IPV6_PKTINFO) {
 		if (pktopt->ip6po_pktinfo)
 			free(pktopt->ip6po_pktinfo, M_IP6OPT);
 		pktopt->ip6po_pktinfo = NULL;
 	}
 	if (optname == -1 || optname == IPV6_HOPLIMIT)
 		pktopt->ip6po_hlim = -1;
 	if (optname == -1 || optname == IPV6_TCLASS)
 		pktopt->ip6po_tclass = -1;
 	if (optname == -1 || optname == IPV6_NEXTHOP) {
 		if (pktopt->ip6po_nextroute.ro_nh) {
 			NH_FREE(pktopt->ip6po_nextroute.ro_nh);
 			pktopt->ip6po_nextroute.ro_nh = NULL;
 		}
 		if (pktopt->ip6po_nexthop)
 			free(pktopt->ip6po_nexthop, M_IP6OPT);
 		pktopt->ip6po_nexthop = NULL;
 	}
 	if (optname == -1 || optname == IPV6_HOPOPTS) {
 		if (pktopt->ip6po_hbh)
 			free(pktopt->ip6po_hbh, M_IP6OPT);
 		pktopt->ip6po_hbh = NULL;
 	}
 	if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) {
 		if (pktopt->ip6po_dest1)
 			free(pktopt->ip6po_dest1, M_IP6OPT);
 		pktopt->ip6po_dest1 = NULL;
 	}
 	if (optname == -1 || optname == IPV6_RTHDR) {
 		if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr)
 			free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
 		pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
 		if (pktopt->ip6po_route.ro_nh) {
 			NH_FREE(pktopt->ip6po_route.ro_nh);
 			pktopt->ip6po_route.ro_nh = NULL;
 		}
 	}
 	if (optname == -1 || optname == IPV6_DSTOPTS) {
 		if (pktopt->ip6po_dest2)
 			free(pktopt->ip6po_dest2, M_IP6OPT);
 		pktopt->ip6po_dest2 = NULL;
 	}
 }
 
 #define PKTOPT_EXTHDRCPY(type) \
 do {\
 	if (src->type) {\
 		int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
 		dst->type = malloc(hlen, M_IP6OPT, canwait);\
 		if (dst->type == NULL)\
 			goto bad;\
 		bcopy(src->type, dst->type, hlen);\
 	}\
 } while (/*CONSTCOND*/ 0)
 
 static int
 copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait)
 {
 	if (dst == NULL || src == NULL)  {
 		printf("ip6_clearpktopts: invalid argument\n");
 		return (EINVAL);
 	}
 
 	dst->ip6po_hlim = src->ip6po_hlim;
 	dst->ip6po_tclass = src->ip6po_tclass;
 	dst->ip6po_flags = src->ip6po_flags;
 	dst->ip6po_minmtu = src->ip6po_minmtu;
 	dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr;
 	if (src->ip6po_pktinfo) {
 		dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
 		    M_IP6OPT, canwait);
 		if (dst->ip6po_pktinfo == NULL)
 			goto bad;
 		*dst->ip6po_pktinfo = *src->ip6po_pktinfo;
 	}
 	if (src->ip6po_nexthop) {
 		dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len,
 		    M_IP6OPT, canwait);
 		if (dst->ip6po_nexthop == NULL)
 			goto bad;
 		bcopy(src->ip6po_nexthop, dst->ip6po_nexthop,
 		    src->ip6po_nexthop->sa_len);
 	}
 	PKTOPT_EXTHDRCPY(ip6po_hbh);
 	PKTOPT_EXTHDRCPY(ip6po_dest1);
 	PKTOPT_EXTHDRCPY(ip6po_dest2);
 	PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
 	return (0);
 
   bad:
 	ip6_clearpktopts(dst, -1);
 	return (ENOBUFS);
 }
 #undef PKTOPT_EXTHDRCPY
 
 struct ip6_pktopts *
 ip6_copypktopts(struct ip6_pktopts *src, int canwait)
 {
 	int error;
 	struct ip6_pktopts *dst;
 
 	dst = malloc(sizeof(*dst), M_IP6OPT, canwait);
 	if (dst == NULL)
 		return (NULL);
 	ip6_initpktopts(dst);
 
 	if ((error = copypktopts(dst, src, canwait)) != 0) {
 		free(dst, M_IP6OPT);
 		return (NULL);
 	}
 
 	return (dst);
 }
 
 void
 ip6_freepcbopts(struct ip6_pktopts *pktopt)
 {
 	if (pktopt == NULL)
 		return;
 
 	ip6_clearpktopts(pktopt, -1);
 
 	free(pktopt, M_IP6OPT);
 }
 
 /*
  * Set IPv6 outgoing packet options based on advanced API.
  */
 int
 ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt,
     struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto)
 {
 	struct cmsghdr *cm = NULL;
 
 	if (control == NULL || opt == NULL)
 		return (EINVAL);
 
 	/*
 	 * ip6_setpktopt can call ifnet_byindex(), so it's imperative that we
 	 * are in the network epoch here.
 	 */
 	NET_EPOCH_ASSERT();
 
 	ip6_initpktopts(opt);
 	if (stickyopt) {
 		int error;
 
 		/*
 		 * If stickyopt is provided, make a local copy of the options
 		 * for this particular packet, then override them by ancillary
 		 * objects.
 		 * XXX: copypktopts() does not copy the cached route to a next
 		 * hop (if any).  This is not very good in terms of efficiency,
 		 * but we can allow this since this option should be rarely
 		 * used.
 		 */
 		if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0)
 			return (error);
 	}
 
 	/*
 	 * XXX: Currently, we assume all the optional information is stored
 	 * in a single mbuf.
 	 */
 	if (control->m_next)
 		return (EINVAL);
 
 	for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len),
 	    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
 		int error;
 
 		if (control->m_len < CMSG_LEN(0))
 			return (EINVAL);
 
 		cm = mtod(control, struct cmsghdr *);
 		if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len)
 			return (EINVAL);
 		if (cm->cmsg_level != IPPROTO_IPV6)
 			continue;
 
 		error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm),
 		    cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 /*
  * Set a particular packet option, as a sticky option or an ancillary data
  * item.  "len" can be 0 only when it's a sticky option.
  * We have 4 cases of combination of "sticky" and "cmsg":
  * "sticky=0, cmsg=0": impossible
  * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data
  * "sticky=1, cmsg=0": RFC3542 socket option
  * "sticky=1, cmsg=1": RFC2292 socket option
  */
 static int
 ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
     struct ucred *cred, int sticky, int cmsg, int uproto)
 {
 	int minmtupolicy, preftemp;
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	if (!sticky && !cmsg) {
 #ifdef DIAGNOSTIC
 		printf("ip6_setpktopt: impossible case\n");
 #endif
 		return (EINVAL);
 	}
 
 	/*
 	 * IPV6_2292xxx is for backward compatibility to RFC2292, and should
 	 * not be specified in the context of RFC3542.  Conversely,
 	 * RFC3542 types should not be specified in the context of RFC2292.
 	 */
 	if (!cmsg) {
 		switch (optname) {
 		case IPV6_2292PKTINFO:
 		case IPV6_2292HOPLIMIT:
 		case IPV6_2292NEXTHOP:
 		case IPV6_2292HOPOPTS:
 		case IPV6_2292DSTOPTS:
 		case IPV6_2292RTHDR:
 		case IPV6_2292PKTOPTIONS:
 			return (ENOPROTOOPT);
 		}
 	}
 	if (sticky && cmsg) {
 		switch (optname) {
 		case IPV6_PKTINFO:
 		case IPV6_HOPLIMIT:
 		case IPV6_NEXTHOP:
 		case IPV6_HOPOPTS:
 		case IPV6_DSTOPTS:
 		case IPV6_RTHDRDSTOPTS:
 		case IPV6_RTHDR:
 		case IPV6_USE_MIN_MTU:
 		case IPV6_DONTFRAG:
 		case IPV6_TCLASS:
 		case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */
 			return (ENOPROTOOPT);
 		}
 	}
 
 	switch (optname) {
 	case IPV6_2292PKTINFO:
 	case IPV6_PKTINFO:
 	{
 		struct ifnet *ifp = NULL;
 		struct in6_pktinfo *pktinfo;
 
 		if (len != sizeof(struct in6_pktinfo))
 			return (EINVAL);
 
 		pktinfo = (struct in6_pktinfo *)buf;
 
 		/*
 		 * An application can clear any sticky IPV6_PKTINFO option by
 		 * doing a "regular" setsockopt with ipi6_addr being
 		 * in6addr_any and ipi6_ifindex being zero.
 		 * [RFC 3542, Section 6]
 		 */
 		if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo &&
 		    pktinfo->ipi6_ifindex == 0 &&
 		    IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			ip6_clearpktopts(opt, optname);
 			break;
 		}
 
 		if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO &&
 		    sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			return (EINVAL);
 		}
 		if (IN6_IS_ADDR_MULTICAST(&pktinfo->ipi6_addr))
 			return (EINVAL);
 		/* validate the interface index if specified. */
 		if (pktinfo->ipi6_ifindex) {
 			ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
 			if (ifp == NULL)
 				return (ENXIO);
 		}
 		if (ifp != NULL && (ifp->if_afdata[AF_INET6] == NULL ||
 		    (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) != 0))
 			return (ENETDOWN);
 
 		if (ifp != NULL &&
 		    !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			struct in6_ifaddr *ia;
 
 			in6_setscope(&pktinfo->ipi6_addr, ifp, NULL);
 			ia = in6ifa_ifpwithaddr(ifp, &pktinfo->ipi6_addr);
 			if (ia == NULL)
 				return (EADDRNOTAVAIL);
 			ifa_free(&ia->ia_ifa);
 		}
 		/*
 		 * We store the address anyway, and let in6_selectsrc()
 		 * validate the specified address.  This is because ipi6_addr
 		 * may not have enough information about its scope zone, and
 		 * we may need additional information (such as outgoing
 		 * interface or the scope zone of a destination address) to
 		 * disambiguate the scope.
 		 * XXX: the delay of the validation may confuse the
 		 * application when it is used as a sticky option.
 		 */
 		if (opt->ip6po_pktinfo == NULL) {
 			opt->ip6po_pktinfo = malloc(sizeof(*pktinfo),
 			    M_IP6OPT, M_NOWAIT);
 			if (opt->ip6po_pktinfo == NULL)
 				return (ENOBUFS);
 		}
 		bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo));
 		break;
 	}
 
 	case IPV6_2292HOPLIMIT:
 	case IPV6_HOPLIMIT:
 	{
 		int *hlimp;
 
 		/*
 		 * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT
 		 * to simplify the ordering among hoplimit options.
 		 */
 		if (optname == IPV6_HOPLIMIT && sticky)
 			return (ENOPROTOOPT);
 
 		if (len != sizeof(int))
 			return (EINVAL);
 		hlimp = (int *)buf;
 		if (*hlimp < -1 || *hlimp > 255)
 			return (EINVAL);
 
 		opt->ip6po_hlim = *hlimp;
 		break;
 	}
 
 	case IPV6_TCLASS:
 	{
 		int tclass;
 
 		if (len != sizeof(int))
 			return (EINVAL);
 		tclass = *(int *)buf;
 		if (tclass < -1 || tclass > 255)
 			return (EINVAL);
 
 		opt->ip6po_tclass = tclass;
 		break;
 	}
 
 	case IPV6_2292NEXTHOP:
 	case IPV6_NEXTHOP:
 		if (cred != NULL) {
 			error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {	/* just remove the option */
 			ip6_clearpktopts(opt, IPV6_NEXTHOP);
 			break;
 		}
 
 		/* check if cmsg_len is large enough for sa_len */
 		if (len < sizeof(struct sockaddr) || len < *buf)
 			return (EINVAL);
 
 		switch (((struct sockaddr *)buf)->sa_family) {
 		case AF_INET6:
 		{
 			struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf;
 			int error;
 
 			if (sa6->sin6_len != sizeof(struct sockaddr_in6))
 				return (EINVAL);
 
 			if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) ||
 			    IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) {
 				return (EINVAL);
 			}
 			if ((error = sa6_embedscope(sa6, V_ip6_use_defzone))
 			    != 0) {
 				return (error);
 			}
 			break;
 		}
 		case AF_LINK:	/* should eventually be supported */
 		default:
 			return (EAFNOSUPPORT);
 		}
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, IPV6_NEXTHOP);
 		opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_nexthop == NULL)
 			return (ENOBUFS);
 		bcopy(buf, opt->ip6po_nexthop, *buf);
 		break;
 
 	case IPV6_2292HOPOPTS:
 	case IPV6_HOPOPTS:
 	{
 		struct ip6_hbh *hbh;
 		int hbhlen;
 
 		/*
 		 * XXX: We don't allow a non-privileged user to set ANY HbH
 		 * options, since per-option restriction has too much
 		 * overhead.
 		 */
 		if (cred != NULL) {
 			error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, IPV6_HOPOPTS);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_hbh))
 			return (EINVAL);
 		hbh = (struct ip6_hbh *)buf;
 		hbhlen = (hbh->ip6h_len + 1) << 3;
 		if (len != hbhlen)
 			return (EINVAL);
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, IPV6_HOPOPTS);
 		opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_hbh == NULL)
 			return (ENOBUFS);
 		bcopy(hbh, opt->ip6po_hbh, hbhlen);
 
 		break;
 	}
 
 	case IPV6_2292DSTOPTS:
 	case IPV6_DSTOPTS:
 	case IPV6_RTHDRDSTOPTS:
 	{
 		struct ip6_dest *dest, **newdest = NULL;
 		int destlen;
 
 		if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */
 			error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, optname);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_dest))
 			return (EINVAL);
 		dest = (struct ip6_dest *)buf;
 		destlen = (dest->ip6d_len + 1) << 3;
 		if (len != destlen)
 			return (EINVAL);
 
 		/*
 		 * Determine the position that the destination options header
 		 * should be inserted; before or after the routing header.
 		 */
 		switch (optname) {
 		case IPV6_2292DSTOPTS:
 			/*
 			 * The old advacned API is ambiguous on this point.
 			 * Our approach is to determine the position based
 			 * according to the existence of a routing header.
 			 * Note, however, that this depends on the order of the
 			 * extension headers in the ancillary data; the 1st
 			 * part of the destination options header must appear
 			 * before the routing header in the ancillary data,
 			 * too.
 			 * RFC3542 solved the ambiguity by introducing
 			 * separate ancillary data or option types.
 			 */
 			if (opt->ip6po_rthdr == NULL)
 				newdest = &opt->ip6po_dest1;
 			else
 				newdest = &opt->ip6po_dest2;
 			break;
 		case IPV6_RTHDRDSTOPTS:
 			newdest = &opt->ip6po_dest1;
 			break;
 		case IPV6_DSTOPTS:
 			newdest = &opt->ip6po_dest2;
 			break;
 		}
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, optname);
 		*newdest = malloc(destlen, M_IP6OPT, M_NOWAIT);
 		if (*newdest == NULL)
 			return (ENOBUFS);
 		bcopy(dest, *newdest, destlen);
 
 		break;
 	}
 
 	case IPV6_2292RTHDR:
 	case IPV6_RTHDR:
 	{
 		struct ip6_rthdr *rth;
 		int rthlen;
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, IPV6_RTHDR);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_rthdr))
 			return (EINVAL);
 		rth = (struct ip6_rthdr *)buf;
 		rthlen = (rth->ip6r_len + 1) << 3;
 		if (len != rthlen)
 			return (EINVAL);
 
 		switch (rth->ip6r_type) {
 		case IPV6_RTHDR_TYPE_0:
 			if (rth->ip6r_len == 0)	/* must contain one addr */
 				return (EINVAL);
 			if (rth->ip6r_len % 2) /* length must be even */
 				return (EINVAL);
 			if (rth->ip6r_len / 2 != rth->ip6r_segleft)
 				return (EINVAL);
 			break;
 		default:
 			return (EINVAL);	/* not supported */
 		}
 
 		/* turn off the previous option */
 		ip6_clearpktopts(opt, IPV6_RTHDR);
 		opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_rthdr == NULL)
 			return (ENOBUFS);
 		bcopy(rth, opt->ip6po_rthdr, rthlen);
 
 		break;
 	}
 
 	case IPV6_USE_MIN_MTU:
 		if (len != sizeof(int))
 			return (EINVAL);
 		minmtupolicy = *(int *)buf;
 		if (minmtupolicy != IP6PO_MINMTU_MCASTONLY &&
 		    minmtupolicy != IP6PO_MINMTU_DISABLE &&
 		    minmtupolicy != IP6PO_MINMTU_ALL) {
 			return (EINVAL);
 		}
 		opt->ip6po_minmtu = minmtupolicy;
 		break;
 
 	case IPV6_DONTFRAG:
 		if (len != sizeof(int))
 			return (EINVAL);
 
 		if (uproto == IPPROTO_TCP || *(int *)buf == 0) {
 			/*
 			 * we ignore this option for TCP sockets.
 			 * (RFC3542 leaves this case unspecified.)
 			 */
 			opt->ip6po_flags &= ~IP6PO_DONTFRAG;
 		} else
 			opt->ip6po_flags |= IP6PO_DONTFRAG;
 		break;
 
 	case IPV6_PREFER_TEMPADDR:
 		if (len != sizeof(int))
 			return (EINVAL);
 		preftemp = *(int *)buf;
 		if (preftemp != IP6PO_TEMPADDR_SYSTEM &&
 		    preftemp != IP6PO_TEMPADDR_NOTPREFER &&
 		    preftemp != IP6PO_TEMPADDR_PREFER) {
 			return (EINVAL);
 		}
 		opt->ip6po_prefer_tempaddr = preftemp;
 		break;
 
 	default:
 		return (ENOPROTOOPT);
 	} /* end of switch */
 
 	return (0);
 }
 
 /*
  * Routine called from ip6_output() to loop back a copy of an IP6 multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be &loif -- easier than replicating that code here.
  */
 void
 ip6_mloopback(struct ifnet *ifp, struct mbuf *m)
 {
 	struct mbuf *copym;
 	struct ip6_hdr *ip6;
 
 	copym = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 	if (copym == NULL)
 		return;
 
 	/*
 	 * Make sure to deep-copy IPv6 header portion in case the data
 	 * is in an mbuf cluster, so that we can safely override the IPv6
 	 * header portion later.
 	 */
 	if (!M_WRITABLE(copym) ||
 	    copym->m_len < sizeof(struct ip6_hdr)) {
 		copym = m_pullup(copym, sizeof(struct ip6_hdr));
 		if (copym == NULL)
 			return;
 	}
 	ip6 = mtod(copym, struct ip6_hdr *);
 	/*
 	 * clear embedded scope identifiers if necessary.
 	 * in6_clearscope will touch the addresses only when necessary.
 	 */
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 	if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 		copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 |
 		    CSUM_PSEUDO_HDR;
 		copym->m_pkthdr.csum_data = 0xffff;
 	}
 	if_simloop(ifp, copym, AF_INET6, 0);
 }
 
 /*
  * Chop IPv6 header off from the payload.
  */
 static int
 ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs)
 {
 	struct mbuf *mh;
 	struct ip6_hdr *ip6;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (m->m_len > sizeof(*ip6)) {
 		mh = m_gethdr(M_NOWAIT, MT_DATA);
 		if (mh == NULL) {
 			m_freem(m);
 			return ENOBUFS;
 		}
 		m_move_pkthdr(mh, m);
 		M_ALIGN(mh, sizeof(*ip6));
 		m->m_len -= sizeof(*ip6);
 		m->m_data += sizeof(*ip6);
 		mh->m_next = m;
 		m = mh;
 		m->m_len = sizeof(*ip6);
 		bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6));
 	}
 	exthdrs->ip6e_ip6 = m;
 	return 0;
 }
 
 /*
  * Compute IPv6 extension header length.
  */
 int
 ip6_optlen(struct inpcb *inp)
 {
 	int len;
 
 	if (!inp->in6p_outputopts)
 		return 0;
 
 	len = 0;
 #define elen(x) \
     (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0)
 
 	len += elen(inp->in6p_outputopts->ip6po_hbh);
 	if (inp->in6p_outputopts->ip6po_rthdr)
 		/* dest1 is valid with rthdr only */
 		len += elen(inp->in6p_outputopts->ip6po_dest1);
 	len += elen(inp->in6p_outputopts->ip6po_rthdr);
 	len += elen(inp->in6p_outputopts->ip6po_dest2);
 	return len;
 #undef elen
 }
diff --git a/sys/netipsec/xform_tcp.c b/sys/netipsec/xform_tcp.c
index ce2552f0a205..42c63813e63c 100644
--- a/sys/netipsec/xform_tcp.c
+++ b/sys/netipsec/xform_tcp.c
@@ -1,429 +1,429 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2003 Bruce M. Simpson <bms@spc.org>
  * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *   notice, this list of conditions and the following disclaimer in the
  *   documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *   derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /* TCP MD5 Signature Option (RFC2385) */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/lock.h>
 #include <sys/md5.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/sockopt.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 
 #include <net/vnet.h>
 
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec_support.h>
 #include <netipsec/xform.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netipsec/ipsec6.h>
 #endif
 
 #include <netipsec/key.h>
 #include <netipsec/key_debug.h>
 
 #define	TCP_SIGLEN	16	/* length of computed digest in bytes */
 #define	TCP_KEYLEN_MIN	1	/* minimum length of TCP-MD5 key */
 #define	TCP_KEYLEN_MAX	80	/* maximum length of TCP-MD5 key */
 
 static int
 tcp_ipsec_pcbctl(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct tcpcb *tp;
 	int error, optval;
 
 	if (sopt->sopt_name != TCP_MD5SIG) {
 		return (ENOPROTOOPT);
 	}
 
 	if (sopt->sopt_dir == SOPT_GET) {
 		INP_RLOCK(inp);
-		if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+		if (inp->inp_flags & INP_DROPPED) {
 			INP_RUNLOCK(inp);
 			return (ECONNRESET);
 		}
 		tp = intotcpcb(inp);
 		optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0;
 		INP_RUNLOCK(inp);
 
 		/* On success return with released INP_WLOCK */
 		return (sooptcopyout(sopt, &optval, sizeof(optval)));
 	}
 
 	error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
 	if (error != 0)
 		return (error);
 
 	/* INP_WLOCK_RECHECK */
 	INP_WLOCK(inp);
-	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+	if (inp->inp_flags & INP_DROPPED) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	if (optval > 0)
 		tp->t_flags |= TF_SIGNATURE;
 	else
 		tp->t_flags &= ~TF_SIGNATURE;
 
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Callback function invoked by m_apply() to digest TCP segment data
  * contained within an mbuf chain.
  */
 static int
 tcp_signature_apply(void *fstate, void *data, u_int len)
 {
 
 	MD5Update(fstate, (u_char *)data, len);
 	return (0);
 }
 
 #ifdef INET
 static int
 ip_pseudo_compute(struct mbuf *m, MD5_CTX *ctx)
 {
 	struct ippseudo ipp;
 	struct ip *ip;
 
 	ip = mtod(m, struct ip *);
 	ipp.ippseudo_src.s_addr = ip->ip_src.s_addr;
 	ipp.ippseudo_dst.s_addr = ip->ip_dst.s_addr;
 	ipp.ippseudo_p = IPPROTO_TCP;
 	ipp.ippseudo_pad = 0;
 	ipp.ippseudo_len = htons(m->m_pkthdr.len - (ip->ip_hl << 2));
 	MD5Update(ctx, (char *)&ipp, sizeof(ipp));
 	return (ip->ip_hl << 2);
 }
 #endif
 
 #ifdef INET6
 static int
 ip6_pseudo_compute(struct mbuf *m, MD5_CTX *ctx)
 {
 	struct ip6_pseudo {
 		struct in6_addr src, dst;
 		uint32_t len;
 		uint32_t nxt;
 	} ip6p __aligned(4);
 	struct ip6_hdr *ip6;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6p.src = ip6->ip6_src;
 	ip6p.dst = ip6->ip6_dst;
 	ip6p.len = htonl(m->m_pkthdr.len - sizeof(*ip6)); /* XXX: ext headers */
 	ip6p.nxt = htonl(IPPROTO_TCP);
 	MD5Update(ctx, (char *)&ip6p, sizeof(ip6p));
 	return (sizeof(*ip6));
 }
 #endif
 
 static int
 tcp_signature_compute(struct mbuf *m, struct tcphdr *th,
     struct secasvar *sav, u_char *buf)
 {
 	MD5_CTX ctx;
 	int len;
 	u_short csum;
 
 	MD5Init(&ctx);
 	 /* Step 1: Update MD5 hash with IP(v6) pseudo-header. */
 	switch (sav->sah->saidx.dst.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		len = ip_pseudo_compute(m, &ctx);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		len = ip6_pseudo_compute(m, &ctx);
 		break;
 #endif
 	default:
 		return (EAFNOSUPPORT);
 	}
 	/*
 	 * Step 2: Update MD5 hash with TCP header, excluding options.
 	 * The TCP checksum must be set to zero.
 	 */
 	csum = th->th_sum;
 	th->th_sum = 0;
 	MD5Update(&ctx, (char *)th, sizeof(struct tcphdr));
 	th->th_sum = csum;
 	/*
 	 * Step 3: Update MD5 hash with TCP segment data.
 	 * Use m_apply() to avoid an early m_pullup().
 	 */
 	len += (th->th_off << 2);
 	if (m->m_pkthdr.len - len > 0)
 		m_apply(m, len, m->m_pkthdr.len - len,
 		    tcp_signature_apply, &ctx);
 	/*
 	 * Step 4: Update MD5 hash with shared secret.
 	 */
 	MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth));
 	MD5Final(buf, &ctx);
 	key_sa_recordxfer(sav, m);
 	return (0);
 }
 
 static void
 setsockaddrs(const struct mbuf *m, union sockaddr_union *src,
     union sockaddr_union *dst)
 {
 	struct ip *ip;
 
 	IPSEC_ASSERT(m->m_len >= sizeof(*ip), ("unexpected mbuf len"));
 
 	ip = mtod(m, struct ip *);
 	switch (ip->ip_v) {
 #ifdef INET
 	case IPVERSION:
 		ipsec4_setsockaddrs(m, src, dst);
 		break;
 #endif
 #ifdef INET6
 	case (IPV6_VERSION >> 4):
 		ipsec6_setsockaddrs(m, src, dst);
 		break;
 #endif
 	default:
 		bzero(src, sizeof(*src));
 		bzero(dst, sizeof(*dst));
 	}
 }
 
 /*
  * Compute TCP-MD5 hash of an *INBOUND* TCP segment.
  * Parameters:
  * m		pointer to head of mbuf chain
  * th		pointer to TCP header
  * buf		pointer to storage for computed MD5 digest
  *
  * Return 0 if successful, otherwise return error code.
  */
 static int
 tcp_ipsec_input(struct mbuf *m, struct tcphdr *th, u_char *buf)
 {
 	char tmpdigest[TCP_SIGLEN];
 	struct secasindex saidx;
 	struct secasvar *sav;
 
 	setsockaddrs(m, &saidx.src, &saidx.dst);
 	saidx.proto = IPPROTO_TCP;
 	saidx.mode = IPSEC_MODE_TCPMD5;
 	saidx.reqid = 0;
 	sav = key_allocsa_tcpmd5(&saidx);
 	if (sav == NULL) {
 		KMOD_TCPSTAT_INC(tcps_sig_err_buildsig);
 		return (ENOENT);
 	}
 	if (buf == NULL) {
 		key_freesav(&sav);
 		KMOD_TCPSTAT_INC(tcps_sig_err_nosigopt);
 		return (EACCES);
 	}
 	/*
 	 * tcp_input() operates with TCP header fields in host
 	 * byte order. We expect them in network byte order.
 	 */
 	tcp_fields_to_net(th);
 	tcp_signature_compute(m, th, sav, tmpdigest);
 	tcp_fields_to_host(th);
 	key_freesav(&sav);
 	if (bcmp(buf, tmpdigest, TCP_SIGLEN) != 0) {
 		KMOD_TCPSTAT_INC(tcps_sig_rcvbadsig);
 		return (EACCES);
 	}
 	KMOD_TCPSTAT_INC(tcps_sig_rcvgoodsig);
 	return (0);
 }
 
 /*
  * Compute TCP-MD5 hash of an *OUTBOUND* TCP segment.
  * Parameters:
  * m		pointer to head of mbuf chain
  * th		pointer to TCP header
  * buf		pointer to storage for computed MD5 digest
  *
  * Return 0 if successful, otherwise return error code.
  */
 static int
 tcp_ipsec_output(struct mbuf *m, struct tcphdr *th, u_char *buf)
 {
 	struct secasindex saidx;
 	struct secasvar *sav;
 
 	setsockaddrs(m, &saidx.src, &saidx.dst);
 	saidx.proto = IPPROTO_TCP;
 	saidx.mode = IPSEC_MODE_TCPMD5;
 	saidx.reqid = 0;
 	sav = key_allocsa_tcpmd5(&saidx);
 	if (sav == NULL) {
 		KMOD_TCPSTAT_INC(tcps_sig_err_buildsig);
 		return (ENOENT);
 	}
 	tcp_signature_compute(m, th, sav, buf);
 	key_freesav(&sav);
 	return (0);
 }
 
 /*
  * Initialize a TCP-MD5 SA. Called when the SA is being set up.
  *
  * We don't need to set up the tdb prefixed fields, as we don't use the
  * opencrypto code; we just perform a key length check.
  *
  * XXX: Currently we have used single 'magic' SPI and need to still
  * support this.
  *
  * This allows per-host granularity without affecting the userland
  * interface, which is a simple socket option toggle switch,
  * TCP_SIGNATURE_ENABLE.
  *
  * To allow per-service granularity requires that we have a means
  * of mapping port to SPI. The mandated way of doing this is to
  * use SPD entries to specify packet flows which get the TCP-MD5
  * treatment, however the code to do this is currently unstable
  * and unsuitable for production use.
  *
  * Therefore we use this compromise in the meantime.
  */
 static int
 tcpsignature_init(struct secasvar *sav, struct xformsw *xsp)
 {
 	int keylen;
 
 	if (sav->alg_auth != SADB_X_AALG_TCP_MD5) {
 		DPRINTF(("%s: unsupported authentication algorithm %u\n",
 		    __func__, sav->alg_auth));
 		return (EINVAL);
 	}
 	if (sav->key_auth == NULL) {
 		DPRINTF(("%s: no authentication key present\n", __func__));
 		return (EINVAL);
 	}
 	keylen = _KEYLEN(sav->key_auth);
 	if ((keylen < TCP_KEYLEN_MIN) || (keylen > TCP_KEYLEN_MAX)) {
 		DPRINTF(("%s: invalid key length %u\n", __func__, keylen));
 		return (EINVAL);
 	}
 	sav->tdb_xform = xsp;
 	return (0);
 }
 
 /*
  * Called when the SA is deleted.
  */
 static void
 tcpsignature_cleanup(struct secasvar *sav)
 {
 }
 
 static struct xformsw tcpsignature_xformsw = {
 	.xf_type =	XF_TCPSIGNATURE,
 	.xf_name =	"TCP-MD5",
 	.xf_init =	tcpsignature_init,
 	.xf_cleanup =	tcpsignature_cleanup,
 };
 
 static const struct tcpmd5_methods tcpmd5_methods = {
 	.input = tcp_ipsec_input,
 	.output = tcp_ipsec_output,
 	.pcbctl = tcp_ipsec_pcbctl,
 };
 
 #ifndef KLD_MODULE
 /* TCP-MD5 support is build in the kernel */
 static const struct tcpmd5_support tcpmd5_ipsec = {
 	.enabled = IPSEC_MODULE_ENABLED,
 	.methods = &tcpmd5_methods
 };
 const struct tcpmd5_support * const tcp_ipsec_support = &tcpmd5_ipsec;
 #endif /* !KLD_MODULE */
 
 static int
 tcpmd5_modevent(module_t mod, int type, void *data)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		xform_attach(&tcpsignature_xformsw);
 #ifdef KLD_MODULE
 		tcpmd5_support_enable(&tcpmd5_methods);
 #endif
 		break;
 	case MOD_UNLOAD:
 #ifdef KLD_MODULE
 		tcpmd5_support_disable();
 #endif
 		xform_detach(&tcpsignature_xformsw);
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t tcpmd5_mod = {
 	"tcpmd5",
 	tcpmd5_modevent,
 	0
 };
 
 DECLARE_MODULE(tcpmd5, tcpmd5_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
 MODULE_VERSION(tcpmd5, 1);
 #ifdef KLD_MODULE
 MODULE_DEPEND(tcpmd5, ipsec_support, 1, 1, 1);
 #endif